<div class="list-group" id="list-tab" role="tablist">
<h1 class="list-group-item list-group-item-action active" data-toggle="list" style='background:black; border:0; color:#ff6666' role="tab" aria-controls="home"><center>Narrative Identification on Demonetization Tweets</center></h1>

In [1]:
keyword = "demonetization" 
number = 10000
filename = "demonetization-tweets_Clusters.csv"
file_count = "demonetization-tweets"

In [2]:
import pandas as pd
import numpy as np
import nltk
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize,TweetTokenizer

# Step 1: Import data

In [3]:
# Identify the encoding of the data file
import chardet
with open('../input/demonetizationtweetscsv/demonetization-tweets.csv','rb') as f:
    result = chardet.detect(f.read())  
result #Windows-1252

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}

# Step 2: Clean the tweets


In [4]:
# Import the data file
df=pd.read_csv('../input/demonetizationtweetscsv/demonetization-tweets.csv',encoding=result['encoding'])
df= df.drop(['Unnamed: 0'],axis=1)
df=df[0:number]
df=df['text']
df=pd.DataFrame({'tweet':df})

#clean the tweets
df['cleaned_tweet']= df['tweet'].replace(r'\'|\"|\,|\.|\?|\+|\-|\/|\=|\(|\)|\n|"', '', regex=True)
df['cleaned_tweet']=df['cleaned_tweet'].replace("  "," ")

words_remove = ["ax","i","you","edu","s","t","m","subject","can","lines","re","what", "there","all","we",
                "one","the","a","an","of","or","in","for","by","on","but","is","in","a","not","with","as",
                "was","if","they","are","this","and","it","have","has","from","at","my","be","by","not","that","to",
                "from","com","org","like","likes","so","said","from","what","told","over","more","other",
                "have","last","with","this","that","such","when","been","says","will","also","where","why",
                "would","today", "in", "on", "you", "r", "d", "u", "hw","wat", "oly", "s", "b", "ht", 
                "rt", "p","the","th", "n", "was"]

def cleantext(df, words_to_remove=words_remove):
    # removing emoticons from th tweets, wont help in topic modelling or semantic processing
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'<ed>','', regex = True)
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'\B<U+.*>|<U+.*>\B|<U+.*>','', regex = True)
    
    # convert tweets to lowercase
    df['cleaned_tweet']=df['cleaned_tweet'].str.lower()
    
    # remove user mentions
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^(@\w+)',"", regex=True)
    
    # remove 'rt' or retweet in the beginning
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^(rt @)',"",regex=True)
    
    #remove symbols
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'[^a-zA-Z0-9]', " ",regex=True)
    
    #remove punctuations
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'[[]!"#$%\'()\*+,-./:;<=>?^_`{|}]+',"", regex = True)
    
    #remove_URL(x)
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'https.*$',"",regex=True)
    
    # remove 'amp' in the text
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'amp',"",regex=True)
    
    #remove words of length 1 or 2
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'\b[a-zA-Z]{1,2}\b',"",regex=True)
    
    #remove extra spaces in the tweet
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^\s+|\s+$'," ", regex=True)
    
    #remove stopwords and words_to_remove
    stop_words=set(stopwords.words('english'))
    mystopwords=[stop_words,'via',words_remove]
    
    # removing stopwords
    df['fully_cleaned_tweet'] = df['cleaned_tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in mystopwords]))
    
    return df

#get the processed tweets
df=cleantext(df)

  re.compile(obj)


**Polarity is float which lies in the range of [-1,1] where 1 means positive statement and -1 means a negative statement. Subjective sentences generally refer to personal opinion, emotion or judgment whereas objective refers to factual information. Subjectivity is also a float which lies in the range of [0,1].**

In [5]:
# Sentiment Analysis
from textblob import TextBlob
df['sentiment']=df['fully_cleaned_tweet'].apply(lambda x:TextBlob(x).sentiment.polarity) #range -1 to 1


In [6]:
df.head(2)

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment
0,RT @rssurjewala: Critical question: Was PayTM ...,rssurjewala critical question was paytm info...,rssurjewala critical question was paytm inform...,0.15
1,RT @Hemant_80: Did you vote on #Demonetization...,hemant 80 did you vote demonetization modi...,hemant 80 did you vote demonetization modi sur...,0.0


# Step 3: Vectorize the tweets

In [7]:
df['tokenized_tweet']=df['fully_cleaned_tweet'].apply(word_tokenize)
df.head(2)

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment,tokenized_tweet
0,RT @rssurjewala: Critical question: Was PayTM ...,rssurjewala critical question was paytm info...,rssurjewala critical question was paytm inform...,0.15,"[rssurjewala, critical, question, was, paytm, ..."
1,RT @Hemant_80: Did you vote on #Demonetization...,hemant 80 did you vote demonetization modi...,hemant 80 did you vote demonetization modi sur...,0.0,"[hemant, 80, did, you, vote, demonetization, m..."


In [8]:
#if a word has a digit, remove that word
df['tokenized_tweet']=df['tokenized_tweet'].apply(lambda x: [y for y in x if not any(c.isdigit() for c in y)])

In [9]:
# Set values for various parameters
num_features=100 # Word vector dimensionality
min_word_count=1 # minimum word count
num_workers=4  # Number of threads to run in parallel
context=10 # context window size

In [10]:
# Initilaize and train the model 
from gensim.models import word2vec
print('Training Model....')
model= word2vec.Word2Vec(df['tokenized_tweet'],workers=num_workers,size=num_features,min_count=min_word_count,
                        window=context)
# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

Training Model....


### Find vector corresponding to each tweet
Take the average of all word vectors in a tweet

In [11]:
vocab=list(model.wv.vocab)
def sentence_vector(sentence, model):
    nwords=0
    featureV=np.zeros(100, dtype='float32')
    for word in sentence:
        if word not in vocab:
            continue
        featureV=np.add(featureV, model[word])
        nwords=nwords+1
    if nwords>0:
        featureV=np.divide(featureV,nwords)
    return featureV

tweet_vector= df['tokenized_tweet'].apply(lambda x: sentence_vector(x,model))
tweet_vector= tweet_vector.apply(pd.Series)

  


In [12]:
# Tweet vector should vary from 0 to 1 (normalise the vector)
#Tweet vector should vary from 0 to 1 (normalize the vector)
for x in range(len(tweet_vector)):
    x_min = tweet_vector.iloc[x].min()
    x_max = tweet_vector.iloc[x].max()
    X  = tweet_vector.iloc[x]
    i = 0
    if (x_max - x_min) == 0:
        for y in X:
            tweet_vector.iloc[x][i] = (1/len(tweet_vector.iloc[x]))
            i = i + 1
    else:
        for y in X:
            tweet_vector.iloc[x][i] = ((y - x_min)/(x_max - x_min))
            i = i + 1



In [13]:
tweet_vector

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.372837,0.545694,0.312134,0.406155,0.419439,0.131938,0.516665,0.455538,0.305514,0.341636,...,0.300012,0.382390,0.733279,0.027603,0.375121,0.293636,0.615567,0.155568,0.644286,0.663150
1,0.377996,0.521854,0.181112,0.599348,0.408489,0.395992,0.599420,0.261847,0.392707,0.225454,...,0.434284,0.505318,0.438099,0.246737,0.284222,0.398768,0.678347,0.239313,0.863538,0.617505
2,0.343447,0.531399,0.203889,0.381148,0.416401,0.272614,0.535161,0.117553,0.402884,0.249198,...,0.388092,0.473040,0.488666,0.314732,0.304895,0.406100,0.630012,0.078014,0.735415,0.631259
3,0.294992,0.522601,0.236966,0.390040,0.415191,0.324684,0.533012,0.089598,0.409144,0.237142,...,0.383553,0.467209,0.459293,0.388277,0.316471,0.414858,0.606502,0.049513,0.727050,0.664490
4,0.391451,0.329246,0.220099,0.279498,0.395203,0.402952,0.436287,0.013495,0.399426,0.325041,...,0.351659,0.475781,0.483794,0.390450,0.169674,0.343840,0.521581,0.145253,0.765088,0.522121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.341186,0.548719,0.232380,0.495930,0.497329,0.217148,0.574678,0.119535,0.385406,0.230178,...,0.519348,0.418279,0.539634,0.290040,0.312487,0.414291,0.538317,0.000000,0.649007,0.665764
9996,0.515508,0.531603,0.165854,0.416848,0.427287,0.379006,0.552947,0.225876,0.525852,0.287786,...,0.371843,0.614178,0.463436,0.258661,0.351038,0.518822,0.777970,0.262985,0.928042,0.648654
9997,0.494650,0.543870,0.280147,0.483720,0.497321,0.307066,0.628304,0.326802,0.406790,0.287471,...,0.544162,0.505814,0.519237,0.248153,0.375796,0.488674,0.675969,0.277095,0.752114,0.564119
9998,0.515508,0.531603,0.165854,0.416848,0.427287,0.379006,0.552947,0.225876,0.525852,0.287786,...,0.371843,0.614178,0.463436,0.258661,0.351038,0.518822,0.777970,0.262985,0.928042,0.648654


# Step 4: Add sentiment to the tweet vector

In [14]:
# Scale the 'sentiment' vector
# Sentiment varies from -1(Negative Sentiment) to +1(Positive Sentiment) polarity
def sentiment(x):
    if x < 0.04:
        return 0 #(Neutral sentiment)
    elif x>0.04:
        return 1 #(Positive Sentiment)
    else:
        return 0.5 #(Negative Sentiment)

tweet_vector[100]=df['sentiment'].apply(lambda x: sentiment(x)) # Adding 100 coumn for sentiment

In [15]:
tweet_vector

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,0.372837,0.545694,0.312134,0.406155,0.419439,0.131938,0.516665,0.455538,0.305514,0.341636,...,0.382390,0.733279,0.027603,0.375121,0.293636,0.615567,0.155568,0.644286,0.663150,1
1,0.377996,0.521854,0.181112,0.599348,0.408489,0.395992,0.599420,0.261847,0.392707,0.225454,...,0.505318,0.438099,0.246737,0.284222,0.398768,0.678347,0.239313,0.863538,0.617505,0
2,0.343447,0.531399,0.203889,0.381148,0.416401,0.272614,0.535161,0.117553,0.402884,0.249198,...,0.473040,0.488666,0.314732,0.304895,0.406100,0.630012,0.078014,0.735415,0.631259,0
3,0.294992,0.522601,0.236966,0.390040,0.415191,0.324684,0.533012,0.089598,0.409144,0.237142,...,0.467209,0.459293,0.388277,0.316471,0.414858,0.606502,0.049513,0.727050,0.664490,0
4,0.391451,0.329246,0.220099,0.279498,0.395203,0.402952,0.436287,0.013495,0.399426,0.325041,...,0.475781,0.483794,0.390450,0.169674,0.343840,0.521581,0.145253,0.765088,0.522121,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.341186,0.548719,0.232380,0.495930,0.497329,0.217148,0.574678,0.119535,0.385406,0.230178,...,0.418279,0.539634,0.290040,0.312487,0.414291,0.538317,0.000000,0.649007,0.665764,0
9996,0.515508,0.531603,0.165854,0.416848,0.427287,0.379006,0.552947,0.225876,0.525852,0.287786,...,0.614178,0.463436,0.258661,0.351038,0.518822,0.777970,0.262985,0.928042,0.648654,1
9997,0.494650,0.543870,0.280147,0.483720,0.497321,0.307066,0.628304,0.326802,0.406790,0.287471,...,0.505814,0.519237,0.248153,0.375796,0.488674,0.675969,0.277095,0.752114,0.564119,1
9998,0.515508,0.531603,0.165854,0.416848,0.427287,0.379006,0.552947,0.225876,0.525852,0.287786,...,0.614178,0.463436,0.258661,0.351038,0.518822,0.777970,0.262985,0.928042,0.648654,1


In [16]:
#Updating the 'sentiment' column in df also
df['sentiment'] = tweet_vector[100]
df.head(3)

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment,tokenized_tweet
0,RT @rssurjewala: Critical question: Was PayTM ...,rssurjewala critical question was paytm info...,rssurjewala critical question was paytm inform...,1,"[rssurjewala, critical, question, was, paytm, ..."
1,RT @Hemant_80: Did you vote on #Demonetization...,hemant 80 did you vote demonetization modi...,hemant 80 did you vote demonetization modi sur...,0,"[hemant, did, you, vote, demonetization, modi,..."
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",roshankar former finsec rbi governor cbdt ch...,roshankar former finsec rbi governor cbdt chai...,0,"[roshankar, former, finsec, rbi, governor, cbd..."


# Step 5: Cluster the narratives [= opinions + expressions]

In [17]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score,silhouette_samples

range_n_clusters=[4,6,7,8,9,10,11,12,14]
X= tweet_vector
n_best_clusters=0
silhouette_best = 0
for n_clusters in range_n_clusters:
    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.  
    clusterer=KMeans(n_clusters=n_clusters,random_state=42)
    cluster_labels=clusterer.fit_predict(X)
    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg=silhouette_score(X,cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)
    
    if silhouette_avg > silhouette_best:
        silhouette_best = silhouette_avg
        n_best_clusters = n_clusters

For n_clusters = 4 The average silhouette_score is : 0.2307716752455403
For n_clusters = 6 The average silhouette_score is : 0.25704949795338566
For n_clusters = 7 The average silhouette_score is : 0.28307576663345596
For n_clusters = 8 The average silhouette_score is : 0.2795376765963954
For n_clusters = 9 The average silhouette_score is : 0.3089161759532428
For n_clusters = 10 The average silhouette_score is : 0.306817928946136
For n_clusters = 11 The average silhouette_score is : 0.3116391187095521
For n_clusters = 12 The average silhouette_score is : 0.3274158684148053
For n_clusters = 14 The average silhouette_score is : 0.31695794523603954


In [18]:
print(n_best_clusters)
print(silhouette_best)

12
0.3274158684148053


In [19]:
clusterer=KMeans(n_clusters=n_best_clusters,random_state=42)
cluster_labels=clusterer.fit_predict(X)

In [20]:
np.unique(cluster_labels)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11], dtype=int32)

In [21]:
#Array of tweets, the corresponding cluster number, sentiment
finaldf = pd.DataFrame({'cl_num': cluster_labels,'fully_cleaned_tweet': df['fully_cleaned_tweet'], 'cleaned_tweet': df['cleaned_tweet'], 'tweet': df['tweet'],'sentiment': df['sentiment']})
finaldf = finaldf.sort_values(by=['cl_num'])

In [22]:
df['cl_num']=cluster_labels
df.head(3)

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment,tokenized_tweet,cl_num
0,RT @rssurjewala: Critical question: Was PayTM ...,rssurjewala critical question was paytm info...,rssurjewala critical question was paytm inform...,1,"[rssurjewala, critical, question, was, paytm, ...",10
1,RT @Hemant_80: Did you vote on #Demonetization...,hemant 80 did you vote demonetization modi...,hemant 80 did you vote demonetization modi sur...,0,"[hemant, did, you, vote, demonetization, modi,...",4
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",roshankar former finsec rbi governor cbdt ch...,roshankar former finsec rbi governor cbdt chai...,0,"[roshankar, former, finsec, rbi, governor, cbd...",1


In [23]:
dfOrdered = pd.DataFrame(df)

#Compute how many times a tweet has been 'retweeted' - that is, how many rows in dfOrdered are identical
dfOrdered['tokenized_tweet'] = dfOrdered['tokenized_tweet'].apply(tuple)
dfUnique = dfOrdered.groupby(['tweet', 'cleaned_tweet', 'fully_cleaned_tweet', 'sentiment','tokenized_tweet', 'cl_num']).size().reset_index(name="freq")
dfUnique = dfUnique.sort_values(by=['cl_num'])

In [24]:
dfUnique['tokenized_tweet'] = dfUnique['tokenized_tweet'].apply(list)
dfOrdered['tokenized_tweet'] = dfOrdered['tokenized_tweet'].apply(list)

In [25]:
dfUnique

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment,tokenized_tweet,cl_num,freq
1623,It's amusing to see people who have elected Tr...,its amusing see people who have elected trump...,its amusing see people who have elected trump ...,1,"[its, amusing, see, people, who, have, elected...",0,1
659,@NewsX @sushmitadevmp makes it very clear that...,sushmitadevmp makes very clear that except f...,sushmitadevmp makes very clear that except for...,1,"[sushmitadevmp, makes, very, clear, that, exce...",0,1
660,@Nishith1608 It's very balanced piece BJP has ...,its very balanced piece bjp has made gains tm...,its very balanced piece bjp has made gains tmc...,1,"[its, very, balanced, piece, bjp, has, made, g...",0,1
2683,RT @muglikar_: Ughhh new spin in the market: #...,muglikar ughhh new spin the market demone...,muglikar ughhh new spin the market demonetizat...,1,"[muglikar, ughhh, new, spin, the, market, demo...",0,9
662,"@NitishKumar Stop this fence sitting exercise,...",stop this fence sitting exercise make your s...,stop this fence sitting exercise make your sta...,1,"[stop, this, fence, sitting, exercise, make, y...",0,1
...,...,...,...,...,...,...,...
2768,RT @rpollard: More currency shortages across I...,rpollard more currency shortages across india...,rpollard more currency shortages across india ...,1,"[rpollard, more, currency, shortages, across, ...",11,1
407,#news #summary: #world #bank backs #demonet...,news summary world bank backs demonet...,news summary world bank backs demonetization s...,1,"[news, summary, world, bank, backs, demonetiza...",11,1
2809,RT @shilpakannan: India's Central Bank comes u...,shilpakannan indias central bank comes with ...,shilpakannan indias central bank comes with mo...,1,"[shilpakannan, indias, central, bank, comes, w...",11,3
1715,More currency shortages across India #demoneti...,more currency shortages across india demoneti...,more currency shortages across india demonetiz...,1,"[more, currency, shortages, across, india, dem...",11,1


### Discard the clusters with poor Silhouette score

In [26]:
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)

poor_cluster_indices = []
avg_cluster_sil_score = []

for i in range(n_best_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
        avgscore = (np.mean(ith_cluster_silhouette_values))   #average silhouette score for each cluster
        avg_cluster_sil_score = np.append(avg_cluster_sil_score, avgscore)
        print('Cluster',i, ':', avgscore)
        if avgscore < 0.2:
            poor_cluster_indices = np.append(poor_cluster_indices, i)
            
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]

Cluster 0 : 0.23703967775473345
Cluster 1 : 0.3092625822574267
Cluster 2 : 0.3434782819851654
Cluster 3 : 0.5191241697375981
Cluster 4 : 0.13044861714973333
Cluster 5 : 0.4560368695953895
Cluster 6 : 0.41968844808574157
Cluster 7 : 0.31383930378836283
Cluster 8 : 1.0
Cluster 9 : 0.3252568605534971
Cluster 10 : 0.4907738588146729
Cluster 11 : 0.2169691656781262


In [27]:
poor_cluster_indices

array([4.])

In [28]:
#remove those rows where cluster value match poor_cluster_indices 
avg_cluster_sil_score_final = []
cluster_name = np.unique(dfOrdered['cl_num'])

if (len(poor_cluster_indices)!=0):
    n_final_clusters = n_best_clusters - len(poor_cluster_indices)
    for i in poor_cluster_indices:
        dfUnique = dfUnique[dfUnique['cl_num'] != i]
    for j in cluster_name:
        if j not in poor_cluster_indices:    
            avg_cluster_sil_score_final = np.append(avg_cluster_sil_score_final, avg_cluster_sil_score[j])
            
    cluster_name = np.unique(dfUnique['cl_num'])
    

In [29]:
dfUnique['cl_num']=abs(dfUnique['cl_num'])
dfUnique=dfUnique.sort_values(by=['cl_num'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


# Step 6: Calculate abstraction and expression for each narrative 
Note that each cluster represents a narrative

In [30]:
tweets_to_consider = 'fully_cleaned_tweet'

In [31]:
final_clusters= np.unique(dfUnique['cl_num'])
print(final_clusters)

[ 0  1  2  3  5  6  7  8  9 10 11]


In [32]:
# Store all tweets corrsponding to each cluster in a file
for i in final_clusters:
    with open('./tweets_Cluster_'+str(i)+'.txt','w') as out:
        y=''
        for x in dfUnique[tweets_to_consider][dfUnique.cl_num==i]:
            y=y+x+'. '
        out.write(y)
        out.close

In [33]:
#A combination of (Noun, adjective, cardinal number, foreign word and Verb) are being extracted now
#Extract chunks matching pattern. Patterns are:
#1) Noun phrase (2 or more nouns occurring together. Ex United states of America, Abdul Kalam etc)
#2) Number followed by Noun (Ex: 28 Terrorists, 45th President)
#3) Adjective followed by Noun (Ex: Economic impact, beautiful inauguration)
#4) Foreign word (Ex: Jallikattu, Narendra modi, Pappu)
#5) Noun followed by Verb (Ex: Terrorists arrested)
#And a combination of all 5
        
import re
import nltk

phrases = pd.DataFrame({'extracted_phrases': [], 'cluster_num': []})


A = '(CD|JJ)/\w+\s'  #cd or jj
B = '(NN|NNS|NNP|NNPS)/\w+\s'  #nouns
C = '(VB|VBD|VBG|VBN|VBP|VBZ)/\w+\s' #verbs
D = 'FW/\w+\s'  #foreign word
patterns = ['('+A+B+')+', '('+D+B+')+','('+D+')+', '('+B+')+', '('+D+A+B+')+', 
           '('+B+C+')+', '('+D+B+C+')+', '('+B+A+B+')+', '('+B+B+C+')+'] 


def extract_phrases(tag1, tag2, sentences):
    extract_phrase = []
    for sentence in sentences:
        phrase = []
        next_word = 0
        for word, pos in nltk.pos_tag(nltk.word_tokenize(sentence)):
            if next_word == 1:
                next_word = 0
                if pos == tag2:
                    extract_phrase = np.append(extract_phrase,phrase + ' ' + word) 
            
            if pos == tag1:
                next_word = 1
                phrase = word
    return extract_phrase

for i in cluster_name:
    File = open('./tweets_Cluster_'+str(i)+'.txt', 'r') #open file
    lines = File.read() #read all lines
    sentences = nltk.sent_tokenize(lines) #tokenize sentences

    for sentence in sentences: 
        f = nltk.pos_tag(nltk.word_tokenize(sentence))
        tag_seq = []
        for word, pos in f:
            tag_seq.append(pos+'/'+ word)
        X = " ".join(tag_seq)

        phrase = []
        for j in range(len(patterns)):
            if re.search(patterns[j], X):
                phrase.append(' '.join([word.split('/')[1] for word in re.search(patterns[j], X).group(0).split()]))
    
        k = pd.DataFrame({'extracted_phrases': np.unique(phrase), 'cluster_num': int(i)})
    
        phrases = pd.concat([phrases,k], ignore_index = True)

print(phrases)

              extracted_phrases  cluster_num
0                    brexit are          0.0
1                        people          0.0
2                    see people          0.0
3                          govt          0.0
4                    govt wants          0.0
...                         ...          ...
9617  positive impact long term         11.0
9618            economic impact         11.0
9619      impact demonetization         11.0
9620                 shops have         11.0
9621       showrooms shops have         11.0

[9622 rows x 2 columns]


### Keeping the largest phrase

In [34]:
#For each phrase identified replace all the substrings by the largest phrase 
#Ex: lakh looted,40 lakh looted and Rs 40 lakh looted, replace all by single largest phrase - Rs 40 lakh looted 
#i.e. instead of 3 different phrases, there will be only one large phrase

phrases_final = pd.DataFrame({'extracted_phrases': [], 'cluster_num': []})
for i in cluster_name:
    phrases_for_each_cluster = []
    cluster_phrases = phrases['extracted_phrases'][phrases.cluster_num == i]
    cluster_phrases = np.unique(np.array(cluster_phrases))
    for j in range(len(cluster_phrases)):
        
        phrase = cluster_phrases[j]
        updated_cluster_phrases = np.delete((cluster_phrases), j)
        if any(phrase in phr for phr in updated_cluster_phrases): 
            'y'
        else: 
            #considering phrases of length greater than 1 only
            if (len(phrase.split(' '))) > 1:
                phrases_for_each_cluster.append(phrase)
    k = pd.DataFrame({'extracted_phrases': phrases_for_each_cluster, 'cluster_num': int(i) })
    
    phrases_final = pd.concat([phrases_final,k], ignore_index = True)

In [35]:
phrases_final

Unnamed: 0,extracted_phrases,cluster_num
0,1 indians,0.0
1,1 request,0.0
2,10 currencies,0.0
3,10 sneezes,0.0
4,100 rupee,0.0
...,...,...
4447,smita dutta,11.0
4448,srinagartimes fresh currency,11.0
4449,supports note,11.0
4450,taxmannindia demonetization,11.0


### For each phrase in each cluster, calculate term frequency 

In [36]:
#Term-frequency : For each cluster, calculate the number of times a given phrase occur in the tweets of that cluster

phrases_final['term_freq'] = len(phrases_final)*[0]

for i in cluster_name:
    for phrase in phrases_final['extracted_phrases'][phrases_final.cluster_num == i]:
        tweets = dfUnique[tweets_to_consider][dfUnique.cl_num == i]
        for tweet in tweets:
            if phrase in tweet:
                phrases_final['term_freq'][(phrases_final.extracted_phrases == phrase) & (phrases_final.cluster_num == i)] = phrases_final['term_freq'][(phrases_final.extracted_phrases == phrase) & (phrases_final.cluster_num == i)] + 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [37]:
phrases_final

Unnamed: 0,extracted_phrases,cluster_num,term_freq
0,1 indians,0.0,1
1,1 request,0.0,1
2,10 currencies,0.0,1
3,10 sneezes,0.0,2
4,100 rupee,0.0,2
...,...,...,...
4447,smita dutta,11.0,1
4448,srinagartimes fresh currency,11.0,1
4449,supports note,11.0,2
4450,taxmannindia demonetization,11.0,1


In [38]:
#Document-frequency
phrases_final['doc_freq'] = len(phrases_final)*[0]


# for each phrase, compute the number of clusters that Sphrase occurs in
for phrase in phrases_final['extracted_phrases']:
    for i in cluster_name:
        all_tweets = ''
        for tweet in dfUnique[tweets_to_consider][dfUnique.cl_num == i]:
            all_tweets = all_tweets + tweet + '. ' 
        if phrase in all_tweets:
            phrases_final['doc_freq'][(phrases_final.extracted_phrases == phrase) & (phrases_final.cluster_num == i)] = phrases_final['doc_freq'][(phrases_final.extracted_phrases == phrase) & (phrases_final.cluster_num == i)] + 1
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [39]:
import math
phrases_final['doc_freq'] = phrases_final['doc_freq'].apply(lambda x: math.log10(n_best_clusters/(x)) )

### For each phrase in each cluster, calculate tf-idf

In [40]:
phrases_final['tf-idf'] = phrases_final['term_freq']*phrases_final['doc_freq']

In [41]:
phrases_final

Unnamed: 0,extracted_phrases,cluster_num,term_freq,doc_freq,tf-idf
0,1 indians,0.0,1,1.079181,1.079181
1,1 request,0.0,1,1.079181,1.079181
2,10 currencies,0.0,1,1.079181,1.079181
3,10 sneezes,0.0,2,1.079181,2.158362
4,100 rupee,0.0,2,1.079181,2.158362
...,...,...,...,...,...
4447,smita dutta,11.0,1,1.079181,1.079181
4448,srinagartimes fresh currency,11.0,1,1.079181,1.079181
4449,supports note,11.0,2,1.079181,2.158362
4450,taxmannindia demonetization,11.0,1,1.079181,1.079181


### For each cluster find top few phrases and respective sentiment
 

In [42]:
phrases_final['diff_tf-idf'] = len(phrases_final)*[0]

narrative = pd.DataFrame({'cl_num': [], 'abstraction': []})
for i in cluster_name: 
    # arrange in descending order of tf-idf score
    phrases_final = phrases_final.sort_values(['cluster_num','tf-idf'], ascending=[1,0])
    
    #Break this distribution at a point where the difference between any consecutive phrases is maximum
    #difference between consecutive values of tf-idf 
    phrases_final['diff_tf-idf'][phrases_final.cluster_num == i] = abs(phrases_final['tf-idf'][phrases_final.cluster_num == i] - phrases_final['tf-idf'][phrases_final.cluster_num == i].shift(1))

    #The last value for each cluster will be 'NaN'. Replacing it with '0'. 
    phrases_final = phrases_final.fillna(0)
    
    phrases_final = phrases_final.reset_index(drop = True) #to avoid old index being added as a new column
    if len(phrases_final[phrases_final.cluster_num == i]) != 0:
        
        #index corresponding to the highest difference
 
        ind = (phrases_final['diff_tf-idf'][phrases_final.cluster_num == i]).idxmax()
        
        abstract = phrases_final['extracted_phrases'][:ind+1][phrases_final.cluster_num == i]
    
    
        #store the abstraction corresponding to each cluster
        k = pd.DataFrame({'cl_num': int(i), 'abstraction': abstract})
        narrative = pd.concat([narrative,k], ignore_index = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the

In [43]:
dfUnique

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment,tokenized_tweet,cl_num,freq
1623,It's amusing to see people who have elected Tr...,its amusing see people who have elected trump...,its amusing see people who have elected trump ...,1,"[its, amusing, see, people, who, have, elected...",0,1
275,#NetasCASHIn Govt wants debate over #DeMonetiz...,netascashin govt wants debate over demonetiz...,netascashin govt wants debate over demonetizat...,1,"[netascashin, govt, wants, debate, over, demon...",0,1
276,#NetasCASHIn Govt wants debate over #DeMonetiz...,netascashin govt wants debate over demonetiz...,netascashin govt wants debate over demonetizat...,1,"[netascashin, govt, wants, debate, over, demon...",0,1
1752,New Delhi: @AamAadmiParty demonstration agains...,new delhi aamaadmiparty demonstration agains...,new delhi aamaadmiparty demonstration against ...,1,"[new, delhi, aamaadmiparty, demonstration, aga...",0,1
1753,New Delhi: Opposition leaders demonstration ag...,new delhi opposition leaders demonstration ag...,new delhi opposition leaders demonstration aga...,1,"[new, delhi, opposition, leaders, demonstratio...",0,1
...,...,...,...,...,...,...,...
1936,"RT @AshwinderRaj: ""Impact of Demonetization on...",ashwinderraj impact demonetization resident...,ashwinderraj impact demonetization residential...,1,"[ashwinderraj, impact, demonetization, residen...",11,1
2019,RT @DipendraDipzo: #Demonetization effect \r\n...,dipendradipzo demonetization effect here co...,dipendradipzo demonetization effect here comes...,1,"[dipendradipzo, demonetization, effect, here, ...",11,40
753,@Vernaculis This crap will just keep getting p...,this crap will just keep getting pushed more ...,this crap will just keep getting pushed more a...,1,"[this, crap, will, just, keep, getting, pushed...",11,1
1773,Note ban will have positive impact in long ter...,note ban will have positive impact long term ...,note ban will have positive impact long term w...,1,"[note, ban, will, have, positive, impact, long...",11,1


In [44]:
#Assigning polarity based on the sentiment for each tweet 2=negative, 1=positive, 3=neutral
dfUnique['polarity'] = np.NaN
dfUnique['polarity'][dfUnique.sentiment == 0.5] = "3"
dfUnique['polarity'][dfUnique.sentiment == 1] = "1"
dfUnique['polarity'][dfUnique.sentiment == 0] = "2"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


### Assign the sentiment to each extracted phrases
count the number of tweets, a phrase has occurred in positive, negative and neutral context. Assign the most occurred sentiment to the phrase

In [45]:
from collections import Counter

#find the highest occurring sentiment corresponding to each tweet
def find_mode(a):
    b = Counter(a).most_common(3)
    mode = []; c_max = 0
    for a,c in b:
        if c>c_max:
            c_max = c
        if c_max == c:
            mode.append(a)  
    print(mode)
    mode.sort()
    print(mode)
    
    ## if mode is 3&2 i.e. neutral and negative, assign the overall sentiment for that phrase as negative, 
    ## if mode is 3&1 i.e. neutral and positive, assign the overall sentiment for that phrase as positive,
    ## if mode is 2&1 i.e. negative and positive, assign the overall sentiment for that phrase as neutal, 
    ## if mode is 3&2&1 i.e. negative, positive and neutral, assign the overall sentiment for that phrase as neutral
    
    if len(mode) == 1:
        return mode[0]
    
    elif (len(mode) == 2) & (mode[1]=='3'):
        return mode[0]
    else:
        return 3
    
#1=>+ve 2=>-ve 3=>Neutral
narrative['expression'] = -1
dfUnique = dfUnique.reset_index(drop = True)
for i in cluster_name:
    tweets = dfUnique[tweets_to_consider][dfUnique.cl_num == i]
    abstracts = narrative['abstraction'][narrative.cl_num == i] 
    for abst in abstracts:
        sent = []
        for tweet, polarity in zip(dfUnique[tweets_to_consider][dfUnique.cl_num == i], dfUnique['polarity'][dfUnique.cl_num == i]):
            if abst in tweet:
                sent = np.append(sent, polarity)
        
        
        if len(sent)!=0:
            ## if mode is 3&2-2, 3&1-1, 2&1-3, 3&2&1 - 3
            senti = find_mode(sent)
            if senti == '2':
                sent_value = "Negative"
            elif senti == '1':
                sent_value = "Positive"
            else:
                sent_value = "Neutral"
            narrative['expression'][(narrative.abstraction == abst) & (narrative.cl_num == i)] = sent_value
        

['1']
['1']
['1']
['1']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['2']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']
['1']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




# Save the narratives in excel file
 With each sheet in the file representing 1 narrative ( == 1 cluster)

In [46]:
#sudo pip install xlwt
#sudo pip3 install openpyxl
from pandas import ExcelWriter

#Save the narratives in an excel file 

writer = pd.ExcelWriter('narrative.xlsx')
for i in cluster_name:
    df1 = pd.DataFrame(dfUnique[['tweet','freq']][dfUnique.cl_num == i]).sort_values(['freq'], ascending = [0])
    df1 = pd.DataFrame({'tweet': dfUnique['tweet'][dfUnique.cl_num == i], 'freq': dfUnique['freq'][dfUnique.cl_num == i]}) 
    df1 = df1.sort_values(['freq'], ascending = [0]) 

    df2 = pd.DataFrame({ 'abstraction': narrative['abstraction'][narrative.cl_num == i], 'expression': narrative['expression'][narrative.cl_num == i]})
    df3 = pd.DataFrame({'abstraction': (len(df1)-len(df2))*['-'], 'expression': (len(df1)-len(df2))*['-']})
    df2 = df2.append(df3)

    df1 = df1.reset_index(drop=True)
    df2 = df2.reset_index(drop=True)
    df1['abstraction'] = df2['abstraction']
    df1['expression'] = df2['expression']

    df1.to_excel(writer,'narrative_cluster'+str(i))

writer.save()
    

In [47]:
narrative

Unnamed: 0,cl_num,abstraction,expression
0,0.0,people have,Positive
1,0.0,amul milk,Positive
2,1.0,demonetization issue,Negative
3,1.0,flawless kick,Negative
4,1.0,mannkibaat jankibaat,Negative
5,1.0,hear navkendar,Negative
6,2.0,6 months,Negative
7,2.0,aap join,Negative
8,2.0,govt explains,Negative
9,2.0,samajwadi partys jaya bachchan,Negative


<div class="list-group" id="list-tab" role="tablist">
<h1 class="list-group-item list-group-item-action active" data-toggle="list" style='background:black; border:0; color:#ff6666' role="tab" aria-controls="home"><center>Thank You 🙏 </center></h1>
