# Latent Dirichlet Allocation
[SXSW Tweets Sentiment Analysis](https://github.com/czarinagluna/Twitter-Sentiment-Analysis/blob/main/sxsw-sentiment-analysis.ipynb)

Authors: Marcelo Scatena, Czarina Luna, Piotr Czolpik, Ross McKim

## Import dataset

In [1]:
%store -r X_train_processed

In [2]:
%store -r X_test_processed

In [3]:
%store -r y_train

In [4]:
%store -r y_test

In [5]:
import pandas as pd

#### Join all data

In [6]:
df_train=pd.concat([X_train_processed, y_train], axis=1)
df_test=pd.concat([X_test_processed, y_test], axis=1)

In [7]:
df_all=pd.concat([df_train, df_test])

In [8]:
df_all.head()

Unnamed: 0,text,target
6488,putting gun head give iphone,2
1944,virtualwallet nfc iphone5 bc standardization ...,0
6869,want win ticket 1 party rule simple android un...,1
3640,still big line outside apple pop shop 3 day ip...,1
7209,go without saying google bread going amazing g...,2


In [9]:
#Dependencies

import gensim #the library for Topic modelling
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim_models #LDA visualization library

from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer
import regex as re

import warnings
warnings.simplefilter('ignore')
from itertools import chain

#### Perform data modeling for LDA:

In [10]:
def tokenize_row(text):
    split=re.split("\W+",text) 
    return split

In [12]:
def remove_blanks(tokens):
    no_blanks = [word for word in tokens if word]
    return no_blanks

In [13]:
def remove_punctuation_row(tokens):
    token_list = [''.join(letter for letter in word if letter not in string.punctuation) for word in tokens]
    return [word for word in token_list if word]

In [14]:
df_all['text'] = df_all['text'].apply(lambda x: tokenize_row(x))
df_all['text'] = df_all['text'].apply(lambda x: remove_punctuation_row(x))

In [15]:
df_all.head()

Unnamed: 0,text,target
6488,"[putting, gun, head, give, iphone]",2
1944,"[virtualwallet, nfc, iphone5, bc, standardizat...",0
6869,"[want, win, ticket, 1, party, rule, simple, an...",1
3640,"[still, big, line, outside, apple, pop, shop, ...",1
7209,"[go, without, saying, google, bread, going, am...",2


## LDA

##### Create Dictionary from the tweets

In [16]:
dictionary = corpora.Dictionary(df_all['text'])
#Total number of non-zeroes in the BOW matrix (sum of the number of unique words per document over the entire corpus).
print(dictionary.num_nnz)

83837


##### Create tweet term matrix

In [17]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df_all['text']]
print(len(doc_term_matrix))

8909


##### Instantiate LDA model

In [18]:
lda = gensim.models.ldamodel.LdaModel

##### Fit LDA model on the dataset

In [19]:
num_topics=2
ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

##### Print the topics identified by LDA model

In [20]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.056*"google" + 0.019*"iphone" + 0.017*"new" + 0.014*"circle" + 0.014*"social" + 0.013*"android" + 0.012*"today" + 0.012*"amp" + 0.011*"app" + 0.011*"launch"'),
 (1,
  '0.050*"ipad" + 0.046*"apple" + 0.030*"store" + 0.024*"2" + 0.020*"austin" + 0.015*"iphone" + 0.012*"pop" + 0.010*"ipad2" + 0.009*"line" + 0.007*"get"')]

With two clusters, LDA identifies Google and IPAD as their clusters.

##### Visualize the LDA model results

In [21]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

##### Find which articles were marked in which cluster

In [22]:
lda_corpus = ldamodel[doc_term_matrix]

In [23]:
doc = [doc for doc in lda_corpus]

In [24]:
scores = list(chain(*[[score for topic_id,score in topic] \
                      for topic in [doc for doc in lda_corpus]]))

threshold = sum(scores)/len(scores)
print(threshold)

0.5000000000542548


In [25]:
cluster1 = [j for i,j in zip(lda_corpus,df_all.index) if i[0][1] > threshold]
cluster2 = [j for i,j in zip(lda_corpus,df_all.index) if i[1][1] > threshold]

print(len(cluster1))
print(len(cluster2))

4292
4616


In [26]:
df_all.iloc[cluster1[:10]]

Unnamed: 0,text,target
2322,"[wait, ipad, 2, also, sale]",2
4792,"[omitting, flash, apple, seems, effective, eve...",2
88,"[thanks, trying, contact, friend, family, japa...",1
4795,"[todo, q, amp, google, amp, bing, ranking, mon...",1
2686,"[far, longest, line, apple, store]",1
7948,"[news, social, important, google, screw, big, ...",1
6520,"[download, free, music, mix, itunes, cc]",1
2853,"[guess, need, figure, session, ipad, app, kill...",2
8480,"[ringo, deathstarr, flooding, ear, canal, red,...",1
1113,"[stupid, apple, opening, temporary, store, aus...",2


In [27]:
df_all.iloc[cluster2[:10]]

Unnamed: 0,text,target
8662,"[google, rumored, unveil, new, social, network...",1
2763,"[new, iphone, new, ipad, 2, come, running, fas...",2
2165,"[congratulation, yes, gowalla, win, best, ando...",2
4914,"[1, lot, fun, hollergram, app, killing]",1
3716,"[good, morning, standing, line, ipad, 2, today...",1
5170,"[apple, elegant, fascist, corporation, america...",0
669,"[need, sweet, mac, goodness, apple, set, tempo...",2
38,"[false, alarm, google, circle, coming, nowand,...",0
4224,"[expect, lot, buzz, ipad, 2, since, come, tomo...",2
4175,"[staying, alive, indie, iphone, game, developm...",1


#### Adding more stopwords:

In [28]:
new_stopwords = ['store', 'amp', 'austin', 'launch', 'new', 'via', 'u', 'w', '3', '4', 'today', 'pop', 'circle', '2']

In [29]:
def remove_stopwords_row(text):
    text=[word for word in text if word not in new_stopwords]
    return text

In [31]:
df_all['text'] = df_all['text'].apply(lambda x: remove_stopwords_row(x))

### Two Clusters, all data model

In [32]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df_all['text']]
print(len(doc_term_matrix))

lda = gensim.models.ldamodel.LdaModel

num_topics=2
ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

8909
Wall time: 1min 28s


In [33]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.036*"iphone" + 0.033*"ipad" + 0.021*"google" + 0.019*"app" + 0.014*"social" + 0.014*"android" + 0.010*"network" + 0.009*"free" + 0.008*"called" + 0.007*"major"'),
 (1,
  '0.055*"apple" + 0.040*"google" + 0.024*"ipad" + 0.011*"ipad2" + 0.010*"line" + 0.010*"party" + 0.007*"map" + 0.006*"win" + 0.006*"temporary" + 0.006*"opening"')]

In [34]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

### 4 Clusters, all data model

In [35]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df_all['text']]
print(len(doc_term_matrix))

lda = gensim.models.ldamodel.LdaModel

num_topics=4
ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

8909


In [36]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.040*"google" + 0.030*"ipad" + 0.025*"iphone" + 0.018*"app" + 0.015*"android" + 0.014*"mobile" + 0.009*"map" + 0.009*"check" + 0.008*"mayer" + 0.007*"marissa"'),
 (1,
  '0.098*"google" + 0.037*"social" + 0.026*"network" + 0.020*"called" + 0.018*"major" + 0.014*"possibly" + 0.014*"party" + 0.008*"iphone" + 0.007*"app" + 0.007*"bing"'),
 (2,
  '0.095*"apple" + 0.045*"ipad" + 0.020*"ipad2" + 0.018*"line" + 0.011*"temporary" + 0.010*"opening" + 0.010*"open" + 0.009*"downtown" + 0.009*"popup" + 0.007*"location"'),
 (3,
  '0.035*"iphone" + 0.032*"ipad" + 0.015*"free" + 0.012*"win" + 0.011*"app" + 0.008*"get" + 0.008*"android" + 0.008*"party" + 0.007*"come" + 0.007*"music"')]

In [37]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

### Two Clusters, Negative data model

In [38]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df_all[df_all['target']==0]['text']]
print(len(doc_term_matrix))

lda = gensim.models.ldamodel.LdaModel

num_topics=2
ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

569


In [39]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.018*"iphone" + 0.007*"app" + 0.006*"google" + 0.003*"android" + 0.003*"battery" + 0.002*"people" + 0.002*"phone" + 0.002*"say" + 0.002*"know" + 0.002*"go"'),
 (1,
  '0.024*"ipad" + 0.014*"google" + 0.014*"apple" + 0.007*"iphone" + 0.005*"like" + 0.004*"social" + 0.004*"apps" + 0.004*"need" + 0.004*"design" + 0.003*"get"')]

In [40]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

### Two Clusters, Positive data model

In [41]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df_all[df_all['target']==2]['text']]
print(len(doc_term_matrix))

lda = gensim.models.ldamodel.LdaModel

num_topics=2
%time ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

2968
Wall time: 39.2 s


In [42]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.036*"google" + 0.028*"ipad" + 0.018*"iphone" + 0.015*"app" + 0.009*"android" + 0.008*"party" + 0.007*"great" + 0.006*"social" + 0.006*"one" + 0.006*"map"'),
 (1,
  '0.063*"apple" + 0.031*"ipad" + 0.015*"ipad2" + 0.012*"iphone" + 0.009*"line" + 0.009*"get" + 0.007*"app" + 0.006*"downtown" + 0.006*"temporary" + 0.006*"opening"')]

In [43]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

### Two Clusters, Neutral data model

In [44]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df_all[df_all['target']==1]['text']]
print(len(doc_term_matrix))

lda = gensim.models.ldamodel.LdaModel

num_topics=2
ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

5372


In [45]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.046*"apple" + 0.042*"ipad" + 0.019*"iphone" + 0.010*"line" + 0.009*"ipad2" + 0.008*"android" + 0.006*"temporary" + 0.006*"open" + 0.006*"opening" + 0.006*"get"'),
 (1,
  '0.064*"google" + 0.018*"social" + 0.013*"network" + 0.013*"iphone" + 0.010*"called" + 0.010*"free" + 0.009*"party" + 0.009*"app" + 0.009*"major" + 0.007*"possibly"')]

In [46]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

In [47]:
def tweet_with_word(series, word):
    '''
    Input: Series of text, word (str)
    Output: Count of the number of tweets with word inputed in them
    Prints: Tweets with word in them
    '''
    count = 0
    for row in series:
        if word in row:
            count += 1
            print(row)
    return count

In [52]:
tweet_with_word(df_all['text'], 'major')

['google', 'major', 'social', 'network', 'called', 'possibly', 'nfusion']
['google', 'major', 'social', 'network', 'called', 'possibly']
['google', 'major', 'social', 'network', 'called', 'possibly', 'creo', 'que', 'sers', 'el', 'primero']
['google', 'major', 'social', 'network', 'called', 'google']
['google', 'major', 'social', 'network', 'called', 'cc']
['google', 'major', 'social', 'network', 'called', 'possibly']
['google', 'major', 'social', 'network', 'called', 'possibly']
['google', 'major', 'social', 'network', 'really', 'dont', 'need', 'another', 'social', 'network']
['enough', 'already', 'google', 'major', 'social', 'network', 'called', 'possibly']
['google', 'major', 'social', 'network', 'called', 'possibly', 'updated']
['google', 'major', 'social', 'network', 'called', 'possibly']
['google', 'major', 'social', 'network', 'called', 'google']
['watch', 'fb', 'big', 'brother', 'google', 'major', 'social', 'network', 'called', 'possibly']
['google', 'major', 'social', 'network'

295