# Topic modelling, with manual filtering of words

In [1]:
import matplotlib.pyplot as plt
import gensim
import numpy as np
import spacy
from tqdm import tqdm_notebook as tqdm

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
import pyLDAvis.gensim

import os, re, operator, warnings
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now
%matplotlib inline

In [2]:
import pickle

In [3]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [4]:
my_stop_words = [u'say', u'\'s', u'Mr', u'be', u'said', u'says', u'saying']
for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

filtered = ['barbara','snowden','iowa','emanuel','goldman','bring','america','like','chrysler','jerry','voter','political','american','president','poll','independent','democratic','campaign','progressive','want','socialist','thing','think','vote','party', 'debate', 'primary', 'nomination', 'presidential', 'caucus', 'history', 'candidate','bernie','sanders','hillary','clinton','barack','obama','donald','trump','republican','democrat','republicans','democrats','romney']
filtered = [nlp(w)[0].lemma_ for w in filtered]
print(filtered)
def is_filtered(w):
    return str(w) in filtered

['barbara', 'snowden', 'iowa', 'emanuel', 'goldman', 'bring', 'america', 'like', 'chrysler', 'jerry', 'voter', 'political', 'american', 'president', 'poll', 'independent', 'democratic', 'campaign', 'progressive', 'want', 'socialist', 'thing', 'think', 'vote', 'party', 'debate', 'primary', 'nomination', 'presidential', 'caucus', 'history', 'candidate', 'bernie', 'sander', 'hillary', 'clinton', 'barack', 'obama', 'donald', 'trump', 'republican', 'democrat', 'republican', 'democrat', 'romney']


In [5]:
text_parties = pickle.load(open('text_dumps.pkl','rb'))

In [6]:
text = text_parties['republican'] + text_parties['democrat']


In [7]:
for party in ['democrat','republican']:
    text = text_parties[party]
    text = text.lower()
    text.split('\n')[:10]
    ### Pre-processing data!

    doc = nlp(text)

    texts_nlp, article_nlp, skl_texts = [], [], []

    for phrase in tqdm(text.split('\n')):
        phrase_nlp = nlp(phrase)
        for w in phrase_nlp:
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and w.is_alpha:
                if not is_filtered(w.lemma_) and len(w.lemma_) > 2:
                    article_nlp.append(w.lemma_)
            # if it's a new line, it means we're onto our next document
        skl_texts.append(' '.join(article_nlp))
        texts_nlp.append(article_nlp)
        article_nlp = []

    texts = texts_nlp

    with open('tokenized_dump_%s.pkl' %party,'wb') as f:
        pickle.dump(texts,f)

HBox(children=(IntProgress(value=0, max=1001), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1047), HTML(value='')))




In [96]:
bigram = gensim.models.Phrases(texts)

texts = [bigram[line] for line in texts]

texts[20][0:100]

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

corpus[1][0:10]

### LSI

LSI stands for Latent Semantic Indeixing - it is a popular information retreival method which works by decomposing the original matrix of words to maintain key topics. Gensim's implementation uses an SVD.

In [80]:
lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [86]:
lsimodel.show_topics(num_topics=15)  # Showing only the top 5 topics

[(0,
  '0.269*"people" + 0.218*"need" + 0.188*"support" + 0.136*"right" + 0.123*"good" + 0.121*"new" + 0.115*"election" + 0.115*"state" + 0.114*"high" + 0.113*"change"'),
 (1,
  '0.336*"people" + 0.178*"kid" + 0.162*"gay" + 0.159*"sex_marriage" + 0.141*"property" + 0.138*"movement" + 0.136*"gay_right" + 0.136*"gay_sex" + -0.133*"israel" + -0.130*"need"'),
 (2,
  '-0.613*"state" + -0.171*"split" + -0.146*"unemployment_rate" + -0.130*"national" + -0.128*"unemployment" + 0.127*"people" + -0.119*"average" + -0.102*"large" + -0.097*"unicameral" + -0.095*"tax"'),
 (3,
  '0.542*"state" + 0.171*"split" + 0.145*"unemployment_rate" + -0.134*"money" + -0.129*"year" + -0.121*"supporter" + -0.121*"favorability" + -0.121*"general_election" + 0.121*"unemployment" + -0.113*"tax"'),
 (4,
  '-0.220*"government" + -0.202*"right" + -0.200*"tax" + -0.154*"spouse" + -0.147*"federal" + -0.140*"work" + -0.114*"post" + 0.111*"general_election" + 0.111*"favorability" + -0.110*"country"'),
 (5,
  '0.334*"right" 

 '0.336*"people" + 0.178*"kid" + 0.162*"gay" + 0.159*"sex_marriage" + 0.141*"property" + 0.138*"movement" + 0.136*"gay_right" + 0.136*"gay_sex" + -0.133*"israel" + -0.130*"need"'),
 
 *"unemployment_rate" + -0.130*"national" + -0.128*"unemployment"
 
 tax
 
 5*"unemployment_rate" + -0.134*"money" 
 
  0.121*"unemployment" + -0.113*"tax"'),
  
  general_election
  
  homosexual
  
  "tax"
  
  282*"war" 
  
  " + 0.163*"war"

### HDP

HDP, the Hierarchical Dirichlet process is an unsupervised topic model which figures out the number of topics on it's own.

In [101]:
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

In [102]:
hdpmodel.show_topics(num_topics=10)

[(0,
  '0.001*sandy + 0.001*negate + 0.001*shove + 0.001*share + 0.001*testimony + 0.001*need + 0.001*man + 0.001*upvot + 0.001*slice + 0.001*west + 0.001*running + 0.001*grandchild + 0.001*worship + 0.001*respect + 0.001*quinnipiac + 0.001*adviser + 0.001*past + 0.001*reference + 0.001*win + 0.001*fil'),
 (1,
  '0.002*people + 0.001*forfend + 0.001*wambach + 0.001*gay_right + 0.001*tucker + 0.001*reagan + 0.001*save + 0.001*shapiro + 0.001*ban + 0.001*regardless + 0.001*fdr + 0.001*cousin + 0.001*native + 0.001*institution + 0.001*post + 0.001*eachother + 0.001*flavor + 0.001*sexism + 0.001*relationship + 0.001*rule'),
 (2,
  '0.002*upset + 0.002*shop + 0.002*begin + 0.001*andrew + 0.001*bernard + 0.001*jeanne + 0.001*few + 0.001*art + 0.001*thin + 0.001*disability + 0.001*lapd + 0.001*explain + 0.001*journalist + 0.001*paycheck + 0.001*bialek + 0.001*stick + 0.001*away + 0.001*hobble + 0.001*absence + 0.001*shell'),
 (3,
  '0.002*missing + 0.002*community + 0.001*supporter + 0.001*pr

sexism
tax
pro_life


### LDA

LDA, or Latent Dirichlet Allocation is arguably the most famous topic modelling algorithm out there. Out here we create a simple topic model with 10 topics.

In [103]:
ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [104]:
ldamodel.show_topics()

[(0,
  '0.025*"support" + 0.023*"defeat" + 0.022*"predict" + 0.022*"whoop" + 0.022*"foreign" + 0.022*"tactic" + 0.022*"hopeful" + 0.019*"opponent" + 0.018*"belief" + 0.017*"lead"'),
 (1,
  '0.028*"fund" + 0.021*"dollar" + 0.020*"ideal" + 0.020*"sach" + 0.020*"hedge" + 0.020*"man" + 0.019*"dnc" + 0.019*"ban" + 0.018*"conservative" + 0.018*"rich"'),
 (2,
  '0.020*"key" + 0.016*"know" + 0.012*"need" + 0.011*"win" + 0.011*"run" + 0.011*"left" + 0.011*"national" + 0.010*"wall_street" + 0.010*"play" + 0.010*"chair"'),
 (3,
  '0.024*"point" + 0.022*"half" + 0.022*"awkward" + 0.022*"breach" + 0.022*"increasingly" + 0.022*"frontrunner" + 0.022*"husband" + 0.022*"prove" + 0.020*"few" + 0.015*"finally"'),
 (4,
  '0.029*"trade" + 0.022*"fire" + 0.017*"support" + 0.015*"plan" + 0.015*"deal" + 0.015*"release" + 0.015*"goal" + 0.015*"day" + 0.015*"policy" + 0.015*"cost"'),
 (5,
  '0.057*"wall_street" + 0.019*"stand" + 0.019*"run" + 0.019*"stupid" + 0.019*"joe_biden" + 0.019*"officially" + 0.019*"hall

tax
gun control
gun
war