In [1]:
# Read in the document-term matrix
import pandas as pd
import pickle

data = pd.read_pickle('dtm_stop.pkl')
data.head()

Unnamed: 0,aaron,abandon,abandonment,abbas,abdication,abduction,abhorrent,abide,ability,abilityour,...,zarfos,zarqawi,zeitchik,zero,zeroemission,zerooverall,zimbabwe,zion,zone,zoom
Barack Obama_2010,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2011,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2012,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2013,0,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [2]:
# import gensim for LDA modeling
from gensim import matutils, models
# import scipy to process spare matrices
import scipy.sparse

In [3]:
# transpose the term document matrix
tdm = data.transpose()
tdm.head()

Unnamed: 0,Barack Obama_2010,Barack Obama_2011,Barack Obama_2012,Barack Obama_2013,Barack Obama_2014,Barack Obama_2015,Barack Obama_2016,Bill Clinton_1994,Bill Clinton_1995,Bill Clinton_1996,...,Richard M. Nixon_1971,Richard M. Nixon_1972,Richard M. Nixon_1974,Ronald Reagan_1982,Ronald Reagan_1983,Ronald Reagan_1984,Ronald Reagan_1985,Ronald Reagan_1986,Ronald Reagan_1987,Ronald Reagan_1988
aaron,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abandon,0,1,0,0,0,1,0,1,0,3,...,0,0,1,0,0,1,1,2,1,0
abandonment,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
abbas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abdication,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# convert term document matrix to sparse matrix, then gensim corpus(part1)
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [5]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [6]:
# for the lda model specify 2 topics and print out topic composition, do it 10 times
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.004*"business" + 0.004*"pass" + 0.003*"care" + 0.003*"health" + 0.003*"act" + 0.003*"change" + 0.003*"citizen" + 0.003*"worker" + 0.003*"day" + 0.003*"strong"'),
 (1,
  '0.004*"billion" + 0.003*"union" + 0.003*"percent" + 0.003*"rate" + 0.003*"growth" + 0.003*"provide" + 0.003*"strong" + 0.003*"problem" + 0.003*"begin" + 0.003*"defense"')]

In [7]:
# LDA for num_topics = 3
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.005*"business" + 0.005*"care" + 0.004*"health" + 0.004*"change" + 0.004*"pass" + 0.004*"pay" + 0.004*"challenge" + 0.004*"century" + 0.003*"strong" + 0.003*"create"'),
 (1,
  '0.003*"propose" + 0.003*"strong" + 0.003*"bring" + 0.003*"health" + 0.003*"day" + 0.003*"goal" + 0.003*"power" + 0.003*"defense" + 0.003*"begin" + 0.003*"provide"'),
 (2,
  '0.004*"billion" + 0.003*"act" + 0.003*"states" + 0.003*"law" + 0.003*"percent" + 0.003*"union" + 0.003*"problem" + 0.003*"believe" + 0.003*"policy" + 0.003*"power"')]

In [8]:
# LDA for num_topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

[(0,
  '0.004*"union" + 0.004*"defense" + 0.004*"problem" + 0.003*"soviet" + 0.003*"growth" + 0.003*"states" + 0.003*"inflation" + 0.003*"policy" + 0.003*"strong" + 0.003*"provide"'),
 (1,
  '0.005*"health" + 0.005*"care" + 0.004*"crime" + 0.004*"challenge" + 0.004*"pass" + 0.003*"think" + 0.003*"welfare" + 0.003*"percent" + 0.003*"change" + 0.003*"pay"'),
 (2,
  '0.004*"act" + 0.004*"business" + 0.003*"pass" + 0.003*"health" + 0.003*"law" + 0.003*"strong" + 0.003*"change" + 0.003*"care" + 0.003*"day" + 0.003*"billion"'),
 (3,
  '0.000*"change" + 0.000*"opportunity" + 0.000*"day" + 0.000*"pass" + 0.000*"billion" + 0.000*"strong" + 0.000*"defense" + 0.000*"health" + 0.000*"power" + 0.000*"problem"')]

<b>The LDA does need certain adjustments, however, there are some patterns that can be gauged from the model output. We can see that if number of topics is 3. However, the topics seem a bit messy, we will need to explore more.</b>

In [9]:
# In order to increase the accuracy of the topic models, we will pull out all nouns instead this time.
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [10]:
# read in the clean data and apply the nouns function 
data_clean = pd.read_pickle('data_clean.pkl')
data_nouns = pd.DataFrame(data_clean.transcript.apply(nouns))
data_nouns[:4]

Unnamed: 0,transcript
Barack Obama_2010,madam speaker vice president member congress a...
Barack Obama_2011,mr speaker vice president member congress amer...
Barack Obama_2012,mr speaker vice president member congress amer...
Barack Obama_2013,mr speaker vice president member congress citi...


In [11]:
# create a new document term matrix with nouns only
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# add stop words that will be removed based on previous models
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said','applause','year','america',
                 'world','nation','americans','country','month','work','congress','government']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# recreate the entirety of the document term matrix
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.transcript)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,aaron,abandon,abandonment,abdication,abduction,abhorrent,abide,ability,ablebodie,abolition,...,yugoslavia,zarfos,zarqawi,zeitchik,zero,zeroemission,zimbabwe,zion,zone,zoom
Barack Obama_2010,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2011,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2012,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2013,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
Barack Obama_2016,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bill Clinton_1994,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Bill Clinton_1995,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,1,0,0
Bill Clinton_1996,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))
# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [13]:
# 2 topics 10 passes
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.009*"help" + 0.008*"child" + 0.007*"security" + 0.007*"family" + 0.006*"school" + 0.006*"life" + 0.006*"economy" + 0.006*"job" + 0.006*"community" + 0.005*"support"'),
 (1,
  '0.010*"job" + 0.009*"program" + 0.007*"help" + 0.007*"state" + 0.006*"tax" + 0.006*"family" + 0.006*"peace" + 0.005*"home" + 0.005*"president" + 0.005*"economy"')]

In [14]:
# 3 topics 10 passes
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.009*"help" + 0.009*"job" + 0.008*"family" + 0.007*"child" + 0.006*"security" + 0.006*"way" + 0.006*"life" + 0.006*"tax" + 0.006*"economy" + 0.006*"program"'),
 (1,
  '0.009*"peace" + 0.007*"program" + 0.006*"states" + 0.006*"job" + 0.006*"president" + 0.006*"effort" + 0.005*"help" + 0.005*"union" + 0.005*"war" + 0.005*"state"'),
 (2,
  '0.013*"program" + 0.009*"tax" + 0.009*"job" + 0.008*"state" + 0.008*"energy" + 0.006*"increase" + 0.006*"economy" + 0.006*"help" + 0.005*"growth" + 0.005*"budget"')]

In [15]:
# 4 topics 10 passes
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.005*"home" + 0.004*"session" + 0.004*"freedom" + 0.004*"power" + 0.003*"war" + 0.003*"help" + 0.003*"budget" + 0.003*"investment" + 0.003*"program" + 0.003*"aggression"'),
 (1,
  '0.009*"help" + 0.008*"program" + 0.007*"job" + 0.007*"peace" + 0.007*"child" + 0.007*"family" + 0.007*"state" + 0.006*"budget" + 0.006*"way" + 0.006*"security"'),
 (2,
  '0.011*"job" + 0.008*"help" + 0.007*"family" + 0.007*"program" + 0.007*"tax" + 0.006*"home" + 0.006*"business" + 0.006*"state" + 0.005*"war" + 0.005*"increase"'),
 (3,
  '0.010*"job" + 0.007*"security" + 0.006*"economy" + 0.006*"help" + 0.006*"tax" + 0.005*"business" + 0.005*"freedom" + 0.005*"school" + 0.005*"home" + 0.005*"way"')]

<b>The LDA with nouns only shows that under 3 topics, the talking points revolve around: Family, Economy, State</b>

In [16]:
# Furthur clean up the document term matrix, this time including adjectives as well as nouns
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [17]:
# Apply the noun-adjective function
data_nouns_adj = pd.DataFrame(data_clean.transcript.apply(nouns_adj))

In [18]:
# Create a new dt matrix with only nouns and adjectives, remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.transcript)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,aaron,abandon,abandonment,abdication,abduction,abhorrent,abide,ability,abilityour,abject,...,zarfos,zarqawi,zeitchik,zero,zeroemission,zerooverall,zimbabwe,zion,zone,zoom
Barack Obama_2010,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2011,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2012,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2013,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Barack Obama_2016,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bill Clinton_1994,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Bill Clinton_1995,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,1,0,0
Bill Clinton_1996,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [20]:
# 2 topics 10 passes
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.004*"thing" + 0.003*"terrorist" + 0.003*"thank" + 0.003*"lead" + 0.003*"responsibility" + 0.003*"parent" + 0.003*"college" + 0.003*"drug" + 0.003*"democracy" + 0.003*"safe"'),
 (1,
  '0.003*"crime" + 0.003*"local" + 0.003*"inflation" + 0.003*"price" + 0.003*"resource" + 0.003*"administration" + 0.003*"control" + 0.003*"propose" + 0.003*"major" + 0.003*"soviet"')]

In [21]:
# 3 topics 10 passes
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.004*"soviet" + 0.004*"price" + 0.004*"inflation" + 0.003*"crime" + 0.003*"foreign" + 0.003*"local" + 0.003*"common" + 0.003*"major" + 0.003*"resource" + 0.003*"propose"'),
 (1,
  '0.004*"terrorist" + 0.003*"iraq" + 0.003*"thank" + 0.003*"drug" + 0.003*"terror" + 0.003*"lead" + 0.003*"weapon" + 0.002*"crime" + 0.002*"measure" + 0.002*"cause"'),
 (2,
  '0.004*"thing" + 0.004*"college" + 0.004*"parent" + 0.003*"student" + 0.003*"value" + 0.003*"thank" + 0.003*"sure" + 0.003*"safe" + 0.003*"lead" + 0.003*"responsibility"')]

In [22]:
# 4 topics 10 passes
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.003*"lead" + 0.003*"responsibility" + 0.003*"thing" + 0.003*"terrorist" + 0.003*"investment" + 0.003*"propose" + 0.002*"iraq" + 0.002*"big" + 0.002*"college" + 0.002*"hard"'),
 (1,
  '0.005*"terrorist" + 0.004*"insurance" + 0.004*"iraq" + 0.003*"enemy" + 0.003*"al" + 0.003*"qaeda" + 0.003*"oil" + 0.002*"iraqi" + 0.002*"fund" + 0.002*"standard"'),
 (2,
  '0.004*"crime" + 0.004*"welfare" + 0.003*"parent" + 0.003*"inflation" + 0.003*"responsibility" + 0.003*"spending" + 0.003*"thing" + 0.003*"value" + 0.003*"propose" + 0.002*"drug"'),
 (3,
  '0.004*"soviet" + 0.003*"thank" + 0.003*"crime" + 0.003*"try" + 0.003*"administration" + 0.003*"poverty" + 0.003*"vietnam" + 0.003*"price" + 0.003*"local" + 0.002*"terrorist"')]

<b> If we were to use the noun-adj LDA model, we'd pick 2 topics, which covers two issues:state level challenges, community level challenges </b>

In [29]:
# Our final LDA model (for now) 
# question, why does the LDA generate different models every time?
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=100, random_state = 3)
ldana.print_topics()

[(0,
  '0.003*"soviet" + 0.003*"crime" + 0.003*"price" + 0.003*"inflation" + 0.003*"major" + 0.003*"propose" + 0.003*"control" + 0.002*"local" + 0.002*"measure" + 0.002*"foreign"'),
 (1,
  '0.005*"thing" + 0.004*"parent" + 0.004*"college" + 0.004*"thank" + 0.003*"student" + 0.003*"lead" + 0.003*"responsibility" + 0.003*"company" + 0.003*"sure" + 0.003*"big"')]

In [310]:
# how to display the visualization of topics in this scenario?
# topics within each transcript
corpus_transformed = ldana[corpusna]

In [311]:
corpus_draft = pd.DataFrame(corpus_transformed)
corpus_draft.columns = ['V1','V2']

In [312]:
corpus_draft = corpus_draft.fillna('--')
corpus_draft.head()

Unnamed: 0,V1,V2
0,"(1, 0.9992398)",--
1,"(1, 0.99940425)",--
2,"(1, 0.9994518)",--
3,"(1, 0.9989334)",--
4,"(1, 0.9994402)",--


In [313]:
corpus_draft['First_Topic'], corpus_draft['First.Rep'] = zip(*corpus_draft.V1)
corpus_draft['Second_Topic'], corpus_draft['Second.Rep'] = zip(*corpus_draft.V2)

In [314]:
corpus_draft.drop(['V1', 'V2'], axis=1, inplace=True)
corpus_draft.head()

Unnamed: 0,First_Topic,First.Rep,Second_Topic,Second.Rep
0,1,0.99924,-,-
1,1,0.999404,-,-
2,1,0.999452,-,-
3,1,0.998933,-,-
4,1,0.99944,-,-


In [315]:
di = {0: "State", 1: "Community"}
corpus_draft['First_Topic'] = corpus_draft['First_Topic'].map(di)
corpus_draft['Second_Topic'] = corpus_draft['Second_Topic'].map(di)

In [320]:
import numpy as np
corpus_draft = corpus_draft.replace('-', np.nan)

In [321]:
cor_index = pd.DataFrame(data_dtmn.index)
cor_index.columns = ['President_SOUA']

In [322]:
cor_index.merge(corpus_draft,how='outer', left_index=True, right_index=True)

Unnamed: 0,President_SOUA,First_Topic,First.Rep,Second_Topic,Second.Rep
0,Barack Obama_2010,Community,0.99924,,
1,Barack Obama_2011,Community,0.999404,,
2,Barack Obama_2012,Community,0.999452,,
3,Barack Obama_2013,Community,0.998933,,
4,Barack Obama_2014,Community,0.99944,,
5,Barack Obama_2015,Community,0.999443,,
6,Barack Obama_2016,Community,0.99889,,
7,Bill Clinton_1994,Community,0.999295,,
8,Bill Clinton_1995,Community,0.999451,,
9,Bill Clinton_1996,Community,0.999266,,
