In [23]:
# Read in the document-term matrix
import pandas as pd
import pickle

data = pd.read_pickle('dtm_stop.pkl')
data.head()

Unnamed: 0,aaron,abandon,abandonment,abbas,abdication,abduction,abhorrent,abide,ability,abilityour,...,zarfos,zarqawi,zeitchik,zero,zeroemission,zerooverall,zimbabwe,zion,zone,zoom
Barack Obama_2010,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2011,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2012,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2013,0,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [24]:
# import gensim for LDA modeling
from gensim import matutils, models
# import scipy to process spare matrices
import scipy.sparse

In [25]:
# transpose the term document matrix
tdm = data.transpose()
tdm.head()

Unnamed: 0,Barack Obama_2010,Barack Obama_2011,Barack Obama_2012,Barack Obama_2013,Barack Obama_2014,Barack Obama_2015,Barack Obama_2016,Bill Clinton_1994,Bill Clinton_1995,Bill Clinton_1996,...,Richard M. Nixon_1971,Richard M. Nixon_1972,Richard M. Nixon_1974,Ronald Reagan_1982,Ronald Reagan_1983,Ronald Reagan_1984,Ronald Reagan_1985,Ronald Reagan_1986,Ronald Reagan_1987,Ronald Reagan_1988
aaron,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abandon,0,1,0,0,0,1,0,1,0,3,...,0,0,1,0,0,1,1,2,1,0
abandonment,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
abbas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abdication,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# convert term document matrix to sparse matrix, then gensim corpus(part1)
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [27]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [28]:
# for the lda model specify 2 topics and print out topic composition, do it 10 times
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.004*"business" + 0.004*"citizen" + 0.003*"pass" + 0.003*"challenge" + 0.003*"health" + 0.003*"day" + 0.003*"strong" + 0.003*"states" + 0.003*"act" + 0.003*"change"'),
 (1,
  '0.004*"billion" + 0.003*"opportunity" + 0.003*"propose" + 0.003*"health" + 0.003*"begin" + 0.003*"believe" + 0.003*"percent" + 0.003*"act" + 0.003*"care" + 0.003*"strong"')]

In [29]:
# LDA for num_topics = 3
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.004*"business" + 0.004*"pass" + 0.004*"health" + 0.003*"care" + 0.003*"strong" + 0.003*"day" + 0.003*"citizen" + 0.003*"challenge" + 0.003*"change" + 0.003*"worker"'),
 (1,
  '0.005*"billion" + 0.003*"percent" + 0.003*"union" + 0.003*"act" + 0.003*"defense" + 0.003*"problem" + 0.003*"believe" + 0.003*"policy" + 0.003*"rate" + 0.003*"growth"'),
 (2,
  '0.004*"care" + 0.004*"health" + 0.003*"begin" + 0.003*"mean" + 0.003*"business" + 0.003*"change" + 0.003*"crime" + 0.003*"law" + 0.002*"create" + 0.002*"place"')]

In [30]:
# LDA for num_topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

[(0,
  '0.004*"health" + 0.004*"care" + 0.004*"pass" + 0.003*"day" + 0.003*"bring" + 0.003*"challenge" + 0.003*"change" + 0.003*"business" + 0.003*"strong" + 0.003*"pay"'),
 (1,
  '0.003*"billion" + 0.003*"shall" + 0.003*"growth" + 0.003*"area" + 0.003*"act" + 0.003*"price" + 0.002*"public" + 0.002*"foreign" + 0.002*"policy" + 0.002*"percent"'),
 (2,
  '0.004*"business" + 0.004*"strong" + 0.004*"change" + 0.003*"opportunity" + 0.003*"citizen" + 0.003*"power" + 0.003*"worker" + 0.003*"health" + 0.003*"states" + 0.003*"day"'),
 (3,
  '0.007*"billion" + 0.004*"vietnam" + 0.004*"shall" + 0.004*"believe" + 0.004*"act" + 0.004*"recommend" + 0.004*"think" + 0.004*"percent" + 0.003*"progress" + 0.003*"law"')]

<b>The LDA does need certain adjustments, however, there are some patterns that can be gauged from the model output. We can see that if number of topics is 3. However, the topics seem a bit messy, we will need to explore more.</b>

In [31]:
# In order to increase the accuracy of the topic models, we will pull out all nouns instead this time.
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [32]:
# read in the clean data and apply the nouns function 
data_clean = pd.read_pickle('data_clean.pkl')
data_nouns = pd.DataFrame(data_clean.transcript.apply(nouns))
data_nouns[:4]

Unnamed: 0,transcript
Barack Obama_2010,madam speaker vice president member congress a...
Barack Obama_2011,mr speaker vice president member congress amer...
Barack Obama_2012,mr speaker vice president member congress amer...
Barack Obama_2013,mr speaker vice president member congress citi...


In [33]:
# create a new document term matrix with nouns only
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# add stop words that will be removed based on previous models
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said','applause','year','america',
                 'world','nation','americans','country','month','work','congress','government']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# recreate the entirety of the document term matrix
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.transcript)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,aaron,abandon,abandonment,abdication,abduction,abhorrent,abide,ability,ablebodie,abolition,...,yugoslavia,zarfos,zarqawi,zeitchik,zero,zeroemission,zimbabwe,zion,zone,zoom
Barack Obama_2010,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2011,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2012,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2013,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
Barack Obama_2016,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bill Clinton_1994,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Bill Clinton_1995,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,1,0,0
Bill Clinton_1996,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))
# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [35]:
# 2 topics 10 passes
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.008*"help" + 0.007*"life" + 0.006*"man" + 0.005*"child" + 0.005*"security" + 0.005*"war" + 0.005*"state" + 0.005*"economy" + 0.005*"health" + 0.005*"way"'),
 (1,
  '0.010*"job" + 0.008*"help" + 0.008*"program" + 0.007*"family" + 0.006*"tax" + 0.006*"peace" + 0.006*"state" + 0.006*"economy" + 0.006*"budget" + 0.006*"security"')]

In [36]:
# 3 topics 10 passes
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.010*"job" + 0.009*"help" + 0.007*"program" + 0.007*"family" + 0.006*"security" + 0.006*"tax" + 0.006*"economy" + 0.006*"state" + 0.006*"peace" + 0.006*"child"'),
 (1,
  '0.005*"tax" + 0.005*"states" + 0.004*"defense" + 0.004*"drug" + 0.004*"job" + 0.004*"man" + 0.004*"life" + 0.004*"freedom" + 0.004*"home" + 0.003*"end"'),
 (2,
  '0.007*"help" + 0.007*"man" + 0.007*"program" + 0.006*"state" + 0.005*"increase" + 0.005*"life" + 0.004*"effort" + 0.004*"war" + 0.004*"goal" + 0.004*"policy"')]

In [37]:
# 4 topics 10 passes
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.009*"help" + 0.009*"family" + 0.008*"child" + 0.008*"job" + 0.007*"community" + 0.007*"school" + 0.007*"program" + 0.007*"peace" + 0.006*"state" + 0.006*"budget"'),
 (1,
  '0.009*"program" + 0.007*"increase" + 0.006*"tax" + 0.006*"family" + 0.006*"state" + 0.005*"act" + 0.005*"job" + 0.005*"pass" + 0.005*"law" + 0.005*"city"'),
 (2,
  '0.009*"program" + 0.008*"help" + 0.007*"peace" + 0.006*"war" + 0.006*"man" + 0.006*"security" + 0.006*"life" + 0.006*"increase" + 0.006*"tax" + 0.006*"president"'),
 (3,
  '0.016*"job" + 0.008*"business" + 0.008*"economy" + 0.008*"help" + 0.007*"family" + 0.007*"security" + 0.006*"tax" + 0.006*"change" + 0.006*"energy" + 0.005*"way"')]

<b>The LDA with nouns only shows that under 3 topics, the talking points revolve around: Family, Economy, State</b>

In [38]:
# Furthur clean up the document term matrix, this time including adjectives as well as nouns
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [39]:
# Apply the noun-adjective function
data_nouns_adj = pd.DataFrame(data_clean.transcript.apply(nouns_adj))

In [40]:
# Create a new dt matrix with only nouns and adjectives, remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.transcript)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,aaron,abandon,abandonment,abdication,abduction,abhorrent,abide,ability,abilityour,abject,...,zarfos,zarqawi,zeitchik,zero,zeroemission,zerooverall,zimbabwe,zion,zone,zoom
Barack Obama_2010,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2011,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2012,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2013,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama_2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Barack Obama_2016,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bill Clinton_1994,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Bill Clinton_1995,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,1,0,0
Bill Clinton_1996,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [42]:
# 2 topics 100 passes
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.003*"terrorist" + 0.003*"thing" + 0.003*"thank" + 0.003*"college" + 0.003*"parent" + 0.003*"responsibility" + 0.003*"lead" + 0.003*"drug" + 0.003*"safe" + 0.003*"crime"'),
 (1,
  '0.004*"soviet" + 0.003*"inflation" + 0.003*"major" + 0.003*"price" + 0.003*"spending" + 0.003*"foreign" + 0.003*"administration" + 0.003*"crime" + 0.003*"propose" + 0.003*"control"')]

In [43]:
# 3 topics 100 passes
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.004*"soviet" + 0.004*"inflation" + 0.003*"foreign" + 0.003*"price" + 0.003*"propose" + 0.003*"major" + 0.002*"area" + 0.002*"seek" + 0.002*"present" + 0.002*"income"'),
 (1,
  '0.004*"thing" + 0.004*"terrorist" + 0.003*"college" + 0.003*"parent" + 0.003*"lead" + 0.003*"sure" + 0.003*"student" + 0.003*"responsibility" + 0.003*"drug" + 0.003*"company"'),
 (2,
  '0.004*"crime" + 0.003*"responsibility" + 0.003*"local" + 0.003*"report" + 0.003*"thank" + 0.003*"propose" + 0.003*"fund" + 0.002*"measure" + 0.002*"resource" + 0.002*"price"')]

In [44]:
# 4 topics 100 passes
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.003*"seek" + 0.003*"terrorist" + 0.003*"benefit" + 0.003*"iraq" + 0.003*"terror" + 0.002*"cause" + 0.002*"society" + 0.002*"poverty" + 0.002*"vote" + 0.002*"insurance"'),
 (1,
  '0.003*"drug" + 0.003*"thing" + 0.003*"administration" + 0.003*"border" + 0.002*"welfare" + 0.002*"common" + 0.002*"safe" + 0.002*"second" + 0.002*"farm" + 0.002*"crime"'),
 (2,
  '0.004*"crime" + 0.003*"soviet" + 0.003*"propose" + 0.003*"responsibility" + 0.003*"inflation" + 0.003*"price" + 0.003*"local" + 0.003*"thank" + 0.003*"major" + 0.003*"resource"'),
 (3,
  '0.005*"terrorist" + 0.004*"thing" + 0.004*"lead" + 0.003*"iraq" + 0.003*"college" + 0.003*"student" + 0.003*"company" + 0.003*"democracy" + 0.003*"idea" + 0.003*"value"')]

<b> If we were to use the noun-adj LDA model, we'd pick 2 topics, which covers two issues:state level challenges, community level challenges </b>

In [45]:
# Our final LDA model (for now) 
# question, why does the LDA generate different models every time?
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=100, random_state = 3)
ldana.print_topics()

[(0,
  '0.003*"soviet" + 0.003*"crime" + 0.003*"price" + 0.003*"inflation" + 0.003*"major" + 0.003*"propose" + 0.003*"control" + 0.002*"local" + 0.002*"measure" + 0.002*"foreign"'),
 (1,
  '0.005*"thing" + 0.004*"parent" + 0.004*"college" + 0.004*"thank" + 0.003*"student" + 0.003*"lead" + 0.003*"responsibility" + 0.003*"company" + 0.003*"sure" + 0.003*"big"')]

In [46]:
# how to display the visualization of topics in this scenario?
# topics within each transcript
corpus_transformed = ldana[corpusna]

In [83]:
for index, i in enumerate(corpus_transformed):
    print(index,i)

0 [(1, 0.9992198)]
1 [(1, 0.9994041)]
2 [(1, 0.99945176)]
3 [(1, 0.99892443)]
4 [(1, 0.99944013)]
5 [(1, 0.99944276)]
6 [(1, 0.9988806)]
7 [(1, 0.9992971)]
8 [(1, 0.9994493)]
9 [(1, 0.99926543)]
10 [(1, 0.99936676)]
11 [(1, 0.9983205)]
12 [(0, 0.029365992), (1, 0.970634)]
13 [(1, 0.9994332)]
14 [(1, 0.9990692)]
15 [(1, 0.9993366)]
16 [(0, 0.33513013), (1, 0.66486984)]
17 [(0, 0.9153077), (1, 0.084692314)]
18 [(0, 0.44029874), (1, 0.55970126)]
19 [(0, 0.728248), (1, 0.27175203)]
20 [(0, 0.96019024), (1, 0.03980976)]
21 [(0, 0.6757215), (1, 0.32427844)]
22 [(0, 0.74724275), (1, 0.25275728)]
23 [(0, 0.7744611), (1, 0.22553891)]
24 [(0, 0.80660915), (1, 0.1933909)]
25 [(0, 0.65604967), (1, 0.34395036)]
26 [(0, 0.9992937)]
27 [(0, 0.99663)]
28 [(0, 0.9993411)]
29 [(0, 0.99900657)]
30 [(0, 0.9988597)]
31 [(0, 0.9990139)]
32 [(0, 0.9994936)]
33 [(0, 0.999529)]
34 [(0, 0.9994276)]
35 [(0, 0.99853164)]
36 [(0, 0.99923044)]
37 [(0, 0.9994209)]
38 [(0, 0.9997581)]
39 [(0, 0.99965215)]
40 [(0, 0.9

In [59]:
for i,president in enumerate(data_dtmn.index):
    print(i,president)

0 Barack Obama_2010
1 Barack Obama_2011
2 Barack Obama_2012
3 Barack Obama_2013
4 Barack Obama_2014
5 Barack Obama_2015
6 Barack Obama_2016
7 Bill Clinton_1994
8 Bill Clinton_1995
9 Bill Clinton_1996
10 Bill Clinton_1997
11 Bill Clinton_1998
12 Bill Clinton_1999
13 Bill Clinton_2000
14 Donald Trump_2018
15 Donald Trump_2019
16 George H.W. Bush_1990
17 George H.W. Bush_1991
18 George H.W. Bush_1992
19 George W. Bush_2002
20 George W. Bush_2003
21 George W. Bush_2004
22 George W. Bush_2005
23 George W. Bush_2006
24 George W. Bush_2007
25 George W. Bush_2008
26 Gerald Ford_1975
27 Gerald Ford_1976
28 Gerald Ford_1977
29 Jimmy Carter_1978
30 Jimmy Carter_1979
31 Jimmy Carter_1980
32 John F. Kennedy_1961
33 John F. Kennedy_1962
34 John F. Kennedy_1963
35 Lyndon B. Johnson_1964
36 Lyndon B. Johnson_1965
37 Lyndon B. Johnson_1966
38 Lyndon B. Johnson_1967
39 Lyndon B. Johnson_1968
40 Lyndon B. Johnson_1969
41 Richard M. Nixon_1970
42 Richard M. Nixon_1971
43 Richard M. Nixon_1972
44 Richard M