In [1]:
import pandas as pd
import seaborn as sns
import gensim

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [67]:
test = pd.read_csv('../data/speeches.csv')

In [3]:
f = open('../data/State+of+the+Union+Addresses+1970-2016.txt')
lines = f.readlines()
bigline = " ".join(lines)
stars = bigline.split('***')
splits = [s.split('\r\n') for s in stars[1:]]
tups = [(s[2], s[3], s[4], "".join(s[5:])) for s in splits]
speech_df = pd.DataFrame(tups)

In [4]:
speech_df.columns = ['type', 'president', 'date', 'text']

In [5]:
# speech_df.to_csv('../data/speeches.csv', index=False)

### TF-IDF w/ Scikits

In [6]:
import nltk
import string
import os

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [7]:
speech_df['text'] = speech_df.text.apply(lambda s: s.lower())
speech_df['text'] = speech_df.text.apply(lambda s: s.translate(None, string.punctuation))
speech_df['date'] = speech_df.date.apply(pd.to_datetime)

In [8]:
final_df = speech_df.loc[speech_df.date > '1/1/1970'].reset_index()

In [73]:
final_df.to_csv('../data/modern_speeches.csv', index=False)

In [62]:
tfidf = TfidfVectorizer(stop_words='english', min_df=2)
tfidf_tfs = tfidf.fit_transform(final_df.text.as_matrix())

In [63]:
tfidf_df = pd.DataFrame(tfidf_tfs.toarray(), columns=tfidf.get_feature_names())

In [64]:
# tfidf_df.america.index = tfidf_df.date

In [66]:
# tfidf_df.america.plot(figsize=(10,4))

In [12]:
pres = list(final_df.president.unique())

In [13]:
parties = ['r', 'r', 'd', 'r', 'r', 'd', 'r', 'd']

In [14]:
pres_party = pd.DataFrame({'president': pres[:-2] + [' Barack Obama'], 'party': parties})

In [15]:
final_party = pd.merge(final_df, pres_party, how='left', on='president')

### Term Frequencies

In [16]:
tf_vectorizer = CountVectorizer(stop_words='english', min_df=2)
tfs = tf_vectorizer.fit_transform(final_df.text.as_matrix())

In [17]:
tf_df = pd.DataFrame(tfs.toarray(), columns=tf_vectorizer.get_feature_names())

In [18]:
tf_df = pd.DataFrame(tfs.toarray(), columns=tf_vectorizer.get_feature_names())

In [19]:
top_100 = tf_df.sum().order(ascending=False).index[:100]

In [50]:
# top_100

### Topic Modeling

In [21]:
from gensim import models, corpora
from nltk.corpus import stopwords

In [22]:
# corpus = gensim.matutils.Dense2Corpus(final_df.text.as_matrix())

In [23]:
docs = final_df.text.apply(lambda s: s.split(' '))

In [38]:
docs = docs.apply(lambda doc: [w.strip() for w in doc 
                               if w.strip() not in stopwords.words('english') + ['applause', 'laughter']])

In [39]:
dictionary = corpora.Dictionary(docs)

In [40]:
# stops = set("for a of the to and in is".split(" "))

In [41]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [42]:
tfidf = gensim.models.TfidfModel(corpus=corpus)

In [43]:
corpus_tfidf = tfidf[corpus]

In [74]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)

In [75]:
lsi.print_topics(5)

[(0,
  u'0.079*"thats" + 0.069*"iraq" + 0.064*"welfare" + 0.064*"college" + 0.056*"oil" + 0.054*"lets" + 0.053*"terror" + 0.052*"iraqi" + 0.052*"terrorists" + 0.051*"weve"'),
 (1,
  u'-0.119*"shall" + -0.104*"soviet" + 0.096*"iraq" + 0.080*"thats" + 0.077*"iraqi" + 0.075*"terrorists" + -0.073*"1980" + 0.072*"al" + -0.071*"1974" + 0.071*"terror"'),
 (2,
  u'-0.157*"iraqi" + -0.156*"iraq" + -0.152*"qaeda" + -0.147*"terror" + -0.117*"terrorists" + -0.116*"iraqis" + 0.101*"thats" + -0.096*"terrorist" + -0.095*"saddam" + -0.084*"11th"'),
 (3,
  u'0.117*"welfare" + 0.116*"21st" + -0.115*"thats" + 0.111*"ought" + -0.094*"oil" + -0.072*"manufacturing" + 0.071*"bosnia" + 0.070*"millennium" + 0.070*"tobacco" + -0.067*"cory"'),
 (4,
  u'-0.201*"shall" + 0.123*"soviet" + -0.108*"92d" + -0.092*"seventies" + -0.088*"property" + -0.085*"sixties" + -0.082*"localities" + -0.067*"colleagues" + -0.063*"parks" + -0.062*"session"')]

In [47]:
lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=10)



In [48]:
lda.print_topics(10)

[(0,
  u'0.000*1974 + 0.000*canal + 0.000*shall + 0.000*thats + 0.000*inflation + 0.000*space + 0.000*gun + 0.000*brandon + 0.000*companies + 0.000*energy'),
 (1,
  u'0.000*thats + 0.000*iraq + 0.000*college + 0.000*rebekah + 0.000*banks + 0.000*ought + 0.000*lending + 0.000*childcare + 0.000*21st + 0.000*lobbyists'),
 (2,
  u'0.000*gun + 0.000*1973 + 0.000*college + 0.000*thats + 0.000*lets + 0.000*kids + 0.000*internet + 0.000*manufacturing + 0.000*hightech + 0.000*idea'),
 (3,
  u'0.000*terror + 0.000*regimes + 0.000*shall + 0.000*iraq + 0.000*11th + 0.000*terrorist + 0.000*homeland + 0.000*camps + 0.000*terrorists + 0.000*saddam'),
 (4,
  u'0.000*hussein + 0.000*saddam + 0.000*retreat + 0.000*aids + 0.000*iraqi + 0.000*inspectors + 0.000*disarm + 0.000*terrorist + 0.000*iraqis + 0.000*alqaida'),
 (5,
  u'0.000*cory + 0.000*1975 + 0.000*iraq + 0.000*qaeda + 0.000*iraqi + 0.000*al + 0.000*extremists + 0.000*oil + 0.000*direction + 0.000*iraqis'),
 (6,
  u'0.000*salt + 0.000*welfare +

In [49]:
# lda = models.LdaModel(corpus=tfidf, id2word=dictionary, num_topics=4, passes=20)

In [143]:
# lda.print_topics()

### Similarities

In [52]:
from gensim import similarities

In [53]:
index = similarities.MatrixSimilarity(lsi[corpus])



### Analysis Ideas
1. Topics / important terms over time
2. Topics / important terms by party
3. How parties are similar in terms speech type / topics
4. Speech similarities