# Attribution to https://www.kaggle.com/ellaphamvn/state-of-the-union-topic-modelling

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [None]:
path = '/content/drive/MyDrive/VD6/archive/sotu/'
dirs = os.listdir(path)

df = pd.DataFrame(columns=['year', 'president', 'text', 'party'])


for i in range(len(dirs)):
    components = dirs[i].split('_')
    name = components[0]
    year = components[1].split('.')[0]
    df.loc[i,'year'] = year
    df.loc[i,'president'] = name   
    
    filename = os.path.join(path, dirs[i])
    text_file = open(filename, "r")
    
    lines = text_file.read()
    df.loc[i, 'text'] = lines.replace('\n', ' ')
    
df.year = df.year.astype(int) 
df.president = df.president.astype(str)
df.text = df.text.astype(str)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 228 entries, 0 to 227
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   year       228 non-null    int64 
 1   president  228 non-null    object
 2   text       228 non-null    object
 3   party      0 non-null      object
dtypes: int64(1), object(3)
memory usage: 18.9+ KB


In [None]:
# need to distinuish between Theodore Roosevelt and Franklin D. Roosevelt
#thank you to kaggle user mjmurphy28 for this code
indices = df.query("president =='Roosevelt' & year <= 1909").index
df.loc[indices,'president'] = 'Theodore Roosevelt'

indices = df.query("president == 'Roosevelt'").index
df.loc[indices,'president'] = 'Franklin D. Roosevelt'

indices = df.query("president =='Bush' & year <= 1992").index
df.loc[indices,'president'] = 'George H. W. Bush'

indices = df.query("president == 'Bush'").index
df.loc[indices,'president'] = 'George W. Bush'

indices = df.query("president =='Johnson' & year <= 1869").index
df.loc[indices,'president'] = 'Andrew Johnson'

indices = df.query("president == 'Johnson'").index
df.loc[indices,'president'] = 'Lyndon B. Johnson'

indices = df.query("president =='Adams' & year <= 1801").index
df.loc[indices,'president'] = 'John Adams'

indices = df.query("president == 'Adams'").index
df.loc[indices,'president'] = 'John Quincy Adams'

indices = df.query("president =='Harrison' & year <= 1841").index
df.loc[indices,'president'] = 'William Henry Harrison'

indices = df.query("president == 'Harrison'").index
df.loc[indices,'president'] = 'Benjamin Harrison'

#thank you to kaggle user mjmurphy28 for this code

In [None]:
#add party name to each year
#thank you to kaggle user mjmurphy28 for this code
def pres_to_party(name):
    republican = ['Lincoln', 'Grant', 'Hayes', 'Garfield', 'Arthur', 
                  'Benjamin Harrison', 'McKinley', 'Theodore Roosevelt', 
                  'Taft', 'Harding', 'Coolidge', 'Hoover', 'Eisenhower', 
                  'Nixon', 'Ford', 'Reagan', 'George H. W. Bush', 
                  'George W. Bush', 'Trump']
    if name in republican:
        return 'Republican'
    
    democratic = ['Jackson', 'Buren', 'Polk', 'Pierce', 
                  'Buchanan', 'Cleveland', 'Wilson', 'Franklin D. Roosevelt', 
                  'Truman', 'Kennedy', 'Lyndon B. Johnson', 'Carter', 'Clinton', 'Obama']
    if name in democratic:
        return 'Democratic'
    
    whig = ['William Henry Harrison', 'Taylor', 'Fillmore']
    if name in whig:
        return 'Whig'
    
    national_union = ['Andrew Johnson']
    if name in national_union:
        return 'National Union'
    
    unaffiliated = ['Washington', 'Tyler']
    if name in unaffiliated:
        return 'Unaffiliated'
    
    federalist = ['John Adams']
    if name in federalist:
        return 'Federalist'
    
    democratic_republican = ['Jefferson', 'Madison', 'Monroe', 'John Quincy Adams']
    if name in democratic_republican:
        return 'Democratic-Republican'
    
df.party = df.president.apply(pres_to_party)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Download stopwords list
nltk.download('punkt')
stop_words = set(stopwords.words('english')) 

# Interface lemma tokenizer from nltk with sklearn
# Thank you to kaggle user mjmurphy28 for this code
class LemmaTokenizer:
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`', '--', '-','...', 'american', 'america', 'world']
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.ignore_tokens]

# Lemmatize the stop words
tokenizer=LemmaTokenizer()
token_stop = tokenizer(' '.join(stop_words))
documents = df['text']

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')

#Return pos tag in wordnetlemmatizer format

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)

#Create the lemmatoken
# Interface lemma tokenizer from nltk with sklearn
class POSLemmaTokenizer:
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`', '--', '-', '...', 'american', 'america', 'world']
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t, get_wordnet_pos(t)) for t in word_tokenize(doc) if t not in self.ignore_tokens]

# Lemmatize the stop words
pos_tokenizer=POSLemmaTokenizer()
pos_token_stop=pos_tokenizer(' '.join(stop_words))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# Create TF-idf model
vectorizer = TfidfVectorizer(stop_words=token_stop, 
                              tokenizer=tokenizer)
#Fit transform current document
tfidf_doc = vectorizer.fit_transform(documents)

tfidf_array = tfidf_doc.toarray()
tfidf_df = pd.DataFrame(tfidf_array, columns = vectorizer.get_feature_names())
tfidf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228 entries, 0 to 227
Columns: 26795 entries, ! to zuloaga
dtypes: float64(26795)
memory usage: 46.6 MB


In [None]:
# Create TF-idf model
pos_vectorizer = TfidfVectorizer(stop_words=pos_token_stop, 
                              tokenizer=pos_tokenizer)
#Fit transform current document
pos_tfidf_doc = pos_vectorizer.fit_transform(documents)

pos_tfidf_array = pos_tfidf_doc.toarray()
pos_tfidf_df = pd.DataFrame(pos_tfidf_array, columns = pos_vectorizer.get_feature_names())
pos_tfidf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228 entries, 0 to 227
Columns: 23483 entries, ! to zuloaga
dtypes: float64(23483)
memory usage: 40.8 MB


In [None]:
df['topic'] = np.arange(0,228)

#Print out the topic-representing words in each year SOTU speech: 
for i in range(0, 228):
    topc = []
    topic_words = tfidf_df.iloc[i, :].sort_values().tail(5).reset_index()
    for word in topic_words['index']:
        topc.append(word)
    df['topic'][i] = topc

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [None]:
df['pos_topic'] = np.arange(0,228)

#Print out the topic-representing words in each year SOTU speech: 
for i in range(0, 228):
    topcs = []
    topic_words = pos_tfidf_df.iloc[i, :].sort_values().tail(5).reset_index()
    for word in topic_words['index']:
        topcs.append(word)
    df['pos_topic'][i] = topcs

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [None]:
df.drop(['text'],axis=1).to_csv('SOTU topics.csv', index=False)

In [None]:
from pprint import pprint
from gensim.models.ldamulticore import LdaMulticore

In [None]:
import re
from gensim import models, corpora
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

NUM_TOPICS = 5
STOPWORDS = stopwords.words('english')

wnl = WordNetLemmatizer()

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n' 

def lemmatize_sent(text): 
    # Text input is string, returns lowercased strings.
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) 
            for word, tag in pos_tag(word_tokenize(text))]

def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return lemmatize_sent(' '.join(cleaned_text))

In [None]:
df['tokens'] = df.text.apply(clean_text)
df.head()

Unnamed: 0,year,president,text,party,topic,pos_topic,tokens
0,1854,Pierce,Fellow-Citizens of the Senate and of the House...,Democratic,"[cyane, greytown, government, united, state]","[cyane, greytown, government, united, state]","[fellow-citizens, senate, house, representativ..."
1,1982,Reagan,"Mr. Speaker, Mr. President, distinguished Memb...",Republican,"[billion, federal, tax, government, program]","[billion, federal, tax, government, program]","[speaker, president, distinguish, member, cong..."
2,1823,Monroe,Fellow-Citizens of the Senate and House of Rep...,Democratic-Republican,"[united, $, post, government, state]","[post, great, make, government, state]","[fellow-citizens, senate, house, representativ..."
3,1840,Buren,Fellow-Citizens of the Senate and House of Rep...,Democratic,"[may, state, upon, public, government]","[may, upon, state, public, government]","[fellow-citizens, senate, house, representativ..."
4,1821,Monroe,Fellow-Citizens of the Senate and House of Rep...,Democratic-Republican,"[power, article, united, state, vessel]","[power, article, united, state, vessel]","[fellow-citizens, senate, house, representativ..."


In [None]:
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(df.tokens)
#dictionary.filter_extremes(no_below=3, no_above=.03)

# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in df.tokens]

# Build the LDA model
lda_model = models.LdaModel(corpus=corpus,  
                            num_topics=20, 
                            id2word=dictionary)

print("LDA Model:")
 
for idx in range(20):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 20))

LDA Model:
Topic #0: 0.010*"government" + 0.009*"state" + 0.007*"year" + 0.007*"make" + 0.007*"congress" + 0.006*"great" + 0.006*"country" + 0.005*"united" + 0.005*"american" + 0.004*"time" + 0.004*"people" + 0.004*"would" + 0.004*"one" + 0.004*"upon" + 0.004*"work" + 0.004*"nation" + 0.004*"public" + 0.004*"may" + 0.003*"law" + 0.003*"war"
Topic #1: 0.011*"government" + 0.007*"state" + 0.006*"people" + 0.006*"congress" + 0.006*"year" + 0.006*"great" + 0.006*"make" + 0.005*"time" + 0.005*"american" + 0.005*"nation" + 0.005*"country" + 0.005*"may" + 0.004*"must" + 0.004*"united" + 0.004*"upon" + 0.004*"would" + 0.004*"new" + 0.004*"interest" + 0.004*"law" + 0.004*"war"
Topic #2: 0.012*"state" + 0.008*"government" + 0.006*"nation" + 0.006*"congress" + 0.005*"country" + 0.005*"make" + 0.005*"year" + 0.005*"may" + 0.005*"upon" + 0.005*"united" + 0.005*"public" + 0.004*"great" + 0.004*"would" + 0.004*"people" + 0.004*"present" + 0.004*"law" + 0.004*"act" + 0.003*"new" + 0.003*"one" + 0.003*

In [None]:
df['lda_topic'] = df['tokens']
#def get_most_popular_topic(index)
for j in np.arange(0, 228):
    df['lda_topic'][j] = [i[0] for i in lda_model.get_document_topics(dictionary.doc2bow(df.tokens[j]), minimum_probability=0.2)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [None]:
df.head(20)

Unnamed: 0,year,president,text,party,topic,pos_topic,tokens,lda_topic
0,1854,Pierce,Fellow-Citizens of the Senate and of the House...,Democratic,"[cyane, greytown, government, united, state]","[cyane, greytown, government, united, state]","[fellow-citizens, senate, house, representativ...","[15, 16]"
1,1982,Reagan,"Mr. Speaker, Mr. President, distinguished Memb...",Republican,"[billion, federal, tax, government, program]","[billion, federal, tax, government, program]","[speaker, president, distinguish, member, cong...","[4, 8]"
2,1823,Monroe,Fellow-Citizens of the Senate and House of Rep...,Democratic-Republican,"[united, $, post, government, state]","[post, great, make, government, state]","[fellow-citizens, senate, house, representativ...",[13]
3,1840,Buren,Fellow-Citizens of the Senate and House of Rep...,Democratic,"[may, state, upon, public, government]","[may, upon, state, public, government]","[fellow-citizens, senate, house, representativ...",[13]
4,1821,Monroe,Fellow-Citizens of the Senate and House of Rep...,Democratic-Republican,"[power, article, united, state, vessel]","[power, article, united, state, vessel]","[fellow-citizens, senate, house, representativ...","[13, 15]"
5,1810,Madison,Fellow-Citizens of the Senate and House of Rep...,Democratic-Republican,"[government, danish, seminary, state, blockade]","[united, danish, seminary, state, blockade]","[fellow-citizens, senate, house, representativ...","[6, 13]"
6,2008,George W. Bush,"Madam Speaker, Vice President Cheney, Members ...",Republican,"[al, empower, terrorist, iraqi, iraq]","[qaeda, al, terrorist, iraqi, iraq]","[madam, speaker, vice, president, cheney, memb...","[8, 19]"
7,1832,Jackson,Fellow Citizens of the Senate and of the House...,Democratic,"[general, public, may, government, state]","[make, public, may, government, state]","[fellow, citizen, senate, house, representativ...",[13]
8,1808,Jefferson,The Senate and House of Representatives of the...,Democratic-Republican,"[belligerent, state, suspension, decree, embargo]","[belligerent, state, suspension, decree, embargo]","[senate, house, representative, united, state,...",[2]
9,1962,Kennedy,"Mr. Vice President, my old colleague from Mass...",Democratic,"[job, farm, program, nation, new]","[farm, help, program, nation, new]","[vice, president, old, colleague, massachusett...","[8, 19]"


In [93]:
import re
from collections import defaultdict

def sotu_topic_finder(year, num_topics, num_words):
    """
    Find SOTU topics using LDA. The LDA model is only trained on the text of that year topic
    Input: index i of the speech
    Output: list 5 topics found by the model
    """
    # Clean the text
    sent_text = sent_tokenize(df.text[year - 1979])
    token_list = []
    for sent in sent_text:
        cleaned_sent = clean_text(sent)
        token_list.append(cleaned_sent)

    # Prepare the dictionary and corpus
    dictionary = corpora.Dictionary(token_list)
    corpus = [dictionary.doc2bow(text) for text in token_list]

    # Build the LDA model
    lda_model = models.LdaModel(corpus=corpus,  
                                num_topics=num_topics, 
                                id2word=dictionary)

    #Output model
    data = {}
    for idx in range(5):
        # Print the first 10 most representative topics
        res = lda_model.show_topic(idx, num_words)
        topic = idx
        data[idx] = ([{"keyword": keyword, "p": str(p)} for keyword, p in res])

    return data

In [94]:
res = {}
for year in range(1989, 2017):
    res[year] = sotu_topic_finder(year, 5, 5)

print(res)

{1989: {0: [{'keyword': 'year', 'p': '0.013158713'}, {'keyword': 'need', 'p': '0.010056836'}, {'keyword': 'new', 'p': '0.008481822'}, {'keyword': 'would', 'p': '0.006840083'}, {'keyword': 'war', 'p': '0.0061211023'}], 1: [{'keyword': 'alliance', 'p': '0.0083495015'}, {'keyword': 'tax', 'p': '0.00783541'}, {'keyword': 'help', 'p': '0.0065037985'}, {'keyword': 'nation', 'p': '0.0064482214'}, {'keyword': 'year', 'p': '0.0063752374'}], 2: [{'keyword': 'world', 'p': '0.012152851'}, {'keyword': 'nation', 'p': '0.011333406'}, {'keyword': 'defense', 'p': '0.00853749'}, {'keyword': 'economic', 'p': '0.006061211'}, {'keyword': 'free', 'p': '0.0059828907'}], 3: [{'keyword': 'year', 'p': '0.013210074'}, {'keyword': 'tax', 'p': '0.009830507'}, {'keyword': 'percent', 'p': '0.00979413'}, {'keyword': 'billion', 'p': '0.009168062'}, {'keyword': 'increase', 'p': '0.006769138'}], 4: [{'keyword': 'nation', 'p': '0.01495379'}, {'keyword': 'free', 'p': '0.0143929785'}, {'keyword': 'need', 'p': '0.006847537'

In [95]:
import json

with open('data.json', 'w') as fp:
    json.dump(res, fp)