In [1]:
import pandas as pd
import os
from collections import Counter
import nltk
import string
import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud, STOPWORDS
from scipy import misc

In [2]:
path = 'state-of-the-union-corpus-1989-2017'
dirs = os.listdir(path)

In [6]:
from nltk.corpus import brown
 
data = []
 
for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)
    
NO_DOCUMENTS = len(data)
print(NO_DOCUMENTS)
print(data[:2])

500


In [7]:
data[:1]



Let's mimic the above data creation, where we'll essentially put each speech as an element of data.

In [10]:
data = []

for i in range(len(dirs)):
    filename = os.path.join(path, dirs[i])
    text_file = open(filename, "r")
    
    lines = text_file.read()
    lines = lines.replace('\n', ' ')
    
    data.append(lines)

In [12]:
data[1]

'Gentlemen of the Senate and Gentlemen of the House of Representatives:  While with reverence and resignation we contemplate the dispensations of Divine Providence in the alarming and destructive pestilence with which several of our cities and towns have been visited, there is cause for gratitude and mutual congratulations that the malady has disappeared and that we are again permitted to assemble in safety at the seat of Government for the discharge of our important duties. But when we reflect that this fatal disorder has within a few years made repeated ravages in some of our principal sea ports, and with increased malignancy, and when we consider the magnitude of the evils arising from the interruption of public and private business, whereby the national interests are deeply affected, I think it my duty to invite the Legislature of the Union to examine the expediency of establishing suitable regulations in aid of the health laws of the respective States; for these being formed on th

Basically following https://nlpforhackers.io/topic-modeling/

In [13]:
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords



In [16]:
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')

In [17]:
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text

In [18]:
tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(text))

In [24]:
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data[:-2]) # holding out last example
 
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

In [20]:
# Have a look at how the 20th document looks like: [(word_id, count), ...]
print(corpus[20])

[(4, 3), (15, 2), (27, 3), (29, 1), (30, 2), (31, 7), (34, 1), (40, 1), (56, 1), (59, 1), (61, 1), (64, 9), (67, 1), (70, 3), (72, 1), (73, 1), (80, 1), (82, 1), (83, 1), (84, 2), (85, 1), (88, 2), (89, 2), (97, 1), (103, 2), (111, 10), (123, 2), (124, 2), (126, 1), (131, 2), (136, 4), (140, 1), (141, 1), (145, 6), (146, 2), (153, 1), (157, 1), (167, 1), (170, 4), (178, 1), (180, 6), (194, 1), (196, 1), (197, 1), (198, 2), (200, 1), (208, 6), (209, 2), (210, 1), (211, 2), (219, 3), (221, 1), (222, 2), (225, 8), (226, 1), (227, 6), (229, 7), (231, 1), (232, 8), (239, 2), (243, 4), (245, 3), (246, 10), (247, 1), (249, 3), (251, 1), (252, 1), (253, 1), (255, 2), (281, 1), (287, 1), (291, 2), (292, 1), (298, 1), (301, 4), (304, 5), (305, 1), (307, 5), (309, 1), (311, 3), (313, 3), (317, 2), (320, 4), (322, 15), (323, 1), (327, 7), (328, 2), (329, 1), (330, 1), (331, 1), (332, 2), (333, 5), (334, 1), (335, 1), (337, 2), (338, 1), (343, 4), (344, 3), (346, 4), (348, 3), (349, 49), (351, 13),

In [25]:
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [26]:
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

And now display inferred topics

In [27]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
    
print("=" * 20)
 
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
    
print("=" * 20)

LDA Model:
Topic #0: 0.008*"government" + 0.008*"states" + 0.006*"congress" + 0.005*"year" + 0.005*"united" + 0.004*"country" + 0.004*"must" + 0.004*"may" + 0.004*"people" + 0.004*"upon"
Topic #1: 0.008*"government" + 0.007*"states" + 0.006*"congress" + 0.005*"year" + 0.005*"country" + 0.005*"people" + 0.005*"may" + 0.004*"must" + 0.004*"upon" + 0.004*"new"
Topic #2: 0.010*"government" + 0.008*"states" + 0.008*"upon" + 0.005*"would" + 0.005*"congress" + 0.005*"people" + 0.004*"public" + 0.004*"united" + 0.004*"year" + 0.004*"may"
Topic #3: 0.009*"government" + 0.009*"states" + 0.006*"united" + 0.006*"congress" + 0.005*"people" + 0.004*"year" + 0.004*"made" + 0.004*"country" + 0.004*"would" + 0.004*"upon"
Topic #4: 0.008*"government" + 0.007*"states" + 0.006*"united" + 0.006*"congress" + 0.005*"year" + 0.004*"people" + 0.004*"would" + 0.004*"may" + 0.004*"great" + 0.004*"upon"
Topic #5: 0.007*"states" + 0.006*"government" + 0.006*"united" + 0.006*"would" + 0.006*"congress" + 0.005*"new"

We held out the last example (Trump speeches) so we can use that.

In [28]:
data[-1]

'GENTLEMEN OF THE CONGRESS:  When I addressed myself to performing the duty laid upon the President by the Constitution to present to you an annual report on the state of the Union, I found my thought dominated by an immortal sentence of Abraham Lincoln\'s--"Let us have faith that right makes might, and in that faith let us dare to do our duty as we understand it"--a sentence immortal because it embodies in a form of utter simplicity and purity the essential faith of the nation, the faith in which it was conceived, and the faith in which it has grown to glory and power. With that faith and the birth of a nation founded upon it came the hope into the world that a new order would prevail throughout the affairs of mankind, an order in which reason and right would take precedence over covetousness and force; and I believe that I express the wish and purpose of every thoughtful American when I say that this sentence marks for us in the plainest manner the part we should play alike in the ar

In [29]:
example = "The year has been characterized by the progressive withdrawal of the Treasury from the domestic credit market and from a position of dominant influence in that market."

In [30]:
bow = dictionary.doc2bow(clean_text(example))

In [31]:
print(lsi_model[bow])

[(0, 0.29349655404355673), (1, -0.0706129875860652), (2, 0.3020381508604294), (3, -0.016723455123898324), (4, 0.11702346681629693), (5, -0.36698775326778427), (6, 0.22984280126426251), (7, 0.1947918056256692), (8, 0.17377632012800726), (9, -0.039528434166479165)]


LSI model suggests Topic 2. It's of note that the inferred topic includes the words "fiscal", "expenditures", "war" and "program"--there's definitely a good case for Topic 2 but some irrelevant features exist in the topic itself.

In [32]:
print(lda_model[bow])

[(3, 0.1997249), (4, 0.7387175)]


The LDA model tells us topic 4 has a strong representation--verifying with the text confirms this somewhat intuitively.