In [1]:
%matplotlib inline

In [2]:
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
import datefinder
import spacy
import pandas as pd
from gensim import corpora, models, similarities
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from textblob import TextBlob
import numpy as np
from sklearn import (manifold, datasets, decomposition, ensemble, discriminant_analysis, random_projection)
from gensim.summarization import (summarize, keywords)
from sklearn.metrics.pairwise import pairwise_distances

In [3]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

print __version__ # requires version >= 1.9.0

from plotly.graph_objs import Scatter, Figure, Layout
import plotly.plotly as py
import plotly.graph_objs as go

2.0.6


In [4]:
init_notebook_mode(connected=True)

In [5]:
stops = set([s for s in STOPWORDS])

# adding corpus-specific stopwords to gensim's default stopwords:
customstops = set(['applause','booing','inaudible','cheers','laughter'])
stops = stops.union(customstops)

In [6]:
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in stops]

def get_speech(doc):
    length = 0
    segs = doc.split("\n\n\n")
    for i in segs:
        if len(i) > length:
            length = len(i)
    speech=[i for i in segs if len(i)==length]
    return speech[0]

def get_date(d):
    matches = list(datefinder.find_dates(d))
    if len(matches)>0:
        return matches[0]
    else:
        return "no date found"
    
def get_ne_list(doc):
    return [e.text for e in list(nlp(doc).ents)]

In [7]:
nlp = spacy.load('en')

In [8]:
html = requests.get("http://www.presidency.ucsb.edu/2016_election_speeches.php?candidate=45&campaign=2016TRUMP&doctype=5000")

soup = BeautifulSoup(html.text, 'html.parser')

links = []
for a in soup.find_all('a'):
    if a['href'].startswith("../ws/index.php?pid="):
        links.append(a['href'])

links = list(set([l.replace("../ws/index.php?","http://www.presidency.ucsb.edu/ws/?") for l in links]))

In [9]:
speeches = []
citations = []

for l in links:
    doc = requests.get(l)
    docsoup = BeautifulSoup(doc.text, 'html.parser')
    speech = get_speech(docsoup.get_text())
    speeches.append(speech.split("\nCitation:")[0])
    citations.append(speech.split("\nCitation:")[1])
    
dates = [get_date(c) for c in citations]

In [10]:
sentiment = [TextBlob(s).sentiment.polarity for s in speeches]

In [11]:
print("Highest Sentiment Speech:")
print(citations[pd.Series(sentiment).idxmax()])
print('\n======================================\n')
print("Lowest Sentiment Speech:")
print(citations[pd.Series(sentiment).idxmin()])

Highest Sentiment Speech:
 Donald J. Trump: "Remarks at Great Faith International Ministries in Detroit, Michigan," September 3, 2016. Online by Gerhard Peters and John T. Woolley, The American Presidency Project. http://www.presidency.ucsb.edu/ws/?pid=119199.


Lowest Sentiment Speech:
 Donald J. Trump: "Remarks at the Jeffco Fairgrounds Event Center in Golden, Colorado," October 29, 2016. Online by Gerhard Peters and John T. Woolley, The American Presidency Project. http://www.presidency.ucsb.edu/ws/?pid=119181.


In [12]:
entities = []

for s in speeches:
    ents = nlp(s).ents
    for e in ents:
        entities.append((e.label_, e.text))

s = pd.Series(entities)

In [13]:
speeches_token_list = [tokenize(s) for s in speeches]
# speeches_ne_list = [get_ne_list(s) for s in speeches]

# dictionary = corpora.Dictionary(speeches_ne_list)
dictionary = corpora.Dictionary(speeches_token_list)

# corpus = [dictionary.doc2bow(s) for s in speeches_ne_list]
corpus = [dictionary.doc2bow(s) for s in speeches_token_list]

In [14]:
# transform the corpus from bag-of-words to Tfidf:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [15]:
lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=7)

# using pandas to collect topics in nice tabular format
topn=25
index = range(topn)
df = pd.DataFrame(index=index)
for i in range(lda.num_topics):
    t = [w[0] for w in lda.show_topic(i, topn=topn)]
    df['topic_%s' % i] = pd.Series(t)

print(df)

     topic_0       topic_1     topic_2   topic_3      topic_4       topic_5  \
0   donating         cyber          va  question       israel         folks   
1    indiana  philadelphia   hampshire     maine     michigan          ohio   
2     follow          navy    veterans     looks      detroit       winning   
3         ll       student       flint    donald        flint      fighting   
4       know  pennsylvania      legion    estate       growth         think   
5      tired       machine    michigan    pueblo       reince            ll   
6      folks       defense       gonna      code         baby          know   
7      think         guard       water  colorado  palestinian      ericsson   
8         ok           war         tpp   african        folks    corruption   
9        lot        health     veteran    future        guard         legal   
10     loans       attacks      follow    heroes        think         farms   
11  carolina         gonna   employers      said    

In [16]:
D = np.empty((0,300))

for d in speeches:
    D = np.vstack([D,nlp(d).vector])

In [17]:
# t-SNE embedding
tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
D_tsne = tsne.fit_transform(D)

In [18]:
def get_keywords(s):
    try:
        return keywords(s, ratio=0.1)
    except:
        return 'na'

In [19]:
kwds = [get_keywords(s) for s in speeches]

In [20]:
ht = ["<br>".join([str(i),x]) for i,x in enumerate(kwds)]

trace = go.Scatter(
    x=D_tsne[:,0],
    y=D_tsne[:,1],
    mode='markers',
    marker= dict(size= 8,
                 line= dict(width=1),
                 opacity= 0.3
                ),
    text= ht,
    hoverinfo='text')

data = [trace]

iplot(data)

In [21]:
# print(citations[62])
# print("\n")
# print(kwds[62])
# print("\n")
# print(speeches[62])

print(" ".join(kwds[35].split("\n")))
print("\n")
print(" ".join(kwds[69].split("\n")))

americans americanism new trade taxes taxing tax jobs job american economic growth regulations regulating regulation plan plans million millions energy great america clinton incomes income people infrastructure policy policies federal including includes dollar dollars production product productive wealth years year come comes coming savings save saved calls completely complete nation national rate rates working work worked business reforms reform reduce reduces reduced single care economy cost costs property class power benefit earned earning entitlements


applause booing right american americans hillary people clinton clintons great state america national nations nation states office officers officer gonna said total totally says foreign time times law means mean meaning jobs job bad justice justices vote votes worse obamacare news new florida obama cubans cuban immigrant immigration immigrants thing like thank taxes tax saying country countries folks movement little school schools w

In [22]:
dist_out = 1-pairwise_distances(D_tsne, metric="l1")
dist_out[dist_out==1.0] = np.nan

In [23]:
maxpair = np.unravel_index(np.nanargmax(dist_out), dist_out.shape)
minpair = np.unravel_index(np.nanargmin(dist_out), dist_out.shape)

In [24]:
print(maxpair[0])
print(citations[maxpair[0]])
print(" ".join(kwds[maxpair[0]].split("\n")))
print("\n")
print(maxpair[1])
print(citations[maxpair[0]])
print(" ".join(kwds[maxpair[1]].split("\n")))

35
 Donald J. Trump: "Remarks to the Economic Club of New York at the Waldorf Astoria in New York City," September 15, 2016. Online by Gerhard Peters and John T. Woolley, The American Presidency Project. http://www.presidency.ucsb.edu/ws/?pid=119209.
americans americanism new trade taxes taxing tax jobs job american economic growth regulations regulating regulation plan plans million millions energy great america clinton incomes income people infrastructure policy policies federal including includes dollar dollars production product productive wealth years year come comes coming savings save saved calls completely complete nation national rate rates working work worked business reforms reform reduce reduces reduced single care economy cost costs property class power benefit earned earning entitlements


68
 Donald J. Trump: "Remarks to the Economic Club of New York at the Waldorf Astoria in New York City," September 15, 2016. Online by Gerhard Peters and John T. Woolley, The American P

In [25]:
print(minpair[0])
print(citations[minpair[0]])
print(" ".join(kwds[minpair[0]].split("\n")))
print("\n")
print(minpair[1])
print(citations[minpair[0]])
print(" ".join(kwds[minpair[1]].split("\n")))

5
 Donald J. Trump: "Remarks at the Cross Insurance Center in Bangor, Maine," October 15, 2016. Online by Gerhard Peters and John T. Woolley, The American Presidency Project. http://www.presidency.ucsb.edu/ws/?pid=119172.
drugs drug hillary clinton clintons american americans like jobs trade treatment treatments taxes tax country countries opioid america deal mike addicted addiction addictive criminals criminal started start starting prescribe opioids corruption corrupt deals reduce come justice justices recovery immigration immigrant immigrants media heroin prescribers prescribing prescribed bring bringing use regulations regulation times time expanding expand government governments traffickers nafta crime crimes end


33
 Donald J. Trump: "Remarks at the Cross Insurance Center in Bangor, Maine," October 15, 2016. Online by Gerhard Peters and John T. Woolley, The American Presidency Project. http://www.presidency.ucsb.edu/ws/?pid=119172.
american americans millions million america sta