In [19]:
%matplotlib inline

In [20]:
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
import datefinder
import spacy
import pandas as pd
from gensim import corpora, models, similarities
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from textblob import TextBlob

In [2]:
stops = set([s for s in STOPWORDS])

# adding corpus-specific stopwords to gensim's default stopwords:
customstops = set(['applause','booing','inaudible','cheers','laughter'])
stops = stops.union(customstops)

In [3]:
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in stops]

def get_speech(doc):
    length = 0
    segs = doc.split("\n\n\n")
    for i in segs:
        if len(i) > length:
            length = len(i)
    speech=[i for i in segs if len(i)==length]
    return speech[0]

def get_date(d):
    matches = list(datefinder.find_dates(d))
    if len(matches)>0:
        return matches[0]
    else:
        return "no date found"
    
def get_ne_list(doc):
    return [e.text for e in list(nlp(doc).ents)]

In [4]:
nlp = spacy.load('en')

In [5]:
html = requests.get("http://www.presidency.ucsb.edu/2016_election_speeches.php?candidate=45&campaign=2016TRUMP&doctype=5000")

soup = BeautifulSoup(html.text, 'html.parser')

links = []
for a in soup.find_all('a'):
    if a['href'].startswith("../ws/index.php?pid="):
        links.append(a['href'])

links = [l.replace("../ws/index.php?","http://www.presidency.ucsb.edu/ws/?") for l in links]

In [6]:
speeches = []
citations = []

for l in links:
    doc = requests.get(l)
    docsoup = BeautifulSoup(doc.text, 'html.parser')
    speech = get_speech(docsoup.get_text())
    speeches.append(speech.split("\nCitation:")[0])
    citations.append(speech.split("\nCitation:")[1])
    
dates = [get_date(c) for c in citations]

In [18]:
sentiment = [TextBlob(s).sentiment.polarity for s in speeches]

In [30]:
pd.Series(sentiment).idxmin()

61

In [34]:
print("Highest Sentiment Speech:")
print(citations[pd.Series(sentiment).idxmax()])
print('\n======================================\n')
print("Lowest Sentiment Speech:")
print(citations[pd.Series(sentiment).idxmin()])

Highest Sentiment Speech:
 Donald J. Trump: "Remarks at Great Faith International Ministries in Detroit, Michigan," September 3, 2016. Online by Gerhard Peters and John T. Woolley, The American Presidency Project. http://www.presidency.ucsb.edu/ws/?pid=119199.


Lowest Sentiment Speech:
 Donald J. Trump: "Remarks at the Jeffco Fairgrounds Event Center in Golden, Colorado," October 29, 2016. Online by Gerhard Peters and John T. Woolley, The American Presidency Project. http://www.presidency.ucsb.edu/ws/?pid=119181.


In [7]:
entities = []

for s in speeches:
    ents = nlp(s).ents
    for e in ents:
        entities.append((e.label_, e.text))

s = pd.Series(entities)

In [8]:
speeches_token_list = [tokenize(s) for s in speeches]
# speeches_ne_list = [get_ne_list(s) for s in speeches]

# dictionary = corpora.Dictionary(speeches_ne_list)
dictionary = corpora.Dictionary(speeches_token_list)

# corpus = [dictionary.doc2bow(s) for s in speeches_ne_list]
corpus = [dictionary.doc2bow(s) for s in speeches_token_list]

In [9]:
# transform the corpus from bag-of-words to Tfidf:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [10]:
lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=7)

# using pandas to collect topics in nice tabular format
topn=25
index = range(topn)
df = pd.DataFrame(index=index)
for i in range(lda.num_topics):
    t = [w[0] for w in lda.show_topic(i, topn=topn)]
    df['topic_%s' % i] = pd.Series(t)

print(df)

        topic_0       topic_1       topic_2     topic_3      topic_4  \
0       indiana         cyber         maine    michigan           va   
1         flint  philadelphia      michigan       haiti     veterans   
2       jackson          navy      donating       gonna      defense   
3         party  pennsylvania      question   venezuela      missile   
4       african       arizona           tpp      legion        guard   
5         black         gonna         flint   wikileaks    employers   
6         looks        health         fight      castro  mississippi   
7            ll      premiums      speeches     florida          war   
8         water       college      virginia         dnc      bonuses   
9      audience     brilliant  universities     student         duty   
10          ubs      colorado         drink      mexico      savings   
11       bishop      deported           ban       flint         navy   
12       police     immigrant        opioid    clintons      vet