In [1]:
import datetime
import numpy as np
import pandas as pd
import gensim
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [11]:
def lemmatize_stemming(stemmer, text):
    '''lemmatize and stem the text to get key tokens'''
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text, names):
    '''preprocess the quotation list and extract tokens to dictionary'''
    result=[]
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in names:
            result.append(lemmatize_stemming(SnowballStemmer("english"), token))
    return result



In [4]:
# import all quotations data of US
USA_DATA = '../data/quotes_mentions_USA_compact.json.bz2'
df = pd.read_json(USA_DATA, lines=True, compression='bz2' )

In [5]:
df.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,phase,mentions,mentions_qids,urls
0,2015-10-25-000242,"' It is not now, nor has it ever been, the gol...",Bernie Sanders,Q359442,2015-10-25 14:12:35,1,E,[Bill Clinton],[Q1124],[examiner.com]
1,2015-08-07-005048,All we ╒ re asking for here is a discussion an...,John Boehner,Q11702,2015-08-07 12:52:52,1,E,[Barack Obama],[Q76],[liveblog.irishtimes.com]
2,2015-10-01-005722,An email included in the latest tranche of Cli...,Hillary Clinton,Q6294,2015-10-01 14:56:48,2,E,[Bill Clinton],[Q1124],"[feeds.foxnews.com, www.foxnews.com]"
3,2015-11-17-006368,"and in fact, Secretary of State Kerry was earl...",Phil Bryant,Q887898,2015-11-17 20:03:05,1,E,[John Kerry],[Q22316],[hottytoddy.com]
4,2015-02-14-014011,I have fought Obamacare from Day One and will ...,John Cornyn,Q719568,2015-02-14 21:01:51,2,E,[Barack Obama],[Q76],"[www.politico.com, politico.com]"


In [9]:
df_2020 = df[df["date"] > pd.Timestamp(2020,1,1)]
df_2020.head(5)

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,phase,mentions,mentions_qids,urls
423280,2020-02-07-012379,but [ President ] Trump (was) eager to make a ...,President Donald Trump,Q22686,2020-02-07 23:05:05,1,E,[Donald Trump],[Q22686],[uspolitics.einnews.com]
423281,2020-01-29-019304,"Even if it's all true, exactly in the worst-ca...",Kevin Cramer,Q3957020,2020-01-29 01:01:16,1,E,"[John R. Bolton, Joe Biden]","[Q311269, Q6279]",[www.washingtonexaminer.com]
423282,2020-01-02-019876,I enjoy your analysis and instruction on polli...,Steve Bartlett,Q185480,2020-01-02 00:00:00,1,E,[Donald Trump],[Q22686],[feeds.foxnews.com]
423283,2020-01-27-036296,"If your last name was not Biden, do you think ...",Joe Biden,Q6279,2020-01-27 14:33:49,14,E,[Joe Biden],[Q6279],[www.nytimes.com]
423284,2020-04-09-027891,"In short, the Clinton administration's policy ...",President Bill Clinton,Q1124,2020-04-09 17:54:15,1,E,[Bill Clinton],[Q1124],[www.globalresearch.ca]


In [14]:
# import politician properties and get their names
POLITICIAN = "../data/filtered_politician_labeled_v3.json.bz2"
pol_df = pd.read_json(POLITICIAN, lines=True, compression='bz2')

# global
process_names = [gensim.utils.simple_preprocess(name) for name in pol_df["name"].tolist()]
global_pol_list = [n for names in process_names for n in names]
global_pol_names = frozenset(global_pol_list)

# US
us_pol_df = pol_df[pol_df["nationality"] == "Q30"]
process_names = [gensim.utils.simple_preprocess(name) for name in us_pol_df["name"].tolist()]
# flatten list
us_pol_list = [n for names in process_names for n in names]
us_pol_names = frozenset(us_pol_list)

len(us_pol_names), len(global_pol_names)

(14712, 168493)

In [16]:
# speed test 
%timeit "clinton" in us_pol_names
%timeit "clinton" in us_pol_list
%timeit "clinton" in global_pol_names
%timeit "clinton" in global_pol_list # idk why this is the fastest, maybe cache?


22.1 ns ± 0.0374 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
38.9 µs ± 97.2 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
28.2 ns ± 0.208 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
9.56 µs ± 12.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [18]:
# pass in name list to remove politician names
sample_quote = df_2020['quotation'].iloc[0]
print("Sample quotation: ", sample_quote)
preprocess(sample_quote, us_pol_names)

Sample quotation:  but [ President ] Trump (was) eager to make a symbol of the Army officer sooner after the Senate acquitted him of the impeachment charges approved by House Democrats.


['presid',
 'eager',
 'symbol',
 'armi',
 'sooner',
 'senat',
 'acquit',
 'impeach',
 'charg',
 'approv',
 'democrat']

In [19]:
# process 2020 data
processed_docs = []
for quotation in df_2020['quotation'].tolist():
    processed_docs.append(preprocess(quotation, us_pol_names))
dictionary = gensim.corpora.Dictionary(processed_docs) #words into corpus
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] #convert to bag of words

In [20]:
# save model
num_topics = 8
lda_2020 =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = num_topics, 
                                   id2word = dictionary,                                    
                                   passes = 10)
MODEL_PATH = "./lda_model_2020"
lda_2020.save(MODEL_PATH)

In [22]:
# analyze model
for idx, topic in lda_2020.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.040*"presid" + 0.028*"american" + 0.014*"peopl" + 0.011*"state" + 0.009*"administr" + 0.008*"countri" + 0.007*"america" + 0.007*"iran" + 0.006*"unit" + 0.006*"work"


Topic: 1 
Words: 0.036*"go" + 0.029*"think" + 0.020*"beat" + 0.019*"peopl" + 0.018*"presid" + 0.013*"work" + 0.011*"come" + 0.010*"candid" + 0.010*"like" + 0.010*"defeat"


Topic: 2 
Words: 0.020*"administr" + 0.015*"say" + 0.012*"peopl" + 0.012*"state" + 0.011*"presid" + 0.009*"want" + 0.007*"protect" + 0.006*"need" + 0.006*"american" + 0.006*"govern"


Topic: 3 
Words: 0.033*"presid" + 0.032*"senat" + 0.019*"vote" + 0.017*"democrat" + 0.014*"trial" + 0.013*"impeach" + 0.012*"republican" + 0.012*"wit" + 0.012*"say" + 0.010*"support"


Topic: 4 
Words: 0.028*"presid" + 0.015*"state" + 0.012*"know" + 0.012*"like" + 0.011*"work" + 0.011*"think" + 0.011*"peopl" + 0.011*"want" + 0.008*"year" + 0.006*"democrat"


Topic: 5 
Words: 0.049*"presid" + 0.016*"know" + 0.010*"say" + 0.010*"state" + 0.009*"year" + 0.

In [30]:
# use the model to predict
test_document = df_2020['quotation'].iloc[0]
test_processed = preprocess(test_document, us_pol_names)
bow_vector = dictionary.doc2bow(test_processed)

print('original quotation: "{}"'.format(test_document))
print('processed input: "{}"'.format(test_processed))
for index, score in sorted(lda_2020[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_2020.print_topic(index, 5)))

original quotation: "but [ President ] Trump (was) eager to make a symbol of the Army officer sooner after the Senate acquitted him of the impeachment charges approved by House Democrats."
processed input: "['presid', 'eager', 'symbol', 'armi', 'sooner', 'senat', 'acquit', 'impeach', 'charg', 'approv', 'democrat']"
Score: 0.7480118870735168	 Topic: 0.049*"presid" + 0.016*"know" + 0.010*"say" + 0.010*"state" + 0.009*"year"
Score: 0.1893889158964157	 Topic: 0.033*"presid" + 0.032*"senat" + 0.019*"vote" + 0.017*"democrat" + 0.014*"trial"
Score: 0.010440495796501637	 Topic: 0.019*"democrat" + 0.017*"presid" + 0.014*"elect" + 0.012*"investig" + 0.011*"talk"
Score: 0.010437259450554848	 Topic: 0.020*"administr" + 0.015*"say" + 0.012*"peopl" + 0.012*"state" + 0.011*"presid"
Score: 0.010431854985654354	 Topic: 0.028*"presid" + 0.015*"state" + 0.012*"know" + 0.012*"like" + 0.011*"work"
Score: 0.010430732741951942	 Topic: 0.026*"presid" + 0.024*"think" + 0.011*"need" + 0.010*"like" + 0.009*"tell