In [1]:
import datetime
import numpy as np
import pandas as pd
import gensim
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel, TfidfModel
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
def lemmatize_stemming(stemmer, text):
    '''lemmatize and stem the text to get key tokens'''
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text, names):
    '''preprocess the quotation list and extract tokens to dictionary'''
    result=[]
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in names:
            result.append(lemmatize_stemming(SnowballStemmer("english"), token))
    return result



In [3]:
# import all quotations data of US
USA_DATA = '../data/quotes_mentions_USA_compact.json.bz2'
df = pd.read_json(USA_DATA, lines=True, compression='bz2' )

In [4]:
df.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,phase,mentions,mentions_qids,urls
0,2015-10-25-000242,"' It is not now, nor has it ever been, the gol...",Bernie Sanders,Q359442,2015-10-25 14:12:35,1,E,[Bill Clinton],[Q1124],[examiner.com]
1,2015-08-07-005048,All we ╒ re asking for here is a discussion an...,John Boehner,Q11702,2015-08-07 12:52:52,1,E,[Barack Obama],[Q76],[liveblog.irishtimes.com]
2,2015-10-01-005722,An email included in the latest tranche of Cli...,Hillary Clinton,Q6294,2015-10-01 14:56:48,2,E,[Bill Clinton],[Q1124],"[feeds.foxnews.com, www.foxnews.com]"
3,2015-11-17-006368,"and in fact, Secretary of State Kerry was earl...",Phil Bryant,Q887898,2015-11-17 20:03:05,1,E,[John Kerry],[Q22316],[hottytoddy.com]
4,2015-02-14-014011,I have fought Obamacare from Day One and will ...,John Cornyn,Q719568,2015-02-14 21:01:51,2,E,[Barack Obama],[Q76],"[www.politico.com, politico.com]"


In [5]:
df_2020 = df[df["date"] > pd.Timestamp(2020,1,1)]
df_2020.head(5)

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,phase,mentions,mentions_qids,urls
423280,2020-02-07-012379,but [ President ] Trump (was) eager to make a ...,President Donald Trump,Q22686,2020-02-07 23:05:05,1,E,[Donald Trump],[Q22686],[uspolitics.einnews.com]
423281,2020-01-29-019304,"Even if it's all true, exactly in the worst-ca...",Kevin Cramer,Q3957020,2020-01-29 01:01:16,1,E,"[John R. Bolton, Joe Biden]","[Q311269, Q6279]",[www.washingtonexaminer.com]
423282,2020-01-02-019876,I enjoy your analysis and instruction on polli...,Steve Bartlett,Q185480,2020-01-02 00:00:00,1,E,[Donald Trump],[Q22686],[feeds.foxnews.com]
423283,2020-01-27-036296,"If your last name was not Biden, do you think ...",Joe Biden,Q6279,2020-01-27 14:33:49,14,E,[Joe Biden],[Q6279],[www.nytimes.com]
423284,2020-04-09-027891,"In short, the Clinton administration's policy ...",President Bill Clinton,Q1124,2020-04-09 17:54:15,1,E,[Bill Clinton],[Q1124],[www.globalresearch.ca]


In [6]:
# import politician properties and get their names
POLITICIAN = "../data/filtered_politician_labeled_v3.json.bz2"
pol_df = pd.read_json(POLITICIAN, lines=True, compression='bz2')

pol_df['aliases_all'] = [a+[b] for a,b in zip(pol_df['aliases'],pol_df['name'])]
pol_df.head()

Unnamed: 0,qid,name,gender,nationality,aliases,parties,positions held,religion,us_congress_id,candidacy_election,aliases_all
0,Q207,George W. Bush,male,Q30,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[Republican Party],"[[Governor of Texas, [+1995-01-17T00:00:00Z]],...",Q329646,,"[2000 United States presidential election, 200...","[George Walker Bush, Bush Jr., Dubya, GWB, Bus..."
1,Q946,Donald Tusk,male,Q36,[Donald Franciszek Tusk],"[Civic Platform, European People's Party]","[[Prime Minister of Poland, [+2007-11-16T00:00...",Q9592,,[2005 Polish presidential election],"[Donald Franciszek Tusk, Donald Tusk]"
2,Q1058,Narendra Modi,male,Q668,"[Modi, Narendra Bhai, Narendra Damodardas Modi...",[Bharatiya Janata Party],"[[Chief Minister of Gujarat, [+2001-10-07T00:0...",Q9089,,[2014 Indian general election in Vadodara Lok ...,"[Modi, Narendra Bhai, Narendra Damodardas Modi..."
3,Q1253,Ban Ki-moon,male,Q884,"[Ban Kimoon, Ban Ki Moon]",[independent politician],"[[United Nations Secretary-General, [+2007-01-...",Q9581,,[],"[Ban Kimoon, Ban Ki Moon, Ban Ki-moon]"
4,Q3996,V. P. Kalairajan,male,Q668,[],[All India Anna Dravida Munnetra Kazhagam],[[Member of the Tamil Nadu Legislative Assembl...,,,[],[V. P. Kalairajan]


In [11]:
# global
# pol_name_global = pol_df["aliases_all"].tolist()
# process_names = [gensim.utils.simple_preprocess(n) for names in pol_name_global for n in names]
# pol_name_global = frozenset([n for names in process_names for n in names])
# print(pol_name_global)

# US
pol_name_us = pol_df[pol_df["nationality"] == "Q30"]["aliases_all"].tolist()
# process_names = [gensim.utils.simple_preprocess(n) for names in pol_name_us for n in names]
pol_name_us = frozenset([n for names in pol_name_us for n in names])
pol_name_us

frozenset({'Bruce H. McMillan',
           'William Wallace Woodman',
           'A. B. Breed',
           'Francis Winnie Qua',
           'Charles P. Brewer',
           'George Burke',
           'Maurice Sullivan',
           'Michelle Stennett',
           'Joel Kleefisch',
           'Carol Chumney',
           'Michael Padilla',
           'Harold J. Brubaker',
           'Amos Lawrence',
           'Eugene E. Donovan',
           'Joseph Patrick Kennedy II',
           'Karen Castor Dentel',
           'Charles Spittal "Chuck" Robb',
           'Arnold Schwarzenegger',
           'George Munsell',
           'Bill Richardson',
           'A. N. Blood',
           'Eric Carl Bauman',
           'Nomiki Konst',
           'Samuel A. Clark',
           'Larry Faircloth',
           'Agnes Charbonneau',
           'A. Holton',
           'Lynn Slaby',
           'Dave Hunt',
           'Don Miller',
           'William Louis Johnson',
           'Eben F. Phillips',
           'Davi

In [60]:
# # import politician properties and get their names
# POLITICIAN = "../data/filtered_politician_labeled_v3.json.bz2"
# pol_df = pd.read_json(POLITICIAN, lines=True, compression='bz2')

# # global
# process_names = [gensim.utils.simple_preprocess(name) for name in pol_df["name"].tolist()]
# global_pol_list = [n for names in process_names for n in names]
# global_pol_names = frozenset(global_pol_list)

# # US
# us_pol_df = pol_df[pol_df["nationality"] == "Q30"]
# process_names = [gensim.utils.simple_preprocess(name) for name in us_pol_df["name"].tolist()]
# # flatten list
# us_pol_list = [n for names in process_names for n in names]
# us_pol_names = frozenset(us_pol_list)

# len(us_pol_names), len(global_pol_names)

(14712, 168493)

In [9]:
# # speed test 
# %timeit "zxczxc" in pol_name_us
# %timeit "zxczxc" in pol_name_global

20.7 ns ± 0.0495 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
17.3 ns ± 0.041 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [14]:
import spacy

# initialize NLP pipeline
nlp = spacy.load('en_core_web_sm')
# customize pipeline
nlp.remove_pipe('lemmatizer') # reduce words to basic form, eg. talking -> talk, president -> presid
nlp.remove_pipe('tagger') # tag the part of speech for the token i.e. noun, verb, etc
# nlp.remove_pipe('parser') # dependency parser, maybe removed if adding in bigram 

('tagger', <spacy.pipeline.tagger.Tagger at 0x7ff852c4a700>)

In [43]:
from nltk.corpus import PlaintextCorpusReader

def get_chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

size = 50
# convert quotations to corpus by year
quotations = "\n".join(df_2020["quotation"].to_list())
print('number of chars {}'.format(len(quotations)))

QUOTE_FILE = './quotations_2020.txt'
corpus_root = "./"
with open(QUOTE_FILE, 'w') as f:
    f.write(quotations)
quotations_corpus = PlaintextCorpusReader(corpus_root, "quotations.*.txt")
# doc = nlp(quotations)
# print("number of sentences in 2020 quotebank: {}".format(len(doc.sents)))
# chunks = get_chunks(doc.sents, size)

number of chars 5227285


In [44]:
quotations_corpus.fileids()

['quotations_2020.txt']

In [46]:
# Get the chunks again (into smaller chunks)
corpus_id = {f:n for n,f in enumerate(quotations_corpus.fileids())} # dictionary of books
chunks = list()
chunk_class = list() # this list contains the original book of the chunk, for evaluation

limit = 500 # how many chunks total
size = 50 # how many sentences per chunk/page

for f in quotations_corpus.fileids():
    sentences = quotations_corpus.sents(f)
    print(f)
    print('Number of sentences:',len(sentences))

    # create chunks
    chunks_of_sents = [x for x in get_chunks(sentences,size)] # this is a list of lists of sentences, which are a list of tokens
    chs = list()

    # regroup so to have a list of chunks which are strings
    for c in chunks_of_sents:
        grouped_chunk = list()
        for s in c:
            grouped_chunk.extend(s)
        chs.append(" ".join(grouped_chunk))
    print("Number of chunks:",len(chs),'\n')

    # filter to the limit, to have the same number of chunks per book
    chunks.extend(chs[:limit])
    chunk_class.extend([corpus_id[f] for _ in range(len(chs[:limit]))])

quotations_2020.txt
Number of sentences: 37162
Number of chunks: 744 



In [94]:
STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS

processed_docs = []
for doc in nlp.pipe(chunks, n_process=5, batch_size=10):
     # Process document using Spacy NLP pipeline.
    ents = doc.ents  # Named entities

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    # doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    # if remove lemmentizer in nlp pipe, what happens here

    # convert token to string while remove punctuation and stopwords
    doc = [token.text for token in doc if token.is_alpha and not token.is_stop]

    # Remove common words from a stopword list and keep only words of length 3 or more.
    doc = [token for token in doc if token not in STOPWORDS and len(token) > 2]

    # Remove politician names
#     doc = [token for token in doc if token not in pol_name_us]

    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])

    processed_docs.append(doc)

docs = processed_docs
del processed_docs


In [89]:
processed_docs = []

for doc in nlp.pipe(chunks, n_process=5, batch_size=10):
     # Process document using Spacy NLP pipeline.
    ents = doc.ents  # Named entities

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    # doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    # if remove lemmentizer in nlp pipe, what happens here

    # convert token to string while remove punctuation and stopwords
    doc = [token.text for token in doc if token.is_alpha and not token.is_stop]

    # Remove common words from a stopword list and keep only words of length 3 or more.
    doc = [token for token in doc if token not in STOPWORDS and len(token) > 2]

    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])

    # remove names
    doc = [token for token in doc if token not in pol_name_us and len(token) > 2]

    processed_docs.append(doc)
noname_docs = processed_docs
del processed_docs

In [96]:
# 'John Bolton' in exclusion
# te = ['John Bolton', 'Donald', 'Trump', 'Donald Trump']
# [n for n in te if n not in exclusion]
# set.union({'1'}, {'1', '2'})
# STOPWORDS
for chk in docs:
    # print('John Bolton' in noname_docs)
    if 'Donald Trump' in chk:
        print('found')

found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
found
foun

In [91]:
# # Add bigrams to docs (only ones that appear 10 times or more).
# bigram = Phrases(noname_docs, min_count=10)

# for idx in range(len(noname_docs)):
#     for token in bigram[noname_docs[idx]]:
#         if '_' in token:
#             # Token is a bigram, add to document.
#             noname_docs[idx].append(token)

In [93]:
noname_docs[0]

['President',
 'eager',
 'symbol',
 'Army',
 'officer',
 'sooner',
 'Senate',
 'acquitted',
 'impeachment',
 'charges',
 'approved',
 'House',
 'Democrats',
 'true',
 'exactly',
 'worst',
 'case',
 'scenario',
 'John',
 'Bolton',
 'doesn',
 'change',
 'facts',
 'don',
 'think',
 'learn',
 'new',
 'know',
 'president',
 'concerned',
 'role',
 'Joe',
 'vice',
 'president',
 'United',
 'States',
 'possible',
 'corrupt',
 'activity',
 'Ukraine',
 'proof',
 'know',
 'enjoy',
 'analysis',
 'instruction',
 'polling',
 'hope',
 'future',
 'continue',
 'pithy',
 'insight',
 'topics',
 'polling',
 'registered',
 'voters',
 'versus',
 'likely',
 'voters',
 'polling',
 'sample',
 'size',
 'polling',
 'sample',
 'demographics',
 'regards',
 'President',
 'believe',
 'points',
 'added',
 'percentages',
 'factors',
 'supporters',
 'subject',
 'shaming',
 'public',
 'ridicule',
 'assault',
 'announce',
 'support',
 'Second',
 'disgusted',
 'juvenile',
 'narcissistic',
 'rants',
 'Twitter',
 'impulsive

In [51]:
# # Add bigrams too
# from gensim.models.phrases import Phrases

# # Add bigrams to docs (only ones that appear 15 times or more).
# bigram = Phrases(docs, min_count=10)

# for idx in range(len(docs)):
#     for token in bigram[docs[idx]]:
#         if '_' in token:
#             # Token is a bigram, add to document.
#             docs[idx].append(token)

In [98]:
docs[0]

['President',
 'Trump',
 'eager',
 'symbol',
 'Army',
 'officer',
 'sooner',
 'Senate',
 'acquitted',
 'impeachment',
 'charges',
 'approved',
 'House',
 'Democrats',
 'true',
 'exactly',
 'worst',
 'case',
 'scenario',
 'John',
 'Bolton',
 'doesn',
 'change',
 'facts',
 'don',
 'think',
 'learn',
 'new',
 'know',
 'president',
 'concerned',
 'Biden',
 'role',
 'Joe',
 'Biden',
 'vice',
 'president',
 'United',
 'States',
 'possible',
 'corrupt',
 'activity',
 'Ukraine',
 'proof',
 'know',
 'enjoy',
 'analysis',
 'instruction',
 'polling',
 'hope',
 'future',
 'continue',
 'pithy',
 'insight',
 'topics',
 'polling',
 'registered',
 'voters',
 'versus',
 'likely',
 'voters',
 'polling',
 'sample',
 'size',
 'polling',
 'sample',
 'demographics',
 'regards',
 'President',
 'Trump',
 'believe',
 'points',
 'added',
 'percentages',
 'factors',
 'Trump',
 'supporters',
 'subject',
 'shaming',
 'public',
 'ridicule',
 'assault',
 'announce',
 'support',
 'Second',
 'disgusted',
 'juvenile',


In [99]:
# Create a dictionary representation of the documents, and filter out frequent and rare words.
from gensim.corpora import Dictionary
dictionary = Dictionary(docs)

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.5
min_wordcount = 5
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]
#MmCorpus.serialize("models/corpus.mm", corpus)

print('Number of unique tokens: %d' % len(dictionary))
print('Number of chunks: %d' % len(corpus))

Number of unique tokens: 6528
Number of chunks: 500


In [101]:
dictionary = Dictionary(noname_docs)

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.5
min_wordcount = 5
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

# Bag-of-words representation of the documents.
noname_corpus = [dictionary.doc2bow(doc) for doc in noname_docs]
#MmCorpus.serialize("models/corpus.mm", corpus)

print('Number of unique tokens: %d' % len(dictionary))
print('Number of chunks: %d' % len(corpus))

Number of unique tokens: 6387
Number of chunks: 500


In [100]:
from gensim.models import LdaMulticore

seed = 1
# random.seed(seed)
np.random.seed(seed)

params = {'passes': 10, 'random_state': seed}
base_models = dict()
model_2020 = LdaMulticore(corpus=corpus, num_topics=4, id2word=dictionary, workers=6,
                passes=params['passes'], random_state=params['random_state'])
model_2020.show_topics(num_topics=5)

[(0,
  '0.003*"office" + 0.002*"Mitch" + 0.002*"party" + 0.002*"witnesses" + 0.002*"economy" + 0.002*"Mitch McConnell" + 0.002*"forward" + 0.002*"military" + 0.002*"continue" + 0.002*"let"'),
 (1,
  '0.002*"won" + 0.002*"coronavirus" + 0.002*"Hunter" + 0.002*"Ukraine" + 0.002*"able" + 0.002*"long" + 0.002*"continue" + 0.002*"Hunter Biden" + 0.002*"important" + 0.002*"sure"'),
 (2,
  '0.002*"coronavirus" + 0.002*"world" + 0.002*"voters" + 0.002*"care" + 0.002*"Ukraine" + 0.002*"let" + 0.002*"job" + 0.002*"day" + 0.002*"didn" + 0.002*"office"'),
 (3,
  '0.002*"year" + 0.002*"Vice" + 0.002*"day" + 0.002*"percent" + 0.002*"didn" + 0.002*"China" + 0.002*"voters" + 0.002*"won" + 0.002*"Ukraine" + 0.002*"sure"')]

In [102]:
seed = 1
# random.seed(seed)
np.random.seed(seed)

params = {'passes': 10, 'random_state': seed}
base_models = dict()
model_2020_noname = LdaMulticore(corpus=noname_corpus, num_topics=4, id2word=dictionary, workers=6,
                passes=params['passes'], random_state=params['random_state'])
model_2020_noname.show_topics(num_topics=5)

[(0,
  '0.003*"person" + 0.002*"Ukraine" + 0.002*"coronavirus" + 0.002*"didn" + 0.002*"crisis" + 0.002*"long" + 0.002*"office" + 0.002*"day" + 0.002*"voters" + 0.002*"end"'),
 (1,
  '0.003*"didn" + 0.002*"coronavirus" + 0.002*"won" + 0.002*"saying" + 0.002*"continue" + 0.002*"care" + 0.002*"nation" + 0.002*"witnesses" + 0.002*"Elizabeth" + 0.002*"important"'),
 (2,
  '0.002*"witnesses" + 0.002*"office" + 0.002*"nation" + 0.002*"voters" + 0.002*"party" + 0.002*"let" + 0.002*"forward" + 0.002*"Senator" + 0.002*"Mitch" + 0.002*"states"'),
 (3,
  '0.002*"Ukraine" + 0.002*"coronavirus" + 0.002*"party" + 0.002*"fact" + 0.002*"Hunter" + 0.002*"Democrat" + 0.002*"won" + 0.002*"national" + 0.002*"military" + 0.002*"world"')]

In [104]:
tfidf = TfidfModel(corpus=noname_docs, id2word=dictionary)
tfidf_corpus = tfidf[noname_docs] 

params = {'passes': 10, 'random_state': seed}
base_models = dict()
model_2020_noname_tfidf = LdaMulticore(corpus=tfidf_corpus, num_topics=4, id2word=dictionary, workers=6,
                passes=params['passes'], random_state=params['random_state'])
model_2020_noname_tfidf.show_topics(num_topics=5)

ValueError: too many values to unpack (expected 2)

In [61]:
# pass in name list to remove politician names
sample_quote = df_2020['quotation'].iloc[0]
print("Sample quotation: ", sample_quote)
preprocess(sample_quote, us_pol_names)

Sample quotation:  but [ President ] Trump (was) eager to make a symbol of the Army officer sooner after the Senate acquitted him of the impeachment charges approved by House Democrats.


['presid',
 'eager',
 'symbol',
 'armi',
 'sooner',
 'senat',
 'acquit',
 'impeach',
 'charg',
 'approv',
 'democrat']

In [62]:
# process 2020 data
processed_docs = []
for quotation in df_2020['quotation'].tolist():
    processed_docs.append(preprocess(quotation, pol_name_us))
#words into corpus
dictionary = gensim.corpora.Dictionary(processed_docs)
#convert to bag of words corpus
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [63]:
tfidf = TfidfModel(bow_corpus, id2word=dictionary)
tfidf_corpus = tfidf[bow_corpus] 

In [78]:
# train LDA model with bow corpus
num_topics = 5
lda_2020_bow =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = num_topics, 
                                   id2word = dictionary,                                    
                                   passes = 10)
MODEL_PATH = "./lda_bow_2020"
lda_2020_bow.save(MODEL_PATH)

In [79]:
# analyze model
for idx, topic in lda_2020_bow.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.015*"administr" + 0.009*"need" + 0.008*"like" + 0.008*"state" + 0.008*"iran" + 0.008*"american" + 0.007*"america" + 0.006*"china" + 0.006*"work" + 0.006*"continu"


Topic: 1 
Words: 0.013*"know" + 0.010*"ukrain" + 0.010*"peopl" + 0.010*"want" + 0.010*"go" + 0.010*"tell" + 0.010*"investig" + 0.009*"think" + 0.008*"time" + 0.008*"senat"


Topic: 2 
Words: 0.020*"say" + 0.012*"administr" + 0.010*"time" + 0.010*"work" + 0.009*"democrat" + 0.008*"coronavirus" + 0.008*"vote" + 0.008*"health" + 0.007*"american" + 0.006*"thing"


Topic: 3 
Words: 0.012*"senat" + 0.012*"impeach" + 0.010*"american" + 0.010*"year" + 0.010*"trial" + 0.009*"vote" + 0.009*"democrat" + 0.008*"countri" + 0.008*"republican" + 0.008*"support"


Topic: 4 
Words: 0.026*"go" + 0.025*"peopl" + 0.024*"think" + 0.019*"like" + 0.018*"democrat" + 0.017*"know" + 0.013*"want" + 0.013*"beat" + 0.010*"say" + 0.009*"support"




In [68]:
# train LDA model with tfidf corpus
num_topics = 5
lda_2020_tfidf =  gensim.models.LdaMulticore(tfidf_corpus, 
                                   num_topics = num_topics, 
                                   id2word = dictionary,                                    
                                   passes = 10)
MODEL_PATH = "./lda_tfidf_2020"
lda_2020_tfidf.save(MODEL_PATH)

In [69]:
# analyze model
for idx, topic in lda_2020_tfidf.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.011*"defeat" + 0.009*"go" + 0.007*"want" + 0.006*"think" + 0.006*"need" + 0.006*"know" + 0.005*"countri" + 0.005*"peopl" + 0.005*"like" + 0.004*"help"


Topic: 1 
Words: 0.012*"beat" + 0.012*"think" + 0.007*"go" + 0.007*"vote" + 0.006*"democrat" + 0.006*"peopl" + 0.005*"candid" + 0.005*"elect" + 0.005*"say" + 0.005*"want"


Topic: 2 
Words: 0.008*"democrat" + 0.007*"go" + 0.006*"right" + 0.006*"think" + 0.006*"know" + 0.006*"thing" + 0.005*"say" + 0.005*"peopl" + 0.005*"parti" + 0.004*"want"


Topic: 3 
Words: 0.008*"senat" + 0.006*"impeach" + 0.005*"administr" + 0.005*"elect" + 0.005*"say" + 0.005*"republican" + 0.004*"like" + 0.004*"vote" + 0.004*"trial" + 0.004*"peopl"


Topic: 4 
Words: 0.008*"know" + 0.007*"talk" + 0.007*"peopl" + 0.006*"work" + 0.006*"like" + 0.006*"year" + 0.005*"say" + 0.005*"time" + 0.005*"american" + 0.005*"support"




In [74]:
# use the model to predict
test_document = df_2020['quotation'].iloc[0]
test_processed = preprocess(test_document, us_pol_names)
bow_vector = dictionary.doc2bow(test_processed)

# bow model
print('original quotation: "{}"'.format(test_document))
print('processed input: "{}"'.format(test_processed))
for index, score in sorted(lda_2020_bow[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_2020_bow.print_topic(index, 5)))

original quotation: "but [ President ] Trump (was) eager to make a symbol of the Army officer sooner after the Senate acquitted him of the impeachment charges approved by House Democrats."
processed input: "['presid', 'eager', 'symbol', 'armi', 'sooner', 'senat', 'acquit', 'impeach', 'charg', 'approv', 'democrat']"
Score: 0.9314004182815552	 Topic: 0.025*"think" + 0.022*"go" + 0.020*"democrat" + 0.015*"vote" + 0.015*"senat"
Score: 0.01739906333386898	 Topic: 0.019*"administr" + 0.011*"work" + 0.009*"like" + 0.009*"health" + 0.009*"year"
Score: 0.01719622313976288	 Topic: 0.022*"peopl" + 0.014*"american" + 0.012*"say" + 0.008*"attack" + 0.007*"administr"
Score: 0.017031699419021606	 Topic: 0.016*"know" + 0.013*"right" + 0.013*"go" + 0.012*"want" + 0.011*"thing"
Score: 0.016972605139017105	 Topic: 0.012*"democrat" + 0.012*"countri" + 0.011*"know" + 0.011*"american" + 0.011*"state"


In [70]:
# tfidf model 
print('original quotation: "{}"'.format(test_document))
print('processed input: "{}"'.format(test_processed))
for index, score in sorted(lda_2020_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_2020_tfidf.print_topic(index, 5)))

original quotation: "but [ President ] Trump (was) eager to make a symbol of the Army officer sooner after the Senate acquitted him of the impeachment charges approved by House Democrats."
processed input: "['presid', 'eager', 'symbol', 'armi', 'sooner', 'senat', 'acquit', 'impeach', 'charg', 'approv', 'democrat']"
Score: 0.9315000772476196	 Topic: 0.008*"senat" + 0.006*"impeach" + 0.005*"administr" + 0.005*"elect" + 0.005*"say"
Score: 0.017363138496875763	 Topic: 0.008*"democrat" + 0.007*"go" + 0.006*"right" + 0.006*"think" + 0.006*"know"
Score: 0.01717216707766056	 Topic: 0.008*"know" + 0.007*"talk" + 0.007*"peopl" + 0.006*"work" + 0.006*"like"
Score: 0.01702079363167286	 Topic: 0.011*"defeat" + 0.009*"go" + 0.007*"want" + 0.006*"think" + 0.006*"need"
Score: 0.01694382168352604	 Topic: 0.012*"beat" + 0.012*"think" + 0.007*"go" + 0.007*"vote" + 0.006*"democrat"
