In [56]:
import json
from nltk.tokenize import RegexpTokenizer
with open('./trump_tweets.json') as f:
    data = json.load(f)
documents = []
for tweet in data:
    # remove retweets
    if tweet['text'][:2] != "RT":
        documents.append(tweet['text'])


glancing at the data:

In [57]:
print(len(documents))
print(documents[:5])

44729
['Will be doing show with @RushLimbaughEIB at 12:00 P.M. TALK RADIO.  ENJOY!!!', 'Covid Relief Negotiations are moving along. Go Big!', 'Just got a briefing on Hurricane Delta rushing toward Louisiana and Mississippi. @fema is there and ready!!!', 'Crazy Nancy Pelosi is looking at the 25th Amendment in order to replace Joe Biden with Kamala Harris. The Dems want that to happen fast because Sleepy Joe is out of it!!!', 'Steve Scully, the second Debate Moderator, is a Never Trumper, just like the son of the great Mike Wallace. Fix!!!']


In [58]:
import gensim 
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import string
np.random.seed(2018)

import nltk
nltk.download('wordnet')
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package wordnet to /Users/erafkin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


A function to perform lemmatize and stem preprocessing steps on the data set:
- no @'s
- no links

In [59]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    temp = []
    for token in text.split(" "):
        token = token.lower()
        if token[:1] != '@' and token[:4] != 'http':
            token = token.translate(str.maketrans('', '', string.punctuation))
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                result.append(lemmatize_stemming(token))
    return result


picking a doc to preview after preprocessing:

In [68]:
doc_sample = documents[4000]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['.@dagenmcdowell,', '“Every', 'Democrat', 'is', 'running', 'to', 'raise', 'taxes.', 'She', '(Pocahontas)', 'lied', 'about', 'her', 'ethnicity,', 'that', 'was', 'her', 'problem.”', '@MariaBartiromo', '', 'So', 'true', 'Dagen!']


 tokenized and lemmatized document: 
['dagenmcdowel', '“everi', 'democrat', 'run', 'rais', 'tax', 'pocahonta', 'lie', 'ethnic', 'problem”', 'true', 'dagen']


Preprocess the headline text, saving the results as ‘processed_docs’

In [69]:
processed_docs = []
for doc in documents:
    processed_docs.append(preprocess(doc))
processed_docs[:10]

[['1200', 'pm', 'talk', 'radio', 'enjoy'],
 ['covid', 'relief', 'negoti', 'move', 'along', 'big'],
 ['brief', 'hurrican', 'delta', 'rush', 'louisiana', 'mississippi', 'readi'],
 ['crazi',
  'nanci',
  'pelosi',
  'look',
  '25th',
  'amend',
  'order',
  'replac',
  'biden',
  'kamala',
  'harri',
  'dem',
  'want',
  'happen',
  'fast',
  'sleepi',
  'it'],
 ['steve',
  'sculli',
  'second',
  'debat',
  'moder',
  'trumper',
  'like',
  'great',
  'mike',
  'wallac',
  'fix'],
 ['vote'],
 ['hello'],
 ['save', 'second', 'amend', 'virginia', 'go', 'away', 'vote', 'trump'],
 ['nick',
  'complet',
  'total',
  'endors',
  'warrior',
  'virginia',
  'usa',
  'protect',
  'second',
  'amend'],
 ['gallup',
  'poll',
  'come',
  'incred',
  'find',
  'better',
  'today',
  'pandem',
  'year',
  'obiden',
  'highest',
  'number',
  'record',
  'pretti',
  'amaz']]

Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.

In [71]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 1200
1 enjoy
2 pm
3 radio
4 talk
5 along
6 big
7 covid
8 move
9 negoti
10 relief


"Filter out everything that's dumb"
- tokens that appear in less than 15 docs
- tokens that appear in more half of the docs
- keep only the first 100000 most frequent tokens


In [72]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=20000)

For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

In [73]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[100]

[(17, 1),
 (32, 1),
 (112, 1),
 (125, 1),
 (162, 1),
 (289, 1),
 (293, 1),
 (303, 1),
 (334, 1),
 (404, 1),
 (414, 1),
 (428, 1),
 (478, 1),
 (479, 1),
 (480, 1),
 (481, 1)]

preview our bag o' words:

In [74]:
bow_doc_4310 = bow_corpus[100]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 17 ("biden") appears 1 time.
Word 32 ("fix") appears 1 time.
Word 112 ("countri") appears 1 time.
Word 125 ("interview") appears 1 time.
Word 162 ("absolut") appears 1 time.
Word 289 ("hall") appears 1 time.
Word 293 ("town") appears 1 time.
Word 303 ("corrupt") appears 1 time.
Word 334 ("way") appears 1 time.
Word 404 ("pay") appears 1 time.
Word 414 ("disgrac") appears 1 time.
Word 428 ("time") appears 1 time.
Word 478 ("dnc") appears 1 time.
Word 479 ("free") appears 1 time.
Word 480 ("host") appears 1 time.
Word 481 ("public") appears 1 time.


Create tf-idf model object using models.TfidfModel on ‘bow_corpus’ and save it to ‘tfidf’, then apply transformation to the entire corpus and call it ‘corpus_tfidf’. Finally we preview TF-IDF scores for our first document.
tf-idf stands for "term frequency inverse document frequency". It is an idicator of the importance of a word. A high score means the word is important, and a low score means less important. It's based on the idea that a word that is very frequent in an individual doc and is very frequent over the entire corpus of documents is unimportant (ex. the, of). 

In [75]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.41109697707787884),
 (1, 0.47409539618980856),
 (2, 0.6734830347647046),
 (3, 0.3907088847635566)]


Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’

In [93]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=2)

For each topic, we will explore the words occuring in that topic and its relative weight.

In [94]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.035*"great" + 0.019*"trump" + 0.016*"apprentic" + 0.016*"tonight" + 0.016*"interview" + 0.015*"look" + 0.013*"night" + 0.011*"golf" + 0.011*"celebr" + 0.011*"america"
Topic: 1 
Words: 0.015*"china" + 0.014*"obama" + 0.012*"amp" + 0.009*"vote" + 0.007*"job" + 0.007*"total" + 0.007*"tax" + 0.007*"countri" + 0.007*"record" + 0.007*"republican"
Topic: 2 
Words: 0.016*"great" + 0.014*"trump" + 0.013*"presid" + 0.013*"work" + 0.012*"obama" + 0.012*"go" + 0.009*"time" + 0.009*"good" + 0.008*"peopl" + 0.007*"amp"
Topic: 3 
Words: 0.030*"trump" + 0.020*"donald" + 0.017*"dont" + 0.013*"think" + 0.011*"like" + 0.010*"great" + 0.010*"deal" + 0.009*"watch" + 0.009*"presid" + 0.008*"obamacar"
Topic: 4 
Words: 0.079*"thank" + 0.032*"you" + 0.021*"great" + 0.018*"trump" + 0.018*"need" + 0.012*"love" + 0.011*"true" + 0.011*"amp" + 0.010*"america" + 0.010*"debt"


running LDA using TF-IDF:

In [95]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.008*"obama" + 0.008*"trump" + 0.006*"want" + 0.005*"amp" + 0.005*"think" + 0.005*"great" + 0.004*"like" + 0.004*"hes" + 0.004*"donald" + 0.004*"peopl"
Topic: 1 Word: 0.016*"great" + 0.009*"america" + 0.008*"you" + 0.008*"thank" + 0.008*"interview" + 0.007*"tonight" + 0.007*"trump" + 0.006*"again" + 0.006*"go" + 0.006*"enjoy"
Topic: 2 Word: 0.008*"cont" + 0.008*"great" + 0.007*"dont" + 0.006*"trump" + 0.006*"love" + 0.005*"best" + 0.005*"like" + 0.005*"news" + 0.005*"amp" + 0.005*"peopl"
Topic: 3 Word: 0.062*"thank" + 0.019*"you" + 0.015*"true" + 0.014*"great" + 0.006*"amp" + 0.006*"apprentic" + 0.006*"trump" + 0.005*"presid" + 0.005*"amaz" + 0.005*"deal"
Topic: 4 Word: 0.014*"trump" + 0.012*"presid" + 0.009*"donald" + 0.009*"thank" + 0.008*"great" + 0.007*"makeamericagreatagain" + 0.007*"luck" + 0.006*"good" + 0.006*"nice" + 0.005*"amp"


Performance evaluation by classifying sample document using LDA Bag of Words model

We will check where our test document would be classified.

In [96]:
processed_docs[120]

['stock',
 'market',
 'big',
 'point',
 '28149',
 'great',
 'news',
 'america',
 'job',
 'job',
 'job']

In [97]:
for index, score in sorted(lda_model[bow_corpus[120]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.6696403622627258	 
Topic: 0.015*"china" + 0.014*"obama" + 0.012*"amp" + 0.009*"vote" + 0.007*"job" + 0.007*"total" + 0.007*"tax" + 0.007*"countri" + 0.007*"record" + 0.007*"republican"

Score: 0.2747897505760193	 
Topic: 0.035*"great" + 0.019*"trump" + 0.016*"apprentic" + 0.016*"tonight" + 0.016*"interview" + 0.015*"look" + 0.013*"night" + 0.011*"golf" + 0.011*"celebr" + 0.011*"america"

Score: 0.018671849742531776	 
Topic: 0.079*"thank" + 0.032*"you" + 0.021*"great" + 0.018*"trump" + 0.018*"need" + 0.012*"love" + 0.011*"true" + 0.011*"amp" + 0.010*"america" + 0.010*"debt"

Score: 0.018513984978199005	 
Topic: 0.030*"trump" + 0.020*"donald" + 0.017*"dont" + 0.013*"think" + 0.011*"like" + 0.010*"great" + 0.010*"deal" + 0.009*"watch" + 0.009*"presid" + 0.008*"obamacar"

Score: 0.018384063616394997	 
Topic: 0.016*"great" + 0.014*"trump" + 0.013*"presid" + 0.013*"work" + 0.012*"obama" + 0.012*"go" + 0.009*"time" + 0.009*"good" + 0.008*"peopl" + 0.007*"amp"


try one more:

In [101]:
processed_docs[170]


['half',
 'year',
 'secur',
 'america',
 'border',
 'rebuild',
 'awesom',
 'power',
 'us',
 'militari',
 'obliter',
 'isi',
 'caliph',
 'fix',
 'disastr',
 'trade',
 'deal',
 'bring',
 'job',
 'home',
 'america',
 'minnesota',
 'maga']

In [102]:
for index, score in sorted(lda_model[bow_corpus[170]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.7578873038291931	 
Topic: 0.015*"china" + 0.014*"obama" + 0.012*"amp" + 0.009*"vote" + 0.007*"job" + 0.007*"total" + 0.007*"tax" + 0.007*"countri" + 0.007*"record" + 0.007*"republican"

Score: 0.21526038646697998	 
Topic: 0.035*"great" + 0.019*"trump" + 0.016*"apprentic" + 0.016*"tonight" + 0.016*"interview" + 0.015*"look" + 0.013*"night" + 0.011*"golf" + 0.011*"celebr" + 0.011*"america"


Performance evaluation by classifying sample document using LDA TF-IDF model:
- we like tf-idf because we think it is classifiying better

In [103]:
for index, score in sorted(lda_model_tfidf[corpus_tfidf[170]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8469293117523193	 
Topic: 0.014*"trump" + 0.012*"presid" + 0.009*"donald" + 0.009*"thank" + 0.008*"great" + 0.007*"makeamericagreatagain" + 0.007*"luck" + 0.006*"good" + 0.006*"nice" + 0.005*"amp"

Score: 0.03890892118215561	 
Topic: 0.008*"obama" + 0.008*"trump" + 0.006*"want" + 0.005*"amp" + 0.005*"think" + 0.005*"great" + 0.004*"like" + 0.004*"hes" + 0.004*"donald" + 0.004*"peopl"

Score: 0.03819483518600464	 
Topic: 0.016*"great" + 0.009*"america" + 0.008*"you" + 0.008*"thank" + 0.008*"interview" + 0.007*"tonight" + 0.007*"trump" + 0.006*"again" + 0.006*"go" + 0.006*"enjoy"

Score: 0.03815516456961632	 
Topic: 0.062*"thank" + 0.019*"you" + 0.015*"true" + 0.014*"great" + 0.006*"amp" + 0.006*"apprentic" + 0.006*"trump" + 0.005*"presid" + 0.005*"amaz" + 0.005*"deal"

Score: 0.0378117561340332	 
Topic: 0.008*"cont" + 0.008*"great" + 0.007*"dont" + 0.006*"trump" + 0.006*"love" + 0.005*"best" + 0.005*"like" + 0.005*"news" + 0.005*"amp" + 0.005*"peopl"


Real deal: testing on unseen document

In [109]:
unseen_document = 'if i win the election the rich won\'t have to pay taxes'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.8330960273742676	 Topic: 0.015*"china" + 0.014*"obama" + 0.012*"amp" + 0.009*"vote" + 0.007*"job"
Score: 0.043201327323913574	 Topic: 0.030*"trump" + 0.020*"donald" + 0.017*"dont" + 0.013*"think" + 0.011*"like"
Score: 0.042434994131326675	 Topic: 0.079*"thank" + 0.032*"you" + 0.021*"great" + 0.018*"trump" + 0.018*"need"
Score: 0.04113909974694252	 Topic: 0.016*"great" + 0.014*"trump" + 0.013*"presid" + 0.013*"work" + 0.012*"obama"
Score: 0.040128543972969055	 Topic: 0.035*"great" + 0.019*"trump" + 0.016*"apprentic" + 0.016*"tonight" + 0.016*"interview"


### Conclusion

this one makes a little more sense. but the parser needs to be better. a lot of words that don't matter were left over. 
