In [3]:
import json
from nltk.tokenize import RegexpTokenizer
with open('./trump_tweets.json') as f:
    data = json.load(f)
documents = []
for tweet in data:
    # remove retweets
    if tweet['text'][:2] != "RT":
        documents.append(tweet['text'])


glancing at the data:

In [4]:
print(len(documents))
print(documents[:5])

44729
['Will be doing show with @RushLimbaughEIB at 12:00 P.M. TALK RADIO.  ENJOY!!!', 'Covid Relief Negotiations are moving along. Go Big!', 'Just got a briefing on Hurricane Delta rushing toward Louisiana and Mississippi. @fema is there and ready!!!', 'Crazy Nancy Pelosi is looking at the 25th Amendment in order to replace Joe Biden with Kamala Harris. The Dems want that to happen fast because Sleepy Joe is out of it!!!', 'Steve Scully, the second Debate Moderator, is a Never Trumper, just like the son of the great Mike Wallace. Fix!!!']


In [5]:
import gensim 
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import string
np.random.seed(2018)

import nltk
nltk.download('wordnet')
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package wordnet to /Users/erafkin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


A function to perform lemmatize and stem preprocessing steps on the data set:
- no @'s
- no links

In [6]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    temp = []
    for token in text.split(" "):
        token = token.lower()
        if token[:1] != '@' and token[:4] != 'http':
            token = token.translate(str.maketrans('', '', string.punctuation))
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                result.append(lemmatize_stemming(token))
    return result


picking a doc to preview after preprocessing:

In [10]:
doc_sample = documents[100]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Did', 'anyone', 'get', 'to', 'see', 'that', 'absolute', '“Joke”', 'of', 'a', 'Town', 'Hall', 'interview', 'that', 'Joe', 'Biden', 'did', 'with', 'Concast', '@NBCNews', ',', 'hosted', 'by', 'Lester', 'Holt?', 'What', 'a', 'disgrace', 'to', 'our', 'Country', 'that', 'FREE', 'public', 'airwaves', 'can', 'be', 'used', 'that', 'way.', 'All', 'SOFTBALLS.', 'A', 'big', 'FIX.', 'Time', 'should', 'be', 'paid', 'by', 'the', 'corrupt', 'DNC!']


 tokenized and lemmatized document: 
['absolut', '“joke”', 'town', 'hall', 'interview', 'biden', 'concast', 'host', 'lester', 'holt', 'disgrac', 'countri', 'free', 'public', 'airwav', 'softbal', 'time', 'pay', 'corrupt']


Preprocess the headline text, saving the results as ‘processed_docs’

In [11]:
processed_docs = []
for doc in documents:
    processed_docs.append(preprocess(doc))
processed_docs[:10]

[['1200', 'talk', 'radio', 'enjoy'],
 ['covid', 'relief', 'negoti', 'move'],
 ['brief', 'hurrican', 'delta', 'rush', 'louisiana', 'mississippi', 'readi'],
 ['crazi',
  'nanci',
  'pelosi',
  'look',
  '25th',
  'amend',
  'order',
  'replac',
  'biden',
  'kamala',
  'harri',
  'dem',
  'want',
  'happen',
  'fast',
  'sleepi'],
 ['steve',
  'sculli',
  'second',
  'debat',
  'moder',
  'trumper',
  'like',
  'great',
  'mike',
  'wallac'],
 ['vote'],
 ['hello'],
 ['save', 'second', 'amend', 'virginia', 'go', 'away', 'vote', 'trump'],
 ['nick',
  'complet',
  'total',
  'endors',
  'warrior',
  'virginia',
  'protect',
  'second',
  'amend'],
 ['gallup',
  'poll',
  'come',
  'incred',
  'find',
  'better',
  'today',
  'pandem',
  'year',
  'obiden',
  'highest',
  'number',
  'record',
  'pretti',
  'amaz']]

Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.

In [12]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 1200
1 enjoy
2 radio
3 talk
4 covid
5 move
6 negoti
7 relief
8 brief
9 delta
10 hurrican


"Filter out everything that's dumb"
- tokens that appear in less than 15 docs
- tokens that appear in more half of the docs
- keep only the first 100000 most frequent tokens


In [14]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=20000)

For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

In [15]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[100]

[(14, 1),
 (101, 1),
 (111, 1),
 (146, 1),
 (262, 1),
 (266, 1),
 (276, 1),
 (367, 1),
 (376, 1),
 (390, 1),
 (436, 1),
 (437, 1),
 (438, 1)]

preview our bag o' words:

In [16]:
bow_doc_4310 = bow_corpus[100]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 14 ("biden") appears 1 time.
Word 101 ("countri") appears 1 time.
Word 111 ("interview") appears 1 time.
Word 146 ("absolut") appears 1 time.
Word 262 ("hall") appears 1 time.
Word 266 ("town") appears 1 time.
Word 276 ("corrupt") appears 1 time.
Word 367 ("pay") appears 1 time.
Word 376 ("disgrac") appears 1 time.
Word 390 ("time") appears 1 time.
Word 436 ("free") appears 1 time.
Word 437 ("host") appears 1 time.
Word 438 ("public") appears 1 time.


Create tf-idf model object using models.TfidfModel on ‘bow_corpus’ and save it to ‘tfidf’, then apply transformation to the entire corpus and call it ‘corpus_tfidf’. Finally we preview TF-IDF scores for our first document.
tf-idf stands for "term frequency inverse document frequency". It is an idicator of the importance of a word. A high score means the word is important, and a low score means less important. It's based on the idea that a word that is very frequent in an individual doc and is very frequent over the entire corpus of documents is unimportant (ex. the, of). 

In [17]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.4669044380529935), (1, 0.7649100709039481), (2, 0.4437486103145963)]


Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’

In [18]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=2)

For each topic, we will explore the words occuring in that topic and its relative weight.

In [19]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.023*"great" + 0.017*"deal" + 0.015*"obama" + 0.014*"america" + 0.014*"china" + 0.009*"vote" + 0.009*"obamacar" + 0.007*"iran" + 0.007*"spend" + 0.006*"need"
Topic: 1 
Words: 0.023*"cont" + 0.012*"like" + 0.011*"year" + 0.010*"great" + 0.008*"trump" + 0.007*"go" + 0.007*"debt" + 0.007*"success" + 0.007*"happi" + 0.007*"peopl"
Topic: 2 
Words: 0.031*"great" + 0.025*"dont" + 0.012*"thank" + 0.012*"good" + 0.012*"today" + 0.010*"hotel" + 0.010*"beauti" + 0.010*"work" + 0.010*"miss" + 0.009*"look"
Topic: 3 
Words: 0.060*"trump" + 0.030*"donald" + 0.020*"watch" + 0.020*"interview" + 0.017*"apprentic" + 0.017*"tonight" + 0.014*"great" + 0.013*"night" + 0.010*"celebr" + 0.009*"thank"
Topic: 4 
Words: 0.053*"thank" + 0.022*"presid" + 0.017*"great" + 0.015*"trump" + 0.015*"countri" + 0.013*"need" + 0.010*"america" + 0.010*"peopl" + 0.010*"want" + 0.010*"true"


running LDA using TF-IDF:

In [20]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.023*"great" + 0.011*"good" + 0.010*"america" + 0.010*"trump" + 0.009*"thank" + 0.007*"golf" + 0.007*"work" + 0.006*"agre" + 0.006*"luck" + 0.006*"enjoy"
Topic: 1 Word: 0.014*"presid" + 0.012*"trump" + 0.009*"dont" + 0.008*"tonight" + 0.007*"interview" + 0.007*"great" + 0.007*"makeamericagreatagain" + 0.007*"watch" + 0.006*"trump2016" + 0.006*"thank"
Topic: 2 Word: 0.076*"thank" + 0.012*"great" + 0.010*"happi" + 0.007*"vote" + 0.006*"birthday" + 0.006*"support" + 0.005*"trump" + 0.005*"peopl" + 0.005*"love" + 0.004*"congratul"
Topic: 3 Word: 0.010*"china" + 0.007*"obama" + 0.006*"time" + 0.006*"countri" + 0.006*"trump" + 0.005*"great" + 0.004*"presid" + 0.004*"peopl" + 0.004*"stop" + 0.004*"think"
Topic: 4 Word: 0.016*"true" + 0.012*"trump" + 0.010*"donald" + 0.009*"cont" + 0.007*"love" + 0.007*"apprentic" + 0.007*"2016" + 0.007*"great" + 0.007*"presid" + 0.006*"obama"


Performance evaluation by classifying sample document using LDA Bag of Words model

We will check where our test document would be classified.

In [21]:
processed_docs[120]

['stock',
 'market',
 'point',
 '28149',
 'great',
 'news',
 'america',
 'job',
 'job',
 'job']

In [22]:
for index, score in sorted(lda_model[bow_corpus[120]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.7242588400840759	 
Topic: 0.053*"thank" + 0.022*"presid" + 0.017*"great" + 0.015*"trump" + 0.015*"countri" + 0.013*"need" + 0.010*"america" + 0.010*"peopl" + 0.010*"want" + 0.010*"true"

Score: 0.2147570252418518	 
Topic: 0.023*"cont" + 0.012*"like" + 0.011*"year" + 0.010*"great" + 0.008*"trump" + 0.007*"go" + 0.007*"debt" + 0.007*"success" + 0.007*"happi" + 0.007*"peopl"

Score: 0.020439056679606438	 
Topic: 0.023*"great" + 0.017*"deal" + 0.015*"obama" + 0.014*"america" + 0.014*"china" + 0.009*"vote" + 0.009*"obamacar" + 0.007*"iran" + 0.007*"spend" + 0.006*"need"

Score: 0.020310547202825546	 
Topic: 0.060*"trump" + 0.030*"donald" + 0.020*"watch" + 0.020*"interview" + 0.017*"apprentic" + 0.017*"tonight" + 0.014*"great" + 0.013*"night" + 0.010*"celebr" + 0.009*"thank"

Score: 0.02023455873131752	 
Topic: 0.031*"great" + 0.025*"dont" + 0.012*"thank" + 0.012*"good" + 0.012*"today" + 0.010*"hotel" + 0.010*"beauti" + 0.010*"work" + 0.010*"miss" + 0.009*"look"


try one more:

In [23]:
processed_docs[170]


['half',
 'year',
 'secur',
 'america',
 'border',
 'rebuild',
 'awesom',
 'power',
 'militari',
 'obliter',
 'isi',
 'caliph',
 'fix',
 'disastr',
 'trade',
 'deal',
 'bring',
 'job',
 'home',
 'america',
 'minnesota',
 'maga']

In [24]:
for index, score in sorted(lda_model[bow_corpus[170]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.922659695148468	 
Topic: 0.023*"great" + 0.017*"deal" + 0.015*"obama" + 0.014*"america" + 0.014*"china" + 0.009*"vote" + 0.009*"obamacar" + 0.007*"iran" + 0.007*"spend" + 0.006*"need"

Score: 0.04938477277755737	 
Topic: 0.060*"trump" + 0.030*"donald" + 0.020*"watch" + 0.020*"interview" + 0.017*"apprentic" + 0.017*"tonight" + 0.014*"great" + 0.013*"night" + 0.010*"celebr" + 0.009*"thank"


Performance evaluation by classifying sample document using LDA TF-IDF model:
- we like tf-idf because we think it is classifiying better

In [25]:
for index, score in sorted(lda_model_tfidf[corpus_tfidf[170]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.6920978426933289	 
Topic: 0.010*"china" + 0.007*"obama" + 0.006*"time" + 0.006*"countri" + 0.006*"trump" + 0.005*"great" + 0.004*"presid" + 0.004*"peopl" + 0.004*"stop" + 0.004*"think"

Score: 0.19135954976081848	 
Topic: 0.014*"presid" + 0.012*"trump" + 0.009*"dont" + 0.008*"tonight" + 0.007*"interview" + 0.007*"great" + 0.007*"makeamericagreatagain" + 0.007*"watch" + 0.006*"trump2016" + 0.006*"thank"

Score: 0.03958211466670036	 
Topic: 0.023*"great" + 0.011*"good" + 0.010*"america" + 0.010*"trump" + 0.009*"thank" + 0.007*"golf" + 0.007*"work" + 0.006*"agre" + 0.006*"luck" + 0.006*"enjoy"

Score: 0.038685038685798645	 
Topic: 0.076*"thank" + 0.012*"great" + 0.010*"happi" + 0.007*"vote" + 0.006*"birthday" + 0.006*"support" + 0.005*"trump" + 0.005*"peopl" + 0.005*"love" + 0.004*"congratul"

Score: 0.03827548027038574	 
Topic: 0.016*"true" + 0.012*"trump" + 0.010*"donald" + 0.009*"cont" + 0.007*"love" + 0.007*"apprentic" + 0.007*"2016" + 0.007*"great" + 0.007*"presid" + 0.006*

Real deal: testing on unseen document

In [26]:
unseen_document = 'if i win the election the rich won\'t have to pay taxes'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.8375043272972107	 Topic: 0.023*"great" + 0.017*"deal" + 0.015*"obama" + 0.014*"america" + 0.014*"china"
Score: 0.0410543717443943	 Topic: 0.023*"cont" + 0.012*"like" + 0.011*"year" + 0.010*"great" + 0.008*"trump"
Score: 0.04051666334271431	 Topic: 0.053*"thank" + 0.022*"presid" + 0.017*"great" + 0.015*"trump" + 0.015*"countri"
Score: 0.04047143831849098	 Topic: 0.031*"great" + 0.025*"dont" + 0.012*"thank" + 0.012*"good" + 0.012*"today"
Score: 0.0404532290995121	 Topic: 0.060*"trump" + 0.030*"donald" + 0.020*"watch" + 0.020*"interview" + 0.017*"apprentic"


### Conclusion

this one makes a little more sense. but the parser needs to be better. a lot of words that don't matter were left over. 
