In [1]:
import pandas as pd
import warnings
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS

In [2]:
# Read in datasets and parse dates
obama = pd.read_csv("./data/obama.csv",
                    parse_dates=["Date"])
trump = pd.read_csv("./data/trump.csv",
                    parse_dates=["created_at"])

In [57]:
# Drop NAs
obama = obama.dropna(axis=0, how ="any")
trump = trump.dropna(axis=0, how ="any")

In [36]:
# Create corpus of Obama tweets
obama_tweets = []
for i in range(len(obama)):
    obama_tweets.append(obama.iloc[i,0])
    
# Create corpus of Trump tweets
trump_tweets = []
for i in range(len(trump)):
    trump_tweets.append(trump.iloc[i,1])

In [37]:
# Instantiate count vectorizers
# Make all words lowercase, remove English stopwords, and create 1- and 2-word n-grams
cv_obama = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1,2))
cv_trump = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1,2))

In [38]:
# Create term-document matrices: rows represent tweets, columns represent words in the vocabulary
tfidf_obama = cv_obama.fit_transform(obama_tweets)
tfidf_trump = cv_trump.fit_transform(trump_tweets)

print("Obama: ", tfidf_obama.shape)
print("Trump: ", tfidf_trump.shape)

Obama:  (6734, 48770)
Trump:  (34579, 256869)


In [39]:
# Create model with 20 topics
lda_obama = LatentDirichletAllocation(n_components=20)
lda_trump = LatentDirichletAllocation(n_components=20)

In [40]:
warnings.simplefilter(action='ignore', category=DeprecationWarning) # to remove warnings
# Run LDA on the term-frequency vectorizer objects (note: this takes around ten minutes, depending on the machine)
X_lda_obama = lda_obama.fit_transform(tfidf_obama)
X_lda_trump = lda_trump.fit_transform(tfidf_trump)

In [45]:
# An auxiliary function to print out the most likely terms for each topic
# Taken from https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic {:#2d}: ".format(topic_idx+1)
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)

In [46]:
obama_features = cv_obama.get_feature_names()
trump_features = cv_trump.get_feature_names()
print("\nTopics in Barack Obama LDA model:\n")
print_top_words(lda_obama, obama_features, 10)
print("\nTopics in Donald Trump LDA model:\n")
print_top_words(lda_trump, trump_features, 10)


Topics in Barack Obama LDA model:

Topic  1: ofa bo ofa bo http ofa http change climate actonclimate climate change read
Topic  2: president ve america world fact got pass year ve got budget
Topic  3: jobs job 000 economy education growth months private added sector
Topic  4: obama president obama president live live president speaking obama speaking sotu opportunityforall just
Topic  5: insurance health health insurance million americans country 1st obamacare enroll 1st http
Topic  6: cuts immigrants madeinamerica dream living balanced manufacturing debate budget cuts standwithourfuture
Topic  7: tell congress renewui helped comes political peace momentum thisiswhypic twitter thisiswhypic giving
Topic  8: wage minimum minimum wage families raisethewage hard working taking forward raising
Topic  9: watch et tune watch president http speak et watch watch live tune http 11
Topic 10: immigrationreform word spread spread word 200 john improving face challenges supports
Topic 11: president