# Table of Contents
 <p>

In this exercise we will go over `realDonaldTrump_tweets` and perform topic modeling. Each line in this file is a tweet. 

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import re

**Task 1: Load the data**

Consider each tweet as a document. Load the tweets. Strip away symbols and web links in the tweets. If the tweet becomes empty string after preprocessing, then discard the tweet from analysis.


In [2]:
file_path = '/dsa/data/all_datasets/linguistic/realDonaldTrump_tweets.txt'

In [3]:
# load each tweet as a document

with open(file_path, 'r') as f:
    tweets = f.read().splitlines()
    tweets = [re.sub(r'[^\w]|https.*\b', ' ', t) for t in tweets]

print(tweets[0:10])

['It was a great honor to have spoken before the countries of the world at the United Nations ', ' USAatUNGA UNGA  ', 'God bless the people of Mexico City  We are with you and will be there for you ', 'As President of the United States of America  I will ALWAYS put  AmericaFirst UNGA', 'Full remarks   ', 'Thehas great strength  amp  patience  but if it is forced to defend itself or its allies  we will have no choice but  ', 'RT  IvankaTrump  I have long respected India s accomplished and charismatic Foreign Minister  SushmaSwaraj  and it was an honor to meet her', 'Big day at the United Nations   many good things  and some tricky ones  happening  We have a great team  Big speech at 10 00 A M ', ' USAatUNGA  UNGA  ', 'We call for the full restoration of democracy and political freedoms in Venezuela  and we want it to happen very  v  ']


**Task 2: Create term frequency matrix for these tweets.**


In [4]:
count_vectorizer = CountVectorizer(stop_words='english')
term_frequency = count_vectorizer.fit_transform(tweets)
feature_names = count_vectorizer.get_feature_names()

In [5]:
print(f"Shape of term freq matrix = {term_frequency.shape}")
print(f"Num of features identified = {len(feature_names)}")

Shape of term freq matrix = (3998, 6058)
Num of features identified = 6058


**Task 3: Apply LDA topic modeling method with 5 topics**

Fit an LDA model with 5 topics on these tweets. 


In [6]:
lda = LatentDirichletAllocation(n_components=5, random_state=0)
lda.fit(term_frequency)  

LatentDirichletAllocation(n_components=5, random_state=0)

**Task 4: Print the top 10 words for each of the topics**

In [7]:
print(f"Num of topics = {len(lda.components_)}")
lda.components_[0]

Num of topics = 5


array([22.4773036 , 27.94216467,  1.19494424, ...,  0.20000724,
        1.18896449,  0.20000306])

In [8]:
feature_names[:25]

['00',
 '000',
 '00am',
 '00pm',
 '00pme',
 '02',
 '10',
 '100',
 '100th',
 '100yrs',
 '109',
 '10am',
 '10k',
 '10p',
 '10pe',
 '10pm',
 '10pme',
 '11',
 '110',
 '113',
 '116',
 '119',
 '11a',
 '11pm',
 '11pme']

In [10]:
def display_topics(model, feature_names, no_top_words):
    
    for topic_idx, term_weights in enumerate(model.components_):
        sorted_indx = term_weights.argsort()

        topk_words = [feature_names[i] for i in sorted_indx[-no_top_words :]]
        print(f"Topic {topic_idx}:", end=None)
        print(";".join(topk_words))

In [11]:
display_topics(lda, feature_names, 10)

Topic 0:
foxandfriends;trump;amp;clinton;make;hillary;america;thank;rt;great
Topic 1:
china;tickets;president;watch;nytimes;failing;korea;north;amp;great
Topic 2:
just;trump;people;obamacare;news;clinton;fake;hillary;media;amp
Topic 3:
realdonaldtrump;country;today;just;amp;american;great;jobs;people;rt
Topic 4:
tonight;big;live;ohio;today;florida;tomorrow;amp;thank;join


**Task 5: Name each of the topic (No right answer)**

After observing top-10 words in each topic, do these topics make sense to you? Can you name each of the topic? 

Topic 0: debate
Topic 1: foreign policy
Topic 2: domestic policy
Topic 3: campaign
Topic 4: rallies

**Task 6: Create a TFIDF matrix**

Create TFIDF matrix for these tweets.

In [12]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(tweets)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [13]:
print(f"Shape of tfidf matrix = {tfidf.shape}")
print(f"Num of features identified = {len(tfidf_feature_names)}")

Shape of tfidf matrix = (3998, 6058)
Num of features identified = 6058


**Task 6: Apply NMF topic modeling with 5 topics**

In [14]:
nmf = NMF(n_components=5, random_state=0)
nmf.fit(term_frequency)



NMF(n_components=5, random_state=0)

**Task 7: Print the top 10 words for each of the topics**

In [15]:
display_topics(nmf, tfidf_feature_names, 10)

Topic 0:
people;today;vote;replace;obamacare;repeal;time;american;jobs;amp
Topic 1:
day;going;honor;big;today;state;people;make;america;great
Topic 2:
cnn;bad;fake;news;people;just;media;crooked;clinton;hillary
Topic 3:
potus;obama;hillaryclinton;new;teamtrump;foxandfriends;president;realdonaldtrump;trump;rt
Topic 4:
carolina;north;going;ohio;vote;join;florida;new;maga;thank


**Task 8: Perform a comparison between the topics identified by LDA and NMF methods.**

In [17]:
topic = lda.components_[1]  # take the corona topic
no_top_words = 10

weights_lda = {}
for i in topic.argsort()[:-no_top_words - 1:-1]:
    print(feature_names[i], topic[i])
    weights_lda[feature_names[i]] = topic[i]

great 78.89910958182654
amp 56.66443628383125
north 55.244651225148786
korea 46.98816793538438
failing 44.262999777514366
nytimes 40.66761631398792
watch 36.916403716213324
president 36.87990434595544
tickets 35.10042679194915
china 33.18958130309385


In [18]:
topic = nmf.components_[4]  # take the corona topic
no_top_words = 10

weights_nmf = {}
for i in topic.argsort()[:-no_top_words - 1:-1]:
    weights_nmf[tfidf_feature_names[i]] = topic[i]
weights_nmf

{'thank': 3.9436984526635266,
 'maga': 0.6665638337722626,
 'new': 0.49661884962907593,
 'florida': 0.46414260713944505,
 'join': 0.43832052060621124,
 'vote': 0.3399501232859374,
 'ohio': 0.33621088538121086,
 'going': 0.3263236472326938,
 'north': 0.30042240796729647,
 'carolina': 0.28697893022218507}

In [19]:
import pandas as pd
df1 = pd.DataFrame(weights_lda.items())
df2 = pd.DataFrame(weights_nmf.items())

df = pd.concat([df1, df2], axis=1)
df

Unnamed: 0,0,1,0.1,1.1
0,great,78.89911,thank,3.943698
1,amp,56.664436,maga,0.666564
2,north,55.244651,new,0.496619
3,korea,46.988168,florida,0.464143
4,failing,44.263,join,0.438321
5,nytimes,40.667616,vote,0.33995
6,watch,36.916404,ohio,0.336211
7,president,36.879904,going,0.326324
8,tickets,35.100427,north,0.300422
9,china,33.189581,carolina,0.286979
