In [1]:
import pandas as pd
import numpy as np

# set seed for reproducibility
np.random.seed(493)

# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# GET THE DATA

In [2]:
import json 
import csv

In [3]:
df = pd.read_json('../data/in/trump-archive.json')

In [4]:
df.head()

Unnamed: 0,date,favorites,id,isRetweet,retweets,text
0,2020-12-16 19:01:06,163597,1339284435456421888,False,42842,https://t.co/GgwnkrGz9U
1,2020-12-16 18:51:14,222413,1339281950490697728,False,52018,"Chris Krebs was totally excoriated and proven wrong at the Senate Hearing on the Fraudulent 2020 Election. Massive FRAUD took place with machines, people voting from out of state, illegals, dead people, no signatures—and so much more!"
2,2020-12-16 18:33:10,246740,1339277405458997248,False,51116,Former United States Solicitor General Ken Starr: Pennsylvania “Flagrantly Violated” Laws Ahead of Election.
3,2020-12-16 18:27:02,179518,1339275859841134592,False,38198,"Senate Hearings going on LIVE @OANN, as to the Fraudulent 2020 Election that just took place. @SenRonJohnson doing an excellent job. Nevada must be flipped based on testimony!"
4,2020-12-16 15:06:47,258927,1339225465584840704,False,53928,"Perhaps the biggest difference between 2016 and 2020 is @FoxNews, despite the fact that I went from 63,000,000 Votes to 75,000,000 Votes, a record 12,000,000 Vote increase. Obama went down 3,000,000 Votes, and won. Rigged Election!!!"


# PRE-PROCESS TEXT

In [5]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
ps = nltk.porter.PorterStemmer()

import unicodedata
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# remove  null values
df = df.loc[df.text.notnull()]

In [7]:
def basic_clean(original):
    word = original.lower()
    word = unicodedata.normalize('NFKD', word)\
                                .encode('ascii', 'ignore')\
                                .decode('utf-8', 'ignore')
    word = re.sub(r"[^a-z0-9'\s]", '', word)
    word = word.replace('\n',' ')
    word = word.replace('\t',' ')
    return word

In [8]:
def remove_stopwords(original, extra_words=[], exclude_words=[]):
    stopword_list = stopwords.words('english')

    for word in extra_words:
        stopword_list.append(word)
    for word in exclude_words:
        stopword_list.remove(word)

    words = original.split()
    filtered_words = [w for w in words if w not in stopword_list]

    original_nostop = ' '.join(filtered_words)

    return original_nostop

In [9]:
def stem(original):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in original.split()]
    original_stemmed = ' '.join(stems)
    return original_stemmed

In [10]:
docs = []
for sentence in df.text:
    words = word_tokenize(stem(remove_stopwords(basic_clean(sentence))))
    docs.append(words)

# GSDMM MODEL TRAINING

In [11]:
from gsdmm import MovieGroupProcess

In [12]:
mgp = MovieGroupProcess(K=15, alpha=0.1, beta=1, n_iters=30)

vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)

y = mgp.fit(docs, n_terms)

In stage 0: transferred 784 clusters with 15 clusters populated
In stage 1: transferred 576 clusters with 15 clusters populated
In stage 2: transferred 400 clusters with 15 clusters populated
In stage 3: transferred 332 clusters with 14 clusters populated
In stage 4: transferred 299 clusters with 13 clusters populated
In stage 5: transferred 237 clusters with 10 clusters populated
In stage 6: transferred 247 clusters with 10 clusters populated
In stage 7: transferred 235 clusters with 11 clusters populated
In stage 8: transferred 218 clusters with 10 clusters populated
In stage 9: transferred 256 clusters with 9 clusters populated
In stage 10: transferred 240 clusters with 9 clusters populated
In stage 11: transferred 227 clusters with 9 clusters populated
In stage 12: transferred 220 clusters with 9 clusters populated
In stage 13: transferred 220 clusters with 9 clusters populated
In stage 14: transferred 239 clusters with 10 clusters populated
In stage 15: transferred 230 clusters wi

In [13]:
# import pickle

# # Save model
# with open('../data/out/v493_trump_archive_k5.model', 'wb') as f:
#     pickle.dump(mgp, f)
#     f.close()

In [14]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)

Number of documents per topic : [173   0   1   7   0  37   0   0   4  14   2 539 151   0   4]


In [15]:
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

Most important clusters (by number of docs inside): [11  0 12  5  9  3 14  8 10  2 13  7  6  4  1]


In [16]:
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print('Cluster %s : %s'%(cluster,sort_dicts))
        print('-'*120)

In [17]:
# Show the top 5 words in term frequency for each cluster 
top_words(mgp.cluster_word_distribution, top_index, 5)

Cluster 11 : [('elect', 212), ('vote', 184), ('state', 108), ('peopl', 66), ('fraud', 61)]
------------------------------------------------------------------------------------------------------------------------
Cluster 0 : [('great', 26), ('thank', 20), ('vaccin', 16), ('peopl', 15), ('news', 13)]
------------------------------------------------------------------------------------------------------------------------
Cluster 12 : [('georgia', 65), ('signatur', 45), ('briankempga', 41), ('state', 40), ('governor', 37)]
------------------------------------------------------------------------------------------------------------------------
Cluster 5 : [('nation', 20), ('defens', 12), ('section', 12), ('230', 12), ('amp', 11)]
------------------------------------------------------------------------------------------------------------------------
Cluster 9 : [('peopl', 8), ('attack', 6), ('back', 5), ('antifa', 5), ('dc', 4)]
-----------------------------------------------------------------

In [18]:
topic_dict = {}
topic_names = ['Topic #1',
               'Topic #2',
               'Topic #3',
               'Topic #4',
               'Topic #5',
               'Topic #6',
               'Topic #7',
               'Topic #8',
               'Topic #9',
               'Topic #10',
               'Topic #11',
               'Topic #12',
               'Topic #13',
               'Topic #14',
               'Topic #15'
              ]
for i, topic_num in enumerate(top_index):
    topic_dict[topic_num]=topic_names[i] 

In [19]:
def create_topics_dataframe(data_text=df.text,  mgp=mgp, threshold=0.3, topic_dict=topic_dict, stem_text=docs):
    result = pd.DataFrame(columns=['text', 'topic', 'stems'])
    for i, text in enumerate(data_text):
        result.at[i, 'text'] = text
        result.at[i, 'stems'] = stem_text[i]
        prob = mgp.choose_best_label(stem_text[i])
        if prob[1] >= threshold:
            result.at[i, 'topic'] = topic_dict[prob[0]]
        else:
            result.at[i, 'topic'] = 'Other'
    return result

In [20]:
dfx = create_topics_dataframe(data_text=df.text,  mgp=mgp, threshold=0.3, topic_dict=topic_dict, stem_text=docs)

In [21]:
dfx.head()

Unnamed: 0,text,topic,stems
0,https://t.co/GgwnkrGz9U,Topic #1,[httpstcoggwnkrgz9u]
1,"Chris Krebs was totally excoriated and proven wrong at the Senate Hearing on the Fraudulent 2020 Election. Massive FRAUD took place with machines, people voting from out of state, illegals, dead people, no signatures—and so much more!",Topic #1,"[chri, kreb, total, excori, proven, wrong, senat, hear, fraudul, 2020, elect, massiv, fraud, took, place, machin, peopl, vote, state, illeg, dead, peopl, signaturesand, much]"
2,Former United States Solicitor General Ken Starr: Pennsylvania “Flagrantly Violated” Laws Ahead of Election.,Topic #1,"[former, unit, state, solicitor, gener, ken, starr, pennsylvania, flagrantli, violat, law, ahead, elect]"
3,"Senate Hearings going on LIVE @OANN, as to the Fraudulent 2020 Election that just took place. @SenRonJohnson doing an excellent job. Nevada must be flipped based on testimony!",Topic #1,"[senat, hear, go, live, oann, fraudul, 2020, elect, took, place, senronjohnson, excel, job, nevada, must, flip, base, testimoni]"
4,"Perhaps the biggest difference between 2016 and 2020 is @FoxNews, despite the fact that I went from 63,000,000 Votes to 75,000,000 Votes, a record 12,000,000 Vote increase. Obama went down 3,000,000 Votes, and won. Rigged Election!!!",Topic #1,"[perhap, biggest, differ, 2016, 2020, foxnew, despit, fact, went, 63000000, vote, 75000000, vote, record, 12000000, vote, increas, obama, went, 3000000, vote, rig, elect]"


In [22]:
dfx.topic.value_counts(dropna=False)

Topic #1    575
Topic #2    180
Topic #3    137
Topic #4     18
Topic #5      8
Topic #6      6
Topic #7      3
Topic #8      2
Topic #9      2
Other         1
Name: topic, dtype: int64