In [1]:
import pandas as pd
import numpy as np

# set seed for reproducibility
np.random.seed(493)

import json 
import csv

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
ps = nltk.porter.PorterStemmer()

import unicodedata
import re

from gsdmm import MovieGroupProcess

# set seed for reproducibility
np.random.seed(493)

# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_json('../data/in/trump_archive.json')

In [3]:
df.head()

Unnamed: 0,date,favorites,id,isRetweet,retweets,text
0,2021-01-06 23:01:04,0,1346954970910707712,False,0,These are the things and events that happen when a sacred landslide election victory is so unceremoniously &amp; viciously stripped away from great patriots who have been badly &amp; unfairly treated for so long. Go home with love &amp; in peace. Remember this day forever!
1,2021-01-06 21:17:24,0,1346928882595885056,False,0,https://t.co/Pm2PKV0Fp3
2,2021-01-06 20:13:26,730357,1346912780700577792,False,156100,"I am asking for everyone at the U.S. Capitol to remain peaceful. No violence! Remember, WE are the Party of Law &amp; Order – respect the Law and our great men and women in Blue. Thank you!"
3,2021-01-06 19:38:58,582183,1346904110969315328,False,107460,Please support our Capitol Police and Law Enforcement. They are truly on the side of our Country. Stay peaceful!
4,2021-01-06 19:24:22,0,1346900434540240896,False,0,"Mike Pence didn’t have the courage to do what should have been done to protect our Country and our Constitution, giving States a chance to certify a corrected set of facts, not the fraudulent or inaccurate ones which they were asked to previously certify. USA demands the truth!"


In [4]:
# remove  null values
df = df.loc[df.text.notnull()]

In [5]:
def basic_clean(original):
    word = original.lower()
    word = unicodedata.normalize('NFKD', word)\
                                .encode('ascii', 'ignore')\
                                .decode('utf-8', 'ignore')
    word = re.sub(r"[^a-z0-9'\s]", '', word)
    word = word.replace('\n',' ')
    word = word.replace('\t',' ')
    return word

In [6]:
def remove_stopwords(original, extra_words=[], exclude_words=[]):
    stopword_list = stopwords.words('english')

    for word in extra_words:
        stopword_list.append(word)
    for word in exclude_words:
        stopword_list.remove(word)

    words = original.split()
    filtered_words = [w for w in words if w not in stopword_list]

    original_nostop = ' '.join(filtered_words)

    return original_nostop

In [7]:
def stem(original):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in original.split()]
    original_stemmed = ' '.join(stems)
    return original_stemmed

In [8]:
docs = []
for sentence in df.text:
    words = word_tokenize(stem(remove_stopwords(basic_clean(sentence))))
    docs.append(words)

In [9]:
mgp = MovieGroupProcess(K=15, alpha=0.1, beta=1, n_iters=30)

vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)

y = mgp.fit(docs, n_terms)

In stage 0: transferred 835 clusters with 15 clusters populated
In stage 1: transferred 590 clusters with 15 clusters populated
In stage 2: transferred 363 clusters with 11 clusters populated
In stage 3: transferred 295 clusters with 10 clusters populated
In stage 4: transferred 231 clusters with 8 clusters populated
In stage 5: transferred 242 clusters with 7 clusters populated
In stage 6: transferred 196 clusters with 7 clusters populated
In stage 7: transferred 177 clusters with 7 clusters populated
In stage 8: transferred 186 clusters with 7 clusters populated
In stage 9: transferred 180 clusters with 7 clusters populated
In stage 10: transferred 181 clusters with 7 clusters populated
In stage 11: transferred 196 clusters with 7 clusters populated
In stage 12: transferred 186 clusters with 6 clusters populated
In stage 13: transferred 193 clusters with 5 clusters populated
In stage 14: transferred 195 clusters with 5 clusters populated
In stage 15: transferred 205 clusters with 7 c

In [10]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)

Number of documents per topic : [239   0   1   0   0  45   0   0 606  22   1   0   0   0  18]


In [11]:
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

Most important clusters (by number of docs inside): [ 8  0  5  9 14 10  2 13 12 11  7  6  4  3  1]


In [12]:
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print('Cluster %s : %s'%(cluster,sort_dicts))
        print('-'*120)

In [13]:
# Show the top 7 words in term frequency for each cluster 
top_words(mgp.cluster_word_distribution, top_index, 7)

Cluster 8 : [('elect', 215), ('vote', 191), ('state', 129), ('georgia', 85), ('amp', 77), ('ballot', 75), ('republican', 72)]
------------------------------------------------------------------------------------------------------------------------
Cluster 0 : [('vaccin', 39), ('great', 38), ('thank', 33), ('get', 26), ('news', 19), ('peopl', 17), ('trump', 15)]
------------------------------------------------------------------------------------------------------------------------
Cluster 5 : [('nation', 20), ('section', 13), ('230', 13), ('amp', 12), ('defens', 11), ('termin', 10), ('secur', 9)]
------------------------------------------------------------------------------------------------------------------------
Cluster 9 : [('peopl', 8), ('antifa', 6), ('attack', 6), ('dc', 5), ('back', 4), ('polic', 4), ('left', 4)]
------------------------------------------------------------------------------------------------------------------------
Cluster 14 : [('need', 3), ('today', 3), ('recog

In [14]:
topic_dict = {}
topic_names = ['Topic #1',
               'Topic #2',
               'Topic #3',
               'Topic #4',
               'Topic #5',
               'Topic #6',
               'Topic #7',
               'Topic #8',
               'Topic #9',
               'Topic #10',
               'Topic #11',
               'Topic #12',
               'Topic #13',
               'Topic #14',
               'Topic #15'
              ]
for i, topic_num in enumerate(top_index):
    topic_dict[topic_num]=topic_names[i] 

In [15]:
def create_topics_dataframe(data_text=df.text,  mgp=mgp, threshold=0.3, topic_dict=topic_dict, stem_text=docs):
    result = pd.DataFrame(columns=['text', 'topic', 'stems'])
    for i, text in enumerate(data_text):
        result.at[i, 'text'] = text
        result.at[i, 'stems'] = stem_text[i]
        prob = mgp.choose_best_label(stem_text[i])
        if prob[1] >= threshold:
            result.at[i, 'topic'] = topic_dict[prob[0]]
        else:
            result.at[i, 'topic'] = 'Other'
    return result

In [16]:
dfx = create_topics_dataframe(data_text=df.text,  mgp=mgp, threshold=0.3, topic_dict=topic_dict, stem_text=docs)

In [17]:
dfx.head()

Unnamed: 0,text,topic,stems
0,These are the things and events that happen when a sacred landslide election victory is so unceremoniously &amp; viciously stripped away from great patriots who have been badly &amp; unfairly treated for so long. Go home with love &amp; in peace. Remember this day forever!,Topic #1,"[thing, event, happen, sacr, landslid, elect, victori, unceremoni, amp, vicious, strip, away, great, patriot, badli, amp, unfairli, treat, long, go, home, love, amp, peac, rememb, day, forev]"
1,https://t.co/Pm2PKV0Fp3,Topic #2,[httpstcopm2pkv0fp3]
2,"I am asking for everyone at the U.S. Capitol to remain peaceful. No violence! Remember, WE are the Party of Law &amp; Order – respect the Law and our great men and women in Blue. Thank you!",Topic #1,"[ask, everyon, us, capitol, remain, peac, violenc, rememb, parti, law, amp, order, respect, law, great, men, women, blue, thank]"
3,Please support our Capitol Police and Law Enforcement. They are truly on the side of our Country. Stay peaceful!,Topic #4,"[pleas, support, capitol, polic, law, enforc, truli, side, countri, stay, peac]"
4,"Mike Pence didn’t have the courage to do what should have been done to protect our Country and our Constitution, giving States a chance to certify a corrected set of facts, not the fraudulent or inaccurate ones which they were asked to previously certify. USA demands the truth!",Topic #1,"[mike, penc, didnt, courag, done, protect, countri, constitut, give, state, chanc, certifi, correct, set, fact, fraudul, inaccur, one, ask, previous, certifi, usa, demand, truth]"


In [18]:
dfx.topic.value_counts(dropna=False)

Topic #1    648
Topic #2    238
Topic #3     26
Topic #4     10
Topic #5      9
Topic #7      1
Name: topic, dtype: int64

In [19]:
dfx.to_csv('../data/out/trump_archive_with_topics.csv', index=False)