In [3]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
import pickle


In [4]:
train = pickle.load(open("../pickle/train.pickle", "rb"))
val = pickle.load(open("../pickle/val.pickle", "rb"))
test = pickle.load(open("../pickle/test.pickle", "rb"))

In [7]:
X_tr = train.tweet.apply(lambda x: str(x)[1:-1]) 
y_tr = train.target
X_val = val.tweet
y_val = val.target

X_tr

0        'reject', 'constantly', 'house', 'threaten', '...
1        'convince', 'lame', 'nigger', 'liver', 'believ...
2        'peace', 'fag', 'remember', 'best', 'lux', 'su...
3                  'haha', 'ight', 'nig', 'calm', 'yoself'
4        'tits', 'better', 'look', 'face', 'make', 'lik...
                               ...                        
18581                               'miss', 'lil', 'bitch'
18582    'gotta', 'hoe', 'smh', 'aint', 'captain', 'sav...
18583        'lmao', 'yeah', 'bitch', 'lil', 'shit', 'rip'
18584                                'tbt', 'bad', 'bitch'
18585                  'hoe', 'act', 'know', 'imma', 'let'
Name: tweet, Length: 18586, dtype: object

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                   min_df=0.2, 
                                   use_idf=True, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(X_tr) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 705 ms, sys: 34.2 ms, total: 739 ms
Wall time: 750 ms
(18586, 1)


In [9]:
terms = tfidf_vectorizer.get_feature_names()


In [11]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
print(dist)

[[0. 1. 1. ... 0. 0. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 ...
 [0. 1. 1. ... 0. 0. 1.]
 [0. 1. 1. ... 0. 0. 1.]
 [1. 1. 1. ... 1. 1. 1.]]


In [12]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 1.29 s, sys: 216 ms, total: 1.51 s
Wall time: 252 ms


In [None]:
import numpy as np
import pandas as pd
import pickle
import re

from mgp import MovieGroupProcess
from tqdm import tqdm

# load in data if needed 
# data_dir = '../data/preprocessed_tweets.csv'
# tweets_df = pd.read_csv(data_dir) 
tweets_df.head(5)

In [None]:
# convert string of tokens into tokens list
tweets_df['tokens'] = tweets_df.tokens.apply(lambda x: re.split('\s', x))
tweets_df.head()

In [None]:
# create a single list of tweet tokens
docs = tweets_df['tokens'].tolist()
docs[:3]  # view top 3 elements

In [None]:
%%time
# Train STTM model
#    K = number of potential topics
#    alpha = controls completeness
#    beta =  controls homogeneity 
#    n_iters = number of iterations
mgp = MovieGroupProcess(K=10, alpha=0.1, beta=0.5, n_iters=5)
vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)
y = mgp.fit(docs, n_terms)

# Save model
with open('../pickle/10clusters.model', 'wb') as f:
    pickle.dump(mgp, f)
    f.close()

In [None]:
# load in trained model 
filehandler = open('../pickle/10clusters.model', 'rb')
mgp = pickle.load(filehandler)

In [None]:
# define helper functions
def top_words(cluster_word_distribution, top_cluster, values):
    '''prints the top words in each cluster'''
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print('Cluster %s : %s'%(cluster,sort_dicts))
        print(' — — — — — — — — —')
        
def cluster_importance(mgp):
    '''returns a word-topic matrix[phi] where each value represents
    the word importance for that particular cluster;
    phi[i][w] would be the importance of word w in topic i.
    '''
    n_z_w = mgp.cluster_word_distribution
    beta, V, K = mgp.beta, mgp.vocab_size, mgp.K
    phi = [{} for i in range(K)]
    for z in range(K):
        for w in n_z_w[z]:
            phi[z][w] = (n_z_w[z][w]+beta)/(sum(n_z_w[z].values())+V*beta)
    return phi

def topic_allocation(df, docs, mgp, topic_dict):
    '''allocates all topics to each document in original dataframe,
    adding two columns for cluster number and cluster description'''
    topic_allocations = []
    for doc in tqdm(docs):
        topic_label, score = mgp.choose_best_label(doc)
        topic_allocations.append(topic_label)

    df['cluster'] = topic_allocations

    df['topic_name'] = df.cluster.apply(lambda x: get_topic_name(x, topic_dict))
    print('Complete. Number of documents with topic allocated: {}'.format(len(df)))

def get_topic_name(doc, topic_dict):
    '''returns the topic name string value from a dictionary of topics'''
    topic_desc = topic_dict[doc]
    return topic_desc

In [None]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)
print('*'*20)

# topics sorted by the number of documents they are allocated to
top_index = doc_count.argsort()[-10:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)

# show the top 5 words in term frequency for each cluster 
topic_indices = np.arange(start=0, stop=len(doc_count), step=1)
top_words(mgp.cluster_word_distribution, topic_indices, 5)

In [None]:
phi = cluster_importance(mgp) # initialize phi matrix

# 'coronavirus' term importance for cluster 1 and 0 
print(phi[1]['wuhan'])
print(phi[0]['china'])

In [None]:
# define dictionary topics in same sequential order
# as resulting clusters from gsdmm model 
topic_dict = {}
topic_names = ['china',
               'wuhan',
               'fuck',
               'communist',
               'cancer & heart disease',
               'diet & exercise',
               'health & medical workers',
               'abortion',
               'vaping & cigarettes',
               'drug costs & opioid crisis']

for i, topic_num in enumerate(topic_indices):
    topic_dict[topic_num]=topic_names[i]
    
# allocate topics to original data frame 
topic_allocation(tweets_df, docs, mgp, topic_dict)

In [None]:
tweets_df[['tweet', 'username', 'tokens', 'cluster', 'topic_name']].sample(5)