In [20]:
import json
import numpy as np
import pandas as pd

import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score
# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from sklearn.metrics.pairwise import cosine_distances

import preprocessor as p # 
import string # use string.punctuation to clean out punctuation

from pprint import pprint

from datetime import datetime
import pickle



from tqdm import tqdm

pd.set_option('display.max_colwidth', -1)

In [2]:
with open("clean_tweets_sample.pkl", 'rb') as picklefile: 
    sampleDF = pickle.load(picklefile)

In [3]:
with open("clean_tweets_full.pkl", 'rb') as picklefile: 
    fullDF = pickle.load(picklefile)

## Preprocessing

In [4]:
def just_text(tweetsDF):
    cleanDF = pd.DataFrame(columns = ['id','full_text'])
    cleanDF['full_text'] = tweetsDF['full_text']
    cleanDF.drop('id',axis = 1).head()
    return cleanDF

In [5]:
sampletextDF = just_text(sampleDF)

In [6]:
fulltextDF = just_text(fullDF)

# Set Stopwords

In [7]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop += ['climate', 'change', 'global', 'warming', 'climatechange', 'globalwarming']

## Count Vectorizer

In [8]:
def run_countvec(tweetDF):

    # Create a CountVectorizer for parsing/counting words
    count_vectorizer = CountVectorizer(ngram_range=(1, 2), #max_df = 0.8,  
                                   stop_words=stop, token_pattern="\\b[a-z][a-z]+\\b")
    count_vectorizer.fit(tweetDF.full_text)
    
    # Create the term-document matrix
    # Transpose it so the terms are the rows
    counts = count_vectorizer.transform(tweetDF.full_text).transpose()
    return count_vectorizer, counts

In [9]:
sample_count_vectorizer, sample_counts = run_countvec(sampletextDF)

In [10]:
full_count_vectorizer, full_counts = run_countvec(fulltextDF)

## TF-IDF

In [11]:
def run_tfidf(counts):
    transformer = TfidfTransformer(smooth_idf=False)
    x_tfidf = transformer.fit_transform(sample_counts)
    #xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)
    return x_tfidf

In [12]:
sample_tfidf = run_tfidf(sample_counts)

  idf = np.log(float(n_samples) / df) + 1.0


In [13]:
sample_counts.shape

(51177, 17579)

In [14]:
full_tfidf = run_tfidf(full_counts)

  idf = np.log(float(n_samples) / df) + 1.0


In [15]:
full_counts.shape

(73998, 180920)

In [16]:
terms = sample_count_vectorizer.get_feature_names()

In [17]:
dist = cosine_distances(sample_tfidf)

KeyboardInterrupt: 

In [None]:
dist

In [22]:


SSEs = []
Sil_coefs = []

for k in range(2,20):
    km = KMeans(n_clusters=k, random_state=1)
    km.fit(sample_tfidf.T)
    labels = km.labels_
    Sil_coefs.append(silhouette_score(sample_tfidf.T,labels, metric='euclidean'))
    SSEs.append(get_SSE(cvec, labels)) # The SSE is just inertia, we
                                           # could have just said km.inertia_

NameError: name 'get_SSE' is not defined

In [None]:
# number of clusters for kmeans
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,5), sharex=True)
k_clusters = range(2,20)
ax1.plot(k_clusters, Sil_coefs)
ax1.set_xlabel('number of clusters')
ax1.set_ylabel('silhouette coefficient')

# Sum of Square Error
ax2.plot(k_clusters, SSEs)
ax2.set_xlabel('number of clusters')
ax2.set_ylabel('SSE');

In [43]:

from sklearn.cluster import KMeans

num_clusters = 3

km = KMeans(n_clusters=num_clusters)

%time km.fit(sample_tfidf.T)

clusters = km.labels_.tolist()

CPU times: user 3.84 s, sys: 30.3 ms, total: 3.87 s
Wall time: 3.89 s


In [44]:
sample_tfidf.shape

(51177, 17579)

In [45]:
len(clusters)

17579

In [46]:
sampletextDF['cluster'] = clusters

In [47]:
sampletextDF.cluster.value_counts()

0    17562
1    16   
2    1    
Name: cluster, dtype: int64

In [49]:
print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % terms[ind], end=',')
    print() #add whitespace
    print() #add whitespace
    

    print() #add whitespace
    print() #add whitespace
    
print()
print()

Top terms per cluster:

Cluster 0 words: world, trees, arctic, science, new, million,



Cluster 1 words: increasing messing, appreciate people, system going, entire system, excited blizzards, living moment,



Cluster 2 words: useful, public fear, wondered useful, governments actually, useful foil, mass hysteria,







In [None]:
from scipy.cluster.hierarchy import ward, dendrogram

linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances

fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=titles);

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout() #show plot with tight layout

#uncomment below to save figure
plt.savefig('ward_clusters.png', dpi=200) #save figure as ward_clusters