In [5]:
import pickle
import pandas as pd   
import numpy as np

import datetime
from sklearn.cluster import KMeans
from sklearn import cluster, metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.manifold import TSNE

import re, string 
from collections import defaultdict
import spacy
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
from gensim.models import Word2Vec
import matplotlib.pyplot as plt

%matplotlib inline


In [6]:
disaster_specific = ['hurricaneharvey', 'harvey', 'houston', 'texas','hurricanharvey']
# custom_stopwords = custom_words + disaster_specific
custom_words = list(set(
    list(ENGLISH_STOP_WORDS) + list(stopwords.words('english')) +
    ['en', 'amp', 'instagram', 'hurricaneharvey', 'harvey', 'houston', 'texas', 'com', 'county', 'org',
     'www', 'https', 'http', 'rt']))
def process_data_by_tweet(file):
    df = pd.read_file(file)
    # convert to lowercase
    self._remove_extraneous(df, custom_stopwords)
    df['cleaned_tweets'] = self._clean_tweets(df['text'])
    df['lemmatized_tweets'] = self._lemmatize_tweets_spacy(df['cleaned_tweets'])
    return df


In [21]:
# bring in csv and convert each tweet to an item in a list
df = pd.read_csv('./data/hurricaneharvey/twitter_retrieval/hurricaneharvey_10000.csv')
tweets = [row.split() for row in df['lemmatized_tweets']]

In [8]:
# create word2vec model finding vectors for each word in the quorum
w2v_model = Word2Vec(sentences=tweets,
                     min_count=1,
                 window=5,
                 size=100,
                 workers=4)

# w2v_model.build_vocab(sentences)
w2v_model.train(tweets, total_examples=w2v_model.corpus_count, epochs=w2v_model.iter)
w2v_model.init_sims(replace=True)


  w2v_model.train(tweets, total_examples=w2v_model.corpus_count, epochs=w2v_model.iter)


In [14]:
# calculate the average vector of all words in the tweet and create tweet dictionary as {tweet: vector}
vect_score = []
for tweet in tweets:
    tweet_vect = []
    for word in tweet:
        # append the word vector from the model into the tweet_vect list. We will Sum these next. 
        tweet_vect.append(w2v_model.wv[word])
    vect_score.append(sum(tweet_vect)/len(tweet_vect))
len(vect_score)

10000
9173
10000


In [6]:
w2v_model.wv['hurricanharvey']

array([ 1.40223265e-01,  2.81652156e-02, -1.09964222e-01, -1.64557770e-01,
        1.04905386e-02,  2.73856074e-02,  9.52607840e-02, -1.69634163e-01,
        1.28236383e-01, -1.26077458e-01, -3.52558680e-02,  9.53566208e-02,
       -1.63610242e-02,  1.46799326e-01, -7.64902607e-02,  4.31606025e-02,
        6.00487590e-02,  5.03449142e-02,  5.29862642e-02,  3.94417942e-02,
        6.00875989e-02, -6.24612533e-03, -4.95768478e-03,  1.69669002e-01,
        2.11169850e-02,  1.13319561e-01, -1.63763031e-01,  5.37509061e-02,
       -1.14197629e-02, -2.41173729e-02, -3.25833727e-03,  1.05494156e-03,
        2.35415623e-03,  5.86211458e-02, -3.48350815e-02, -1.62090316e-01,
       -1.48385838e-01,  1.34264380e-01,  3.92028168e-02, -1.69060137e-02,
        1.94213949e-02, -3.07429284e-02,  1.53678328e-01,  8.32565352e-02,
        1.07282326e-01,  8.77068564e-02,  1.98589891e-01, -2.76821610e-02,
       -9.03183818e-02,  3.98838706e-02, -7.48796165e-02, -1.21097550e-01,
        1.37586117e-01, -

In [7]:
for tweet in tweets:
    if len(tweet) == 2:
        print(tweet)

['horrible', 'hurricanharvey']
['hurricanharvey', 'beaumont']
['hurricanharvey', 'hurricaneirma']
['hurricanharvey', 'irmarecovery']
['hurricaneirma', 'hurricanharvey']
['hurrcaneirma', 'hurricanharvey']
['hurricanharvey', 'prayerforhurricanharveyvictims']
['pray', 'hurricanharvey']
['hurricaneirma', 'hurricanharvey']
['hurricanharvey', 'hurricaneirma']
['hurricanharvey', 'portaransas']
['awful', 'hurricanharvey']
['bioshock', 'hurricanharvey']
['hurricanharvey', 'harveyrelief']
['migos', 'hurricanharvey']
['relief', 'hurricanharvey']
['isaiah', 'hurricanharvey']
['hurricanharvey', 'lagrangetexas']
['prayforhouston', 'hurricanharvey']
['check', 'hurricanharvey']
['getinvolved', 'hurricanharvey']
['joelosteen', 'hurricanharvey']
['prayfortexas', 'hurricanharvey']
['party', 'home']
['hurricanharvey', 'pleasedontcomethisway']
['hurricanharvey', 'calebcity']
['pray', 'hurricanharvey']
['heroes', 'hurricanharvey']
['hurricanharvey', 'afterparty']
['hurricanharvey', 'animal']
['troopsdirect'

In [8]:
print('TWEETS')
print(f'Word Count: {sum([len(tweet) for tweet in tweets])}')
print(f'Shortest Tweet: {min([len(tweet) for tweet in tweets])}')
print('\nMODEL')
print(f'Type: {type(w2v_model.wv.vocab)}')
print(f'Length: {len(list(w2v_model.wv.vocab))}')
print('\nVECT_SCORE')
print(f'Type: {type(vect_score)}')
print(f'Length: {len(list(vect_score))}')

TWEETS
Word Count: 83360
Shortest Tweet: 1

MODEL
Type: <class 'dict'>
Length: 14889

VECT_SCORE
Type: <class 'dict'>
Length: 9173


In [9]:
len([len(tweet) for tweet in tweets if len(tweet) < 2])

225

In [10]:
# Word2Vec into Kmeans from https://ai.intelligentonlinetools.com/ml/k-means-clustering-example-word2vec/

X = w2v_model[w2v_model.wv.vocab]
X2 = list(vect_score.values())

from nltk.cluster import KMeansClusterer
import nltk
kclusterer = KMeansClusterer(num_means=20, distance=nltk.cluster.util.cosine_distance, repeats=25,avoid_empty_clusters=True)
assigned_clusters = kclusterer.cluster(X2, assign_clusters=True)

# n_clusters = 20

# kmeans = KMeans(n_clusters=n_clusters).fit(X)

# clusters = kmeans.cluster_centers_
# current_time = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
# filename = f"../data/mendocinocomplex/kmeans/kmeans_{n_clusters}_{current_time}.pkl"
# with open(filename, 'wb') as file:
#     pickle.dump(kmeans, file)
#     file.close()
# df['cluster'] = kmeans.predict(X)
# print(df[['cluster', 'id', 'text']])
# dataframe.to_csv(f"../data/mendocinocomplex/kmeans/mendocinocomplex_{n_clusters}_{current_time}.csv", header=True, index=False)

  This is separate from the ipykernel package so we can avoid doing imports until


KeyboardInterrupt: 

In [None]:
# Print words from model
words = list(w2v_model.wv.vocab)
for i, word in enumerate(words):  
    print (word + ":" + str(assigned_clusters[i]))

In [None]:
kmeans = cluster.KMeans(n_clusters=20)
kmeans.fit(X2)
 
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
 
print ("Cluster id labels for inputted data")
print (labels)
print ("Centroids data")
print (centroids)
 
print ("Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):")
print (kmeans.score(X))
 
silhouette_score = metrics.silhouette_score(X, labels, metric='euclidean')
 
print ("Silhouette_score: ")
print (silhouette_score)

In [None]:
# from https://towardsdatascience.com/understanding-word2vec-embedding-in-practice-3e9b8985953

def tsne_plot(model):
    "Create TSNE model and plot it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(18, 18)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()
   
tsne_plot(w2v_model)