In [1]:
import json, time, re, nltk, hdbscan, spacy, string
import psycopg2 as pg2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sys import getsizeof
from datetime import datetime
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from psycopg2.extras import RealDictCursor, Json
from spacy.lang.en.examples import sentences
from scipy import sparse

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.cluster import KMeans

np.random.seed(42)

In [2]:
def filename_format_log(file_path, 
                        logfile = 'assets/file_log.txt', 
                        now = round(time.time()), 
                        file_description = None): 
   
    try:
        ext = re.search('(?<!^)(?<!\.)\.(?!\.)', file_path).start() 
    except:
        raise NameError('Please enter a relative path with a file extension.') 
    
    stamp = re.search('(?<!^)(?<!\.)[a-z]+_[a-z]+(?=\.)', file_path).start()
    formatted_name = f'{file_path[:stamp]}{now}_{file_path[stamp:]}'  
    if not file_description:
        file_description = f'Word list saved at: {time.asctime(time.gmtime(now))}'
    with open(logfile, 'a+') as f:
        f.write(f'{formatted_name}: {file_description}\n')
    return formatted_name, now, file_description

In [3]:
sandy_df = pd.read_csv('data/CrisisLexT6/2012_Sandy_Hurricane/2012_Sandy_Hurricane-ontopic_offtopic.csv')

In [4]:
sandy_df.head()

Unnamed: 0,tweet id,tweet,label
0,'262596552399396864',I've got enough candles to supply a Mexican fa...,off-topic
1,'263044104500420609',Sandy be soooo mad that she be shattering our ...,on-topic
2,'263309629973491712',@ibexgirl thankfully Hurricane Waugh played it...,off-topic
3,'263422851133079552',@taos you never got that magnificent case of B...,off-topic
4,'262404311223504896',"I'm at Mad River Bar &amp; Grille (New York, N...",off-topic


In [5]:
sandy_df['type'] = 'hurricane'

In [6]:
sandy_df.head()

Unnamed: 0,tweet id,tweet,label,type
0,'262596552399396864',I've got enough candles to supply a Mexican fa...,off-topic,hurricane
1,'263044104500420609',Sandy be soooo mad that she be shattering our ...,on-topic,hurricane
2,'263309629973491712',@ibexgirl thankfully Hurricane Waugh played it...,off-topic,hurricane
3,'263422851133079552',@taos you never got that magnificent case of B...,off-topic,hurricane
4,'262404311223504896',"I'm at Mad River Bar &amp; Grille (New York, N...",off-topic,hurricane


In [7]:
sandy_df.columns

Index(['tweet id', ' tweet', ' label', 'type'], dtype='object')

In [8]:
sandy_df.rename(columns={' tweet':'tweet'}, inplace=True)

In [9]:
sandy_df.rename(columns={' label':'label'}, inplace=True)

In [10]:
sandy_df['label'] = sandy_df['label'].map(lambda x: 1 if x=='on-topic' else 0)

In [11]:
sandy_df.head()

Unnamed: 0,tweet id,tweet,label,type
0,'262596552399396864',I've got enough candles to supply a Mexican fa...,0,hurricane
1,'263044104500420609',Sandy be soooo mad that she be shattering our ...,1,hurricane
2,'263309629973491712',@ibexgirl thankfully Hurricane Waugh played it...,0,hurricane
3,'263422851133079552',@taos you never got that magnificent case of B...,0,hurricane
4,'262404311223504896',"I'm at Mad River Bar &amp; Grille (New York, N...",0,hurricane


In [12]:
sandy_df.shape

(10008, 4)

In [13]:
alberta_df = pd.read_csv('data/CrisisLexT6/2013_Alberta_Floods/2013_Alberta_Floods-ontopic_offtopic.csv')

In [14]:
alberta_df.head()

Unnamed: 0,tweet id,tweet,label
0,'348351442404376578',@Jay1972Jay Nope. Mid 80's. It's off Metallica...,off-topic
1,'348167215536803841',Nothing like a :16 second downpour to give us ...,off-topic
2,'348644655786778624',@NelsonTagoona so glad that you missed the flo...,on-topic
3,'350519668815036416',"Party hard , suns down , still warm , lovin li...",off-topic
4,'351446519733432320',@Exclusionzone if you compare yourself to wate...,off-topic


In [15]:
alberta_df['type'] = 'flood'

In [16]:
alberta_df.columns

Index(['tweet id', ' tweet', ' label', 'type'], dtype='object')

In [17]:
alberta_df.rename(columns={' tweet':'tweet'}, inplace=True)

In [18]:
alberta_df.rename(columns={' label':'label'}, inplace=True)

In [19]:
alberta_df['label'] = alberta_df['label'].map(lambda x: 1 if x=='on-topic' else 0)

In [20]:
alberta_df.head()

Unnamed: 0,tweet id,tweet,label,type
0,'348351442404376578',@Jay1972Jay Nope. Mid 80's. It's off Metallica...,0,flood
1,'348167215536803841',Nothing like a :16 second downpour to give us ...,0,flood
2,'348644655786778624',@NelsonTagoona so glad that you missed the flo...,1,flood
3,'350519668815036416',"Party hard , suns down , still warm , lovin li...",0,flood
4,'351446519733432320',@Exclusionzone if you compare yourself to wate...,0,flood


In [21]:
alberta_df.shape

(10031, 4)

In [22]:
oklahoma_df = pd.read_csv('data/CrisisLexT6/2013_Oklahoma_Tornado/2013_Oklahoma_Tornado-ontopic_offtopic.csv')

In [None]:
oklahoma_df.head()

Unnamed: 0,tweet id,tweet,label
0,'336908711324962817',@HeatleyJheat44 its barley even raining where ...,on-topic
1,'337052158035890176',Sorry I can't do anything right.,off-topic
2,'339338021751103488',@mrwendell29: @BradSowderWX says we have the ...,on-topic
3,'336339509077762051',#honestyhour I like to wear half split running...,off-topic
4,'337734129972035584',I'm too stressed to have a good summer,off-topic


In [None]:
oklahoma_df['type'] = 'tornado'

In [None]:
oklahoma_df.columns

Index(['tweet id', ' tweet', ' label', 'type'], dtype='object')

In [None]:
oklahoma_df.rename(columns={' tweet':'tweet'}, inplace=True)

In [None]:
oklahoma_df.rename(columns={' label':'label'}, inplace=True)

In [None]:
oklahoma_df['label'] = oklahoma_df['label'].map(lambda x: 1 if x=='on-topic' else 0)

In [None]:
oklahoma_df.head()

Unnamed: 0,tweet id,tweet,label,type
0,'336908711324962817',@HeatleyJheat44 its barley even raining where ...,1,tornado
1,'337052158035890176',Sorry I can't do anything right.,0,tornado
2,'339338021751103488',@mrwendell29: @BradSowderWX says we have the ...,1,tornado
3,'336339509077762051',#honestyhour I like to wear half split running...,0,tornado
4,'337734129972035584',I'm too stressed to have a good summer,0,tornado


In [None]:
oklahoma_df.shape

(9992, 4)

In [None]:
queensland_df = pd.read_csv('data/CrisisLexT6/2013_Queensland_Floods/2013_Queensland_Floods-ontopic_offtopic.csv')

In [None]:
queensland_df.head()

Unnamed: 0,tweet id,tweet,label
0,'296728042179534848',"@MarkSDobson I always thought that, big lad ai...",off-topic
1,'296085045645570048',@thamonstar a lot of water moving around and a...,on-topic
2,'296811076400603136',Craig Thompson to be extradited to Victoria on...,off-topic
3,'295357934387486720',"Sunshine state, sort your shit out.",off-topic
4,'296390762210398210',@MarkPhilippi yeah I saw it. He's a wanker. Pa...,off-topic


In [None]:
queensland_df['type'] = 'flood'

In [None]:
queensland_df.columns

Index(['tweet id', ' tweet', ' label', 'type'], dtype='object')

In [None]:
queensland_df.rename(columns={' tweet':'tweet'}, inplace=True)

In [None]:
queensland_df.rename(columns={' label':'label'}, inplace=True)

In [None]:
queensland_df['label'] = queensland_df['label'].map(lambda x: 1 if x=='on-topic' else 0)

In [None]:
queensland_df.head()

Unnamed: 0,tweet id,tweet,label,type
0,'296728042179534848',"@MarkSDobson I always thought that, big lad ai...",0,flood
1,'296085045645570048',@thamonstar a lot of water moving around and a...,1,flood
2,'296811076400603136',Craig Thompson to be extradited to Victoria on...,0,flood
3,'295357934387486720',"Sunshine state, sort your shit out.",0,flood
4,'296390762210398210',@MarkPhilippi yeah I saw it. He's a wanker. Pa...,0,flood


In [None]:
queensland_df.shape

(10033, 4)

In [None]:
crisislex_df = pd.concat([sandy_df, alberta_df, oklahoma_df, queensland_df])

In [None]:
crisislex_df

Unnamed: 0,tweet id,tweet,label,type
0,'262596552399396864',I've got enough candles to supply a Mexican fa...,0,hurricane
1,'263044104500420609',Sandy be soooo mad that she be shattering our ...,1,hurricane
2,'263309629973491712',@ibexgirl thankfully Hurricane Waugh played it...,0,hurricane
3,'263422851133079552',@taos you never got that magnificent case of B...,0,hurricane
4,'262404311223504896',"I'm at Mad River Bar &amp; Grille (New York, N...",0,hurricane
5,'263101347421888513',Neighborly duties. @Cory_Kennedy arrives to th...,1,hurricane
6,'262763437325684736',And that's it until the spring.,0,hurricane
7,'263298821189156865',I don't know how I'm getting back to Jersey si...,1,hurricane
8,'262813023515865088',@NaeemPeena We were asked to get off the plane...,0,hurricane
9,'262998165282762752',@jaytee_96 you must be crazy! &amp; omg you tw...,0,hurricane


In [None]:
df = crisislex_df

In [None]:
df.head()

Unnamed: 0,tweet id,tweet,label,type
0,'262596552399396864',I've got enough candles to supply a Mexican fa...,0,hurricane
1,'263044104500420609',Sandy be soooo mad that she be shattering our ...,1,hurricane
2,'263309629973491712',@ibexgirl thankfully Hurricane Waugh played it...,0,hurricane
3,'263422851133079552',@taos you never got that magnificent case of B...,0,hurricane
4,'262404311223504896',"I'm at Mad River Bar &amp; Grille (New York, N...",0,hurricane


In [None]:
df.columns

Index(['tweet id', 'tweet', 'label', 'type'], dtype='object')

In [None]:
def processTweet(tweet):
    #Convert to lower case
    tweet = tweet.lower()
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #Eliminate hastags
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #Remove @ signs
    tweet = re.sub('@', '', tweet)
    return tweet

In [None]:
tweet_list = [i for i in df['tweet']]

In [None]:
tweet_list[:5]

["I've got enough candles to supply a Mexican family",
 'Sandy be soooo mad that she be shattering our doors and shiet #HurricaneSandy',
 '@ibexgirl thankfully Hurricane Waugh played it cool and waited this one out. Ready to go at any moment tho.',
 '@taos you never got that magnificent case of Burgundy I sent you to thank you for your tweets?',
 "I'm at Mad River Bar &amp; Grille (New York, NY) http://t.co/VSiZrzKP"]

Create a column of processed tweets utilizing the created function above:

In [None]:
df['processed'] = [processTweet(i) for i in tweet_list]

In [None]:
df['processed'].str.contains('timberlake').sum()

0

In [None]:
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
df['clean_processed'] = df['processed'].map(lambda x: tokenizer.tokenize(x))

In [None]:
df.head()

Unnamed: 0,tweet id,tweet,label,type,processed,clean_processed
0,'262596552399396864',I've got enough candles to supply a Mexican fa...,0,hurricane,i've got enough candles to supply a mexican fa...,"[i, ve, got, enough, candles, to, supply, a, m..."
1,'263044104500420609',Sandy be soooo mad that she be shattering our ...,1,hurricane,sandy be soooo mad that she be shattering our ...,"[sandy, be, soooo, mad, that, she, be, shatter..."
2,'263309629973491712',@ibexgirl thankfully Hurricane Waugh played it...,0,hurricane,ibexgirl thankfully hurricane waugh played it ...,"[ibexgirl, thankfully, hurricane, waugh, playe..."
3,'263422851133079552',@taos you never got that magnificent case of B...,0,hurricane,taos you never got that magnificent case of bu...,"[taos, you, never, got, that, magnificent, cas..."
4,'262404311223504896',"I'm at Mad River Bar &amp; Grille (New York, N...",0,hurricane,"i'm at mad river bar &amp; grille (new york, n...","[i, m, at, mad, river, bar, amp, grille, new, ..."


In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
df['lemm_clean_processed'] = df['clean_processed'].map(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x]))

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

formatted_name, now, file_description = filename_format_log(file_path ='assets/crisislex_df.csv')
df.to_csv(formatted_name, index=False)

In [None]:
lemm_clean_processed_list = [i for i in df['lemm_clean_processed']]

Utilize countvectorizer:

min_df will remember words that are in a document at least once. For example, if min_df = 3, will only remember words that are a part of 3 documents in a corpus. max_df refers to the %-age of documents within a corpus that a word occurs in. For example, if max_df = 0.9, words that occur in more than 90% of my documents will be ignored. By default, max_df is equal to 1.

In [None]:
cv = CountVectorizer(ngram_range = (1,2),
                     stop_words = 'english',
                     min_df = 15,
                     max_df = 1.0)

In [None]:
df_cv = pd.SparseDataFrame(cv.fit_transform(df['lemm_clean_processed']), 
                     columns = cv.get_feature_names())

In [None]:
getsizeof(df_cv)

In [None]:
df_cv.fillna(0, inplace=True)

In [None]:
df_cv.columns

Perform SVD to reduce dimensionality to about ~1000 (Currently ~1800)...Have to run an instance with about ~8GB of RAM (~0.08cents to perform one calculation); shut off the instance and restart this instance.. 

tfidf dataframe

custom_stop = ['url',
               'rt',
               'mitchellvii',
               'wa',
               'ha',
               'just',
               'good',
               'free',
               'purchase',
               'shipping',
               'don',
               'buy',
               'sale',
               'snkrconnecthq',
               
              ]

stop_words = ENGLISH_STOP_WORDS.union(custom_stop)

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1,2), 
                        stop_words = 'english', 
                        min_df = 25, 
                        max_df = 1.0)

In [None]:
df_tfidf = pd.SparseDataFrame(tfidf.fit_transform(df['lemm_clean_processed']),
                              columns = tfidf.get_feature_names())

In [None]:
df_tfidf.shape

In [None]:
df_tfidf.fillna(0, inplace = True)

In [None]:
df_tfidf.head()

crisislexTFIDF_coo = df_tfidf.to_coo()

formatted_name, now, file_description = filename_format_log(file_path ='assets/crisislexTFIDF_coo.npz')
sparse.save_npz('assets/crisislexTFIDF_coo.npz', crisislexTFIDF_coo)

columns = pd.Series(tfidf.get_feature_names())

formatted_name, now, file_description = filename_format_log(file_path ='/assets/TFIDF_col.csv')
columns.to_csv('assets/TFIDF_col.csv', index=False)

### SVD

In [None]:
SVD = TruncatedSVD(n_components=1000) 
# doesn't center out data...versus PCA which it does...
# ##If we didn't fit before train_test_split (WHY DO WE NEED TO FIT TRANSFORM BEFORE TRAIN TEST SPLIT...)
svd_matrix = SVD.fit_transform(df_tfidf)
svd_matrix.shape

In [None]:
component_names = pd.Series(["component_"+str(i+1) for i in range(1000)])

In [None]:
SVD.components_.shape

In [None]:
fig = plt.figure(figsize=(20,10))
plt.bar(np.array(range(1000))+1, 
        SVD.explained_variance_ratio_, 
        color='g', 
        label='Explained Variance')
plt.plot(np.array(range(1000))+1, 
         np.cumsum(SVD.explained_variance_ratio_), 
         label='Cumulative Explained Variance')
plt.legend(fontsize=16)
plt.xlabel('Component', fontsize=20)
plt.ylabel('Variance Ratio', fontsize=20)
plt.title('Explained Variance by Component', fontsize=36);

In [None]:
np.cumsum(SVD.explained_variance_)

In [None]:
component_names = ["component_"+str(i+1) for i in range(1000)]
svd_df = pd.SparseDataFrame(svd_matrix,columns=component_names)

In [None]:
svd_df.head()

In [None]:
svd_df.shape

In [None]:
loadings = pd.SparseDataFrame(SVD.components_,
                              index=component_names,
                              columns=df_tfidf.columns).T

In [None]:
loadings['abs_component_1'] = np.abs(loadings.component_1)
loadings['abs_component_2'] = np.abs(loadings.component_2)
loadings['abs_component_3'] = np.abs(loadings.component_3)
loadings['abs_component_4'] = np.abs(loadings.component_4)

In [None]:
loadings.head()

In [None]:
loadings.sort_values('abs_component_1',ascending=False).head(20)[['component_1']]

In [None]:
loadings.sort_values('abs_component_2',ascending=False).head(20)[['component_2']]

In [None]:
loadings.sort_values('abs_component_3',ascending=False).head(20)[['component_3']]

In [None]:
loadings.sort_values('abs_component_4',ascending=False).head(20)[['component_4']]

In [None]:
# K-Means Clustering

In [None]:
km = KMeans(n_clusters = 4, n_init = 20, max_iter = 600, random_state=42)
km.fit(svd_df)

In [None]:
km.labels_

In [None]:
len(km.labels_)

In [None]:
crisislex_df['labels_km'] = km.labels_

In [None]:
crisislex_df.head()

In [None]:
crisislex_df['label'].value_counts()

In [None]:
crisislex_df['type'].value_counts(normalize=True)

In [None]:
crisislex_df['labels_km'].value_counts()

In [None]:
crisislex_df[crisislex_df['labels_km'] == 0]

In [None]:
crisislex_df[['label']][crisislex_df['labels_km'] == 1].sum()

In [None]:
crisislex_df[crisislex_df['labels_km'] == 1]

In [None]:
crisislex_df[crisislex_df['labels_km'] == 2]

In [None]:
crisislex_df[crisislex_df['labels_km'] == 3]

# HDBSCAN

In [None]:
cos_mat = pairwise_distances(svd_df, metric='cosine')

getsizeof(cos_mat) / 1e9

In [None]:
cluster = hdbscan.HDBSCAN(min_cluster_size = 150,
                          min_samples = 100, 
                          gen_min_span_tree = True,
                          approx_min_span_tree = False,
                          prediction_data = True,
                          metric = 'precomputed', 
                         )

In [None]:
cluster.fit(cos_mat.astype('float64')) 

In [None]:
cluster.labels_

In [None]:
len(cluster.labels_)

In [None]:
crisislex_df['labels_hdbs'] = cluster.labels_

In [None]:
crisislex_df.head()

In [None]:
crisislex_df['labels_hdbs'].value_counts()

In [None]:
crisislex_df[crisislex_df['labels_hdbs'] == 0]

In [None]:
crisislex_df[crisislex_df['labels_hdbs'] == 1]

In [None]:
crisislex_df[crisislex_df['labels_hdbs'] == 2]

In [None]:
crisislex_df[crisislex_df['labels_hdbs'] == 3]

In [None]:
plt.figure(figsize=(20,10))
cluster.minimum_spanning_tree_.plot(edge_cmap='viridis',
                                    edge_alpha=0.6,
                                    node_size=80,
                                    edge_linewidth=2)

## We can also visually represent the clustering hierarchy

plt.figure(figsize=(20,10))
cluster.single_linkage_tree_.plot(cmap='viridis', colorbar=True)

## If we condense this plot, we can get a better idea of our final clusters

In [None]:
plt.figure(figsize=(20,10))
cluster.condensed_tree_.plot()

## And then we can actually visually represent which clusters are selected

In [None]:
plt.figure(figsize=(20,10))
cluster.condensed_tree_.plot(select_clusters=True, selection_palette=sns.color_palette())

In [None]:
hdbs_null_index = crisislex_df[crisislex_df['labels_hdbs'] == -1].index

In [None]:
km2 = KMeans(n_clusters = 2)
km2.fit(crisislex_df['lemm_clean_processed'][crisislex_df['labels_hdbs'] == -1])