In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

%matplotlib inline

In [2]:
df = pd.read_csv('./datasets/stacked_sentiment_again.csv')
df.shape

(1130, 20)

In [49]:
# Trying to find an optimal amount of clusters
def kmeans_check_ks(df, ks):
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,1))
    X = vectorizer.fit_transform(df['text_nourl'])
    # don't need to scale since Tfidf does that automatically

    scores = []
    for k in range(2, ks+1):
        km = KMeans(n_clusters=k, random_state=2018)
        km.fit(X)
        sil = silhouette_score(X, km.labels_)
        intertia = km.inertia_
        scores.append((k, sil, intertia))

    scores_df = pd.DataFrame(scores, columns=['K', 'SilhouetteScore', 'Inertia'])
    scores_df.sort_values('SilhouetteScore', ascending=False, inplace=True)
    return scores_df

### Basic k-means on all text - We are not using any filters
- Followed steps in [this](https://towardsdatascience.com/applying-machine-learning-to-classify-an-unsupervised-text-document-e7bb6265f52) article

In [3]:
# We need to vectorize first and turn all words into their roots
vectorizer_a = TfidfVectorizer(stop_words='english')
X_all = vectorizer.fit_transform(df['text_nourl'])

# Checking with 3 clusters
true_k = 3
model_all = KMeans(n_clusters=true_k, random_state=2018)
model_all.fit(X)

# Viewing clusters of words
order_centroids = model_all.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer_a.get_feature_names()

for i in range(true_k):
    print(f'Cluster %d:' % i),
    for ind in order_centroids[i, :10]:
        print(f'%s' % terms[ind])

### Basic k-means on fire-related tweets
- follow steps in [this](https://towardsdatascience.com/applying-machine-learning-to-classify-an-unsupervised-text-document-e7bb6265f52) article

In [18]:
# Manually coded each tweet if it was related to the fire.  
# Pulling it into a new dataframe
fire = df[(df['is-fire-related'] == 1)].copy()
fire.shape

(616, 20)

In [52]:
# Vectorizing and fitting a Kmeans with 3 clusters
vectorizer_f = TfidfVectorizer(stop_words='english')
X_fire = vectorizer_f.fit_transform(fire['text_nourl'])

true_k = 3
model_3f = KMeans(n_clusters=true_k, random_state=2018)
model_3f.fit(X)

order_centroids = model_3f.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer_f.get_feature_names()

# Viewing clusters
for i in range(true_k):
    print(f'Cluster %d:' % i),
    for ind in order_centroids[i, :10]:
        print(f'%s' % terms[ind])

Cluster 0:
rd
traffic
closure
ca
nb
sb
camp
89
hwy
70
Cluster 1:
took
forget
hoping
away
return
lost
day
replace
able
paradise
Cluster 2:
california
campfire
chico
paradise
ca
buttecounty
000
acres
camp
contained


### 3 clusters
- cluster 0: Traffic and road related
- cluster 1: Emotional
- cluster 2: California

In [53]:
# Fitting with 5 clusters
true_k = 5
model_5f = KMeans(n_clusters=true_k, random_state=2018)
model_5f.fit(X_fire)

order_centroids = model_5f.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer_f.get_feature_names()

# Viewing clusters
for i in range(true_k):
    print(f'Cluster %d:' % i),
    for ind in order_centroids[i, :10]:
        print(f'%s' % terms[ind])

Cluster 0:
rd
closure
traffic
pentz
wb
eb
durham
pearson
camp
neal
Cluster 1:
took
forget
hoping
away
return
lost
day
replace
able
paradise
Cluster 2:
california
chico
paradise
campfire
camp
just
calfire
people
help
today
Cluster 3:
ca
nb
sb
89
hwy
70
traffic
closure
camp
oroville
Cluster 4:
ca
info
unofficial
disclaimers
incorrect
officials
safety
images
sacramento
update


### 5 clusters:
- 0: Traffic 
- 1: Emotional (negative sentiment)
- 2: California
- 3: Traffic
- 4: Informational

In [29]:
# Fitting each tweet to a cluster
fire['clusters'] = fire['text_nourl'].apply(lambda x: int(model_5f.predict(vectorizer_f.transform([x]))[0]))
fire.head()

Unnamed: 0,tweet_count,City,id,tweet_text,timestamp,hashtags,username,mentions,rewtweets,replies,...,day,hour,minute,during_fire,is-fire-related,text_nourl,key_score,from_locations,sent,clusters
0,366.0,"Paradise, CA",1.06e+18,"::paradise baby:: meet jd, the grandson of my ...",2018-11-16 16:31:06+00:00,#campfire,trekcass,@JulieRems,0,0,...,16,16,31,1,1,"::paradise baby:: meet JD, the grandson of my ...",2,"Paradise, CA",1.0,0
1,700.0,"Paradise, CA",1.06e+18,"¡coño! #campfire @chico, california https://ww...",2018-11-09 19:29:06+00:00,#CampFire,VXO,,0,0,...,9,19,29,1,1,"¡Coño! #CampFire @Chico, California",1,"Chico, CA",1.0,0
3,53.0,"Paradise, CA",1.07e+18,.@stucam7771 nailed it! the man in the white h...,2018-11-24 11:29:40+00:00,,XLComedy,@stucam7771,1,0,...,24,11,29,1,1,.@Stucam7771 nailed it! The man in the White H...,1,"Paradise, CA",1.0,0
5,429.0,"Paradise, CA",1.06e+18,"“and the peace of god, which transcends all un...",2018-11-15 01:20:54+00:00,#firefighter #calfire #firstresponders,AaronComfortDog,,1,0,...,15,1,20,1,1,"“And the peace of God, which transcends all un...",2,"Chico, CA",1.0,0
8,185.0,"Paradise, CA",1.07e+18,"“from the ashes, a fire shall be woken, a ligh...",2018-11-20 23:28:54+00:00,,Marcella_renai,,0,0,...,20,23,28,1,1,"“From the ashes, a fire shall be woken, A ligh...",1,"Chico, CA",1.0,0


In [41]:
fire.to_csv('./datasets/clustered_fires.csv', index = False) # exporting

In [None]:
fire[(fire['key_score'] > 0) & (fire['text_nourl'].str.contains('fire'))][['text_nourl', 'key_score', 'clusters']]

In [50]:
fire_scores_2 = kmeans_check_ks(fire, 20)

fire_scores_2

In [46]:
all_clustering = kmeans_check_ks(df, 30)
all_clustering

Unnamed: 0,K,SilhouetteScore,Inertia
27,29,0.129636,849.411539
28,30,0.129616,847.973657
26,28,0.129585,850.790604
25,27,0.129316,852.337106
24,26,0.128798,854.373349
23,25,0.12851,856.101311
22,24,0.125481,860.104913
19,21,0.12316,865.866615
14,16,0.123042,883.718962
13,15,0.122643,885.470893


In [None]:
# Dataframe viewing options for double checking
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_colwidth', None)


In [54]:
fire.groupby('clusters')['key_score'].mean()

clusters
0    1.319372
1    1.162162
2    1.000000
3    1.085714
4    0.884615
Name: key_score, dtype: float64

### chart k-means

In [32]:
df['locs_no'] = df['from_locations']
df.head()

Unnamed: 0,tweet_count,City,id,tweet_text,timestamp,hashtags,username,mentions,rewtweets,replies,...,day,hour,minute,during_fire,is-fire-related,text_nourl,key_score,from_locations,sent,locs_no
0,366.0,"Paradise, CA",1.06e+18,"::paradise baby:: meet jd, the grandson of my ...",2018-11-16 16:31:06+00:00,#campfire,trekcass,@JulieRems,0,0,...,16,16,31,1,1,"::paradise baby:: meet JD, the grandson of my ...",2,"Paradise, CA",1.0,"Paradise, CA"
1,700.0,"Paradise, CA",1.06e+18,"¡coño! #campfire @chico, california https://ww...",2018-11-09 19:29:06+00:00,#CampFire,VXO,,0,0,...,9,19,29,1,1,"¡Coño! #CampFire @Chico, California",1,"Chico, CA",1.0,"Chico, CA"
2,494.0,"Paradise, CA",1.06e+18,. . . . . #photography #instapics #photographe...,2018-11-13 15:51:31+00:00,#photography #instapics #photographersofig #pi...,sidewayseightp,,0,0,...,13,15,51,1,0,. . . . . #photography #instapics #photographe...,0,"Chico, CA",1.0,"Chico, CA"
3,53.0,"Paradise, CA",1.07e+18,.@stucam7771 nailed it! the man in the white h...,2018-11-24 11:29:40+00:00,,XLComedy,@stucam7771,1,0,...,24,11,29,1,1,.@Stucam7771 nailed it! The man in the White H...,1,"Paradise, CA",1.0,"Paradise, CA"
4,201.0,"Oroville, CA",1.06e+18,‘merurica... #voted #uklastudios #uklapictures...,2018-11-07 02:02:51+00:00,#voted #uklastudios #uklapictures,UKLA_Music,,0,0,...,7,2,2,0,0,‘Merurica... #voted #uklastudios #uklapictures...,0,"Gridley, CA",1.0,"Gridley, CA"


In [34]:
all_cities = ['Chico, CA', 'California, USA', 'Sacramento, CA', 'Reno, NV',
       'Rocklin, CA', 'Folsom, CA', 'Oroville, CA', 'Florin, CA',
       'Nevada City, CA', 'Yountville, CA', 'Redding, CA', 'Linda, CA',
       'Magalia, CA', 'Paradise, CA', 'nolocationfound', 'Durham, CA',
       'Napa, CA', 'Glen Ellen, CA', 'Oroville East, CA',
       'South Oroville, CA', 'Thermalito, CA', 'Gridley, CA']

butte_cty = ['Chico, CA',  'Oroville, CA', 'Magalia, CA', 'Paradise, CA',  
             'Durham, CA', 'Oroville East, CA', 'South Oroville, CA', 
             'Thermalito, CA', 'Gridley, CA']

not_butte = ['California, USA', 'Sacramento, CA', 'Reno, NV','Rocklin, CA', 
             'Folsom, CA', 'Florin, CA','Nevada City, CA', 'Yountville, CA', 
             'Redding, CA', 'Linda, CA', 'nolocationfound', 'Napa, CA', 
             'Glen Ellen, CA',]

In [35]:
df['locs_no'] = df['locs_no'].replace(['Chico, CA',  'Oroville, CA', 'Magalia, CA', 'Paradise, CA',  
             'Durham, CA', 'Oroville East, CA', 'South Oroville, CA', 
             'Thermalito, CA', 'Gridley, CA'], 1)

In [36]:
df['locs_no'] = df['locs_no'].replace(['California, USA', 'Sacramento, CA', 'Reno, NV','Rocklin, CA', 
             'Folsom, CA', 'Florin, CA','Nevada City, CA', 'Yountville, CA', 
             'Redding, CA', 'Linda, CA', 'nolocationfound', 'Napa, CA', 
             'Glen Ellen, CA',], 0)

In [None]:
df.head()

In [37]:
df_cluster = df[['key_score', 'locs_no']]

In [39]:
df['timestamp'].sort_values()

376     2018-11-01 00:10:34+00:00
554     2018-11-01 00:12:17+00:00
551     2018-11-01 00:12:58+00:00
697     2018-11-01 00:18:43+00:00
698     2018-11-01 00:19:37+00:00
                  ...            
881     2018-11-25 23:45:36+00:00
968     2018-11-25 23:48:52+00:00
419     2018-11-25 23:51:06+00:00
1104    2018-11-25 23:54:12+00:00
735     2018-11-25 23:58:04+00:00
Name: timestamp, Length: 1130, dtype: object

In [None]:
import chart_studio.plotly as py 
import plotly.graph_objs as go 
import plotly.express as px

In [None]:
# Simple line graph of apple high prices
fig = px.line(df, # data
             x = 'timestamp', # date for x-axis
             y = 'key_score' ) # plot close price
fig.show()