In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

%matplotlib inline

In [2]:
cities = pd.read_csv('./data/cities_clean.csv')
pre = pd.read_csv('./data/pre-fire.csv')

In [3]:
print(cities.shape)
print(pre.shape)

(1809, 11)
(183, 10)


In [4]:
cities.drop_duplicates(subset ="id", 
                     keep = 'first', inplace = True) 
cities.shape

(947, 11)

In [6]:
df = pd.concat([cities, pre])
df.shape

(1130, 11)

In [8]:
df.dtypes

tweet_count    float64
City            object
id               int64
tweet_text      object
timestamp       object
hashtags        object
username        object
mentions        object
rewtweets        int64
replies          int64
link            object
dtype: object

In [11]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [12]:
df.dtypes

tweet_count                float64
City                        object
id                           int64
tweet_text                  object
timestamp      datetime64[ns, UTC]
hashtags                    object
username                    object
mentions                    object
rewtweets                    int64
replies                      int64
link                        object
dtype: object

In [15]:
print("Original Dataframe:")
print(df.head())
print("\nYear:")
print(df.timestamp.dt.year.head())
print("\nMonth:")
print(df.timestamp.dt.month.head())
print("\nDay:")
print(df.timestamp.dt.day.head())
print("\nHour:")
print(df.timestamp.dt.hour.head())
print("\nMinute:")
print(df.timestamp.dt.minute.head())
print("\nSecond:")
print(df.timestamp.dt.second.head())

Original Dataframe:
   tweet_count          City                   id  \
0          0.0  Paradise, CA  1066843491555205120   
1          1.0  Paradise, CA  1066842521601400832   
2          2.0  Paradise, CA  1066841740060098562   
3          3.0  Paradise, CA  1066841178782482433   
4          4.0  Paradise, CA  1066839617196961792   

                                          tweet_text  \
0  Lord Farquad Quad Squat Squad @Chico, Californ...   
1  winter edition #queenadailypic released. @Soda...   
2  Drinking a Def Leppard Pale by @ElysianBrewing...   
3  The forgotten. #LimeBike #Reno @Reno, Nevada h...   
4  SSCC is United and moving Forward !!! The powe...   

                  timestamp          hashtags        username  \
0 2018-11-25 23:58:04+00:00               NaN     areoandmilk   
1 2018-11-25 23:54:12+00:00   #queenadailypic  leenathequeena   
2 2018-11-25 23:51:06+00:00               NaN  geradellsworth   
3 2018-11-25 23:48:52+00:00   #LimeBike #Reno   alittlegordie   

In [16]:
df['day'] = df.timestamp.dt.day
df['hour'] = df.timestamp.dt.hour
df['minute'] = df.timestamp.dt.minute

In [17]:
df.head()

Unnamed: 0,tweet_count,City,id,tweet_text,timestamp,hashtags,username,mentions,rewtweets,replies,link,day,hour,minute
0,0.0,"Paradise, CA",1066843491555205120,"Lord Farquad Quad Squat Squad @Chico, Californ...",2018-11-25 23:58:04+00:00,,areoandmilk,,0,0,https://twitter.com/areoandmilk/status/1066843...,25,23,58
1,1.0,"Paradise, CA",1066842521601400832,winter edition #queenadailypic released. @Soda...,2018-11-25 23:54:12+00:00,#queenadailypic,leenathequeena,,0,0,https://twitter.com/leenathequeena/status/1066...,25,23,54
2,2.0,"Paradise, CA",1066841740060098562,Drinking a Def Leppard Pale by @ElysianBrewing...,2018-11-25 23:51:06+00:00,,geradellsworth,@ElysianBrewing @Golden1Center,0,0,https://twitter.com/geradellsworth/status/1066...,25,23,51
3,3.0,"Paradise, CA",1066841178782482433,"The forgotten. #LimeBike #Reno @Reno, Nevada h...",2018-11-25 23:48:52+00:00,#LimeBike #Reno,alittlegordie,,0,0,https://twitter.com/alittlegordie/status/10668...,25,23,48
4,4.0,"Paradise, CA",1066839617196961792,SSCC is United and moving Forward !!! The powe...,2018-11-25 23:42:40+00:00,#heritageoffaith,LesSimmons,@sscc7710,1,0,https://twitter.com/LesSimmons/status/10668396...,25,23,42


In [21]:
df.isna().sum()

tweet_count     183
City              0
id                0
tweet_text        0
timestamp         0
hashtags        559
username          0
mentions       1003
rewtweets         0
replies           0
link              0
day               0
hour              0
minute            0
dtype: int64

In [22]:
df.to_csv('./data/stacked_v1.csv')