In [1]:
import pandas as pd
import GetOldTweets3 as got
# https://github.com/Mottl/GetOldTweets3

In [2]:
# function to get twitter info and organize in dataframe
def get_twitter_info():
    tweet_df["id"] = tweet_df["got_criteria"].apply(lambda x: x.id)
    tweet_df["tweet_text"] = tweet_df["got_criteria"].apply(lambda x: x.text)
    tweet_df["timestamp"] = tweet_df["got_criteria"].apply(lambda x: x.date)
    tweet_df["hashtags"] = tweet_df["got_criteria"].apply(lambda x: x.hashtags)
    tweet_df["username"] = tweet_df["got_criteria"].apply(lambda x: x.username)
    tweet_df["mentions"] = tweet_df["got_criteria"].apply(lambda x: x.mentions)
    tweet_df["rewtweets"] = tweet_df["got_criteria"].apply(lambda x: x.retweets)
    tweet_df["replies"] = tweet_df["got_criteria"].apply(lambda x: x.replies)
    tweet_df["link"] = tweet_df["got_criteria"].apply(lambda x: x.permalink)
# https://medium.com/@robbiegeoghegan/download-twitter-data-with-10-lines-of-code-42eb2ba1ab0f

*NOTE:* the code below creates a dateframe based on the features in the function above. The dataframes are location-based. We received error messages when saving all cities into the `locations` variable. As manual workaround, we updated each city in the `locations` variable manually.

The cities included are as follows: `Butte County`, `Paradise`, `Chico`, `Durham`, `Oroville`, `Magalia`. This is pulled from the location tag of a given tweet.

In [18]:
#specify location
locations = ["Paradise, CA"]

#get old tweets
##date range pulled from for fire: 11/8/2018 - 11/26/2018
##date range pulled from pre fire: 11/1/2018 - 11/7/2018
tweetCriteria_list = []
for location in locations:
    tweetCriteria = got.manager.TweetCriteria().setSince('2018-11-08')\
                                            .setUntil('2018-11-26')\
                                            .setNear(location)\
                                            .setWithin('100mi')
    tweetCriteria_list.append(tweetCriteria)
    
#create twitter info for each city
tweet_dict = {}
for criteria, location in zip(tweetCriteria_list, locations):
    tweets = got.manager.TweetManager.getTweets(criteria)
    tweet_dict[location] = tweets
    
#create df
tweet_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in tweet_dict.items() ]))
tweet_df['tweet_count'] = tweet_df.index
tweet_df = pd.melt(tweet_df, id_vars=["tweet_count"], var_name='City', value_name='got_criteria')
# tweet_df = tweet_df.dropna()

#extract twitter information
get_twitter_info()
tweet_df = tweet_df.drop("got_criteria", 1)

In [19]:
#save copy of df based on city specified in location
paradise = tweet_df.copy()
paradise.shape

(160, 11)

In [20]:
#save out raw city csv 
paradise.to_csv('./data/paradise.csv', index=False)

In [25]:
## raw dataframes shape summary

## during fire: 
# paradise (817, 11)
# butte (902, 11)
# chico (811, 11)
# oroville (202, 11)
# magalia (764, 11)
# durham (862, 11)

## pre-fire:
# paradise_pre (145, 11)
# butte_pre (166, 11)
# chico_pre (141, 11)
# oroville_pre (53, 11)
# magalia_pre (144, 11)
# durham_pre (160, 11)


### Create single dataframe for tweets during fire

In [20]:
# read in raw city files
paradise = pd.read_csv('./data/paradise.csv')
butte = pd.read_csv('./data/butte.csv')
chico = pd.read_csv('./data/chico.csv')
magalia = pd.read_csv('./data/magalia.csv')
oroville = pd.read_csv('./data/oroville.csv')

In [21]:
# concatenate raw city files into single df
cities = pd.concat([paradise, butte, chico, durham, magalia, oroville])
cities.shape

(4358, 11)

In [22]:
# save out raw, concatenated cities file
cities.to_csv('./data/cities_raw.csv', index=False)

In [26]:
# check duplicates
dups = cities[(cities['id'].duplicated() == True)]
dups

Unnamed: 0,tweet_count,City,id,tweet_text,timestamp,hashtags,username,mentions,rewtweets,replies,link
0,0,Butte County,1066843491555205120,"Lord Farquad Quad Squat Squad @Chico, Californ...",2018-11-25 23:58:04+00:00,,areoandmilk,,0,0,https://twitter.com/areoandmilk/status/1066843...
1,1,Butte County,1066842521601400832,winter edition #queenadailypic released. @Soda...,2018-11-25 23:54:12+00:00,#queenadailypic,leenathequeena,,0,0,https://twitter.com/leenathequeena/status/1066...
2,2,Butte County,1066841740060098562,Drinking a Def Leppard Pale by @ElysianBrewing...,2018-11-25 23:51:06+00:00,,geradellsworth,@ElysianBrewing @Golden1Center,0,0,https://twitter.com/geradellsworth/status/1066...
3,3,Butte County,1066841178782482433,"The forgotten. #LimeBike #Reno @Reno, Nevada h...",2018-11-25 23:48:52+00:00,#LimeBike #Reno,alittlegordie,,0,0,https://twitter.com/alittlegordie/status/10668...
5,5,Butte County,1066839617196961792,SSCC is United and moving Forward !!! The powe...,2018-11-25 23:42:40+00:00,#heritageoffaith,LesSimmons,@sscc7710,1,0,https://twitter.com/LesSimmons/status/10668396...
...,...,...,...,...,...,...,...,...,...,...,...
196,196,"Oroville, CA",1060237957473464320,4th stop on #FarmCityWeek at Look Ahead Veteri...,2018-11-07 18:30:01+00:00,#FarmCityWeek #sacvalley,WCWDwebcam,,0,0,https://twitter.com/WCWDwebcam/status/10602379...
197,197,"Oroville, CA",1060216596306620416,Morse Mandarin Farm visit on #FarmCityWeek tou...,2018-11-07 17:05:09+00:00,#FarmCityWeek #SacValley,WCWDwebcam,,0,0,https://twitter.com/WCWDwebcam/status/10602165...
198,198,"Oroville, CA",1060191924236226561,Feather: Running http://whatsrunning.billdurr....,2018-11-07 15:27:06+00:00,,whats_running,,0,0,https://twitter.com/whats_running/status/10601...
199,199,"Oroville, CA",1060056825175961600,The sunset the other night on my drive home fr...,2018-11-07 06:30:16+00:00,,SeanSullivan_23,,0,0,https://twitter.com/SeanSullivan_23/status/106...


In [29]:
# drop duplicates
cities.drop_duplicates(subset ="id", 
                     keep = 'first', inplace = True) 
cities.drop_duplicates(subset ="id", 
                     keep = 'first', inplace = True) 
cities.shape

(1809, 11)

In [30]:
cities.head()

Unnamed: 0,tweet_count,City,id,tweet_text,timestamp,hashtags,username,mentions,rewtweets,replies,link
0,0,"Paradise, CA",1066843491555205120,"Lord Farquad Quad Squat Squad @Chico, Californ...",2018-11-25 23:58:04+00:00,,areoandmilk,,0,0,https://twitter.com/areoandmilk/status/1066843...
1,1,"Paradise, CA",1066842521601400832,winter edition #queenadailypic released. @Soda...,2018-11-25 23:54:12+00:00,#queenadailypic,leenathequeena,,0,0,https://twitter.com/leenathequeena/status/1066...
2,2,"Paradise, CA",1066841740060098562,Drinking a Def Leppard Pale by @ElysianBrewing...,2018-11-25 23:51:06+00:00,,geradellsworth,@ElysianBrewing @Golden1Center,0,0,https://twitter.com/geradellsworth/status/1066...
3,3,"Paradise, CA",1066841178782482433,"The forgotten. #LimeBike #Reno @Reno, Nevada h...",2018-11-25 23:48:52+00:00,#LimeBike #Reno,alittlegordie,,0,0,https://twitter.com/alittlegordie/status/10668...
4,4,"Paradise, CA",1066839617196961792,SSCC is United and moving Forward !!! The powe...,2018-11-25 23:42:40+00:00,#heritageoffaith,LesSimmons,@sscc7710,1,0,https://twitter.com/LesSimmons/status/10668396...


In [31]:
# save out cleaned cities file 
cities.to_csv('./data/cities_clean.csv', index=False)

### Create single dataframe for tweets 1-week before fire
*repeat same process as above*

In [22]:
paradise_pre = pd.read_csv('./data/paradise_pre.csv')
butte_pre = pd.read_csv('./data/butte_pre.csv')
chico_pre = pd.read_csv('./data/chico_pre.csv')
magalia_pre = pd.read_csv('./data/magalia_pre.csv')
oroville_pre = pd.read_csv('./data/oroville_pre.csv')

In [23]:
pre = pd.concat([paradise_pre, butte_pre, chico_pre, durham_pre, magalia_pre, oroville_pre])
pre.shape

(809, 11)

In [24]:
pre.drop_duplicates(subset ="id", 
                     keep = 'first', inplace = True) 
pre.shape

(342, 11)

In [25]:
pre.drop_duplicates(subset ="id", 
                     keep = 'first', inplace = True) 
pre.shape

(342, 11)

In [27]:
pre.to_csv('./data/pre-fire.csv', index=False)

In [29]:
check_dups = pd.read_csv('./data/pre-fire.csv', index_col=0)

In [30]:
check_dups.shape

(342, 10)

In [31]:
check_dups.drop_duplicates(subset ="id", 
                     keep = 'first', inplace = True) 
check_dups.shape

(183, 10)

In [32]:
check_dups.to_csv('./data/pre-fire.csv', index=False)

### Create single dataframe for with tweets before and during fire

In [None]:
cities = pd.read_csv('./data/cities_clean.csv')
pre = pd.read_csv('./data/pre-fire.csv')

In [None]:
df = pd.concat([cities, pre])
df.shape

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [None]:
df['day'] = df.timestamp.dt.day
df['hour'] = df.timestamp.dt.hour
df['minute'] = df.timestamp.dt.minute

In [None]:
df.to_csv('./data/stacked_v1.csv')