In [2]:
import numpy as np
import pandas as pd
import requests, json, time, os
import tweepy
import GetOldTweets3 as got

from datetime import datetime

### Function: scrap()
- Search for tweets with keyword and time period within the area surrounding a city.
- Saves the result as a csv
____
**Parameters**
- `hashtag`: *string* | keyword to search for
- `since`: *string* | *"yyyy-mm-dd* | date range
- `until`: *string* | *"yyyy-mm-dd* | date range
- `location_str`: *string* | city name to search around
- `radius_str`: *string* | *numkm* for km or *nummi* for mile | radius around `location_str` to search around

In [17]:
def scrap(hashtag, since, until, location_str='los angeles', radius_str= '80km', save_csv = False):
    """
    Scrap tweets using 'GetOldTweets3'
    Return the dataframe
    
    Paramater:
    hashtag: string/ hashtag to search for
    since: 'yyyy-mm-dd'/ dates the tweets were created since
    until: 'yyyy-mm-dd'/ dates the tweets were created until
    save_csv: boolean/ if True, save the dataframe in csv
    """
    tweetCriteria = got.manager.TweetCriteria().setQuerySearch(hashtag)\
                                                   .setTopTweets(False)\
                                                   .setSince(since).setUntil(until)\
                                                   .setNear(location_str)\
                                                   .setWithin(radius_str)
                    
    list_of_dicts = []
    for tweet in got.manager.TweetManager.getTweets(tweetCriteria):
        tweets = {}
        tweets["text"] = tweet.text
        tweets["hashtags"] = tweet.hashtags
        tweets["user_name"] = tweet.username
        tweets["date"] = tweet.date
        tweets["user_location"] = tweet.geo

        list_of_dicts.append(tweets)

    df = pd.DataFrame(list_of_dicts)
    now = datetime.now()
    if save_csv:
        df.to_csv(f"./tweets/{hashtag}{now.strftime('%-H_%-m_%-d')}.csv", index=False)
    
    return df

### Runs

In [46]:
tick_df = scrap("Tick Fire", "2019-10-01", "2019-10-28", save_csv = False)

In [48]:
tick_df.to_csv(f"./tweets/tickfire_103019.csv", index=False)

In [8]:
wild_df = scrap("wild fire", "2019-10-01", "2019-10-28", save_csv = False)

In [49]:
wild_df.to_csv(f"./tweets/wildfire_103019.csv", index=False)

In [50]:
getty_df = scrap("Getty Fire", "2019-10-15", "2019-10-28", save_csv = False)

In [52]:
getty_df.to_csv(f"./tweets/gettyfire_103019.csv", index=False)

### Function: label_tweet()
- Show each observation in `tweet_col` column
- For each observations, accept a value
- Add the value in an additional column `label`
- Enter **stop** to end the function. This gives the value of 999 to all unlabeled rows.
____
**Parameters**
- `dataframe`: *dataframe* | dataframe with text column to review and label.
- `tweet_col`: *string* | name of the text column to review

In [57]:
def label_tweet(dataframe, tweet_col = 'text'):

    label = []

    for i in range(len(dataframe)):
        print("")
        print("")
        print(f"{i+1}/{len(dataframe)}")
        print(dataframe.loc[i ,tweet_col])
        print("")
        print("")
        ans = input("disaster: 1, no disaster: 0 ")
        if ans == "stop":
            for n in range(len(dataframe) - (i)):
                label.append(999)
            
            dataframe['label'] = label
            print("Stopped")
            break
        label.append(ans)
    
    dataframe['label'] = label
    
    return dataframe

---

2728 tweets were scraped using this GetOldTweet3 scraper function, **scrap()**. The tweets were manually labeled using **label_tweet()** function. This labeled data was used as train data for EDA and Modeling. This data was saved as 'train_w_label.csv' in data directory.

In [3]:
pd.read_csv('../data/train_w_label.csv')

Unnamed: 0,text,hashtags,user_name,date,user_location,label
0,Congrats pre,,MasterAR9012502,2019-10-24 15:05:01+00:00,,0
1,Wassup?,,theprejon,2019-10-21 20:54:33+00:00,,0
2,"Ini mbc korea yang komen orang indo semueh,, b...",,KhoyrR,2019-10-24 14:19:01+00:00,,0
3,Meron pre?,,Zendriccccc,2019-10-26 13:00:29+00:00,,0
4,this is their pre-p3p5 fam love story thanks,,akihikologist,2019-10-21 02:44:47+00:00,,0
...,...,...,...,...,...,...
2723,What does North of Soledad Cyn Rd mean? Maybe ...,#TICKFIRE #lacofdpio,NormadyThomas,2019-10-25 04:02:37+00:00,,1
2724,Closing freeway to traffic coming north in cer...,#KincadeFire #evacuate,clairefelicite2,2019-10-26 23:20:44+00:00,,1
2725,Nuove evacuazioni I saw this on the BBC and th...,#Californiawildfires,paoloigna1,2019-10-26 22:20:48+00:00,,0
2726,Stay safe everyone and be careful goodnight #T...,#TickFire,jessica2sweet,2019-10-25 05:53:44+00:00,,0
