In [2]:
import twitter, json, glob, time
import pandas as pd

from datetime import datetime

### 1. Authorization

- **Step 1** Save your credential as following in a json file

```
{'consumer_key':        CONSUMER_KEY,
 'consumer_secret':     CONSUMER_SECRET,
 'access_token_key':    ACCESS_TOKEN,
 'access_token_secret': ACCESS_TOKEN_SECRET}```

- **Step 2** Name the json file "creds.json" and save it in the directory this jupyter notebook is located.

In [6]:
creds_file = open('./creds.json', 'r')
twitter_keys = json.loads(creds_file.read())

In [8]:
api = twitter.Api(consumer_key         =   twitter_keys['consumer_key'],
                  consumer_secret      =   twitter_keys['consumer_key_secret'],
                  access_token_key     =   twitter_keys['access_token'],
                  access_token_secret  =   twitter_keys['access_token_secret'],
                  tweet_mode= 'extended' # tweet_mode= 'extended' is needed to get the full_text.
                  ) 

### 2. Function: get_tweets()
- Search for tweets with `keyword` and `geo_list`
- Saves the result as a csv in `geo_tweet` folder
____
**Parameters**
- `keyword`: *string* | keyword to search for
- `geo_list`: *list of string* | *"latitude,longitude"* | list of geographic coordinates to search for
- `geo_tweet`: *string* | folder to save csv file

In [105]:
def get_tweets(keyword, geo_list, save_dir = 'geo_tweets'):
    for i, geo in enumerate(geo_list):
        
        # Track progress
        prog = i +1
        
        # Search for tweets
        coor = geo
        tweets = api.GetSearch(term = keyword,
                               geocode = coor, 
                               lang='eng')
        
        # Grab result from the search
        created_at = []

        user_id = []
        user_location = []
        user_name = []

        location_type = []
        location = []
        coordinate = []

        hashtags = [] 
        text = []
        re_text = [] 

        for tweet in tweets:
            td = tweet.__dict__
            created_at.append(td["created_at"])
            hashtags.append([hg.text for hg in td["hashtags"]])
            user_id.append(td["id"])
            location_type.append(td["place"]["place_type"])
            location.append(td["place"]["full_name"])
            coordinate.append(td["place"]["bounding_box"]["coordinates"])
            user_location.append(td["user"].location)
            user_name.append(td["user"].screen_name)
            text.append(td["full_text"])
            if td['retweeted_status']:
                re_text.append(td['retweeted_status'].full_text)
            else:
                re_text.append("")

        # Save the result in a dataframe
        keys = ["created_at", "user_id", "user_location", "user_name",
                "location_type", "location", "coordinate",
                "hashtags", "text", "re_text"]
        values = [created_at, user_id, user_location, user_name,
                  location_type, location, coordinate,
                  hashtags, text, re_text]
        out_df = {}

        for key, value in zip(keys, values):
            out_df[key] = value

        # Save the dataframe as csv
        result = pd.DataFrame(out_df)
        now = datetime.now()
        result.to_csv("./"+ save_dir +"/"+keyword+"_"+str(prog)+now.strftime("_%m%d%y")+".csv", index=False)
        
        # Notify the progress
        print(f"{prog}/{len(geo_list)} finished")
        
        # Wait 5 seconds to avoid reaching rate limit
        time.sleep(5)

### 2. Function: get_tweets()
- Return a list of geographical coordinates around `latitude` and `longitude` as the origin.
- The size of the grid is 15 km by 15 km and each geographical coordinate point is 1km apart.
____
**Parameters**
- `latitude`: *float* or *integer* | latitude of the origin to grab the geographical coordinates surrrounding it.
- `longitude`: *float* or *integer* | longitude of the origin to grab the geographical coordinates surrrounding it.

In [1]:
def get_coord(latitude, longitude):
    #creating a 15x15 km square (225 points!)
    n_vals = list(range(0, 8)) + [-i for i in range(1, 8)]
    
    #for twitter scrape geo coords have to be of the form "latitude,longitude,1km"
    longs = [str(round(longitude + .009044 * n, 6)) + ",1km" for n in n_vals]
    lats = [str(round(latitude + .008983 * n, 6)) + "," for n in n_vals]
    
    return [lat+long for lat in lats for long in longs ]

### Test run

In [2]:
# Geographical coordinates surrouding "34.098191,-118.478717"
geo_list = get_coord(34.098191,-118.478717)

In [106]:
get_tweets("fire", ["34.098191,-118.478717,1km", "34.098191,-118.478717,1km"]);

1/2 finished
2/2 finished


### 4. Function: merge_by_keyword()
- Merge all csv files generated from **get_tweets()** by keyword.
- Save the merged files into a dataframe.
- if `keyword` is True, it saves the dataframe in a csv file.
- Note: Create a directory in 'geo_tweets' and name it 'merged'.
---
**Parameters**
- `keyword`: *string* | keyword to search for
- `save_csv`: *boolean* | if True, it saves the 


In [49]:
def merge_by_keyword(keyword, save_csv = False):
    datas = glob.glob("./geo_tweets/"+keyword+"*")
    merged_df = pd.concat([pd.read_csv(data) for data in datas])
    if save_csv:
        now = datetime.now()
        merged_df.to_csv("./geo_tweets/merged/"+keyword+now.strftime("_%m%d%y")+".csv", index=False)
    return merged_df

### Test run

In [53]:
df = merge_by_keyword("fire", save_csv= True)

In [54]:
df

Unnamed: 0,created_at,user_id,user_location,user_name,location_type,location,coordinate,hashtags,text
0,Mon Oct 28 19:17:07 +0000 2019,1188897501278900224,Los Angeles,abc7robhayes,neighborhood,"Brentwood, Los Angeles","[[[-118.528736, 34.041356], [-118.457499, 34.0...",[],Before/After the Getty Fire. \n\nThis is 1510 ...
1,Wed Oct 30 07:50:37 +0000 2019,1189449512961495040,"Los Angeles, CA",jintakhan,poi,Mount St. Mary's College Campus Center,"[[[-118.48168318584761, 34.08396202249944], [-...",[],Up at Mount St. Mary’s University. The wind is...
2,Tue Oct 29 16:31:26 +0000 2019,1189218195116158976,santa monica . maine . boston,runawaykat,neighborhood,"Brentwood, Los Angeles","[[[-118.528736, 34.041356], [-118.457499, 34.0...",[],"los angeles renters, i just asked my leasing o..."


The follwing parameters were used in 'get_tweets()' function.
- `Keyword` used in query: 
     - fire, tick fire, wild fire, california fire, los angeles fire, la fire, getty fire, easy fire, maria fire

- `geo_list` used in query:
     - maria_coord = get_coord(34.342314, -118.991682)
     - tick_coord = get_coord(34.47, -118.37)
     - getty_coord = get_coord(34.098191, -118.478717)

As a result, 90 unique tweets were scrapped.

Coordinates returned from the Python Twitter scrapper were a list of four coordinate points that formed a boundary. A random coordinate points (latitude and longitude) within the boundary were selected and assigned to column 'lat' and 'long' column.

The result was saved as `tweets_w_geo_python_twitter.csv`.
- See `tweets_w_geo_python_twitter.csv` below.

In [16]:
pd.read_csv('../data/tweets_w_geo_python_twitter.csv').head()

Unnamed: 0,text,lat,long,user_name,location,user_location
0,Before/After the Getty Fire. \n\nThis is 1510 ...,34.09635,-118.502207,abc7robhayes,"Brentwood, Los Angeles",Los Angeles
1,Up at Mount St. Mary’s University. The wind is...,34.083962,-118.481683,jintakhan,Mount St. Mary's College Campus Center,"Los Angeles, CA"
2,"los angeles renters, i just asked my leasing o...",34.070677,-118.486314,runawaykat,"Brentwood, Los Angeles",santa monica . maine . boston
3,Horrible situation on #tigertail right now in ...,34.076485,-118.488554,abc7JoshHaskell,Crestwood Hills Recreation Center,"Los Angeles, CA"
4,Good morning from near the fire lines. We are ...,34.081526,-118.506668,ChristinaKTLA,Mandeville Canyon,Los Angeles


Due to the time constraint, we failed to scrape more tweets with this scrapper. Therefore, we merged `tweets_w_geo_python_twitter.csv` with scrapped disaster related tweets with geo_coordinates that another team previously worked on. To see the team's work, visit the link: https://github.com/csinatra/Twitter-Disaster-Repo.

- Previous team data (github username: 'csinatra')

In [12]:
pd.read_csv('../data/tweets_w_geo_previous_team.csv').head().drop('Unnamed: 0', axis=1)

Unnamed: 0,tweet,label,lat,long
0,@jackshope They are airlifting our crew into C...,off-topic,34.082321,-117.335853
1,"#Skywire was boring, what was exciting was whe...",off-topic,34.122635,-117.243592
2,@WBrettWilson @TarzanDan whenever works for yo...,off-topic,34.092197,-117.266696
3,@joshclassenCTV you are dead wrong sir there i...,on-topic,34.049241,-117.378255
4,"I'm at Canada Olympic Park (Calgary, AB) http:...",off-topic,33.70454,-117.192101
