# Data Collection

In [1]:
!pip install python-twitter



In [2]:
!pip install psycopg2-binary



In [3]:
import twitter
import json
import time
import psycopg2 as pg2
import numpy as np
import pandas as pd

from datetime import datetime
from psycopg2.extras import RealDictCursor, Json

In [4]:
%run sql_test.py

In [5]:
IP_ADDRESS = '34.220.55.180'
DBNAME = 'postgres'
USER = 'postgres'
PASSWORD = 'foobar1'

## Configure Postgres Server with Docker

Define functions to programmatically connect to and insert data into database:
-  **con_cur_to_db**: returns both a connection and a cursor object for database
-  **execute_query**: executes query directly to database, without having to create a cursor and connection each time
-  **insert_entry_json**: inserts data into database

In [6]:
def con_cur_to_db(dbname=DBNAME, dict_cur=None):
    ''' 
    Returns both a connection and a cursor object for your database
    '''

    con = pg2.connect(host=IP_ADDRESS, #allows you to navigate db
                  dbname=dbname,
                  user=USER,
                  password=PASSWORD)
    if dict_cur:
        cur = con.cursor(cursor_factory=RealDictCursor)
    else:
        cur = con.cursor()
    return con, cur
    
def execute_query(query, dbname=DBNAME, dict_cur=None, command=False):
    '''
    Executes a query directly to a database, without having to create a cursor and connection each time. 
    '''
    con, cur = con_cur_to_db(dbname, dict_cur)
    cur.execute(f'{query}')
    if not command:
        data = cur.fetchall()
        con.close()
        return data
    con.commit() #sends to server
    con.close() #closes server connection

def insert_entry_json(data, tablename=None):
    con, cur = con_cur_to_db()
    for x in data:
        cur.execute(f'INSERT INTO {tablename} (data) VALUES ({Json(x)});')
    con.commit()
    con.close()

## Application Token

Define API keys and instantiate twitter API

In [7]:
twitter_keys = {
    'consumer_key':        'WuBAkr5TGQmgadzpHmOeSzPWk',
    'consumer_secret':     'pfim3bjV2X6ONw1Xf7qktrgLZ54gCZku7e2BcjT61Fz5SKCvUz',
    'access_token_key':    '1080999232427909120-pWDWD3VwbiYwlfCIo05cKCLXmKNooH',
    'access_token_secret': 'HYwBAbszupAT56B6giElUv2IVsNRBx5scB3LvdseFMOPP'
}

api = twitter.Api(consumer_key         =   twitter_keys['consumer_key'],
                  consumer_secret      =   twitter_keys['consumer_secret'],
                  access_token_key     =   twitter_keys['access_token_key'],
                  access_token_secret  =   twitter_keys['access_token_secret'],
)

In [8]:
print(api.VerifyCredentials())

{"created_at": "Fri Jan 04 01:27:55 +0000 2019", "default_profile": true, "default_profile_image": true, "geo_enabled": true, "id": 1080999232427909120, "id_str": "1080999232427909120", "lang": "en", "name": "connie", "profile_background_color": "F5F8FA", "profile_image_url": "http://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png", "profile_image_url_https": "https://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png", "profile_link_color": "1DA1F2", "profile_sidebar_border_color": "C0DEED", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "profile_use_background_image": true, "screen_name": "connie99418347"}


## Collect Tweets

Collect tweets and store into database:
-  `geocode`: specify geolocation within which to search for tweets
-  `terms`: terms to search by
-  `result_type`: type of results returned (mixed, recent or popular)
-  `since`: search for tweets since specified date
-  `count`: number of results returned (100 max)
-  `sql_db`: database to save tweets to

In [9]:
def streamTweets(term, geocode, since, count, sql_db='raw_tweets'):
    for i in range(1,8):
        year, month, day = since.split('-')
        day = int(day)
        day-=1
        day = str(day).zfill(2)
        date = year + month + day
        after = datetime.strptime(date, '%Y%m%d').strftime('%Y-%m-%d')
        
        results = api.GetSearch(
#             since = since,
#             terms = ['jt', 'justin timberlake', 'jtimberlake', 'justin', 'timberlake', 'concert', 'show', 'mirror', 'cry me a river',
#     'man of the woods', 'tour', 'music', 'PNC', 'welcome back', 'TN kids', 'tennessee kids']

            term = term,
            geocode = geocode,
            return_json = True
        )

        insert_entry_json(data = results['statuses'], 
                          tablename = sql_db)
        before = after

Define function to have `streamTweets` on a loop to programmatically collect tweets:
-  Repeat function 15 times, returning 100 (`count`) each time
-  Pause for 40 seconds to avoid exceeding rate limit

In [10]:
def tweet_repeater(term, geocode, since, repeats=15, count=100, sql_db='raw_tweets'):
    for i in range(repeats):
        since = since
        
        streamTweets(term, geocode, since, count, sql_db)
        print(f'Loop {i+1} complete. Raw tweets pushed to {sql_db}.')
        time.sleep(40)
        
    print('All tweets pulled.')

In [1]:
disaster_list = [
    ['malibu_ca', 'flash_flood/mudslide', '2019-01-06', '34.0249999', '-118.773830238'], 
    ['riverside_ca', 'mudslide', '2019-01-13', '33.9806', '-117.3755'], 
    ['orange_county_ca', 'flash_flood', '2019-01-14', '33.7175', '-117.8311'], 
    ['sandiego_county_ca', 'flash_flood', '2019-01-14', '33.7175', '-117.8311'],
]

Collect most recent tweets:

-  within 15 mile radius of all locations
-  run function 100 times, collecting 700 tweets (1 week x 100 tweets) each time
-  save into `raw_tweets` database

In [None]:
tweet_repeater(term='storm',
               geocode='34.0249999,-118.773830238,15mi',
               since='2019-01-06',
               repeats=100, 
               count=100, 
               sql_db='raw_tweets')

## Retrieve Data from PostgresSQL database

In [None]:
query = """SELECT data ->> 'text'
FROM raw_tweets;
"""
response = execute_query(query, dict_cur=True)

df_text = pd.DataFrame(response)

- retrieve text from tweets

In [None]:
query = """SELECT data#>'{place,bounding_box,coordinates}'
FROM raw_tweets;
"""
response = execute_query(query, dict_cur=True)

df_geo = pd.DataFrame(response)

- retrieve geo coordinates from tweets

In [None]:
df_geo.dropna(how = 'any', subset = ['?column?'], inplace = True)

- drop tweets without coordinates from geo dataframe

In [None]:
latitude = []
longitude = []

for tweet in df_geo['?column?']:
    inside = tweet[0][1]
    outside = tweet[0][3]
    lat = (inside[0] + outside[0])/2
    long = (inside[1] + outside[1])/2
    latitude.append(lat)
    longitude.append(long)

- take the average of the box coordinates

In [None]:
df_geo['lat'] = latitude
df_geo['long'] = longitude

- add latitude and longitude to dataframe

In [None]:
df = pd.merge(df_text, df_geo[['lat', 'long']], left_index=True, right_index=True)

- merge text dataframe and geo dataframe through the index

In [None]:
df.drop_duplicates(keep='first', inplace=True)

- drop duplicate tweets