## Obtain tweets geotagged to Prospect Park starting January 2020

In [1]:
# Import necessary modules
import pandas as pd
import numpy as np
import tweepy
import time

# # Twitter API credentials
# from auth import (
#     api_secret_key,)

# # Auths
# auth = tweepy.AppAuthHandler(api_key, api_secret_key)
# api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

### Write dict of strings defining non-overlapping circles that "partition" Prospect Park

In [2]:
# Tweepy's search function only allows you to search circular areas. 
# So to obtain all and only the tweets published from within Prospect Park, 
# we "partition" the park into 26 disjoint circles. (Not a true partition 
# since the tiny areas between the circles are left out.)

centers = {0:"40.671104,-73.968465",
1:"40.671850,-73.965368",
2:"40.670769,-73.965293",
3:"40.669673,-73.971451",
4:"40.669288,-73.965786",
5:"40.668409,-73.963533",
6:"40.664121,-73.969430",
7:"40.666716,-73.962611",
8:"40.665301,-73.962074",
9:"40.664031,-73.962010",
10:"40.663495,-73.976905",
11:"40.661436,-73.977198",
12:"40.660841,-73.963457",
13:"40.659645,-73.974967",
14:"40.659025,-73.964253",
15:"40.659100,-73.966612",
16:"40.658856,-73.971547",
17:"40.658343,-73.973121",
18:"40.655710,-73.968853",
19:"40.657098,-73.973500",
20:"40.656878,-73.963422",
21:"40.655183,-73.963351",
22:"40.652970,-73.972335",
23:"40.652125,-73.971362",
24:"40.651342,-73.967057",
25:"40.650105,-73.970340"}

radii = {0:"0.21997km",
         1:"0.04828km",
         2:"0.05169km",
         3:"0.07364km",
         4:"0.08708km",
         5:"0.12566km",
         6:"0.56544km",
         7:"0.08047km",
         8:"0.07141km",
         9:"0.06437km",
         10:"0.06960km",
         11:"0.16093km",
         12:"0.05713km",
         13:"0.12107km",
         14:"0.15429km",
         15:"0.04103km",
         16:"0.04444km",
         17:"0.09251km",
         18:"0.37845km",
         19:"0.04466km",
         20:"0.09656km",
         21:"0.08569km",
         22:"0.04828km",
         23:"0.08047km",
         24:"0.14484km",
         25:"0.13708km"}

### Create and save a single master dataframe consisting of all the tweets 
### from Prospect Park since March 1, 2020

In [3]:
import GetOldTweets3 as got
import datetime as dt
import numpy as np

In [4]:
today = dt.datetime.now().strftime("%Y-%m-%d")

In [5]:
# Store the properties that we use to specify which tweets we are seeking as our request's tweet criteria

criteria = {}

# For each circle i, record tweet criteria for requesting all tweets published within circle i
for i in range(0,26):
    
    criteria[i] = got.manager.TweetCriteria().setSince("2020-01-01")\
                                               .setUntil(today)\
                                               .setNear(centers[i])\
                                               .setWithin(radii[i])

In [6]:
# Initialize an empty dict: tweets. For each circle i, tweets[i] will contain all 
# tweets published within circle i since January 1, 2020.

tweets = {}

# Request the tweets and store them in a list: tweets[i]
for i in range(0,26):
    tweets[i] = got.manager.TweetManager.getTweets(criteria[i])

In [61]:
# Build a dataframe out of all of our tweets

# We will need to convert the timezone to EST (it is UTC by default)
import pytz
est = pytz.timezone('US/Eastern')

# Initialize empty columns
dates = []
locations = []
texts = []

for i in range(0,26):
    for j in range(0,len(tweets[i])):
            dates.append(tweets[i][j].date.astimezone(tz= est))
            # If we want dates to be more human parseable, add: .strftime("%Y-%m-%d %H:%M:%S")
            locations.append(tweets[i][j].geo)
            texts.append(tweets[i][j].text)

In [62]:
# Build a dataframe out of all of our tweets: df
data = {'date':dates, 'location':locations, 'text':texts}
df = pd.DataFrame(data=data)
df.date = pd.to_datetime(df.date)

# Save what we have so far before we do anything to it
df.to_pickle(path="./park_tweets_raw.pkl")

# Sort the data by time, but leave date as a column for easy manipulation
df = df.set_index('date',drop=False)
df = df.sort_index()

# Create a column for day of the week
df['day_of_week'] = df.date.dt.day_name()

# Create a column for the hour
df['hour'] = df.date.dt.hour

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1023 entries, 2019-12-31 22:00:04-05:00 to 2020-04-07 15:43:24-04:00
Data columns (total 5 columns):
date           1023 non-null datetime64[ns, US/Eastern]
location       1023 non-null object
text           1023 non-null object
day_of_week    1023 non-null object
hour           1023 non-null int64
dtypes: datetime64[ns, US/Eastern](1), int64(1), object(3)
memory usage: 88.0+ KB


In [64]:
df.head()

Unnamed: 0_level_0,date,location,text,day_of_week,hour
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-12-31 22:00:04-05:00,2019-12-31 22:00:04-05:00,,人还很少 #NYE2020 – at Prospect Park,Tuesday,22
2019-12-31 23:00:44-05:00,2019-12-31 23:00:44-05:00,,Baby 出现了，家庭友好的新年庆祝 #NYE2020 #brooklyn #NYC – a...,Tuesday,23
2019-12-31 23:39:12-05:00,2019-12-31 23:39:12-05:00,,Flirty Leo & Aquarius – at Grand Army Plaza Gr...,Tuesday,23
2019-12-31 23:51:00-05:00,2019-12-31 23:51:00-05:00,,"""New Year in Space"" Illo for the Washington Po...",Tuesday,23
2019-12-31 23:52:13-05:00,2019-12-31 23:52:13-05:00,,"""New Year Countdown"" Illo for NEW YORK PRESS D...",Tuesday,23


In [65]:
df.tail()

Unnamed: 0_level_0,date,location,text,day_of_week,hour
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-07 13:40:27-04:00,2020-04-07 13:40:27-04:00,,Just posted a photo @Prospect Park https://www...,Tuesday,13
2020-04-07 14:09:05-04:00,2020-04-07 14:09:05-04:00,,"Greetings from Brooklyn, miss you all! Stay sa...",Tuesday,14
2020-04-07 14:45:51-04:00,2020-04-07 14:45:51-04:00,,tuesday jules – at Prospect Park,Tuesday,14
2020-04-07 15:41:49-04:00,2020-04-07 15:41:49-04:00,,@darakass hi dr. Kass. I too am a Brooklyn res...,Tuesday,15
2020-04-07 15:43:24-04:00,2020-04-07 15:43:24-04:00,,Work * at NYU Langone. And have seen the devas...,Tuesday,15


In [66]:
# Looks like the location column is empty. Double check.
len(pd.unique(df.location))

1

In [67]:
# Get rid of the empty column. 
# NOTE: We know anyway that these tweets were published from 
# within Prospect Park. 
df = df.drop(['location'], axis=1)

In [68]:
# Save df to pkl
df.to_pickle(path="./park_tweets.pkl")

In [35]:
# Reload for new session
df = pd.read_pickle("./park_tweets.pkl")