## Obtain tweets geotagged to Prospect Park starting January 2020

In [1]:
# Import necessary modules
import pandas as pd
import numpy as np
import tweepy
import time

# # Twitter API credentials
# from auth import (
#     api_secret_key,)

# # Auths
# auth = tweepy.AppAuthHandler(api_key, api_secret_key)
# api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

### Write dict of strings defining non-overlapping circles that "partition" Prospect Park

In [2]:
# Tweepy's search function only allows you to search circular areas. 
# So to obtain all and only the tweets published from within Prospect Park, 
# we "partition" the park into many disjoint circles, not including
# Grand Army Plaza greenmarket, the Brooklyn Botanical Garden, or the Parade
# Grounds. (Not a true partition since the tiny areas between the circles 
# are left out.)

import re
import numpy as np

# We use the tool at the following URL to partition the park. 
# Note that the centers and radii of our partition circles can conveniently 
# be obtained from the URL! 
url = "https://www.mapdevelopers.com/draw-circle-tool.php?circles=%5B%5B502.02%2C40.664381%2C-73.9699879%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B362.14%2C40.655873%2C-73.968789%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B154.29%2C40.6590246%2C-73.9642525%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B160.93%2C40.6614359%2C-73.9771982%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B112.92%2C40.6597918%2C-73.9748271%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B92.51%2C40.6583267%2C-73.9731641%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B100.18%2C40.6568699%2C-73.9633579%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B79.35%2C40.6552887%2C-73.9628255%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B98.59%2C40.6521246%2C-73.9713724%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B131.2%2C40.6709736%2C-73.9697845%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B83.46%2C40.6694346%2C-73.9682861%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B88.13%2C40.6695266%2C-73.9715369%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B69.6%2C40.6634627%2C-73.9766691%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B82.06%2C40.6592781%2C-73.971593%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B69.6%2C40.6632254%2C-73.9633747%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B64.37%2C40.6620616%2C-73.9640184%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B64.37%2C40.6610035%2C-73.9646407%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B48.07%2C40.6607268%2C-73.9633962%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B54.44%2C40.6545443%2C-73.9641562%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B41.03%2C40.6523385%2C-73.9697245%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B53.72%2C40.6532772%2C-73.9723029%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B48.28%2C40.6570506%2C-73.9733757%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B48.28%2C40.6599112%2C-73.9729307%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B42.84%2C40.6595286%2C-73.9690469%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B39.22%2C40.6644199%2C-73.9764176%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B48.28%2C40.6554667%2C-73.9643161%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B48.28%2C40.6592242%2C-73.9666618%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B48.28%2C40.6693261%2C-73.969892%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B42.84%2C40.6688134%2C-73.972778%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B45.56%2C40.6687565%2C-73.9670274%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B29.26%2C40.6704409%2C-73.9680252%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B24.03%2C40.6705549%2C-73.9715228%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B32.19%2C40.6640906%2C-73.9636425%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B21.32%2C40.6648348%2C-73.9638249%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B19.54%2C40.6547593%2C-73.9730587%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B16.09%2C40.6575064%2C-73.9738755%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B26.06%2C40.6596598%2C-73.976497%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B22.22%2C40.6611419%2C-73.9793515%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B16.09%2C40.6633706%2C-73.9777464%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B18.81%2C40.6654976%2C-73.9760047%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B23.34%2C40.6681528%2C-73.9737409%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B21.53%2C40.6540927%2C-73.9648659%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B16.09%2C40.6526519%2C-73.972537%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B34.69%2C40.6604114%2C-73.9655446%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B42.15%2C40.6594319%2C-73.9700899%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B48.28%2C40.6596598%2C-73.9679227%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B39.22%2C40.6611899%2C-73.9748213%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B27.2%2C40.6574391%2C-73.9646728%2C%22%23AAAAAA%22%2C%22%23000000%22%2C0.4%5D%2C%5B21.31%2C40.6578827%2C-73.9628221%2C%22%23AAAAAA%22%"

centers = re.findall("%5B\d+\.\d+%2C(\d+\.\d+)%2C(-\d+\.\d+)%2C",url)
radii = re.findall("%5B(\d+\.\d+)%2C\d+\.\d+%2C-\d+\.\d+%2C",url)

for i in range(0,len(radii)):
    radii[i] = str(np.round(float(radii[i])/1000,5)) + 'km'

### Create and save a single master dataframe consisting of all the tweets 
### from Prospect Park since March 1, 2020

In [3]:
import GetOldTweets3 as got
import datetime as dt
import numpy as np

In [4]:
today = dt.datetime.now().strftime("%Y-%m-%d")

In [5]:
# Store the properties that we use to specify which tweets we are seeking as our request's tweet criteria

criteria = {}

# For each circle i, record tweet criteria for requesting all tweets published within circle i
for i in range(0,26):
    
    criteria[i] = got.manager.TweetCriteria().setSince("2020-01-01")\
                                               .setUntil(today)\
                                               .setNear(centers[i][0]+','+centers[i][1])\
                                               .setWithin(radii[i])

In [6]:
# Initialize an empty dict: tweets. For each circle i, tweets[i] will contain all 
# tweets published within circle i since January 1, 2020.

tweets = {}

# Request the tweets and store them in a list: tweets[i]
for i in range(0,26):
    tweets[i] = got.manager.TweetManager.getTweets(criteria[i])

In [7]:
# Build a dataframe out of all of our tweets

# We will need to convert the timezone to EST (it is UTC by default)
import pytz
est = pytz.timezone('US/Eastern')

# Initialize empty columns
dates = []
locations = []
texts = []
users = []

for i in range(0,26):
    for j in range(0,len(tweets[i])):
            dates.append(tweets[i][j].date.astimezone(tz= est))
            # If we want dates to be more human parseable, add: .strftime("%Y-%m-%d %H:%M:%S")
            locations.append(tweets[i][j].geo)
            texts.append(tweets[i][j].text)
            users.append(tweets[i][j].username)

In [16]:
# Build a dataframe out of all of our tweets: df
data = {'date':dates, 'location':locations, 'text':texts, 'username':users}
df = pd.DataFrame(data=data)
df.date = pd.to_datetime(df.date)

# Save what we have so far before we do anything to it
df.to_pickle(path="./park_tweets_raw.pkl")

# Sort the data by time, but leave date as a column for easy manipulation
df = df.set_index('date',drop=False)
df = df.sort_index()

# Create a column for day of the week
df['day_of_week'] = df.date.dt.day_name()

# Create a column for the hour
df['hour'] = df.date.dt.hour

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 840 entries, 2019-12-31 22:00:04-05:00 to 2020-04-09 14:28:46-04:00
Data columns (total 6 columns):
date           840 non-null datetime64[ns, US/Eastern]
location       840 non-null object
text           840 non-null object
username       840 non-null object
day_of_week    840 non-null object
hour           840 non-null int64
dtypes: datetime64[ns, US/Eastern](1), int64(1), object(4)
memory usage: 85.9+ KB


In [18]:
df.head()

Unnamed: 0_level_0,date,location,text,username,day_of_week,hour
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-12-31 22:00:04-05:00,2019-12-31 22:00:04-05:00,,人还很少 #NYE2020 – at Prospect Park,waynesun09,Tuesday,22
2019-12-31 23:00:44-05:00,2019-12-31 23:00:44-05:00,,Baby 出现了，家庭友好的新年庆祝 #NYE2020 #brooklyn #NYC – a...,waynesun09,Tuesday,23
2019-12-31 23:51:00-05:00,2019-12-31 23:51:00-05:00,,"""New Year in Space"" Illo for the Washington Po...",dannyhellman,Tuesday,23
2019-12-31 23:52:13-05:00,2019-12-31 23:52:13-05:00,,"""New Year Countdown"" Illo for NEW YORK PRESS D...",dannyhellman,Tuesday,23
2020-01-01 00:11:44-05:00,2020-01-01 00:11:44-05:00,,2019 再见，2020 新的十年开始。19年还是不错，追求不多所以收获也没多少，只是简单的...,waynesun09,Wednesday,0


In [19]:
df.tail()

Unnamed: 0_level_0,date,location,text,username,day_of_week,hour
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-09 00:30:46-04:00,2020-04-09 00:30:46-04:00,,Keep your distance methane boy @Prospect Park ...,JonezyKarona,Thursday,0
2020-04-09 07:18:41-04:00,2020-04-09 07:18:41-04:00,,@BestBuy @BestBuySupport if our store is open ...,fatespeakstruth,Thursday,7
2020-04-09 08:57:37-04:00,2020-04-09 08:57:37-04:00,,Are you blind or just insane? Social distancin...,Kuch516,Thursday,8
2020-04-09 12:11:57-04:00,2020-04-09 12:11:57-04:00,,Eastern phoebe in @prospect_park. Day 15 of po...,allieri,Thursday,12
2020-04-09 14:28:46-04:00,2020-04-09 14:28:46-04:00,,I'm on the roof #fuk #that Still #zonedout @bw...,joelmylesjr,Thursday,14


In [20]:
# Looks like the location column is empty. Double check.
len(pd.unique(df.location))

1

In [21]:
# Get rid of the empty column. 
# NOTE: We know anyway that these tweets were published from 
# within Prospect Park. 
df = df.drop(['location'], axis=1)

In [22]:
# Save df to pkl
df.to_pickle(path="./park_tweets.pkl")

In [23]:
# Reload for new session
df = pd.read_pickle("./park_tweets.pkl")