## Obtain tweets geotagged to Prospect Park starting January 2020

In [2]:
# Import necessary modules
import pandas as pd
import numpy as np
import tweepy
import time

# Twitter API credentials
from auth import (
    api_secret_key,)

# Auths
auth = tweepy.AppAuthHandler(api_key, api_secret_key)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

### Write dict of strings defining non-overlapping circles that "partition" Prospect Park

In [3]:
# Tweepy's search function only allows you to search circular areas. 
# So to obtain all and only the tweets published from within Prospect Park, 
# we "partition" the park into 26 disjoint circles. (Not a true partition 
# since the tiny areas between the circles are left out.)

centers = {0:"40.671104,-73.968465",
1:"40.671850,-73.965368",
2:"40.670769,-73.965293",
3:"40.669673,-73.971451",
4:"40.669288,-73.965786",
5:"40.668409,-73.963533",
6:"40.664121,-73.969430",
7:"40.666716,-73.962611",
8:"40.665301,-73.962074",
9:"40.664031,-73.962010",
10:"40.663495,-73.976905",
11:"40.661436,-73.977198",
12:"40.660841,-73.963457",
13:"40.659645,-73.974967",
14:"40.659025,-73.964253",
15:"40.659100,-73.966612",
16:"40.658856,-73.971547",
17:"40.658343,-73.973121",
18:"40.655710,-73.968853",
19:"40.657098,-73.973500",
20:"40.656878,-73.963422",
21:"40.655183,-73.963351",
22:"40.652970,-73.972335",
23:"40.652125,-73.971362",
24:"40.651342,-73.967057",
25:"40.650105,-73.970340"}

radii = {0:"0.21997km",
         1:"0.04828km",
         2:"0.05169km",
         3:"0.07364km",
         4:"0.08708km",
         5:"0.12566km",
         6:"0.56544km",
         7:"0.08047km",
         8:"0.07141km",
         9:"0.06437km",
         10:"0.06960km",
         11:"0.16093km",
         12:"0.05713km",
         13:"0.12107km",
         14:"0.15429km",
         15:"0.04103km",
         16:"0.04444km",
         17:"0.09251km",
         18:"0.37845km",
         19:"0.04466km",
         20:"0.09656km",
         21:"0.08569km",
         22:"0.04828km",
         23:"0.08047km",
         24:"0.14484km",
         25:"0.13708km"}

### Create and save a single master dataframe consisting of all the tweets 
### from Prospect Park since March 1, 2020

In [4]:
import GetOldTweets3 as got
import datetime as dt
import numpy as np

In [5]:
today = dt.datetime.now().strftime("%Y-%m-%d")

In [6]:
# Store the properties that we use to specify which tweets we are seeking as our request's tweet criteria

criteria = {}

# For each circle i, record tweet criteria for requesting all tweets published within circle i
for i in range(0,26):
    
    criteria[i] = got.manager.TweetCriteria().setSince("2020-01-01")\
                                               .setUntil(today)\
                                               .setNear(centers[i])\
                                               .setWithin(radii[i])

In [7]:
# Initialize an empty dict: tweets. For each circle i, tweets[i] will contain all 
# tweets published within circle i since January 1, 2020.

tweets = {}

# Request the tweets and store them in a list: tweets[i]
for i in range(0,26):
    tweets[i] = got.manager.TweetManager.getTweets(criteria[i])

In [28]:
# Build a dataframe out of all of our tweets

# Initialize empty columns
dates = []
locations = []
texts = []

# pandas.concat(objs, ignore_index=False) with ignore_index set to True and objs 
# as a list containing a row and a DataFrame to insert the row into the DataFrame.

for i in range(0,26):
    for j in range(0,len(tweets[i])):
            dates.append(tweets[i][j].date.strftime("%Y-%m-%d %H:%M:%S"))
            locations.append(tweets[i][j].geo)
            texts.append(tweets[i][j].text)


# Build a dataframe out of all of our tweets: df
data = {'date':dates, 'location':locations, 'text':texts}
df = pd.DataFrame(data=data)
df.date = pd.to_datetime(df.date)

# Save what we have so far before we do anything to it
df.to_pickle(path="./park_tweets_raw.pkl")

# Sort the data by time, but leave date as a column for easy manipulation
df = df.set_index('date',drop=False)
df = df.sort_index()

# Create a column for day of the week
df['day_of_week'] = df.date.dt.day_name()

# Create a column for the hour
df['hour'] = df.date.dt.hour

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1005 entries, 2020-01-01 03:00:04 to 2020-04-05 22:10:12
Data columns (total 5 columns):
date           1005 non-null datetime64[ns]
location       1005 non-null object
text           1005 non-null object
day_of_week    1005 non-null object
hour           1005 non-null int64
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 87.1+ KB


In [30]:
df.head()

Unnamed: 0_level_0,date,location,text,day_of_week,hour
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-01 03:00:04,2020-01-01 03:00:04,,人还很少 #NYE2020 – at Prospect Park,Wednesday,3
2020-01-01 04:00:44,2020-01-01 04:00:44,,Baby 出现了，家庭友好的新年庆祝 #NYE2020 #brooklyn #NYC – a...,Wednesday,4
2020-01-01 04:39:12,2020-01-01 04:39:12,,Flirty Leo & Aquarius – at Grand Army Plaza Gr...,Wednesday,4
2020-01-01 04:51:00,2020-01-01 04:51:00,,"""New Year in Space"" Illo for the Washington Po...",Wednesday,4
2020-01-01 04:52:13,2020-01-01 04:52:13,,"""New Year Countdown"" Illo for NEW YORK PRESS D...",Wednesday,4


In [31]:
df.tail()

Unnamed: 0_level_0,date,location,text,day_of_week,hour
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-05 19:43:28,2020-04-05 19:43:28,,,Sunday,19
2020-04-05 20:06:22,2020-04-05 20:06:22,,"I'm at Prospect Park - @nycparks in Brooklyn, ...",Sunday,20
2020-04-05 20:25:53,2020-04-05 20:25:53,,Prospect park is packed right now- not even 25...,Sunday,20
2020-04-05 21:50:17,2020-04-05 21:50:17,,Still getting out there – at Prospect Park,Sunday,21
2020-04-05 22:10:12,2020-04-05 22:10:12,,Sensational walk with #PeteyPiro this afternoo...,Sunday,22


In [32]:
# Looks like the location column is empty. Double check.
len(pd.unique(df.location))

1

In [33]:
# Get rid of the empty column. 
# NOTE: We know anyway that these tweets were published from 
# within Prospect Park. 
df = df.drop(['location'], axis=1)

In [34]:
# Save df to pkl
df.to_pickle(path="./park_tweets.pkl")

In [35]:
# Reload for new session
df = pd.read_pickle("./park_tweets.pkl")