In [1]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
import datetime
import pandas as pd
import os
import matplotlib.pyplot as plt
import pickle

%matplotlib inline

### Get Data Files

In [2]:
# Define NYC MTA turnstile data URL
url = 'http://web.mta.info/developers/turnstile.html'

In [3]:
# Send request and verify URL
rsp = requests.get(url)
print(rsp.url)

http://web.mta.info/developers/turnstile.html


In [4]:
# Parse HTML response
soup = BeautifulSoup(rsp.text, 'html.parser')

In [5]:
# Find all anchor tags with href attribute defined
a_tags = soup.find_all('a', href=True)

In [6]:
file_list = []
base_url = url.split('turnstile')[0]
for a in a_tags:
    tmp = re.search('data/nyct/turnstile/turnstile_\d*\.txt', a['href'])
    if tmp is not None:
        file_list.append(''.join([base_url, tmp.string]))        

In [7]:
# # Note: This only needs to be run once; comment out afterward.
# # Download txt files from NYC MTA website to local machine
# for file in file_list:
#     !wget -P ./turnstile_data/ {file}

### Compile Raw Data

In [51]:
# Note: This cell only needs to be run once; comment out afterward.
# Path to local directory with files; this needs to be set by the user.
data_path = '/home/cneiderer/Metis/turnstile_data' 
# Initialize empty dataframe
df = pd.DataFrame()
# Get list of data files
file_names = sorted(os.listdir(data_path))
for file in file_names:
    file_path = os.path.join(data_path, file)
    # Load data and concatenate with existing dataframe
    df = pd.concat([df, pd.read_csv(file_path)], axis=0)  
# Reset index for concatenated dataframe since each dataframe has own index, 
# which causes duplicate indices in concatenated dataframe  
df = df.reset_index(drop=True)   
# Save raw compiled dataframe as *.CSV and *.PKL for easy data retrieval
#df.to_csv('raw_turnstile_data.csv')
df.to_pickle('raw_turnstile_data_small.pkl')

### Data Munging / Preprocessing

#### Load Data

In [52]:
# Load raw turnstile data
df = pickle.load(open('raw_turnstile_data.pkl', 'rb'))
# df = pickle.load(open('raw_turnstile_data_small.pkl', 'rb'))

# Remove whitespace from column names and make lowercase
df.columns = [x.strip().lower() for x in df.columns]

# Get rid of unnecessary columns
df = df.drop(['c/a', 'unit', 'scp', 'linename', 'division'], axis=1)

In [53]:
# Load HH200K data
df2 = pd.read_csv('HH200K.csv')

# Remove whitespace from column names and make lowercase
df2.columns = [x.strip().lower() for x in df2.columns]

In [54]:
# Load stations data
df3 = pd.read_csv('Stations.csv')

# Remove whitespace from column names and make lowercase
df3.columns = [x.strip().lower() for x in df3.columns]

#### Join data

In [55]:
# Rename Stop Name to station so it can be used as the join key
df2 = df2.rename(index=str, columns={'stop name': 'station'})
df3 = df3.rename(index=str, columns={'stop name': 'station'})

In [56]:
# Set all station names to in the turnstile data and station data to lowercase so they can be joined
df.station = df.station.str.lower()
df2.station = df2.station.str.lower()
df3.station = df3.station.str.lower()

In [57]:
# Join turnstile data with station data via station column
df = df.join(df2.set_index('station'), on='station')

In [58]:
# Remove data that didn't have matching station key
df = df.dropna()

In [59]:
# Reset index for the joined dataframe
df = df.reset_index(drop=True)  

#### Calculate Count Stats

In [33]:
# Calculate count differences between audits and add to df
df[['entries_cnt', 'exits_cnt']] = df.groupby('station')[['entries', 'exits']].diff().fillna(0)

#### Convert Date and Time to DateTime Objs
This allows dataframe to be subset by date and/or time

In [60]:
# Conver date and time strings to datetime objects
df['date_time'] = pd.to_datetime(df.date + df.time, format='%m/%d/%Y%H:%M:%S')

In [61]:
df.head()

Unnamed: 0,station,date,time,desc,entries,exits,gtfs stop id,zip_code,gtfs latitude,gtfs longitude,perhh,bachdeg,graddeg,totgrad,2015 median household income,date_time
0,59 st,04/02/2016,00:00:00,REGULAR,5600838.0,1896290.0,R42,11220.0,40.634967,-74.023377,0.036585,0.18196,0.105754,0.287714,43903.0,2016-04-02 00:00:00
1,59 st,04/02/2016,00:00:00,REGULAR,5600838.0,1896290.0,635,10003.0,40.734673,-73.989951,0.277815,0.416931,0.417836,0.834767,150949.0,2016-04-02 00:00:00
2,59 st,04/02/2016,04:00:00,REGULAR,5600863.0,1896293.0,R42,11220.0,40.634967,-74.023377,0.036585,0.18196,0.105754,0.287714,43903.0,2016-04-02 04:00:00
3,59 st,04/02/2016,04:00:00,REGULAR,5600863.0,1896293.0,635,10003.0,40.734673,-73.989951,0.277815,0.416931,0.417836,0.834767,150949.0,2016-04-02 04:00:00
4,59 st,04/02/2016,08:00:00,REGULAR,5600877.0,1896313.0,R42,11220.0,40.634967,-74.023377,0.036585,0.18196,0.105754,0.287714,43903.0,2016-04-02 08:00:00


### Sandbox Below

In [44]:
# df[(df['date'] > '2013-01-01') & (df['date'] < '2013-02-01')]
df_tmp2 = df[df['date_time'] >= '2017-01-01'] # & df['date_time'] <= '2015-06-15']

In [46]:
len(df_tmp2)

4925768

In [36]:
df_tmp = df.groupby(['station', 'date'])[['entries_cnt', 'exits_cnt']].sum()

In [37]:
df_tmp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,entries_cnt,exits_cnt
station,date,Unnamed: 2_level_1,Unnamed: 3_level_1
1 av,01/28/2017,-202135.0,-138760.0
1 av,01/29/2017,11918.0,10702.0
1 av,01/30/2016,-700405.0,-203386.0
1 av,01/30/2017,20926.0,22617.0
1 av,01/31/2016,17393.0,18294.0


In [16]:
df['date_obj'] = pd.to_datetime(df.date, format='%m/%d/%Y')
df['time_obj'] = pd.to_datetime(df.time, format='%H:%M:%S')

In [17]:
df2 = df.head(20)

In [18]:
df2[datetime.datetime(1900, 1, 1, 8, 0, 0, 0) == df2.time_obj]

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,exits,entries_diff,exits_diff,date_time,date_obj,time_obj
2,A002,R051,02-00-00,59 ST,NQR456,BMT,04/02/2016,08:00:00,REGULAR,5600877,1896313,14.0,20.0,2016-04-02 08:00:00,2016-04-02,1900-01-01 08:00:00
8,A002,R051,02-00-00,59 ST,NQR456,BMT,04/03/2016,08:00:00,REGULAR,5601643,1896578,5.0,12.0,2016-04-03 08:00:00,2016-04-03,1900-01-01 08:00:00
14,A002,R051,02-00-00,59 ST,NQR456,BMT,04/04/2016,08:00:00,REGULAR,5602297,1896893,41.0,100.0,2016-04-04 08:00:00,2016-04-04,1900-01-01 08:00:00


In [19]:
df2.time_obj.diff().fillna(0)

0             00:00:00
1             04:00:00
2             04:00:00
3             04:00:00
4             04:00:00
5             04:00:00
6    -1 days +04:00:00
7             04:00:00
8             04:00:00
9             04:00:00
10            04:00:00
11            04:00:00
12   -1 days +04:00:00
13            04:00:00
14            04:00:00
15            00:35:08
16            03:24:52
17            04:00:00
18            04:00:00
19   -1 days +08:00:00
Name: time_obj, dtype: timedelta64[ns]

In [20]:
df2.date_time.diff().fillna(0)

0    00:00:00
1    04:00:00
2    04:00:00
3    04:00:00
4    04:00:00
5    04:00:00
6    04:00:00
7    04:00:00
8    04:00:00
9    04:00:00
10   04:00:00
11   04:00:00
12   04:00:00
13   04:00:00
14   04:00:00
15   00:35:08
16   03:24:52
17   04:00:00
18   04:00:00
19   08:00:00
Name: date_time, dtype: timedelta64[ns]

In [24]:
t2 = df2.time[1:].reset_index(drop=True)

In [25]:
t1 = df2.time[0:-1].reset_index(drop=True)

In [26]:
pd.concat([t1.rename('t1'), t2.rename('t2')], axis=1)

Unnamed: 0,t1,t2
0,00:00:00,04:00:00
1,04:00:00,08:00:00
2,08:00:00,12:00:00
3,12:00:00,16:00:00
4,16:00:00,20:00:00
5,20:00:00,00:00:00
6,00:00:00,04:00:00
7,04:00:00,08:00:00
8,08:00:00,12:00:00
9,12:00:00,16:00:00


nan