In [1]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
import datetime
import pandas as pd
import os
import matplotlib.pyplot as plt
import pickle

%matplotlib inline

### Get Data Files

In [2]:
# Define NYC MTA turnstile data URL
url = 'http://web.mta.info/developers/turnstile.html'

In [3]:
# Send request and verify URL
rsp = requests.get(url)
print(rsp.url)

http://web.mta.info/developers/turnstile.html


In [4]:
# Parse HTML response
soup = BeautifulSoup(rsp.text, 'html.parser')

In [5]:
# Find all anchor tags with href attribute defined
a_tags = soup.find_all('a', href=True)

In [6]:
file_list = []
base_url = url.split('turnstile')[0]
for a in a_tags:
    tmp = re.search('data/nyct/turnstile/turnstile_\d*\.txt', a['href'])
    if tmp is not None:
        file_list.append(''.join([base_url, tmp.string]))        

In [7]:
# # Note: This only needs to be run once; comment out afterward.
# # Download txt files from NYC MTA website to local machine
# for file in file_list:
#     !wget -P ./turnstile_data/ {file}

### Compile Raw Data

In [8]:
# # Note: This cell only needs to be run once; comment out afterward.
# # Path to local directory with files; this needs to be set by the user.
# data_path = '/home/cneiderer/Metis/turnstile_data' 
# # Initialize empty dataframe
# df = pd.DataFrame()
# # Get list of data files
# file_names = os.listdir(data_path)
# for file in file_names:
#     file_path = os.path.join(data_path, file)
#     # Load data and concatenate with existing dataframe
#     df = pd.concat([df, pd.read_csv(file_path)], axis=0)  
# # Reset index for concatenated dataframe since each dataframe has own index, 
# # which causes duplicate indices in concatenated dataframe  
# df = df.reset_index(drop=True)   
# # Save raw compiled dataframe as *.CSV and *.PKL for easy data retrieval
# df.to_csv('raw_turnstile_data.csv')
# df.to_pickle('raw_turnstile_data.pkl')

### Working DataFrame

In [46]:
# Load raw dataframe
df = pickle.load(open('raw_turnstile_data.pkl', 'rb'))

CPU times: user 2.01 s, sys: 680 ms, total: 2.69 s
Wall time: 2.69 s


In [38]:
# Remove whitespace from column names and make lowercase
df.columns = [x.strip().lower() for x in df.columns]

In [9]:
df.head()

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,exits
0,A002,R051,02-00-00,59 ST,NQR456,BMT,04/02/2016,00:00:00,REGULAR,5600838,1896290
1,A002,R051,02-00-00,59 ST,NQR456,BMT,04/02/2016,04:00:00,REGULAR,5600863,1896293
2,A002,R051,02-00-00,59 ST,NQR456,BMT,04/02/2016,08:00:00,REGULAR,5600877,1896313
3,A002,R051,02-00-00,59 ST,NQR456,BMT,04/02/2016,12:00:00,REGULAR,5600922,1896375
4,A002,R051,02-00-00,59 ST,NQR456,BMT,04/02/2016,16:00:00,REGULAR,5601172,1896445


In [10]:
# Calculate count differences between audits and add to df
df[['entries_diff', 'exits_diff']] = df.groupby('station')[['entries', 'exits']].diff().fillna(0)

In [11]:
df.head()

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,exits,entries_diff,exits_diff
0,A002,R051,02-00-00,59 ST,NQR456,BMT,04/02/2016,00:00:00,REGULAR,5600838,1896290,0.0,0.0
1,A002,R051,02-00-00,59 ST,NQR456,BMT,04/02/2016,04:00:00,REGULAR,5600863,1896293,25.0,3.0
2,A002,R051,02-00-00,59 ST,NQR456,BMT,04/02/2016,08:00:00,REGULAR,5600877,1896313,14.0,20.0
3,A002,R051,02-00-00,59 ST,NQR456,BMT,04/02/2016,12:00:00,REGULAR,5600922,1896375,45.0,62.0
4,A002,R051,02-00-00,59 ST,NQR456,BMT,04/02/2016,16:00:00,REGULAR,5601172,1896445,250.0,70.0


In [12]:
# Conver date and time strings to datetime objects
df['date_time'] = pd.to_datetime(df.date + df.time, format='%m/%d/%Y%H:%M:%S')

In [13]:
df.head()

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,exits,entries_diff,exits_diff,date_time
0,A002,R051,02-00-00,59 ST,NQR456,BMT,04/02/2016,00:00:00,REGULAR,5600838,1896290,0.0,0.0,2016-04-02 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456,BMT,04/02/2016,04:00:00,REGULAR,5600863,1896293,25.0,3.0,2016-04-02 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456,BMT,04/02/2016,08:00:00,REGULAR,5600877,1896313,14.0,20.0,2016-04-02 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456,BMT,04/02/2016,12:00:00,REGULAR,5600922,1896375,45.0,62.0,2016-04-02 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456,BMT,04/02/2016,16:00:00,REGULAR,5601172,1896445,250.0,70.0,2016-04-02 16:00:00


### Sandbox Below

In [16]:
df['date_obj'] = pd.to_datetime(df.date, format='%m/%d/%Y')
df['time_obj'] = pd.to_datetime(df.time, format='%H:%M:%S')

In [17]:
df2 = df.head(20)

In [18]:
df2[datetime.datetime(1900, 1, 1, 8, 0, 0, 0) == df2.time_obj]

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,exits,entries_diff,exits_diff,date_time,date_obj,time_obj
2,A002,R051,02-00-00,59 ST,NQR456,BMT,04/02/2016,08:00:00,REGULAR,5600877,1896313,14.0,20.0,2016-04-02 08:00:00,2016-04-02,1900-01-01 08:00:00
8,A002,R051,02-00-00,59 ST,NQR456,BMT,04/03/2016,08:00:00,REGULAR,5601643,1896578,5.0,12.0,2016-04-03 08:00:00,2016-04-03,1900-01-01 08:00:00
14,A002,R051,02-00-00,59 ST,NQR456,BMT,04/04/2016,08:00:00,REGULAR,5602297,1896893,41.0,100.0,2016-04-04 08:00:00,2016-04-04,1900-01-01 08:00:00


In [19]:
df2.time_obj.diff().fillna(0)

0             00:00:00
1             04:00:00
2             04:00:00
3             04:00:00
4             04:00:00
5             04:00:00
6    -1 days +04:00:00
7             04:00:00
8             04:00:00
9             04:00:00
10            04:00:00
11            04:00:00
12   -1 days +04:00:00
13            04:00:00
14            04:00:00
15            00:35:08
16            03:24:52
17            04:00:00
18            04:00:00
19   -1 days +08:00:00
Name: time_obj, dtype: timedelta64[ns]

In [20]:
df2.date_time.diff().fillna(0)

0    00:00:00
1    04:00:00
2    04:00:00
3    04:00:00
4    04:00:00
5    04:00:00
6    04:00:00
7    04:00:00
8    04:00:00
9    04:00:00
10   04:00:00
11   04:00:00
12   04:00:00
13   04:00:00
14   04:00:00
15   00:35:08
16   03:24:52
17   04:00:00
18   04:00:00
19   08:00:00
Name: date_time, dtype: timedelta64[ns]

In [24]:
t2 = df2.time[1:].reset_index(drop=True)

In [25]:
t1 = df2.time[0:-1].reset_index(drop=True)

In [26]:
pd.concat([t1.rename('t1'), t2.rename('t2')], axis=1)

Unnamed: 0,t1,t2
0,00:00:00,04:00:00
1,04:00:00,08:00:00
2,08:00:00,12:00:00
3,12:00:00,16:00:00
4,16:00:00,20:00:00
5,20:00:00,00:00:00
6,00:00:00,04:00:00
7,04:00:00,08:00:00
8,08:00:00,12:00:00
9,12:00:00,16:00:00
