## Determine URLs needed to extract Turnstile Data
##### Date Range: 
>April to May (2016 to 2019)

In [34]:
import datetime as dt
import pandas as pd
import numpy as np

In [2]:
url_prefix = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_'

In [13]:
start_date = '160402'
date = dt.date(int(start_date[:2]), int(start_date[2:4]), int(start_date[4:]))

In [14]:
date2 = date + dt.timedelta(days=7)

In [15]:
str(date2)[2:4]+str(date2)[5:7]+str(date2)[8:]

'160409'

In [30]:
# Create list of dates (by week) from April to end of May 2016
date_txt_2016 = '160402'
date_2016 = dt.date(int(start_date[:2]), int(start_date[2:4]), int(start_date[4:]))

date_list = [date_txt_2016]

In [31]:
    
while date_2016 < dt.date(16,6,1):
    date_2016 += dt.timedelta(days=7)
    date_list.append(str(date_2016)[2:4]+str(date_2016)[5:7]+str(date_2016)[8:])


In [33]:
date_list

['160402',
 '160409',
 '160416',
 '160423',
 '160430',
 '160507',
 '160514',
 '160521',
 '160528',
 '160604']

In [None]:
urls = []
for date in date_list:
        
    

## Data Frame to Summarize Turnstile Entries & Exits

In [2]:
!cwd

/bin/sh: cwd: command not found


In [2]:
import os

import pandas as pd
import matplotlib as plt

def get_data(folder):
    """
    Reads in turnstile data from a specified folder in Data
    
    Input: turnstile data file i.e. 2016-2017_turnstile_data
    Output: a DataFrame with all rows from all files in folder
    """
    
    col_names = ['C/A',
                 'UNIT',
                 'SCP',
                 'STATION',
                 'LINENAME',
                 'DIVISION',
                 'DATE',
                 'TIME',
                 'DESC',
                 'ENTRIES',
                 'EXITS                                                               ']

    ## absolute path to Data folder
    data_dir = os.getcwd()+"/Data/" 
    
    return_df = pd.DataFrame(columns=col_names)
    for file in os.listdir(data_dir+folder):
        if not file.startswith('.'):
            file_path = "Data/"+folder+'/'+file
            return_df = pd.concat([return_df, pd.read_csv(file_path)],axis=0)
            
    return_df.rename(columns={return_df.columns[10]:'EXITS'},inplace=True)
    
    return(return_df)

In [7]:
df = get_data("2016-2019_turnstile_data")

In [65]:
df['ENTRIES'] = df['ENTRIES'].astype(np.int)
df['EXITS'] = df['EXITS'].astype(np.int)
df['DATE'] = df['DATE'].astype('datetime64[ns]')

In [41]:
df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

In [150]:
# Create dataframe that consolidates each Turnstile at each station for each date; take the max entries & exits
# This will be used to determine the difference between Entries & Exits for each day
df2 = df.groupby(['UNIT', 'SCP', 'STATION', 'DATE']).agg({'ENTRIES':'max', 'EXITS':'max'}).reset_index()

In [151]:
df2.sort_values(['UNIT', 'SCP', 'STATION', 'DATE'], inplace=True)

In [152]:
df2

Unnamed: 0,UNIT,SCP,STATION,DATE,ENTRIES,EXITS
0,R001,00-00-00,WHITEHALL S-FRY,2016-02-27,1699746,1615404
1,R001,00-00-00,WHITEHALL S-FRY,2016-02-28,1700253,1615607
2,R001,00-00-00,WHITEHALL S-FRY,2016-02-29,1700940,1615845
3,R001,00-00-00,WHITEHALL S-FRY,2016-03-01,1701609,1616151
4,R001,00-00-00,WHITEHALL S-FRY,2016-03-02,1702421,1616404
...,...,...,...,...,...,...
1710013,R572,01-03-04,96 ST-2 AVE,2019-05-20,1083156,357664
1710014,R572,01-03-04,96 ST-2 AVE,2019-05-21,1084711,358132
1710015,R572,01-03-04,96 ST-2 AVE,2019-05-22,1086098,358636
1710016,R572,01-03-04,96 ST-2 AVE,2019-05-23,1087521,359095


In [153]:
df2['Entries Counter'] = df2['ENTRIES'].rolling(2).apply(lambda x: x[1]-x[0] if abs(x[1]-x[0]) < 5000 else np.nan ,raw=True)
#df2['Exits Counter'] = df2['EXITS'].diff()

In [154]:
df2.shape

(1710018, 7)

In [155]:
df2

Unnamed: 0,UNIT,SCP,STATION,DATE,ENTRIES,EXITS,Entries Counter
0,R001,00-00-00,WHITEHALL S-FRY,2016-02-27,1699746,1615404,
1,R001,00-00-00,WHITEHALL S-FRY,2016-02-28,1700253,1615607,507.0
2,R001,00-00-00,WHITEHALL S-FRY,2016-02-29,1700940,1615845,687.0
3,R001,00-00-00,WHITEHALL S-FRY,2016-03-01,1701609,1616151,669.0
4,R001,00-00-00,WHITEHALL S-FRY,2016-03-02,1702421,1616404,812.0
...,...,...,...,...,...,...,...
1710013,R572,01-03-04,96 ST-2 AVE,2019-05-20,1083156,357664,1429.0
1710014,R572,01-03-04,96 ST-2 AVE,2019-05-21,1084711,358132,1555.0
1710015,R572,01-03-04,96 ST-2 AVE,2019-05-22,1086098,358636,1387.0
1710016,R572,01-03-04,96 ST-2 AVE,2019-05-23,1087521,359095,1423.0


In [160]:
#df2['Entries Counter'] = df2['ENTRIES'].transform(lamhrolling(2).apply(lambda x: x[1]-x[0] if abs(x[1]-x[0]) < 5000 else np.nan ,raw=True))


In [156]:
df2[df2['Entries Counter'] < 0]

Unnamed: 0,UNIT,SCP,STATION,DATE,ENTRIES,EXITS,Entries Counter
10188,R001,02-06-01,SOUTH FERRY,2016-02-27,1,167,-1781.0
10734,R001,02-06-03,SOUTH FERRY,2018-02-24,2,23,-190.0
11098,R001,02-06-05,SOUTH FERRY,2018-02-24,1,26,-178.0
20007,R007,00-00-02,104 ST,2016-02-28,2026156571,622219061,-215.0
20008,R007,00-00-02,104 ST,2016-02-29,2026156412,622218961,-159.0
...,...,...,...,...,...,...,...
1693422,R552,00-01-00,JOURNAL SQUARE,2017-04-07,2875,8623,-1590.0
1693424,R552,00-01-00,JOURNAL SQUARE,2017-04-09,2419,5168,-600.0
1694095,R552,00-01-02,JOURNAL SQUARE,2018-05-17,1565,4513,-4414.0
1696520,R552,00-02-00,JOURNAL SQUARE,2018-04-26,101,1535,-4982.0


In [158]:
df2['Entries Counter'].min()

-4997.0

In [157]:
df2.iloc[10185:10195, :]

Unnamed: 0,UNIT,SCP,STATION,DATE,ENTRIES,EXITS,Entries Counter
10185,R001,02-06-00,SOUTH FERRY,2019-05-22,1777,0,4.0
10186,R001,02-06-00,SOUTH FERRY,2019-05-23,1780,0,3.0
10187,R001,02-06-00,SOUTH FERRY,2019-05-24,1782,0,2.0
10188,R001,02-06-01,SOUTH FERRY,2016-02-27,1,167,-1781.0
10189,R001,02-06-01,SOUTH FERRY,2016-02-28,1,167,0.0
10190,R001,02-06-01,SOUTH FERRY,2016-02-29,1,167,0.0
10191,R001,02-06-01,SOUTH FERRY,2016-03-01,1,167,0.0
10192,R001,02-06-01,SOUTH FERRY,2016-03-02,1,167,0.0
10193,R001,02-06-01,SOUTH FERRY,2016-03-03,1,169,0.0
10194,R001,02-06-01,SOUTH FERRY,2016-03-04,1,169,0.0


In [115]:
df[(df['DATE']=='2016-02-27') & (df['C/A']=='A002') & (df['SCP']=='02-00-01')]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
45,A002,R051,02-00-01,59 ST,NQR456,BMT,2016-02-27,03:00:00,REGULAR,5114878,1116740
46,A002,R051,02-00-01,59 ST,NQR456,BMT,2016-02-27,07:00:00,REGULAR,5114889,1116760
47,A002,R051,02-00-01,59 ST,NQR456,BMT,2016-02-27,11:00:00,REGULAR,5114945,1116819
48,A002,R051,02-00-01,59 ST,NQR456,BMT,2016-02-27,15:00:00,REGULAR,5115178,1116872
49,A002,R051,02-00-01,59 ST,NQR456,BMT,2016-02-27,19:00:00,REGULAR,5115572,1116929
50,A002,R051,02-00-01,59 ST,NQR456,BMT,2016-02-27,23:00:00,REGULAR,5115796,1116962


In [93]:
df[df['SCP']=='00-00-00'].groupby(['UNIT', 'SCP', 'DATE'])


471

381