In [3]:
import sys
print("Python Version:", sys.version)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
pd.set_option('display.max_rows', 500)
%matplotlib inline

Python Version: 3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]


In [4]:
# yymmdd format
dates = ['190803',
        '190810',
        '190817',
        '190824',
        '190831',
        '190907',
        '190914',
        '190921',
        '190928']

# iterate through dates and pull in csv and cat dataframes together
df_turns = []
for date in dates:
    df_turns.append(pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_'+date+'.txt'))
    
# Concat all dataframes
df_turns = pd.concat(df_turns)  

In [5]:
#Clean up the column Names
new_col_names = [name.strip() for name in df_turns.columns]
df_turns.columns = new_col_names



In [8]:
# Remove audited rows
df_turns=df_turns[df_turns.DESC!='RECOVR AUD']
# remove column?
del df_turns['DESC']

In [10]:
# Create a DateTime column that can be used for timeseries
df_turns.loc[:,'DATETIME'] = pd.to_datetime(df_turns['DATE'] + ' ' + df_turns['TIME'])

#Convert Date column to proper date type.
df_turns['DATE']  = pd.to_datetime(df_turns['DATE'],format='%m/%d/%Y')

#convert time to datetime type
df_turns['TIME']=pd.to_datetime(df_turns['TIME'], format='%H:%M:%S')

In [11]:
#backup
df_backup=df_turns.copy()

In [12]:
#Sort in order of datetime
df_turns = df_turns.sort_values(by=['DATETIME']).reset_index()

In [14]:
# Caluclate Entries Aggregated at station level by DateTime
df_turns['Entry_Diff']=df_turns.groupby(['STATION', 'C/A', 'UNIT', 'SCP'],as_index=False)['ENTRIES'].transform(pd.Series.diff)['ENTRIES']

# Caluclate Exits Aggregated at station level by DateTime
df_turns['Exit_Diff']=df_turns.groupby(['STATION', 'C/A', 'UNIT', 'SCP'],as_index=False)['EXITS'].transform(pd.Series.diff)['EXITS']



In [16]:
#Absolute Value to deal with counting backward issues 
df_turns['Entry_Diff'] = abs(df_turns['Entry_Diff'])
df_turns['Exit_Diff']=abs(df_turns['Exit_Diff'])

# Calculate both
df_turns['Total_Traffic']=df_turns['Entry_Diff']+df_turns['Exit_Diff']

In [32]:
# Filter out values in calculated fields outside of upper quartile range


Unnamed: 0,index,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,ENTRIES,EXITS,DATETIME,Entry_Diff,Exit_Diff,Total_Traffic
622,163598,R244,R050,00-00-04,59 ST,456NQRW,IRT,2019-07-27,1900-01-01 00:00:00,569229,2989306,2019-07-27 00:00:00,,,
6184,163599,R244,R050,00-00-04,59 ST,456NQRW,IRT,2019-07-27,1900-01-01 04:00:00,569232,2989334,2019-07-27 04:00:00,3.0,28.0,31.0
11489,163600,R244,R050,00-00-04,59 ST,456NQRW,IRT,2019-07-27,1900-01-01 08:00:00,569232,2989365,2019-07-27 08:00:00,0.0,31.0,31.0
14734,163601,R244,R050,00-00-04,59 ST,456NQRW,IRT,2019-07-27,1900-01-01 12:00:00,569236,2989419,2019-07-27 12:00:00,4.0,54.0,58.0
19671,163602,R244,R050,00-00-04,59 ST,456NQRW,IRT,2019-07-27,1900-01-01 16:00:00,569237,2989481,2019-07-27 16:00:00,1.0,62.0,63.0
26186,163603,R244,R050,00-00-04,59 ST,456NQRW,IRT,2019-07-27,1900-01-01 20:00:00,569241,2989546,2019-07-27 20:00:00,4.0,65.0,69.0
29924,163604,R244,R050,00-00-04,59 ST,456NQRW,IRT,2019-07-28,1900-01-01 00:00:00,569245,2989572,2019-07-28 00:00:00,4.0,26.0,30.0
36447,163605,R244,R050,00-00-04,59 ST,456NQRW,IRT,2019-07-28,1900-01-01 04:00:00,569245,2989588,2019-07-28 04:00:00,0.0,16.0,16.0
40254,163606,R244,R050,00-00-04,59 ST,456NQRW,IRT,2019-07-28,1900-01-01 08:00:00,569245,2989610,2019-07-28 08:00:00,0.0,22.0,22.0
44962,163607,R244,R050,00-00-04,59 ST,456NQRW,IRT,2019-07-28,1900-01-01 12:00:00,569246,2989653,2019-07-28 12:00:00,1.0,43.0,44.0


In [41]:
df_per_station = df_turns.groupby(['STATION','DATE']).sum()
df_per_station.sort_values('Total_Traffic', ascending=False).head(100)

Unnamed: 0_level_0,Unnamed: 1_level_0,index,ENTRIES,EXITS,Entry_Diff,Exit_Diff,Total_Traffic
STATION,DATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
72 ST,2019-08-07,25374223,4877497882,11166630474,2755329000.0,2837989000.0,5593318000.0
14 ST,2019-08-30,28611345,22402819204,17151035865,2150588000.0,1175634000.0,3326222000.0
HUNTS POINT AV,2019-07-29,10814370,4361265469,4801125493,1111574000.0,1971931000.0,3083504000.0
CYPRESS AV,2019-09-21,6025699,3181269503,2369163252,2013357000.0,839006700.0,2852364000.0
59 ST,2019-08-14,32473359,12540745137,9854615315,2029705000.0,702686100.0,2732391000.0
161/YANKEE STAD,2019-09-06,27525051,14339713748,13654450495,718583400.0,1886418000.0,2605002000.0
3 AV-149 ST,2019-07-31,15688665,18213189687,10669202828,1819048000.0,555769400.0,2374817000.0
LAFAYETTE AV,2019-08-07,3058159,5966150260,1009544995,1953784000.0,317486600.0,2271271000.0
5 AVE,2019-09-11,10199060,3828678301,1560523407,991555700.0,504199600.0,1495755000.0
GRD CNTRL-42 ST,2019-08-06,62890363,15203808919,20552437283,654962000.0,616400500.0,1271362000.0
