In [3]:
import sys
print("Python Version:", sys.version)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
pd.set_option('display.max_rows', 500)
%matplotlib inline

Python Version: 3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]


In [4]:
# yymmdd format
dates = ['190803',
        '190810',
        '190817',
        '190824',
        '190831',
        '190907',
        '190914',
        '190921',
        '190928']

# iterate through dates and pull in csv and cat dataframes together
df_turns = []
for date in dates:
    df_turns.append(pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_'+date+'.txt'))
    
# Concat all dataframes
df_turns = pd.concat(df_turns)  

In [5]:
#Clean up the column Names
new_col_names = [name.strip() for name in df_turns.columns]
df_turns.columns = new_col_names



In [8]:
# Remove audited rows
df_turns=df_turns[df_turns.DESC!='RECOVR AUD']
# remove column?
del df_turns['DESC']

In [10]:
# Create a DateTime column that can be used for timeseries
df_turns.loc[:,'DATETIME'] = pd.to_datetime(df_turns['DATE'] + ' ' + df_turns['TIME'])

#Convert Date column to proper date type.
df_turns['DATE']  = pd.to_datetime(df_turns['DATE'],format='%m/%d/%Y')

#convert time to datetime type
df_turns['TIME']=pd.to_datetime(df_turns['TIME'], format='%H:%M:%S')

In [11]:
#backup
df_backup=df_turns.copy()

In [12]:
#Sort in order of datetime
df_turns = df_turns.sort_values(by=['DATETIME']).reset_index()

In [14]:
# Caluclate Entries Aggregated at station level by DateTime
df_turns['Entry_Diff']=df_turns.groupby(['STATION', 'C/A', 'UNIT', 'SCP'],as_index=False)['ENTRIES'].transform(pd.Series.diff)['ENTRIES']

# Caluclate Exits Aggregated at station level by DateTime
df_turns['Exit_Diff']=df_turns.groupby(['STATION', 'C/A', 'UNIT', 'SCP'],as_index=False)['EXITS'].transform(pd.Series.diff)['EXITS']



In [16]:
#Absolute Value to deal with counting backward issues 
df_turns['Entry_Diff'] = abs(df_turns['Entry_Diff'])
df_turns['Exit_Diff']=abs(df_turns['Exit_Diff'])

# Calculate both
df_turns['Total_Traffic']=df_turns['Entry_Diff']+df_turns['Exit_Diff']

In [91]:
# I think we can use this as a cut off point for station traffic totals.
df_turns.groupby(['STATION'])['Total_Traffic'].quantile(.99)


STATION
1 AV               2069.55
103 ST             1434.00
103 ST-CORONA      1439.56
104 ST              369.00
110 ST             1295.20
111 ST             1090.15
116 ST             1063.00
116 ST-COLUMBIA    1737.90
121 ST              229.39
125 ST             1796.00
135 ST              867.00
137 ST CITY COL    1008.10
138/GRAND CONC      848.40
14 ST              1670.26
14 ST-UNION SQ     2344.00
145 ST             1857.89
149/GRAND CONC     1615.12
14TH STREET        1280.40
15 ST-PROSPECT      600.54
155 ST              595.72
157 ST             1204.07
161/YANKEE STAD    1053.82
163 ST-AMSTERDM     473.54
167 ST             1491.33
168 ST             1366.70
169 ST              667.68
170 ST             1016.10
174 ST              998.52
174-175 STS         410.34
175 ST              989.67
176 ST              692.00
18 AV               890.70
18 ST              1284.73
181 ST             1509.91
182-183 STS         670.69
183 ST              778.23
190 ST              

In [77]:
# Filter out values in calculated fields outside of upper quartile range
df_turns.dropna(subset=["Entry_Diff","Exit_Diff",'Total_Traffic'], axis=0, inplace=True)

q_9 = df_turns['Total_Traffic'].quantile(.99)

df_turns_filt = df_turns[df_turns.Total_Traffic<= q_9]



In [78]:
df_per_station = df_turns_filt.groupby(['STATION']).sum()
df_per_station.sort_values('Total_Traffic', ascending=False).head(100)

Unnamed: 0_level_0,index,ENTRIES,EXITS,Entry_Diff,Exit_Diff,Total_Traffic
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
34 ST-PENN STA,3401972217,1460365699649,1538055094977,8254361.0,6992507.0,15246868.0
GRD CNTRL-42 ST,3752729625,919546458460,1105055715021,5679047.0,4802427.0,10481474.0
23 ST,2607575657,2091252915203,1812031229789,5750528.0,4288698.0,10039226.0
FULTON ST,3245309619,1787737426085,1745103246838,5078558.0,4419547.0,9498105.0
34 ST-HERALD SQ,1402968886,1646402859827,2064647805215,4784950.0,4558398.0,9343348.0
TIMES SQ-42 ST,2162196120,2135387457561,1652034183983,4821028.0,4298798.0,9119826.0
42 ST-PORT AUTH,975287980,2571982579745,2230353121601,5268731.0,3669247.0,8937978.0
14 ST-UNION SQ,787362551,88115964177,57426714905,3903559.0,3995590.0,7899149.0
86 ST,2581504936,599888099997,877774250646,4261140.0,3569073.0,7830213.0
125 ST,1950357794,2515036418386,1554869348054,4047442.0,3446371.0,7493813.0
