In [3]:
import sys
print("Python Version:", sys.version)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
pd.set_option('display.max_rows', 500)
%matplotlib inline

Python Version: 3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]


In [4]:
# yymmdd format
dates = ['190803',
        '190810',
        '190817',
        '190824',
        '190831',
        '190907',
        '190914',
        '190921',
        '190928']

# iterate through dates and pull in csv and cat dataframes together
df_turns = []
for date in dates:
    df_turns.append(pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_'+date+'.txt'))
    
# Concat all dataframes
df_turns = pd.concat(df_turns)  

In [5]:
#Clean up the column Names
new_col_names = [name.strip() for name in df_turns.columns]
df_turns.columns = new_col_names



In [8]:
# Remove audited rows
df_turns=df_turns[df_turns.DESC!='RECOVR AUD']
# remove column?
del df_turns['DESC']

In [10]:
# Create a DateTime column that can be used for timeseries
df_turns.loc[:,'DATETIME'] = pd.to_datetime(df_turns['DATE'] + ' ' + df_turns['TIME'])

#Convert Date column to proper date type.
df_turns['DATE']  = pd.to_datetime(df_turns['DATE'],format='%m/%d/%Y')

#convert time to datetime type
df_turns['TIME']=pd.to_datetime(df_turns['TIME'], format='%H:%M:%S')

In [11]:
#backup
df_backup=df_turns.copy()

In [12]:
#Sort in order of datetime
df_turns = df_turns.sort_values(by=['DATETIME']).reset_index()

In [14]:
# Caluclate Entries Aggregated at station level by DateTime
df_turns['Entry_Diff']=df_turns.groupby(['STATION', 'C/A', 'UNIT', 'SCP'],as_index=False)['ENTRIES'].transform(pd.Series.diff)['ENTRIES']

# Caluclate Exits Aggregated at station level by DateTime
df_turns['Exit_Diff']=df_turns.groupby(['STATION', 'C/A', 'UNIT', 'SCP'],as_index=False)['EXITS'].transform(pd.Series.diff)['EXITS']



In [16]:
#Absolute Value to deal with counting backward issues 
df_turns['Entry_Diff'] = abs(df_turns['Entry_Diff'])
df_turns['Exit_Diff']=abs(df_turns['Exit_Diff'])

# Calculate both
df_turns['Total_Traffic']=df_turns['Entry_Diff']+df_turns['Exit_Diff']

In [63]:
# Filter out values in calculated fields outside of upper quartile range
turnstiles_daily.dropna(subset=["Entry_Diff","Exit_Diff",''], axis=0, inplace=True)

q_9 = df_turns['Total_Traffic'].quantile(.95)

df_turns_filt = df_turns[df_turns.Total_Traffic<= q_9]


In [64]:
df_per_station = df_turns_filt.groupby(['STATION']).sum()
df_per_station.sort_values('Total_Traffic', ascending=False).head(100)

Unnamed: 0_level_0,index,ENTRIES,EXITS,Entry_Diff,Exit_Diff,Total_Traffic
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
34 ST-PENN STA,3078693904,1257301059440,1306997116935,6079560.0,4999645.0,11079205.0
FULTON ST,3140364218,1750191113720,1698389761918,4147188.0,3712055.0,7859243.0
23 ST,2433534862,1813834659997,1562346431004,4119658.0,3395568.0,7515226.0
TIMES SQ-42 ST,1899419531,1820295799988,1531300707290,3271587.0,2811010.0,6082597.0
GRD CNTRL-42 ST,3224243370,797738981093,891551109355,3208546.0,2783020.0,5991566.0
86 ST,2394732623,569403272649,853932780047,3319410.0,2654236.0,5973646.0
CANAL ST,784349905,1148138236746,1497651025647,3075409.0,2821580.0,5896989.0
42 ST-PORT AUTH,853837601,1919482120362,1701123794014,3555846.0,2339638.0,5895484.0
34 ST-HERALD SQ,1233184883,1376928671543,1665971555220,2812154.0,2931871.0,5744025.0
125 ST,1772806511,2402360486249,1466854773661,2931941.0,2445409.0,5377350.0
