In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
from pandas.api.types import CategoricalDtype

In [2]:
#Format days of the week to not appear alphabetically on visuals
cats = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
cat_type = CategoricalDtype(categories=cats, ordered=True)

In [12]:
# Source: http://web.mta.info/developers/turnstile.html
df = pd.concat(map(pd.read_csv, (['http://web.mta.info/developers/data/nyct/turnstile/turnstile_190504.txt',
                    'http://web.mta.info/developers/data/nyct/turnstile/turnstile_190511.txt',
                    'http://web.mta.info/developers/data/nyct/turnstile/turnstile_190518.txt',
                    'http://web.mta.info/developers/data/nyct/turnstile/turnstile_190525.txt',
                    'http://web.mta.info/developers/data/nyct/turnstile/turnstile_190601.txt'])),ignore_index=True)

df.shape

(1026784, 11)

In [13]:
df.rename(columns = {
    'C/A': 'area',
    'UNIT': 'unit',
    'SCP': 'scp',
    'STATION': 'station',
    'DATE': 'date',
    'TIME': 'time',
    'ENTRIES': 'entries',
    'EXITS                                                               ': 'exits'},
    inplace = True)

In [14]:
df.sort_values(['station','area','unit','scp','date','time'], inplace=True)
df.head(30)

Unnamed: 0,area,unit,scp,station,LINENAME,DIVISION,date,time,DESC,entries,exits
31049,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,00:00:00,REGULAR,14647394,16373694
31050,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,04:00:00,REGULAR,14647394,16373709
31051,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,08:00:00,REGULAR,14647395,16373732
31052,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,12:00:00,REGULAR,14647395,16373766
31053,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,16:00:00,REGULAR,14647395,16373800
31054,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,20:00:00,REGULAR,14647395,16373826
31055,H007,R248,00-00-00,1 AV,L,BMT,04/28/2019,00:00:00,REGULAR,14647395,16373849
31056,H007,R248,00-00-00,1 AV,L,BMT,04/28/2019,04:00:00,REGULAR,14647395,16373865
31057,H007,R248,00-00-00,1 AV,L,BMT,04/28/2019,08:00:00,REGULAR,14647395,16373883
31058,H007,R248,00-00-00,1 AV,L,BMT,04/28/2019,12:00:00,REGULAR,14647395,16373909


In [15]:
#We can combine all three to create a unique ID for any turnstile
df['unit_id'] = df.scp+' '+df.area +' '+df.unit

# Create datetime variable
df['DateTime'] = pd.to_datetime(df.date+' '+df.time)

#Record what day of the week each entry is from
df['Day'] = df.DateTime.dt.day_name()

#Calculate the entries/exits in a four hour period by finding the difference between rows
df['four hour entries'] = df.groupby('unit_id').entries.diff().values
df['four hour exits'] = df.groupby('unit_id').exits.diff().values

#Replace NaN, negative values, and impossibly large values by the average of the turnstile
df['four hour entries'].fillna(-1, inplace=True)
df['four hour exits'].fillna(-1, inplace=True)
df['four hour entries'] = df.groupby(['unit_id','area'])['four hour entries'].transform(
        lambda x: np.where((x<0)|(x>20000),x.mask((x<0)|(x>20000)).mean(),x))
df['four hour exits'] = df.groupby(['unit_id','area'])['four hour exits'].transform(
        lambda x: np.where((x<0)|(x>20000),x.mask((x<0)|(x>20000)).mean(),x))
df.head()

Unnamed: 0,area,unit,scp,station,LINENAME,DIVISION,date,time,DESC,entries,exits,unit_id,DateTime,Day,four hour entries,four hour exits
31049,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,00:00:00,REGULAR,14647394,16373694,00-00-00 H007 R248,2019-04-27 00:00:00,Saturday,378.810573,407.942731
31050,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,04:00:00,REGULAR,14647394,16373709,00-00-00 H007 R248,2019-04-27 04:00:00,Saturday,0.0,15.0
31051,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,08:00:00,REGULAR,14647395,16373732,00-00-00 H007 R248,2019-04-27 08:00:00,Saturday,1.0,23.0
31052,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,12:00:00,REGULAR,14647395,16373766,00-00-00 H007 R248,2019-04-27 12:00:00,Saturday,0.0,34.0
31053,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,16:00:00,REGULAR,14647395,16373800,00-00-00 H007 R248,2019-04-27 16:00:00,Saturday,0.0,34.0


In [16]:
# Find total flow over 4 hour blocks
df['total flow'] = df['four hour entries']+df['four hour exits']
df.head()

Unnamed: 0,area,unit,scp,station,LINENAME,DIVISION,date,time,DESC,entries,exits,unit_id,DateTime,Day,four hour entries,four hour exits,total flow
31049,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,00:00:00,REGULAR,14647394,16373694,00-00-00 H007 R248,2019-04-27 00:00:00,Saturday,378.810573,407.942731,786.753304
31050,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,04:00:00,REGULAR,14647394,16373709,00-00-00 H007 R248,2019-04-27 04:00:00,Saturday,0.0,15.0,15.0
31051,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,08:00:00,REGULAR,14647395,16373732,00-00-00 H007 R248,2019-04-27 08:00:00,Saturday,1.0,23.0,24.0
31052,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,12:00:00,REGULAR,14647395,16373766,00-00-00 H007 R248,2019-04-27 12:00:00,Saturday,0.0,34.0,34.0
31053,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,16:00:00,REGULAR,14647395,16373800,00-00-00 H007 R248,2019-04-27 16:00:00,Saturday,0.0,34.0,34.0


In [17]:
#Find daily sum for each turnstile for each day of the week
df_daily = df.groupby(['unit_id', 'station','date'])
df_daily.head(50)

Unnamed: 0,area,unit,scp,station,LINENAME,DIVISION,date,time,DESC,entries,exits,unit_id,DateTime,Day,four hour entries,four hour exits,total flow
31049,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,00:00:00,REGULAR,14647394,16373694,00-00-00 H007 R248,2019-04-27 00:00:00,Saturday,378.810573,407.942731,786.753304
31050,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,04:00:00,REGULAR,14647394,16373709,00-00-00 H007 R248,2019-04-27 04:00:00,Saturday,0.000000,15.000000,15.000000
31051,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,08:00:00,REGULAR,14647395,16373732,00-00-00 H007 R248,2019-04-27 08:00:00,Saturday,1.000000,23.000000,24.000000
31052,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,12:00:00,REGULAR,14647395,16373766,00-00-00 H007 R248,2019-04-27 12:00:00,Saturday,0.000000,34.000000,34.000000
31053,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,16:00:00,REGULAR,14647395,16373800,00-00-00 H007 R248,2019-04-27 16:00:00,Saturday,0.000000,34.000000,34.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1002164,R419,R326,00-05-01,ZEREGA AV,6,IRT,05/31/2019,05:00:00,REGULAR,39,148,00-05-01 R419 R326,2019-05-31 05:00:00,Friday,0.000000,0.000000,0.000000
1002165,R419,R326,00-05-01,ZEREGA AV,6,IRT,05/31/2019,09:00:00,REGULAR,39,148,00-05-01 R419 R326,2019-05-31 09:00:00,Friday,0.000000,0.000000,0.000000
1002166,R419,R326,00-05-01,ZEREGA AV,6,IRT,05/31/2019,13:00:00,REGULAR,39,148,00-05-01 R419 R326,2019-05-31 13:00:00,Friday,0.000000,0.000000,0.000000
1002167,R419,R326,00-05-01,ZEREGA AV,6,IRT,05/31/2019,17:00:00,REGULAR,39,148,00-05-01 R419 R326,2019-05-31 17:00:00,Friday,0.000000,0.000000,0.000000


In [19]:
#Find daily sum for each STATION for each day of the week
df_daily = df.groupby(['station','date'])['total flow'].sum()
df_daily.head(20)

station  date      
1 AV     04/27/2019    16904.392871
         04/28/2019    12444.000000
         04/29/2019    36454.000000
         04/30/2019    39009.000000
         05/01/2019    40345.000000
         05/02/2019    40741.000000
         05/03/2019    42397.000000
         05/04/2019    17728.000000
         05/05/2019    13728.000000
         05/06/2019    36691.000000
         05/07/2019    39146.000000
         05/08/2019    40615.000000
         05/09/2019    41254.000000
         05/10/2019    41024.000000
         05/11/2019    18131.000000
         05/12/2019    11859.000000
         05/13/2019    36908.000000
         05/14/2019    40150.000000
         05/15/2019    41294.000000
         05/16/2019    40763.000000
Name: total flow, dtype: float64

In [20]:
ridership = df.groupby(['station'])['total flow'].sum()
top_10 = ridership.sort_values(ascending=False).reset_index().round(0).head(10)
top_10

Unnamed: 0,station,total flow
0,34 ST-PENN STA,9764486.0
1,GRD CNTRL-42 ST,8228594.0
2,34 ST-HERALD SQ,6902750.0
3,14 ST-UNION SQ,6174981.0
4,TIMES SQ-42 ST,5997518.0
5,23 ST,5912547.0
6,FULTON ST,5341551.0
7,42 ST-PORT AUTH,5320527.0
8,86 ST,4878903.0
9,125 ST,4311430.0
