In [128]:
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data16 = './data/turnstile/201605'
data17 = './data/turnstile/201705'
datatypes = {'C/A':'object', 
             'UNIT':'object',
             'SCP':'object',
             'STATION':'object',
             'LINENAME':'object',
             'DIVISION':'object',
             'DATE':'object',
             'TIME':'object',
             'DESC':'object',
             'ENTRIES':'int64',
             'EXITS':'int64'}

In [3]:
def concat_data(datadir):
    frames = []
    txts = [os.path.join(datadir, f) for f in os.listdir(datadir)]
    for txt in txts:
        df = pd.read_csv(txt, sep=',', dtype=datatypes)
        df['DATETIME'] = df['DATE'] + ' ' + df['TIME']
        df['DATETIME'] = pd.to_datetime(df['DATETIME'])
        df.columns = [x.lower().strip() for x in df.columns]
        frames.append(df)
        
    resdf = pd.concat(frames, ignore_index=True) # throw away the original indices when concatenating
    return resdf

In [4]:
df16 = concat_data(data16)

In [5]:
df17 = concat_data(data17)

In [142]:
len(df16)

777903

In [139]:
len(df17)

785089

In [140]:
# drop duplicates (see: www.jbencina.com/blog/2015/06/25/cleaning-nyc-turnstile-usage-data/)
df16 = df16.drop_duplicates(subset=['unit', 'scp', 'datetime'])
df17 = df17.drop_duplicates(subset=['unit', 'scp', 'datetime'])

In [141]:
len(df17)

785087

In [147]:
# Also drop suspiciously low entries or exits or values not on the 4th hour.
df16 = df16[(df16['entries']>10000) & (df16['exits']>10000)]
df17 = df17[(df17['entries']>10000) & (df17['exits']>10000)]

In [148]:
len(df16)

698779

In [149]:
len(df17)

698779

In [6]:
df16.head()

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,exits,datetime
0,A002,R051,02-00-00,59 ST,NQR456,BMT,04/30/2016,00:00:00,REGULAR,5639941,1909983,2016-04-30 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456,BMT,04/30/2016,04:00:00,REGULAR,5639991,1909993,2016-04-30 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456,BMT,04/30/2016,08:00:00,REGULAR,5640014,1910024,2016-04-30 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456,BMT,04/30/2016,12:00:00,REGULAR,5640158,1910134,2016-04-30 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456,BMT,04/30/2016,16:00:00,REGULAR,5640454,1910197,2016-04-30 16:00:00


In [167]:
df16[df16['station']=='GRD CNTRL-42 ST']

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,exits,datetime
148097,R236,R045,00-00-00,GRD CNTRL-42 ST,4567S,IRT,04/30/2016,01:00:00,REGULAR,321993,806628,2016-04-30 01:00:00
148098,R236,R045,00-00-00,GRD CNTRL-42 ST,4567S,IRT,04/30/2016,05:00:00,REGULAR,321995,806635,2016-04-30 05:00:00
148099,R236,R045,00-00-00,GRD CNTRL-42 ST,4567S,IRT,04/30/2016,09:00:00,REGULAR,322022,806810,2016-04-30 09:00:00
148100,R236,R045,00-00-00,GRD CNTRL-42 ST,4567S,IRT,04/30/2016,13:00:00,REGULAR,322138,807335,2016-04-30 13:00:00
148101,R236,R045,00-00-00,GRD CNTRL-42 ST,4567S,IRT,04/30/2016,17:00:00,REGULAR,322351,807965,2016-04-30 17:00:00
148102,R236,R045,00-00-00,GRD CNTRL-42 ST,4567S,IRT,04/30/2016,21:00:00,REGULAR,322539,808537,2016-04-30 21:00:00
148103,R236,R045,00-00-00,GRD CNTRL-42 ST,4567S,IRT,05/01/2016,01:00:00,REGULAR,322600,808783,2016-05-01 01:00:00
148104,R236,R045,00-00-00,GRD CNTRL-42 ST,4567S,IRT,05/01/2016,05:00:00,REGULAR,322602,808784,2016-05-01 05:00:00
148105,R236,R045,00-00-00,GRD CNTRL-42 ST,4567S,IRT,05/01/2016,09:00:00,REGULAR,322623,808891,2016-05-01 09:00:00
148106,R236,R045,00-00-00,GRD CNTRL-42 ST,4567S,IRT,05/01/2016,13:00:00,REGULAR,322686,809394,2016-05-01 13:00:00


In [7]:
df16.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777938 entries, 0 to 777937
Data columns (total 12 columns):
c/a         777938 non-null object
unit        777938 non-null object
scp         777938 non-null object
station     777938 non-null object
linename    777938 non-null object
division    777938 non-null object
date        777938 non-null object
time        777938 non-null object
desc        777938 non-null object
entries     777938 non-null int64
exits       777938 non-null int64
datetime    777938 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(9)
memory usage: 71.2+ MB


In [8]:
df17.head()

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,exits,datetime
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,00:00:00,REGULAR,6157740,2085315,2017-04-29 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,04:00:00,REGULAR,6157777,2085319,2017-04-29 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,08:00:00,REGULAR,6157810,2085353,2017-04-29 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,12:00:00,REGULAR,6157963,2085453,2017-04-29 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,16:00:00,REGULAR,6158212,2085529,2017-04-29 16:00:00


In [9]:
df17.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785089 entries, 0 to 785088
Data columns (total 12 columns):
c/a         785089 non-null object
unit        785089 non-null object
scp         785089 non-null object
station     785089 non-null object
linename    785089 non-null object
division    785089 non-null object
date        785089 non-null object
time        785089 non-null object
desc        785089 non-null object
entries     785089 non-null int64
exits       785089 non-null int64
datetime    785089 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(9)
memory usage: 71.9+ MB


In [10]:
df16.columns

Index(['c/a', 'unit', 'scp', 'station', 'linename', 'division', 'date', 'time',
       'desc', 'entries', 'exits', 'datetime'],
      dtype='object')

In [151]:
df16.groupby(['station','linename'])[["entries", "exits"]].sum().sort_values(by=['entries', 'exits'], ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,entries,exits
station,linename,Unnamed: 2_level_1,Unnamed: 3_level_1
42 ST-PORT AUTH,ACENQRS1237,1297581449562,1060105329948
57 ST-7 AV,NQR,1238021517150,1074078621210
TIMES SQ-42 ST,1237ACENQRS,993969061064,779875240289
GRD CNTRL-42 ST,4567S,744709015088,146795161001
CANAL ST,JNQRZ6,648819317822,464853116818
125 ST,23,626658006510,390685329886
34 ST-HERALD SQ,BDFMNQR,563480365520,596749602738
FULTON ST,ACJZ2345,546001314810,352031990885
23 ST,FM,530529867750,556425643066
23 ST,6,524971518434,456699877380


In [12]:
df16.groupby(['station'])['datetime'].max()

station
1 AV              2016-05-27 20:00:00
103 ST            2016-05-27 21:00:00
103 ST-CORONA     2016-05-27 20:00:00
104 ST            2016-05-27 20:00:00
110 ST            2016-05-27 20:00:00
111 ST            2016-05-27 21:00:00
116 ST            2016-05-27 21:00:00
116 ST-COLUMBIA   2016-05-27 21:00:00
121 ST            2016-05-27 20:00:00
125 ST            2016-05-27 21:00:00
135 ST            2016-05-27 21:00:00
137 ST CITY COL   2016-05-27 21:00:00
138/GRAND CONC    2016-05-27 20:00:00
14 ST             2016-05-27 21:00:00
14 ST-UNION SQ    2016-05-27 21:00:00
145 ST            2016-05-27 20:00:00
149/GRAND CONC    2016-05-27 20:00:00
14TH STREET       2016-05-27 23:59:33
15 ST-PROSPECT    2016-05-27 20:30:00
155 ST            2016-05-27 21:00:00
157 ST            2016-05-27 21:00:00
161/YANKEE STAD   2016-05-27 20:22:00
163 ST-AMSTERDM   2016-05-27 21:00:00
167 ST            2016-05-27 20:00:00
168 ST            2016-05-27 20:00:00
169 ST            2016-05-27 20:00:00
170 

In [152]:
df16_min = df16.loc[df16.groupby(['station', 'linename', 'division'])['datetime'].idxmin()]

In [153]:
df16_min.head()

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,exits,datetime
29818,H007,R248,00-00-00,1 AV,L,BMT,04/30/2016,00:00:00,REGULAR,11020304,12018000,2016-04-30 00:00:00
135210,R170,R191,00-00-00,103 ST,1,IRT,04/30/2016,01:00:00,REGULAR,12592318,4496078,2016-04-30 01:00:00
155402,R252,R180,00-00-00,103 ST,6,IRT,04/30/2016,01:00:00,REGULAR,33822764,503630608,2016-04-30 01:00:00
45389,N037,R314,00-00-00,103 ST,BC,IND,04/30/2016,01:00:00,REGULAR,11851043,10101965,2016-04-30 01:00:00
178918,R529,R208,00-00-00,103 ST-CORONA,7,IRT,04/30/2016,00:00:00,REGULAR,13778232,21798134,2016-04-30 00:00:00


In [154]:
df16_max = df16.loc[df16.groupby(['station', 'linename', 'division'])['datetime'].idxmax()]

In [166]:
df16_max[df16_max['station']=='GRD CNTRL-42 ST']

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,exits,datetime,stationlinediv
731211,R236,R045,00-00-00,GRD CNTRL-42 ST,4567S,IRT,05/27/2016,21:00:00,REGULAR,347011,933521,2016-05-27 21:00:00,GRD CNTRL-42 ST-4567S-IRT


In [165]:
df16_min[df16_min['station']=='GRD CNTRL-42 ST']

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,exits,datetime,stationlinediv
148601,R237,R046,01-00-00,GRD CNTRL-42 ST,4567S,IRT,04/30/2016,00:00:00,REGULAR,1057788,3274938,2016-04-30,GRD CNTRL-42 ST-4567S-IRT


In [157]:
len(df16_min)

476

In [158]:
len(df16_max)

476

In [159]:
df16_max['stationlinediv'] = df16_max['station'] + '-' + df16_max['linename'] + '-' + df16_max['division']

In [160]:
df16_min['stationlinediv'] = df16_min['station'] + '-' + df16_min['linename'] + '-' + df16_min['division']

In [161]:
df16_max.shape

(476, 13)

In [171]:
# calculate difference for each station in each month
def calculate_monthly(df_min, df_max):
    stationlinedivs = df_min['stationlinediv'].unique()
    table = []
    headers = ['station', 'linename','division', 'month_entry_total', 'month_exit_total']
    for stationlinediv in stationlinedivs:
        station = df_max.loc[df_max['stationlinediv']==stationlinediv, 'station'].iloc[0]
        linename = df_max.loc[df_max['stationlinediv']==stationlinediv, 'linename'].iloc[0]
        division = df_max.loc[df_max['stationlinediv']==stationlinediv, 'division'].iloc[0]
        entry_max = df_max.loc[df_max['stationlinediv']==stationlinediv, 'entries'].iloc[0]
        entry_min = df_min.loc[df_min['stationlinediv']==stationlinediv, 'entries'].iloc[0]
        exit_max = df_max.loc[df_max['stationlinediv']==stationlinediv, 'exits'].iloc[0]
        exit_min = df_min.loc[df_min['stationlinediv']==stationlinediv, 'exits'].iloc[0]
        # evidently an absolute calculation gives us a more approximate sense where there
        # are turnstile errors in counting (resets, "time traveling turnstiles")
        month_entry = abs(entry_max-entry_min)
        month_exit = abs(exit_max-exit_min)
        
        table.append([station, linename, division, month_entry, month_exit])
    diff_df = pd.DataFrame(table, columns=headers)
    return diff_df

In [169]:
df16_diff = calculate_monthly(df16_min, df16_max)

In [170]:
df16_diff.sort_values(by=['month_entry_total', 'month_exit_total'], ascending=False)

Unnamed: 0,station,linename,division,month_entry_total,month_exit_total
306,FULTON ST,ACJZ2345,IND,7259506,4143438
104,42 ST-PORT AUTH,ACENQRS1237,IND,6070261,33949622
176,ATL AV-BARCLAY,BDNQR2345,BMT,5706210,12080456
303,FULTON ST,2345ACJZ,IND,2598881,4316267
167,9TH STREET,1,PTH,1672765,1479241
407,PATH WTC,1,PTH,1390661,487061
452,TWENTY THIRD ST,1,PTH,1265757,47219
473,WORLD TRADE CTR,ACE23,IND,1073927,12542
323,HARRISON,1,PTH,1048603,539330
315,GRD CNTRL-42 ST,4567S,IRT,710777,2341417


In [173]:
df17_min = df17.loc[df17.groupby(['station', 'linename', 'division'])['datetime'].idxmin()]

In [174]:
df17_max = df17.loc[df17.groupby(['station', 'linename', 'division'])['datetime'].idxmax()]

In [175]:
df17_min['stationlinediv'] = df17_min['station'] + '-' + df17_min['linename'] + '-' + df17_min['division']

In [176]:
df17_max['stationlinediv'] = df17_max['station'] + '-' + df17_max['linename'] + '-' + df17_max['division']

In [177]:
df17_diff = calculate_monthly(df17_min, df17_max)

In [178]:
df17_diff.sort_values(by=['month_entry_total', 'month_exit_total'], ascending=False)

Unnamed: 0,station,linename,division,month_entry_total,month_exit_total
73,23 ST,CE,IND,102439453,1359248134
448,SUTTER AV,L,BMT,11753905,15777686
267,CRESCENT ST,JZ,BMT,7868497,11835897
104,42 ST-PORT AUTH,ACENQRS1237W,IND,5235520,35350331
306,FULTON ST,2345ACJZ,IND,2318143,4652172
147,79 ST,1,IRT,1544275,319354
326,HARRISON,1,PTH,1296318,189645
456,TWENTY THIRD ST,1,PTH,851078,561350
449,THIRTY ST,1,PTH,789263,738965
450,THIRTY THIRD ST,1,PTH,417204,282474
