In [78]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [79]:
data_dir = './mta_data/'

In [80]:
dname = '200627'
df = pd.read_csv(data_dir+'turnstile_'+dname+'.txt', parse_dates=[['DATE','TIME']])

In [122]:
df.columns = list(map((lambda x: x.strip() if isinstance(x, str) else x), df.columns.values))

In [81]:
# Create UID to uniquely identify a turnstile by (C/A, UNIT, SCP, STATION)
df['TUID'] = pd.factorize(df['C/A'] + df['UNIT'] + df['SCP'] + df['STATION'])[0]

In [82]:
# Create date column to make grouping by date easier
df['DATE'] = df['DATE_TIME'].dt.date

In [127]:
# Sort by turnstile TUID and DATE_TIME
df.sort_values(['TUID','DATE_TIME'])

# Group by TUID and DATE_TIME and calculate deltas between rows
tuid_groups = df.groupby(['TUID'])
df['I_ENTRIES'] = tuid_groups['ENTRIES'].diff().shift(-1)
df['I_EXITS'] = tuid_groups['EXITS'].diff().shift(-1)
df.head(5)

Unnamed: 0,DATE_TIME,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DESC,ENTRIES,EXITS,TUID,DATE,I_ENTRIES,I_EXITS
0,2020-06-20 00:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7424218,2522558,0,2020-06-20,2.0,1.0
1,2020-06-20 04:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7424220,2522559,0,2020-06-20,11.0,13.0
2,2020-06-20 08:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7424231,2522572,0,2020-06-20,34.0,18.0
3,2020-06-20 12:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7424265,2522590,0,2020-06-20,75.0,14.0
4,2020-06-20 16:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7424340,2522604,0,2020-06-20,75.0,8.0


In [130]:
df.groupby(['TUID', 'DATE']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,ENTRIES,EXITS,I_ENTRIES,I_EXITS
TUID,DATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2020-06-20,44545689,15135495,223.0,64.0
0,2020-06-21,44546873,15135841,143.0,52.0
0,2020-06-22,44548086,15136432,377.0,153.0
0,2020-06-23,44550245,15137335,346.0,139.0
0,2020-06-24,44552381,15138204,368.0,156.0
...,...,...,...,...,...
4942,2020-06-22,33324,3084,0.0,0.0
4942,2020-06-23,33324,3084,0.0,0.0
4942,2020-06-24,33324,3084,0.0,0.0
4942,2020-06-25,33324,3084,0.0,0.0


In [32]:
# Gets total number of entries and exits for each turnstile on a given day
df_by_turnstile = df.groupby(['DATE','UID']).sum()
df_by_turnstile

Unnamed: 0_level_0,Unnamed: 1_level_0,ENTRIES,EXITS
DATE,UID,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-06-20,0,44545689,15135495
2020-06-20,1,39586492,8921343
2020-06-20,2,8274656,31156934
2020-06-20,3,8714597,13543898
2020-06-20,4,39298331,52448931
...,...,...,...
2020-06-26,4938,20967751,263907
2020-06-26,4939,4518162,184170
2020-06-26,4940,18800934,2384406
2020-06-26,4941,18546,372


In [38]:
# Get densest station by ridership / num_turnstiles


['1 AV',
 '103 ST',
 '103 ST-CORONA',
 '104 ST',
 '110 ST',
 '111 ST',
 '116 ST',
 '116 ST-COLUMBIA',
 '121 ST',
 '125 ST',
 '135 ST',
 '137 ST CITY COL',
 '138/GRAND CONC',
 '14 ST',
 '14 ST-UNION SQ',
 '145 ST',
 '149/GRAND CONC',
 '14TH STREET',
 '15 ST-PROSPECT',
 '155 ST',
 '157 ST',
 '161/YANKEE STAD',
 '163 ST-AMSTERDM',
 '167 ST',
 '168 ST',
 '169 ST',
 '170 ST',
 '174 ST',
 '174-175 STS',
 '175 ST',
 '176 ST',
 '18 AV',
 '18 ST',
 '181 ST',
 '182-183 STS',
 '183 ST',
 '190 ST',
 '191 ST',
 '2 AV',
 '20 AV',
 '207 ST',
 '21 ST',
 '21 ST-QNSBRIDGE',
 '215 ST',
 '219 ST',
 '225 ST',
 '23 ST',
 '231 ST',
 '233 ST',
 '238 ST',
 '25 AV',
 '25 ST',
 '28 ST',
 '3 AV',
 '3 AV 138 ST',
 '3 AV-149 ST',
 '30 AV',
 '33 ST',
 '33 ST-RAWSON ST',
 '34 ST-HERALD SQ',
 '34 ST-HUDSON YD',
 '34 ST-PENN STA',
 '36 AV',
 '36 ST',
 '39 AV',
 '4 AV-9 ST',
 '40 ST LOWERY ST',
 '42 ST-BRYANT PK',
 '42 ST-PORT AUTH',
 '45 ST',
 '46 ST',
 '46 ST BLISS ST',
 '47-50 STS ROCK',
 '49 ST',
 '4AV-9 ST',
 '5 AV

In [19]:
df['STATION'].unique().shape

(379,)