## Data Frame to Summarize Turnstile Entries & Exits

In [118]:
import datetime as dt
import pandas as pd
import numpy as np

In [119]:
# Read pickled datafile
turnstile_df = pd.read_pickle('data.pkl')

In [120]:
turnstile_df['ENTRIES'] = turnstile_df['ENTRIES'].astype(np.int)

In [121]:
turnstile_df['EXITS'] = turnstile_df['EXITS'].astype(np.int)

In [122]:
turnstile_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10303675 entries, 0 to 10303674
Data columns (total 11 columns):
C/A         object
UNIT        object
SCP         object
STATION     object
LINENAME    object
DIVISION    object
DATE        object
TIME        object
DESC        object
ENTRIES     int64
EXITS       int64
dtypes: int64(2), object(9)
memory usage: 864.7+ MB


In [123]:
turnstile_df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

#### Extract Date & Time Features:

In [124]:
date = pd.to_datetime(turnstile_df['DATE'], format='%m/%d/%Y')
time = pd.to_datetime(turnstile_df['TIME'], format='%H:%M:%S')

turnstile_df['year'] = date.dt.year
turnstile_df['month'] = date.dt.month
turnstile_df['day'] = date.dt.day
turnstile_df['day_of_week'] = date.dt.weekday
turnstile_df['hour'] = time.dt.hour
turnstile_df['minute'] = time.dt.minute

In [125]:
# Create concatenated field for Turnstile Key (UNIT + SCP)
turnstile_df.insert(0, 'TURNSTILE', turnstile_df['UNIT']+'|'+turnstile_df['SCP'])

In [126]:
# Create field to flag weekday or weekend transactions
turnstile_df.loc[turnstile_df['day_of_week'] < 5, 'WD vs WE'] = 'WD'
turnstile_df.loc[turnstile_df['day_of_week'] > 4, 'WD vs WE'] = 'WE'

In [127]:
turnstile_df.head(10)

Unnamed: 0,TURNSTILE,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,year,month,day,day_of_week,hour,minute,WD vs WE
0,R051|02-00-00,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/23/2019,00:00:00,REGULAR,6989774,2370411,2019,3,23,5,0,0,WE
1,R051|02-00-00,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/23/2019,04:00:00,REGULAR,6989795,2370413,2019,3,23,5,4,0,WE
2,R051|02-00-00,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/23/2019,08:00:00,REGULAR,6989813,2370436,2019,3,23,5,8,0,WE
3,R051|02-00-00,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/23/2019,12:00:00,REGULAR,6989924,2370512,2019,3,23,5,12,0,WE
4,R051|02-00-00,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/23/2019,16:00:00,REGULAR,6990200,2370573,2019,3,23,5,16,0,WE
5,R051|02-00-00,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/23/2019,20:00:00,REGULAR,6990562,2370623,2019,3,23,5,20,0,WE
6,R051|02-00-00,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/24/2019,00:00:00,REGULAR,6990734,2370648,2019,3,24,6,0,0,WE
7,R051|02-00-00,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/24/2019,04:00:00,REGULAR,6990758,2370653,2019,3,24,6,4,0,WE
8,R051|02-00-00,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/24/2019,08:00:00,REGULAR,6990772,2370676,2019,3,24,6,8,0,WE
9,R051|02-00-00,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/24/2019,12:00:00,REGULAR,6990860,2370731,2019,3,24,6,12,0,WE


# Calculate number of entries in each time period (`entries_delta`)

- Group DataFrame by turnstile and sort by date
- Define an appropriate time period
- Check for inconsistent data

In order to identify unique turnstiles, we use two key definitions from the [MTA transit toolkit](http://transitdatatoolkit.com/lessons/subway-turnstile-data/):

- `UNIT`: The remote unit is a collection of turnstiles... there can be multiple remote units one station
- `SCP`: Subunit channel position represents a turnstile... the same number can be used at different stations

Together, `UNIT` and `SCP` make a unique identifier for NYC turnstiles. 

In [128]:
# Create dataframe that consolidates each Turnstile (Unit + SCP); take the max entries & exits
# This will be used to determine the difference between Entries & Exits for each day
cons_turnstile_df = turnstile_df    \
                        .groupby(['TURNSTILE', 'STATION', 'year','month','day','hour', 'WD vs WE'])    \
                        .agg({'ENTRIES':'max', 'EXITS':'max'}).reset_index()

In [129]:
cons_turnstile_df.sort_values(['TURNSTILE', 'STATION', 'year','month','day','hour', 'WD vs WE'], inplace=True)

In [130]:
cons_turnstile_df.head(10)

Unnamed: 0,TURNSTILE,STATION,year,month,day,hour,WD vs WE,ENTRIES,EXITS
0,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,0,WE,1699119,1615179
1,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,4,WE,1699131,1615186
2,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,8,WE,1699154,1615191
3,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,12,WE,1699294,1615253
4,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,16,WE,1699516,1615328
5,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,20,WE,1699746,1615404
6,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,0,WE,1699792,1615423
7,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,4,WE,1699799,1615426
8,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,8,WE,1699817,1615432
9,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,12,WE,1699887,1615472


In [131]:
cons_turnstile_df['Entries Delta'] = cons_turnstile_df['ENTRIES']    \
                                            .rolling(2)    \
                                            .apply(lambda x: x[1]-x[0] if abs(x[1]-x[0]) < 5000 else np.nan ,raw=True)


In [132]:
cons_turnstile_df.head(10)

Unnamed: 0,TURNSTILE,STATION,year,month,day,hour,WD vs WE,ENTRIES,EXITS,Entries Delta
0,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,0,WE,1699119,1615179,
1,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,4,WE,1699131,1615186,12.0
2,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,8,WE,1699154,1615191,23.0
3,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,12,WE,1699294,1615253,140.0
4,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,16,WE,1699516,1615328,222.0
5,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,20,WE,1699746,1615404,230.0
6,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,0,WE,1699792,1615423,46.0
7,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,4,WE,1699799,1615426,7.0
8,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,8,WE,1699817,1615432,18.0
9,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,12,WE,1699887,1615472,70.0


In [133]:
cons_turnstile_df[cons_turnstile_df['Entries Delta'] < 0]

Unnamed: 0,TURNSTILE,STATION,year,month,day,hour,WD vs WE,ENTRIES,EXITS,Entries Delta
61519,R001|02-06-01,SOUTH FERRY,2016,2,27,0,WE,1,167,-1781.0
64824,R001|02-06-03,SOUTH FERRY,2018,2,24,0,WE,2,23,-190.0
67042,R001|02-06-05,SOUTH FERRY,2018,2,24,0,WE,1,26,-178.0
120368,R007|00-00-02,104 ST,2016,2,27,7,WE,2026156755,622219189,-31.0
120369,R007|00-00-02,104 ST,2016,2,27,11,WE,2026156690,622219172,-65.0
120370,R007|00-00-02,104 ST,2016,2,27,15,WE,2026156624,622219149,-66.0
120371,R007|00-00-02,104 ST,2016,2,27,19,WE,2026156583,622219099,-41.0
120372,R007|00-00-02,104 ST,2016,2,27,23,WE,2026156573,622219079,-10.0
120373,R007|00-00-02,104 ST,2016,2,28,3,WE,2026156571,622219061,-2.0
120374,R007|00-00-02,104 ST,2016,2,28,7,WE,2026156555,622219058,-16.0


In [134]:
cons_turnstile_df.iloc[120360:120375, :]

Unnamed: 0,TURNSTILE,STATION,year,month,day,hour,WD vs WE,ENTRIES,EXITS,Entries Delta
120360,R007|00-00-01,104 ST,2019,5,23,20,WD,1121292500,1907495231,57.0
120361,R007|00-00-01,104 ST,2019,5,24,0,WD,1121292518,1907495171,18.0
120362,R007|00-00-01,104 ST,2019,5,24,4,WD,1121292523,1907495166,5.0
120363,R007|00-00-01,104 ST,2019,5,24,8,WD,1121292725,1907495133,202.0
120364,R007|00-00-01,104 ST,2019,5,24,12,WD,1121292859,1907495096,134.0
120365,R007|00-00-01,104 ST,2019,5,24,16,WD,1121292940,1907495022,81.0
120366,R007|00-00-01,104 ST,2019,5,24,20,WD,1121293020,1907494900,80.0
120367,R007|00-00-02,104 ST,2016,2,27,3,WE,2026156786,622219196,
120368,R007|00-00-02,104 ST,2016,2,27,7,WE,2026156755,622219189,-31.0
120369,R007|00-00-02,104 ST,2016,2,27,11,WE,2026156690,622219172,-65.0


In [135]:
cons_turnstile_df2 = cons_turnstile_df.copy()
cons_turnstile_df2.drop(columns=['Entries Delta'], inplace=True)

In [136]:
cons_turnstile_df2['Entries Delta'] = cons_turnstile_df2.groupby(['TURNSTILE', 'year'])['ENTRIES']    \
                        .apply(lambda x: x.rolling(2)    \
                        .apply(lambda x: x[1] - x[0] if ((x[1] - x[0]) >= 0) and    \
                               ((x[1] - x[0]) <= 4800) else np.nan, raw=True))

In [137]:
cons_turnstile_df2['Exits Delta'] = cons_turnstile_df2.groupby(['TURNSTILE', 'year'])['EXITS']    \
                        .apply(lambda x: x.rolling(2)    \
                        .apply(lambda x: x[1] - x[0] if ((x[1] - x[0]) >= 0) and    \
                               ((x[1] - x[0]) <= 4800) else np.nan, raw=True))

In [138]:
cons_turnstile_df2['Cum. Entries & Exits'] = cons_turnstile_df2['Entries Delta'] + cons_turnstile_df2['Exits Delta']

In [139]:
cons_turnstile_df2.head(10)

Unnamed: 0,TURNSTILE,STATION,year,month,day,hour,WD vs WE,ENTRIES,EXITS,Entries Delta,Exits Delta,Cum. Entries & Exits
0,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,0,WE,1699119,1615179,,,
1,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,4,WE,1699131,1615186,12.0,7.0,19.0
2,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,8,WE,1699154,1615191,23.0,5.0,28.0
3,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,12,WE,1699294,1615253,140.0,62.0,202.0
4,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,16,WE,1699516,1615328,222.0,75.0,297.0
5,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,20,WE,1699746,1615404,230.0,76.0,306.0
6,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,0,WE,1699792,1615423,46.0,19.0,65.0
7,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,4,WE,1699799,1615426,7.0,3.0,10.0
8,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,8,WE,1699817,1615432,18.0,6.0,24.0
9,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,12,WE,1699887,1615472,70.0,40.0,110.0


In [140]:
# Create Key in cons_turnstile_df2 for TURNSTILE + hour
cons_turnstile_df2['TURNSTILE'] = cons_turnstile_df2['TURNSTILE'].astype(str)
cons_turnstile_df2['hour'] = cons_turnstile_df2['hour'].astype(str)
cons_turnstile_df2['turnstile_hour_key'] = cons_turnstile_df2['TURNSTILE'] + '|' + cons_turnstile_df2['hour']
cons_turnstile_df2.head(10)

Unnamed: 0,TURNSTILE,STATION,year,month,day,hour,WD vs WE,ENTRIES,EXITS,Entries Delta,Exits Delta,Cum. Entries & Exits,turnstile_hour_key
0,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,0,WE,1699119,1615179,,,,R001|00-00-00|0
1,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,4,WE,1699131,1615186,12.0,7.0,19.0,R001|00-00-00|4
2,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,8,WE,1699154,1615191,23.0,5.0,28.0,R001|00-00-00|8
3,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,12,WE,1699294,1615253,140.0,62.0,202.0,R001|00-00-00|12
4,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,16,WE,1699516,1615328,222.0,75.0,297.0,R001|00-00-00|16
5,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,20,WE,1699746,1615404,230.0,76.0,306.0,R001|00-00-00|20
6,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,0,WE,1699792,1615423,46.0,19.0,65.0,R001|00-00-00|0
7,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,4,WE,1699799,1615426,7.0,3.0,10.0,R001|00-00-00|4
8,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,8,WE,1699817,1615432,18.0,6.0,24.0,R001|00-00-00|8
9,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,12,WE,1699887,1615472,70.0,40.0,110.0,R001|00-00-00|12


##### NOTE:
**"cons_turnstile_df2"** to be used further down for merging with **"avg_volume_df"**

### CALCULATE THE AVERAGE VOLUME (by Turnstile for each hour timeslot)

In [141]:
cons_turnstile_df3 = cons_turnstile_df2.groupby(['TURNSTILE', 'STATION', 'WD vs WE', 'hour'])    \
                                       .agg({'Cum. Entries & Exits':'sum'}).reset_index()

In [142]:
cons_turnstile_df3['Cum. Entries & Exits'].isnull().sum()

0

In [143]:
# Create Key in cons_turnstile_df3 for TURNSTILE + hour
cons_turnstile_df3['TURNSTILE'] = cons_turnstile_df3['TURNSTILE'].astype(str)
cons_turnstile_df3['hour'] = cons_turnstile_df3['hour'].astype(str)
cons_turnstile_df3['turnstile_hour_key'] = cons_turnstile_df3['TURNSTILE'] + '|' + cons_turnstile_df3['hour']
cons_turnstile_df3

Unnamed: 0,TURNSTILE,STATION,WD vs WE,hour,Cum. Entries & Exits,turnstile_hour_key
0,R001|00-00-00,WHITEHALL S-FRY,WD,0,8959.0,R001|00-00-00|0
1,R001|00-00-00,WHITEHALL S-FRY,WD,1,42085.0,R001|00-00-00|1
2,R001|00-00-00,WHITEHALL S-FRY,WD,10,174.0,R001|00-00-00|10
3,R001|00-00-00,WHITEHALL S-FRY,WD,11,0.0,R001|00-00-00|11
4,R001|00-00-00,WHITEHALL S-FRY,WD,12,29651.0,R001|00-00-00|12
5,R001|00-00-00,WHITEHALL S-FRY,WD,13,135373.0,R001|00-00-00|13
6,R001|00-00-00,WHITEHALL S-FRY,WD,16,22942.0,R001|00-00-00|16
7,R001|00-00-00,WHITEHALL S-FRY,WD,17,162958.0,R001|00-00-00|17
8,R001|00-00-00,WHITEHALL S-FRY,WD,18,620.0,R001|00-00-00|18
9,R001|00-00-00,WHITEHALL S-FRY,WD,19,26.0,R001|00-00-00|19


In [144]:
# Create average_volume_df to contain average total entries & exits for each Turnstile for every hour timeslot
# Create Key in cons_turnstile_df3 for TURNSTILE + hour
average_volume_df = cons_turnstile_df3.groupby(['TURNSTILE', 'hour']).agg({'Cum. Entries & Exits':'mean'}).reset_index()
average_volume_df.rename(columns={'Cum. Entries & Exits': 'Mean Entries & Exits'}, inplace=True)
average_volume_df['TURNSTILE'] = average_volume_df['TURNSTILE'].astype(str)
average_volume_df['hour'] = average_volume_df['hour'].astype(str)
average_volume_df['turnstile_hour_key'] = average_volume_df['TURNSTILE'] + '|' + average_volume_df['hour']
average_volume_df.drop(columns=['TURNSTILE', 'hour'], inplace=True)
average_volume_df

Unnamed: 0,Mean Entries & Exits,turnstile_hour_key
0,7179.0,R001|00-00-00|0
1,30921.5,R001|00-00-00|1
2,174.0,R001|00-00-00|10
3,156.0,R001|00-00-00|11
4,18602.5,R001|00-00-00|12
5,87005.0,R001|00-00-00|13
6,16859.5,R001|00-00-00|16
7,108286.0,R001|00-00-00|17
8,620.0,R001|00-00-00|18
9,26.0,R001|00-00-00|19


### CREATE FINAL MERGED TABLE WITH ORIGINAL DATA + AVG ENTRIES & EXITS

In [145]:
cons_turnstile_df4 = pd.merge(cons_turnstile_df2, average_volume_df, on='turnstile_hour_key', how='left')
cons_turnstile_df4

Unnamed: 0,TURNSTILE,STATION,year,month,day,hour,WD vs WE,ENTRIES,EXITS,Entries Delta,Exits Delta,Cum. Entries & Exits,turnstile_hour_key,Mean Entries & Exits
0,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,0,WE,1699119,1615179,,,,R001|00-00-00|0,7179.0
1,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,4,WE,1699131,1615186,12.0,7.0,19.0,R001|00-00-00|4,1766.5
2,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,8,WE,1699154,1615191,23.0,5.0,28.0,R001|00-00-00|8,11694.5
3,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,12,WE,1699294,1615253,140.0,62.0,202.0,R001|00-00-00|12,18602.5
4,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,16,WE,1699516,1615328,222.0,75.0,297.0,R001|00-00-00|16,16859.5
5,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,20,WE,1699746,1615404,230.0,76.0,306.0,R001|00-00-00|20,22781.5
6,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,0,WE,1699792,1615423,46.0,19.0,65.0,R001|00-00-00|0,7179.0
7,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,4,WE,1699799,1615426,7.0,3.0,10.0,R001|00-00-00|4,1766.5
8,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,8,WE,1699817,1615432,18.0,6.0,24.0,R001|00-00-00|8,11694.5
9,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,12,WE,1699887,1615472,70.0,40.0,110.0,R001|00-00-00|12,18602.5


In [146]:
cons_turnstile_df4[np.isnan(cons_turnstile_df4['Cum. Entries & Exits'])]

Unnamed: 0,TURNSTILE,STATION,year,month,day,hour,WD vs WE,ENTRIES,EXITS,Entries Delta,Exits Delta,Cum. Entries & Exits,turnstile_hour_key,Mean Entries & Exits
0,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,0,WE,1699119,1615179,,,,R001|00-00-00|0,7179.0
551,R001|00-00-00,WHITEHALL S-FRY,2017,2,25,0,WE,2243265,1838586,,,,R001|00-00-00|0,7179.0
1102,R001|00-00-00,WHITEHALL S-FRY,2018,2,24,0,WE,3097287,2197711,,,,R001|00-00-00|0,7179.0
1647,R001|00-00-00,WHITEHALL S-FRY,2019,2,23,0,WE,4104196,2551855,,,,R001|00-00-00|0,7179.0
2199,R001|00-00-01,WHITEHALL S-FRY,2016,2,27,0,WE,1411875,1122986,,,,R001|00-00-01|0,6934.0
2751,R001|00-00-01,WHITEHALL S-FRY,2017,2,25,0,WE,1873943,1370803,,,,R001|00-00-01|0,6934.0
3303,R001|00-00-01,WHITEHALL S-FRY,2018,2,24,0,WE,2568734,1749953,,,,R001|00-00-01|0,6934.0
3848,R001|00-00-01,WHITEHALL S-FRY,2019,2,23,0,WE,3320946,2137303,,,,R001|00-00-01|0,6934.0
4401,R001|00-00-02,WHITEHALL S-FRY,2016,2,27,0,WE,467954,416107,,,,R001|00-00-02|0,6528.0
4953,R001|00-00-02,WHITEHALL S-FRY,2017,2,25,0,WE,873995,728968,,,,R001|00-00-02|0,6528.0


### REPLACE ALL NaN Values in "Cum. Entries & Exits" with MEAN

In [147]:
cons_turnstile_df5 = cons_turnstile_df4.copy()

In [148]:
cons_turnstile_df5.loc[np.isnan(cons_turnstile_df5['Cum. Entries & Exits']),    \
                                            'Cum. Entries & Exits'] = cons_turnstile_df5['Mean Entries & Exits']

In [149]:
cons_turnstile_df5['hour'] = cons_turnstile_df5['hour'].astype(int)

In [150]:
cons_turnstile_df5

Unnamed: 0,TURNSTILE,STATION,year,month,day,hour,WD vs WE,ENTRIES,EXITS,Entries Delta,Exits Delta,Cum. Entries & Exits,turnstile_hour_key,Mean Entries & Exits
0,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,0,WE,1699119,1615179,,,7179.0,R001|00-00-00|0,7179.0
1,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,4,WE,1699131,1615186,12.0,7.0,19.0,R001|00-00-00|4,1766.5
2,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,8,WE,1699154,1615191,23.0,5.0,28.0,R001|00-00-00|8,11694.5
3,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,12,WE,1699294,1615253,140.0,62.0,202.0,R001|00-00-00|12,18602.5
4,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,16,WE,1699516,1615328,222.0,75.0,297.0,R001|00-00-00|16,16859.5
5,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,20,WE,1699746,1615404,230.0,76.0,306.0,R001|00-00-00|20,22781.5
6,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,0,WE,1699792,1615423,46.0,19.0,65.0,R001|00-00-00|0,7179.0
7,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,4,WE,1699799,1615426,7.0,3.0,10.0,R001|00-00-00|4,1766.5
8,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,8,WE,1699817,1615432,18.0,6.0,24.0,R001|00-00-00|8,11694.5
9,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,12,WE,1699887,1615472,70.0,40.0,110.0,R001|00-00-00|12,18602.5


In [151]:
cons_turnstile_df5['hour'].unique()

array([ 0,  4,  8, 12, 16, 20,  5,  9, 13, 17, 21,  1,  7, 10, 11, 18, 19,
       14,  3, 15, 23,  6, 22,  2])

### Create new DF to remove rows with timestamp below 8am and above 8pm

In [152]:
cons_turnstile_df6 = cons_turnstile_df5[(cons_turnstile_df5['hour'] > 8) & (cons_turnstile_df5['hour'] < 20)]
cons_turnstile_df6['hour'].unique()

array([12, 16,  9, 13, 17, 10, 11, 18, 19, 14, 15])

In [153]:
cons_turnstile_df6

Unnamed: 0,TURNSTILE,STATION,year,month,day,hour,WD vs WE,ENTRIES,EXITS,Entries Delta,Exits Delta,Cum. Entries & Exits,turnstile_hour_key,Mean Entries & Exits
3,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,12,WE,1699294,1615253,140.0,62.0,202.0,R001|00-00-00|12,18602.5
4,R001|00-00-00,WHITEHALL S-FRY,2016,2,27,16,WE,1699516,1615328,222.0,75.0,297.0,R001|00-00-00|16,16859.5
9,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,12,WE,1699887,1615472,70.0,40.0,110.0,R001|00-00-00|12,18602.5
10,R001|00-00-00,WHITEHALL S-FRY,2016,2,28,16,WE,1700070,1615557,183.0,85.0,268.0,R001|00-00-00|16,16859.5
15,R001|00-00-00,WHITEHALL S-FRY,2016,2,29,12,WD,1700602,1615748,216.0,96.0,312.0,R001|00-00-00|12,18602.5
16,R001|00-00-00,WHITEHALL S-FRY,2016,2,29,16,WD,1700725,1615781,123.0,33.0,156.0,R001|00-00-00|16,16859.5
21,R001|00-00-00,WHITEHALL S-FRY,2016,3,1,12,WD,1701261,1616002,202.0,126.0,328.0,R001|00-00-00|12,18602.5
22,R001|00-00-00,WHITEHALL S-FRY,2016,3,1,16,WD,1701408,1616073,147.0,71.0,218.0,R001|00-00-00|16,16859.5
27,R001|00-00-00,WHITEHALL S-FRY,2016,3,2,12,WD,1702020,1616282,244.0,101.0,345.0,R001|00-00-00|12,18602.5
28,R001|00-00-00,WHITEHALL S-FRY,2016,3,2,16,WD,1702179,1616327,159.0,45.0,204.0,R001|00-00-00|16,16859.5


In [154]:
cons_turnstile_df6.to_pickle('data_clean.pkl')

## PLOTTING DATA:

In [None]:
import seaborn as sns
import matplotlib.plyplot as plt

In [None]:
# Av