# MTA Turnstile Data Exploratory Data Analysis

Initial Exploratory Data Analysis on smaller sample of data

In [1]:
# Importing packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine

In [2]:
# Focus just on 3 weeks worth of data
# Source: http://web.mta.info/developers/turnstile.html
def get_data(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)
        
week_nums = [210109, 210116, 210123]
df = get_data(week_nums)

In [3]:
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/02/2021,03:00:00,REGULAR,7511653,2558871
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/02/2021,07:00:00,REGULAR,7511655,2558877
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/02/2021,11:00:00,REGULAR,7511677,2558930
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/02/2021,15:00:00,REGULAR,7511766,2558968
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/02/2021,19:00:00,REGULAR,7511912,2558999


In [4]:
print(df.dtypes)
print(df.columns)
print(df.shape)

C/A                                                                     object
UNIT                                                                    object
SCP                                                                     object
STATION                                                                 object
LINENAME                                                                object
DIVISION                                                                object
DATE                                                                    object
TIME                                                                    object
DESC                                                                    object
ENTRIES                                                                  int64
EXITS                                                                    int64
dtype: object
Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES',
       'EXI

### Data Cleaning
**1. Identifying problems with the dataset**
   - Extra whitespace in EXITS column
   - Inconsistent datatype: 
       - DATE, TIME (datetime object)
       - ENTRIES, EXITS (int)
   - Any Missing Values
   - Any duplicates of the same datetime
   - Drop any unnecessary columns or rows

In [5]:
# rename columns to remove whitespace
df.columns = [column.strip() for column in df.columns]
df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

In [6]:
from datetime import datetime, timedelta
# Create DATETIME column from DATE, TIME columns
df['DATETIME'] = pd.to_datetime(df['DATE'] + " " + df['TIME'],
                                   format="%m/%d/%Y %H:%M:%S")
# Create DAY_OF_WEEK column from DATETIME
df['DAY_OF_WEEK'] = df[['DATETIME']]\
    .apply(lambda x: datetime.strftime(x['DATETIME'], "%A"), axis=1)

In [7]:
#change dtypes
df['ENTRIES']=df.ENTRIES.astype('int')
df['EXITS']=df.EXITS.astype('int')

In [8]:
#Checking for duplicates 
(df
.groupby(['C/A','UNIT','SCP','STATION','DATE','TIME'])
.ENTRIES.count()
.reset_index()
.sort_values("ENTRIES", ascending=False)).head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,TIME,ENTRIES
308795,N512,R163,00-00-00,14 ST,01/15/2021,11:00:00,2
56629,B028,R136,01-00-01,SHEEPSHEAD BAY,01/08/2021,04:00:00,2
0,A002,R051,02-00-00,59 ST,01/02/2021,03:00:00,1
419264,R138,R293,00-02-05,34 ST-PENN STA,01/02/2021,17:00:00,1
419258,R138,R293,00-02-04,34 ST-PENN STA,01/22/2021,17:00:00,1


In [9]:
#double checking on 01/15/21, seems to be double entries which does not affect the regular entry
mask = ((df['C/A'] == 'N512') &
       (df['UNIT'] == 'R163') &
       (df['SCP'] == '00-00-00') &
       (df['STATION'] == '14 ST') &
       (df['DATE'] == "01/15/2021"))
df[mask]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME,DAY_OF_WEEK
102924,N512,R163,00-00-00,14 ST,FLM123,IND,01/15/2021,03:00:00,REGULAR,3755758,8020534,2021-01-15 03:00:00,Friday
102925,N512,R163,00-00-00,14 ST,FLM123,IND,01/15/2021,07:00:00,REGULAR,3755762,8020550,2021-01-15 07:00:00,Friday
102926,N512,R163,00-00-00,14 ST,FLM123,IND,01/15/2021,11:00:00,REGULAR,3755777,8020627,2021-01-15 11:00:00,Friday
102927,N512,R163,00-00-00,14 ST,FLM123,IND,01/15/2021,11:00:00,RECOVR AUD,4,0,2021-01-15 11:00:00,Friday
102928,N512,R163,00-00-00,14 ST,FLM123,IND,01/15/2021,15:00:00,REGULAR,3755807,8020684,2021-01-15 15:00:00,Friday
102929,N512,R163,00-00-00,14 ST,FLM123,IND,01/15/2021,19:00:00,REGULAR,3755860,8020748,2021-01-15 19:00:00,Friday
102930,N512,R163,00-00-00,14 ST,FLM123,IND,01/15/2021,23:00:00,REGULAR,3755878,8020763,2021-01-15 23:00:00,Friday


In [10]:
df.DESC.value_counts()

REGULAR       626747
RECOVR AUD      2145
Name: DESC, dtype: int64

In [11]:
# Appears there are extra entries from RECOVR AUD, which are not relevant 
# for the analysis
# Remove non-REGULAR values from 'DESC'
df = df.drop(df.loc[df.DESC != 'REGULAR'].index)

In [12]:
#Sanity Check again
(df
.groupby(['C/A','UNIT','SCP','STATION','DATE','TIME'])
.ENTRIES.count()
.reset_index()
.sort_values("ENTRIES", ascending=False)).head(5)

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,TIME,ENTRIES
0,A002,R051,02-00-00,59 ST,01/02/2021,03:00:00,1
414977,R138,R293,00-02-01,34 ST-PENN STA,01/16/2021,13:00:00,1
414979,R138,R293,00-02-01,34 ST-PENN STA,01/16/2021,21:00:00,1
414980,R138,R293,00-02-01,34 ST-PENN STA,01/17/2021,01:00:00,1
414981,R138,R293,00-02-01,34 ST-PENN STA,01/17/2021,05:00:00,1


In [13]:
#Drop rows for PATH, RIT, and SRT 
#since only checking on subway data
df.drop(df.loc[df['DIVISION'].isin(['PTH','RIT','SRT'])].index, 
        inplace=True)
df.shape

(572172, 13)

In [14]:
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME,DAY_OF_WEEK
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/02/2021,03:00:00,REGULAR,7511653,2558871,2021-01-02 03:00:00,Saturday
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/02/2021,07:00:00,REGULAR,7511655,2558877,2021-01-02 07:00:00,Saturday
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/02/2021,11:00:00,REGULAR,7511677,2558930,2021-01-02 11:00:00,Saturday
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/02/2021,15:00:00,REGULAR,7511766,2558968,2021-01-02 15:00:00,Saturday
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/02/2021,19:00:00,REGULAR,7511912,2558999,2021-01-02 19:00:00,Saturday


In [15]:
# Drop unnecessary columns
df = df.drop(['LINENAME', 'DIVISION', 'DESC', 'DATE', 'TIME'], axis=1)
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,ENTRIES,EXITS,DATETIME,DAY_OF_WEEK
0,A002,R051,02-00-00,59 ST,7511653,2558871,2021-01-02 03:00:00,Saturday
1,A002,R051,02-00-00,59 ST,7511655,2558877,2021-01-02 07:00:00,Saturday
2,A002,R051,02-00-00,59 ST,7511677,2558930,2021-01-02 11:00:00,Saturday
3,A002,R051,02-00-00,59 ST,7511766,2558968,2021-01-02 15:00:00,Saturday
4,A002,R051,02-00-00,59 ST,7511912,2558999,2021-01-02 19:00:00,Saturday


In [16]:
# Detecting any missing values
df.isna().sum()

C/A            0
UNIT           0
SCP            0
STATION        0
ENTRIES        0
EXITS          0
DATETIME       0
DAY_OF_WEEK    0
dtype: int64

In [17]:
# Checking on first formatted dataset
df.sort_values(['C/A', 'UNIT', 'SCP', 'STATION', 'DATETIME'],
              inplace=True, ascending=True)
print(df.shape)
df.head()

(572172, 8)


Unnamed: 0,C/A,UNIT,SCP,STATION,ENTRIES,EXITS,DATETIME,DAY_OF_WEEK
0,A002,R051,02-00-00,59 ST,7511653,2558871,2021-01-02 03:00:00,Saturday
1,A002,R051,02-00-00,59 ST,7511655,2558877,2021-01-02 07:00:00,Saturday
2,A002,R051,02-00-00,59 ST,7511677,2558930,2021-01-02 11:00:00,Saturday
3,A002,R051,02-00-00,59 ST,7511766,2558968,2021-01-02 15:00:00,Saturday
4,A002,R051,02-00-00,59 ST,7511912,2558999,2021-01-02 19:00:00,Saturday


**2. Entry and Exit counts**
   - some counters resets or count backwards

In [18]:
#Create previous datetime, entries and entries columns
df[['PREV_DATETIME', "PREV_ENTRIES", "PREV_EXITS"]] = (df
                                                .groupby(['C/A', 'UNIT', 'SCP', 'STATION'])\
                                                       ['DATETIME','ENTRIES','EXITS']
                                                .apply(lambda grp: grp.shift(1)))

  df[['PREV_DATETIME', "PREV_ENTRIES", "PREV_EXITS"]] = (df


In [19]:
# Drop the rows for the earliest times in the df
df.dropna(subset=["PREV_DATETIME"], axis=0, inplace=True)
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,ENTRIES,EXITS,DATETIME,DAY_OF_WEEK,PREV_DATETIME,PREV_ENTRIES,PREV_EXITS
1,A002,R051,02-00-00,59 ST,7511655,2558877,2021-01-02 07:00:00,Saturday,2021-01-02 03:00:00,7511653.0,2558871.0
2,A002,R051,02-00-00,59 ST,7511677,2558930,2021-01-02 11:00:00,Saturday,2021-01-02 07:00:00,7511655.0,2558877.0
3,A002,R051,02-00-00,59 ST,7511766,2558968,2021-01-02 15:00:00,Saturday,2021-01-02 11:00:00,7511677.0,2558930.0
4,A002,R051,02-00-00,59 ST,7511912,2558999,2021-01-02 19:00:00,Saturday,2021-01-02 15:00:00,7511766.0,2558968.0
5,A002,R051,02-00-00,59 ST,7511996,2559007,2021-01-02 23:00:00,Saturday,2021-01-02 19:00:00,7511912.0,2558999.0


In [20]:
# problem with counter being in reverse
df[df["ENTRIES"] < df["PREV_ENTRIES"]].head()

Unnamed: 0,C/A,UNIT,SCP,STATION,ENTRIES,EXITS,DATETIME,DAY_OF_WEEK,PREV_DATETIME,PREV_ENTRIES,PREV_EXITS
1639,A011,R080,01-03-00,57 ST-7 AV,885629549,489699476,2021-01-02 07:00:00,Saturday,2021-01-02 03:00:00,885629550.0,489699496.0
1640,A011,R080,01-03-00,57 ST-7 AV,885629539,489699340,2021-01-02 11:00:00,Saturday,2021-01-02 07:00:00,885629549.0,489699476.0
1641,A011,R080,01-03-00,57 ST-7 AV,885629523,489699138,2021-01-02 15:00:00,Saturday,2021-01-02 11:00:00,885629539.0,489699340.0
1642,A011,R080,01-03-00,57 ST-7 AV,885629468,489698994,2021-01-02 19:00:00,Saturday,2021-01-02 15:00:00,885629523.0,489699138.0
1643,A011,R080,01-03-00,57 ST-7 AV,885629436,489698939,2021-01-02 23:00:00,Saturday,2021-01-02 19:00:00,885629468.0,489698994.0


In [21]:
# Let's see how many stations have this problem
(df[df["ENTRIES"] < df["PREV_ENTRIES"]]
    .groupby(["C/A", "UNIT", "SCP", "STATION"])
    .size())

C/A   UNIT  SCP       STATION        
A011  R080  01-03-00  57 ST-7 AV         125
A031  R083  00-00-01  23 ST                1
A042  R086  01-00-04  PRINCE ST            2
A043  R462  00-03-03  CANAL ST             1
A049  R088  02-05-00  CORTLANDT ST         7
                                        ... 
R401  R445  00-00-00  3 AV 138 ST        122
R412  R146  00-00-00  HUNTS POINT AV      70
R523  R147  00-00-04  61 ST WOODSIDE     111
R622  R123  00-00-00  FRANKLIN AV        123
R730  R431  00-00-04  EASTCHSTER/DYRE     34
Length: 71, dtype: int64

In [22]:
# Functions for entry and exit counts
def get_entry_counts(row, max_counter):
    counter = abs(row['ENTRIES'] - row['PREV_ENTRIES'])
    # Set anomaly values due to reset of counters to the uniform NaN values
    if counter > max_counter:
        counter = np.nan
    return counter

def get_exit_counts(row, max_counter):
    counter = abs(row['EXITS'] - row['PREV_EXITS'])
    # Set anomaly values due to reset of counters to the uniform NaN values
    if counter > max_counter:
        counter = np.nan
    return counter

# Set max_counter = 86400, assuming only 1/person/sec/turnstile at a time
df['entry_count'] = df.apply(get_entry_counts, axis=1, max_counter=14400)
df['exit_count'] = df.apply(get_exit_counts, axis=1, max_counter=14400)
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,ENTRIES,EXITS,DATETIME,DAY_OF_WEEK,PREV_DATETIME,PREV_ENTRIES,PREV_EXITS,entry_count,exit_count
1,A002,R051,02-00-00,59 ST,7511655,2558877,2021-01-02 07:00:00,Saturday,2021-01-02 03:00:00,7511653.0,2558871.0,2.0,6.0
2,A002,R051,02-00-00,59 ST,7511677,2558930,2021-01-02 11:00:00,Saturday,2021-01-02 07:00:00,7511655.0,2558877.0,22.0,53.0
3,A002,R051,02-00-00,59 ST,7511766,2558968,2021-01-02 15:00:00,Saturday,2021-01-02 11:00:00,7511677.0,2558930.0,89.0,38.0
4,A002,R051,02-00-00,59 ST,7511912,2558999,2021-01-02 19:00:00,Saturday,2021-01-02 15:00:00,7511766.0,2558968.0,146.0,31.0
5,A002,R051,02-00-00,59 ST,7511996,2559007,2021-01-02 23:00:00,Saturday,2021-01-02 19:00:00,7511912.0,2558999.0,84.0,8.0


In [23]:
# Checking NaN values created by previous function
# NaN values are counters that are negative or exceeds threshold
print("NaN in Entry: ", df.entry_count.isna().sum())
print("NaN in Exit: ", df.entry_count.isna().sum())

NaN in Entry:  26
NaN in Exit:  26


In [24]:
df[df.entry_count.isna() == 1].head()

Unnamed: 0,C/A,UNIT,SCP,STATION,ENTRIES,EXITS,DATETIME,DAY_OF_WEEK,PREV_DATETIME,PREV_ENTRIES,PREV_EXITS,entry_count,exit_count
4717,A031,R083,00-00-01,23 ST,16,12,2021-01-17 11:00:00,Sunday,2021-01-16 03:00:00,5102094.0,1719658.0,,
7306,A043,R462,00-03-03,CANAL ST,2,8,2021-01-10 12:00:00,Sunday,2021-01-10 08:00:00,196944.0,155111.0,,
10857,A058,R001,01-00-01,WHITEHALL S-FRY,32,25,2021-01-11 12:00:00,Monday,2021-01-11 08:00:00,2172459.0,1717560.0,,
14970,B015,R098,01-03-00,CHURCH AV,196632,8,2021-01-22 11:00:00,Friday,2021-01-21 19:00:00,13806.0,14983.0,,
29663,E011,R371,00-06-01,79 ST,262158,19,2021-01-07 15:00:00,Thursday,2021-01-07 11:00:00,526761.0,2570.0,,2551.0


In [25]:
# Fill NaN values with the mean of values before and after NaN value
# AKA take the mean of the daily count

entry_list = list(df['entry_count'])
ind = 0
for i in entry_list:
    if np.isnan(i) == 1:
        entry_list[ind] = np.nanmean([entry_list[ind-2],entry_list[ind-1],entry_list[ind+1],entry_list[ind+2]])
    ind += 1
    
df['ENTRY_DIFF'] = entry_list

In [26]:
df['ENTRY_DIFF'].isna().sum()

0

In [27]:
# Cleaning NaN values in exit_count
# For each NaN values, replace it with the mean of values 
                                        #before and after the NaN value
exit_list = list(df['exit_count'])
ind = 0
for i in exit_list:
    if np.isnan(i) == 1:
        exit_list[ind] = np.nanmean([exit_list[ind-2],exit_list[ind-1],exit_list[ind+1],exit_list[ind+2]])
    ind += 1
    
df['EXIT_DIFF'] = exit_list

In [28]:
df['EXIT_DIFF'].isna().sum()

0

In [29]:
# Get total traffic count
df['TOTAL_TRAFFIC'] = df['ENTRY_DIFF'] + df['EXIT_DIFF']
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,ENTRIES,EXITS,DATETIME,DAY_OF_WEEK,PREV_DATETIME,PREV_ENTRIES,PREV_EXITS,entry_count,exit_count,ENTRY_DIFF,EXIT_DIFF,TOTAL_TRAFFIC
1,A002,R051,02-00-00,59 ST,7511655,2558877,2021-01-02 07:00:00,Saturday,2021-01-02 03:00:00,7511653.0,2558871.0,2.0,6.0,2.0,6.0,8.0
2,A002,R051,02-00-00,59 ST,7511677,2558930,2021-01-02 11:00:00,Saturday,2021-01-02 07:00:00,7511655.0,2558877.0,22.0,53.0,22.0,53.0,75.0
3,A002,R051,02-00-00,59 ST,7511766,2558968,2021-01-02 15:00:00,Saturday,2021-01-02 11:00:00,7511677.0,2558930.0,89.0,38.0,89.0,38.0,127.0
4,A002,R051,02-00-00,59 ST,7511912,2558999,2021-01-02 19:00:00,Saturday,2021-01-02 15:00:00,7511766.0,2558968.0,146.0,31.0,146.0,31.0,177.0
5,A002,R051,02-00-00,59 ST,7511996,2559007,2021-01-02 23:00:00,Saturday,2021-01-02 19:00:00,7511912.0,2558999.0,84.0,8.0,84.0,8.0,92.0


In [30]:
#Sanity Check
(df
 .groupby(['C/A','UNIT','SCP','STATION','DATETIME','DAY_OF_WEEK'])
 .sum()
 .reset_index())

Unnamed: 0,C/A,UNIT,SCP,STATION,DATETIME,DAY_OF_WEEK,ENTRIES,EXITS,PREV_ENTRIES,PREV_EXITS,entry_count,exit_count,ENTRY_DIFF,EXIT_DIFF,TOTAL_TRAFFIC
0,A002,R051,02-00-00,59 ST,2021-01-02 07:00:00,Saturday,7511655,2558877,7.511653e+06,2.558871e+06,2.0,6.0,2.0,6.0,8.0
1,A002,R051,02-00-00,59 ST,2021-01-02 11:00:00,Saturday,7511677,2558930,7.511655e+06,2.558877e+06,22.0,53.0,22.0,53.0,75.0
2,A002,R051,02-00-00,59 ST,2021-01-02 15:00:00,Saturday,7511766,2558968,7.511677e+06,2.558930e+06,89.0,38.0,89.0,38.0,127.0
3,A002,R051,02-00-00,59 ST,2021-01-02 19:00:00,Saturday,7511912,2558999,7.511766e+06,2.558968e+06,146.0,31.0,146.0,31.0,177.0
4,A002,R051,02-00-00,59 ST,2021-01-02 23:00:00,Saturday,7511996,2559007,7.511912e+06,2.558999e+06,84.0,8.0,84.0,8.0,92.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
567552,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-01-22 04:00:00,Friday,1559826404,1728238157,1.559826e+09,1.728238e+09,0.0,0.0,0.0,0.0,0.0
567553,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-01-22 08:00:00,Friday,1559826356,1728238172,1.559826e+09,1.728238e+09,48.0,15.0,48.0,15.0,63.0
567554,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-01-22 12:00:00,Friday,1559826324,1728238186,1.559826e+09,1.728238e+09,32.0,14.0,32.0,14.0,46.0
567555,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-01-22 16:00:00,Friday,1559826293,1728238195,1.559826e+09,1.728238e+09,31.0,9.0,31.0,9.0,40.0


**3. Inconsistent time intervals**
   - unifying into same time intervals (4-hours)

In [31]:
# Check TIME intervals
df['TIME'] = df['DATETIME'].apply(lambda x: x.time())
print(df.TIME.unique())
print(df.TIME.value_counts())

[datetime.time(7, 0) datetime.time(11, 0) datetime.time(15, 0)
 datetime.time(19, 0) datetime.time(23, 0) datetime.time(3, 0)
 datetime.time(5, 0) datetime.time(9, 0) datetime.time(13, 0)
 datetime.time(17, 0) datetime.time(21, 0) datetime.time(1, 0)
 datetime.time(17, 14, 50) datetime.time(17, 16, 10)
 datetime.time(17, 17, 31) datetime.time(17, 18, 51)
 datetime.time(17, 20, 12) datetime.time(17, 21, 33)
 datetime.time(17, 22, 53) datetime.time(17, 24, 13)
 datetime.time(17, 25, 34) datetime.time(17, 26, 55)
 datetime.time(17, 28, 15) datetime.time(17, 29, 36)
 datetime.time(17, 30, 56) datetime.time(17, 32, 17)
 datetime.time(17, 33, 38) datetime.time(4, 0) datetime.time(8, 0)
 datetime.time(12, 0) datetime.time(16, 0) datetime.time(20, 0)
 datetime.time(0, 0) datetime.time(16, 8, 52) datetime.time(16, 5, 55)
 datetime.time(20, 37, 45) datetime.time(20, 38, 55)
 datetime.time(8, 4, 29) datetime.time(8, 5, 39) datetime.time(6, 0)
 datetime.time(10, 0) datetime.time(14, 0) datetime.ti

In [32]:
# Turn all time intervals to 4-hour frequency
df = df.groupby(['C/A', 'UNIT', 'SCP', 'STATION',
                  pd.Grouper(key='DATETIME', freq='4H'),
                  'DAY_OF_WEEK', 'TIME']).sum().reset_index()
# Double checking on time intervals
df['TIME'] = df['DATETIME'].apply(lambda x: x.time())
print(df.TIME.unique())

[datetime.time(4, 0) datetime.time(8, 0) datetime.time(12, 0)
 datetime.time(16, 0) datetime.time(20, 0) datetime.time(0, 0)]


In [33]:
# Syncing Day_of_week column to the new datetime
df['DAY_OF_WEEK'] = (df['DATETIME']
                    .apply(lambda x : datetime.strftime(x, "%A")))

In [34]:
#Sanity Check
(df
 .groupby(['C/A','UNIT','SCP','STATION','DATETIME','DAY_OF_WEEK'])
 .sum()
 .reset_index())

Unnamed: 0,C/A,UNIT,SCP,STATION,DATETIME,DAY_OF_WEEK,ENTRIES,EXITS,PREV_ENTRIES,PREV_EXITS,entry_count,exit_count,ENTRY_DIFF,EXIT_DIFF,TOTAL_TRAFFIC
0,A002,R051,02-00-00,59 ST,2021-01-02 04:00:00,Saturday,7511655,2558877,7.511653e+06,2.558871e+06,2.0,6.0,2.0,6.0,8.0
1,A002,R051,02-00-00,59 ST,2021-01-02 08:00:00,Saturday,7511677,2558930,7.511655e+06,2.558877e+06,22.0,53.0,22.0,53.0,75.0
2,A002,R051,02-00-00,59 ST,2021-01-02 12:00:00,Saturday,7511766,2558968,7.511677e+06,2.558930e+06,89.0,38.0,89.0,38.0,127.0
3,A002,R051,02-00-00,59 ST,2021-01-02 16:00:00,Saturday,7511912,2558999,7.511766e+06,2.558968e+06,146.0,31.0,146.0,31.0,177.0
4,A002,R051,02-00-00,59 ST,2021-01-02 20:00:00,Saturday,7511996,2559007,7.511912e+06,2.558999e+06,84.0,8.0,84.0,8.0,92.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562958,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-01-22 04:00:00,Friday,1559826404,1728238157,1.559826e+09,1.728238e+09,0.0,0.0,0.0,0.0,0.0
562959,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-01-22 08:00:00,Friday,1559826356,1728238172,1.559826e+09,1.728238e+09,48.0,15.0,48.0,15.0,63.0
562960,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-01-22 12:00:00,Friday,1559826324,1728238186,1.559826e+09,1.728238e+09,32.0,14.0,32.0,14.0,46.0
562961,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-01-22 16:00:00,Friday,1559826293,1728238195,1.559826e+09,1.728238e+09,31.0,9.0,31.0,9.0,40.0


In [35]:
# Final clean up
# Drop unnecessary columns
turnstile_df = df.drop(['C/A', 'UNIT', 'SCP',
                         'ENTRIES', 'EXITS', 'TIME',
                         'PREV_ENTRIES', 'PREV_EXITS', 
                         'entry_count', 'exit_count'], axis=1)
turnstile_df.head()

Unnamed: 0,STATION,DATETIME,DAY_OF_WEEK,ENTRY_DIFF,EXIT_DIFF,TOTAL_TRAFFIC
0,59 ST,2021-01-02 04:00:00,Saturday,2.0,6.0,8.0
1,59 ST,2021-01-02 08:00:00,Saturday,22.0,53.0,75.0
2,59 ST,2021-01-02 12:00:00,Saturday,89.0,38.0,127.0
3,59 ST,2021-01-02 16:00:00,Saturday,146.0,31.0,177.0
4,59 ST,2021-01-02 20:00:00,Saturday,84.0,8.0,92.0


In [36]:
#Save file as csv
#turnstile_df.to_csv('./data/cleaned_mta_df.csv')