In [62]:
import pandas as pd
import numpy as np

from sqlalchemy import create_engine

import geopy

import matplotlib.pyplot as plt
import seaborn as sns

# Configure image format to be ‘svg’
%config InlineBackend.figure_format = 'svg'

# allow visuals to render within notebook
%matplotlib inline 

In [63]:
# Make some room to see stuff (i.e. drop display limits on Pandas rows & cols - be careful w/ big df's!)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# 1. Import MTA Data From SQLite Database

In [64]:
engine = create_engine('sqlite:///../_PROJECT/mta_data.db')

engine.table_names()

  engine.table_names()


['mta_data']

In [65]:
# Check for any NULL values

pd.read_sql('''
            SELECT *
            FROM mta_data
            WHERE unit IS NULL 
               OR scp IS NULL
               OR station IS NULL
               OR linename IS NULL
               OR date IS NULL
               OR time IS NULL
               OR desc IS NULL
               OR entries IS NULL
               OR exits IS NULL;
            ''', con=engine)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS


In [66]:
# Bring in just the data for May-June 2019 (order by ascending 'DATE'; don't bother importing 'EXITS' col)

# df_2019 = pd.read_sql('''
#                       SELECT (STATION || ' - ' || LINENAME || ' - ' || DIVISION) AS STATION_ID,
#                          STATION AS STATION_NAME, [C/A], UNIT, SCP,  DATE, TIME, DESC, ENTRIES
#                       FROM mta_data
#                       WHERE DATE = '04/30/2019' OR DATE LIKE '05%19' OR DATE LIKE '06%19'
#                       ORDER BY DATE ASC;
#                       ''', con=engine)

df_2019 = pd.read_sql('''
                      SELECT *
                      FROM mta_data
                      WHERE DATE = '04/30/2019' OR DATE LIKE '05%19' OR DATE LIKE '06%19'
                      ORDER BY DATE ASC;
                      ''', con=engine)

In [67]:
# Add a 'DATETIME' column (and drop 'DATE' & 'TIME' - can get either of these from 'DATETIME' if needed)

df_2019['DATETIME'] = pd.to_datetime(df_2019['DATE'] + ' ' + df_2019['TIME'])

# Make 'DATE' column a proper date format
df_2019['DATE'] = df_2019['DATETIME'].dt.date

df_2019.drop(columns=['TIME', 'EXITS'], inplace=True)

In [68]:
print(df_2019.shape)
display(df_2019.head())
display(df_2019.tail())

(3639340, 10)


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,DESC,ENTRIES,DATETIME
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-30,REGULAR,7038242,2019-04-30 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-30,REGULAR,7038261,2019-04-30 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-30,REGULAR,7038308,2019-04-30 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-30,REGULAR,7038474,2019-04-30 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-30,REGULAR,7038775,2019-04-30 16:00:00


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,DESC,ENTRIES,DATETIME
3639335,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2019-06-30,REGULAR,5554,2019-06-30 05:00:00
3639336,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2019-06-30,REGULAR,5554,2019-06-30 09:00:00
3639337,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2019-06-30,REGULAR,5554,2019-06-30 13:00:00
3639338,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2019-06-30,REGULAR,5554,2019-06-30 17:00:00
3639339,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2019-06-30,REGULAR,5554,2019-06-30 21:00:00


In [69]:
# Bring in just the data for May-June 2021 (order by ascending 'DATE'; don't bother importing 'EXITS' col)

# df_2021 = pd.read_sql('''
#                       SELECT (STATION || ' - ' || LINENAME || ' - ' || DIVISION) AS STATION_ID,
#                          STATION AS STATION_NAME, [C/A], UNIT, SCP,  DATE, TIME, DESC, ENTRIES
#                       FROM mta_data
#                       WHERE DATE = '04/30/2021' OR DATE LIKE '05%21' OR DATE LIKE '06%21'
#                       ORDER BY DATE ASC;
#                       ''', con=engine)

df_2021 = pd.read_sql('''
                      SELECT *
                      FROM mta_data
                      WHERE DATE = '04/30/2021' OR DATE LIKE '05%21' OR DATE LIKE '06%21'
                      ORDER BY DATE ASC;
                      ''', con=engine)

In [70]:
# Add a 'DATETIME' column (and drop 'DATE' & 'TIME' - can get either of these from 'DATETIME' if needed)

df_2021['DATETIME'] = pd.to_datetime(df_2021['DATE'] + ' ' + df_2021['TIME'])

# Make 'DATE' column a proper date format
df_2021['DATE'] = df_2021['DATETIME'].dt.date

df_2021.drop(columns=['TIME', 'EXITS'], inplace=True)

In [71]:
print(df_2021.shape)
display(df_2021.head())
display(df_2021.tail())

(3706606, 10)


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,DESC,ENTRIES,DATETIME
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2021-04-30,REGULAR,7564048,2021-04-30 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2021-04-30,REGULAR,7564059,2021-04-30 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2021-04-30,REGULAR,7564073,2021-04-30 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2021-04-30,REGULAR,7564118,2021-04-30 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2021-04-30,REGULAR,7564286,2021-04-30 16:00:00


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,DESC,ENTRIES,DATETIME
3706601,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2021-06-30,REGULAR,5554,2021-06-30 05:00:00
3706602,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2021-06-30,REGULAR,5554,2021-06-30 09:00:00
3706603,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2021-06-30,REGULAR,5554,2021-06-30 13:00:00
3706604,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2021-06-30,REGULAR,5554,2021-06-30 17:00:00
3706605,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2021-06-30,REGULAR,5554,2021-06-30 21:00:00


# 2. Basic Cleaning

### 2a) Check for Duplicates
#### A single turnstile somwhere in the system is uniquely identified by C/A (Booth) + UNIT (Station) + SCP (Turnstile)
#### Add a new column 'ID' that creates a unique identifier for grouping on these three

In [72]:
df_2019['ID'] = df_2019['C/A'] + ' - ' + df_2019['UNIT'] + ' - ' + df_2019['SCP']
df_2019.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,DESC,ENTRIES,DATETIME,ID
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-30,REGULAR,7038242,2019-04-30 00:00:00,A002 - R051 - 02-00-00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-30,REGULAR,7038261,2019-04-30 04:00:00,A002 - R051 - 02-00-00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-30,REGULAR,7038308,2019-04-30 08:00:00,A002 - R051 - 02-00-00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-30,REGULAR,7038474,2019-04-30 12:00:00,A002 - R051 - 02-00-00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-30,REGULAR,7038775,2019-04-30 16:00:00,A002 - R051 - 02-00-00


In [73]:
df_2021['ID'] = df_2021['C/A'] + ' - ' + df_2021['UNIT'] + ' - ' + df_2021['SCP']
df_2021.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,DESC,ENTRIES,DATETIME,ID
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2021-04-30,REGULAR,7564048,2021-04-30 00:00:00,A002 - R051 - 02-00-00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2021-04-30,REGULAR,7564059,2021-04-30 04:00:00,A002 - R051 - 02-00-00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2021-04-30,REGULAR,7564073,2021-04-30 08:00:00,A002 - R051 - 02-00-00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2021-04-30,REGULAR,7564118,2021-04-30 12:00:00,A002 - R051 - 02-00-00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2021-04-30,REGULAR,7564286,2021-04-30 16:00:00,A002 - R051 - 02-00-00


#### Now check for duplicates (Start with df_2019):

In [74]:
check = df_2019.groupby(['ID', 'DATETIME'])['ENTRIES'] \
                        .count().sort_values(ascending=False)
check.reset_index().head()

Unnamed: 0,ID,DATETIME,ENTRIES
0,R249 - R179 - 01-05-00,2019-06-27 00:00:00,4
1,N045 - R187 - 01-06-00,2019-06-02 17:00:00,4
2,R174 - R034 - 00-00-03,2019-06-08 00:00:00,4
3,N045 - R187 - 01-00-00,2019-06-02 17:00:00,4
4,R174 - R034 - 00-00-02,2019-06-08 00:00:00,4


In [75]:
for i in range(0, 6):
    display(df_2019[(df_2019['ID'] == check.reset_index().iloc[i, 0:1].values[0]) & 
                    (df_2019['DATETIME'] == check.reset_index().iloc[i, 1:2].values[0])])

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,DESC,ENTRIES,DATETIME,ID
3427686,R249,R179,01-05-00,86 ST,456,IRT,2019-06-27,REGULAR,6,2019-06-27,R249 - R179 - 01-05-00
3427687,R249,R179,01-05-00,86 ST,456,IRT,2019-06-27,RECOVR AUD,11,2019-06-27,R249 - R179 - 01-05-00
3457150,R249,R179,01-05-00,86 ST,456,IRT,2019-06-27,REGULAR,6,2019-06-27,R249 - R179 - 01-05-00
3457151,R249,R179,01-05-00,86 ST,456,IRT,2019-06-27,RECOVR AUD,11,2019-06-27,R249 - R179 - 01-05-00


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,DESC,ENTRIES,DATETIME,ID
1940800,N045,R187,01-06-00,81 ST-MUSEUM,BC,IND,2019-06-02,REGULAR,2254050,2019-06-02 17:00:00,N045 - R187 - 01-06-00
1940801,N045,R187,01-06-00,81 ST-MUSEUM,BC,IND,2019-06-02,RECOVR AUD,2254049,2019-06-02 17:00:00,N045 - R187 - 01-06-00
1969768,N045,R187,01-06-00,81 ST-MUSEUM,BC,IND,2019-06-02,REGULAR,2254050,2019-06-02 17:00:00,N045 - R187 - 01-06-00
1969769,N045,R187,01-06-00,81 ST-MUSEUM,BC,IND,2019-06-02,RECOVR AUD,2254049,2019-06-02 17:00:00,N045 - R187 - 01-06-00


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,DESC,ENTRIES,DATETIME,ID
2306436,R174,R034,00-00-03,125 ST,1,IRT,2019-06-08,REGULAR,3828801,2019-06-08,R174 - R034 - 00-00-03
2306437,R174,R034,00-00-03,125 ST,1,IRT,2019-06-08,RECOVR AUD,3828804,2019-06-08,R174 - R034 - 00-00-03
2335466,R174,R034,00-00-03,125 ST,1,IRT,2019-06-08,REGULAR,3828801,2019-06-08,R174 - R034 - 00-00-03
2335467,R174,R034,00-00-03,125 ST,1,IRT,2019-06-08,RECOVR AUD,3828804,2019-06-08,R174 - R034 - 00-00-03


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,DESC,ENTRIES,DATETIME,ID
1940782,N045,R187,01-00-00,81 ST-MUSEUM,BC,IND,2019-06-02,REGULAR,5254570,2019-06-02 17:00:00,N045 - R187 - 01-00-00
1940783,N045,R187,01-00-00,81 ST-MUSEUM,BC,IND,2019-06-02,RECOVR AUD,5254568,2019-06-02 17:00:00,N045 - R187 - 01-00-00
1969750,N045,R187,01-00-00,81 ST-MUSEUM,BC,IND,2019-06-02,REGULAR,5254570,2019-06-02 17:00:00,N045 - R187 - 01-00-00
1969751,N045,R187,01-00-00,81 ST-MUSEUM,BC,IND,2019-06-02,RECOVR AUD,5254568,2019-06-02 17:00:00,N045 - R187 - 01-00-00


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,DESC,ENTRIES,DATETIME,ID
2306430,R174,R034,00-00-02,125 ST,1,IRT,2019-06-08,REGULAR,6091547,2019-06-08,R174 - R034 - 00-00-02
2306431,R174,R034,00-00-02,125 ST,1,IRT,2019-06-08,RECOVR AUD,6091548,2019-06-08,R174 - R034 - 00-00-02
2335460,R174,R034,00-00-02,125 ST,1,IRT,2019-06-08,REGULAR,6091547,2019-06-08,R174 - R034 - 00-00-02
2335461,R174,R034,00-00-02,125 ST,1,IRT,2019-06-08,RECOVR AUD,6091548,2019-06-08,R174 - R034 - 00-00-02


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,DESC,ENTRIES,DATETIME,ID
2467844,N045,R187,01-06-00,81 ST-MUSEUM,BC,IND,2019-06-11,REGULAR,2258033,2019-06-11 21:00:00,N045 - R187 - 01-06-00
2467845,N045,R187,01-06-00,81 ST-MUSEUM,BC,IND,2019-06-11,RECOVR AUD,2258031,2019-06-11 21:00:00,N045 - R187 - 01-06-00
2496957,N045,R187,01-06-00,81 ST-MUSEUM,BC,IND,2019-06-11,REGULAR,2258033,2019-06-11 21:00:00,N045 - R187 - 01-06-00
2496958,N045,R187,01-06-00,81 ST-MUSEUM,BC,IND,2019-06-11,RECOVR AUD,2258031,2019-06-11 21:00:00,N045 - R187 - 01-06-00


#### Looks like accidental data duplication and some RECOV AUDs virtually identical to REGULARs; will just drop all the duplicates

In [76]:
print(df_2019.shape)

df_2019 = df_2019.sort_values(by=['ID', 'DATETIME'], 
                              ascending=False) \
                             .drop_duplicates(subset=['ID', 'DATETIME'])

df_2019.reset_index(drop=True, inplace=True)
print(df_2019.shape)

(3639340, 11)
(1819646, 11)


In [77]:
# Re-check for duplicates

check = df_2019.groupby(['ID', 'DATETIME'])['ENTRIES'] \
                        .count().sort_values(ascending=False)
check.reset_index().head()

Unnamed: 0,ID,DATETIME,ENTRIES
0,A002 - R051 - 02-00-00,2019-04-30 00:00:00,1
1,R142 - R293 - 01-00-01,2019-05-12 18:00:00,1
2,R142 - R293 - 01-00-01,2019-05-14 06:00:00,1
3,R142 - R293 - 01-00-01,2019-05-14 02:00:00,1
4,R142 - R293 - 01-00-01,2019-05-13 22:00:00,1


#### Now deal with duplicates in df_2021:

In [78]:
check = df_2021.groupby(['ID', 'DATETIME'])['ENTRIES'] \
                         .count().sort_values(ascending=False)
check.reset_index().head()

Unnamed: 0,ID,DATETIME,ENTRIES
0,B024 - R211 - 00-05-00,2021-05-02 12:00:00,4
1,B024 - R211 - 00-05-00,2021-05-05 12:00:00,4
2,N329 - R201 - 00-03-02,2021-05-08 05:00:00,4
3,N329 - R201 - 00-03-02,2021-05-08 01:00:00,4
4,N335 - R158 - 01-00-03,2021-06-20 18:30:00,4


In [79]:
for i in range(0, 6):
    display(df_2021[(df_2021['ID'] == check.reset_index().iloc[i, 0:1].values[0]) & 
                    (df_2021['DATETIME'] == check.reset_index().iloc[i, 1:2].values[0])])

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,DESC,ENTRIES,DATETIME,ID
121785,B024,R211,00-05-00,KINGS HWY,BQ,BMT,2021-05-02,REGULAR,262152,2021-05-02 12:00:00,B024 - R211 - 00-05-00
121786,B024,R211,00-05-00,KINGS HWY,BQ,BMT,2021-05-02,RECOVR AUD,13,2021-05-02 12:00:00,B024 - R211 - 00-05-00
151574,B024,R211,00-05-00,KINGS HWY,BQ,BMT,2021-05-02,REGULAR,262152,2021-05-02 12:00:00,B024 - R211 - 00-05-00
151575,B024,R211,00-05-00,KINGS HWY,BQ,BMT,2021-05-02,RECOVR AUD,13,2021-05-02 12:00:00,B024 - R211 - 00-05-00


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,DESC,ENTRIES,DATETIME,ID
301051,B024,R211,00-05-00,KINGS HWY,BQ,BMT,2021-05-05,REGULAR,262152,2021-05-05 12:00:00,B024 - R211 - 00-05-00
301052,B024,R211,00-05-00,KINGS HWY,BQ,BMT,2021-05-05,RECOVR AUD,21,2021-05-05 12:00:00,B024 - R211 - 00-05-00
330870,B024,R211,00-05-00,KINGS HWY,BQ,BMT,2021-05-05,REGULAR,262152,2021-05-05 12:00:00,B024 - R211 - 00-05-00
330871,B024,R211,00-05-00,KINGS HWY,BQ,BMT,2021-05-05,RECOVR AUD,21,2021-05-05 12:00:00,B024 - R211 - 00-05-00


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,DESC,ENTRIES,DATETIME,ID
490387,N329,R201,00-03-02,WOODHAVEN BLVD,MR,IND,2021-05-08,REGULAR,4825280,2021-05-08 05:00:00,N329 - R201 - 00-03-02
490388,N329,R201,00-03-02,WOODHAVEN BLVD,MR,IND,2021-05-08,RECOVR AUD,12538860,2021-05-08 05:00:00,N329 - R201 - 00-03-02
520213,N329,R201,00-03-02,WOODHAVEN BLVD,MR,IND,2021-05-08,REGULAR,4825280,2021-05-08 05:00:00,N329 - R201 - 00-03-02
520214,N329,R201,00-03-02,WOODHAVEN BLVD,MR,IND,2021-05-08,RECOVR AUD,12538860,2021-05-08 05:00:00,N329 - R201 - 00-03-02


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,DESC,ENTRIES,DATETIME,ID
490385,N329,R201,00-03-02,WOODHAVEN BLVD,MR,IND,2021-05-08,REGULAR,4825276,2021-05-08 01:00:00,N329 - R201 - 00-03-02
490386,N329,R201,00-03-02,WOODHAVEN BLVD,MR,IND,2021-05-08,RECOVR AUD,12538859,2021-05-08 01:00:00,N329 - R201 - 00-03-02
520211,N329,R201,00-03-02,WOODHAVEN BLVD,MR,IND,2021-05-08,REGULAR,4825276,2021-05-08 01:00:00,N329 - R201 - 00-03-02
520212,N329,R201,00-03-02,WOODHAVEN BLVD,MR,IND,2021-05-08,RECOVR AUD,12538859,2021-05-08 01:00:00,N329 - R201 - 00-03-02


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,DESC,ENTRIES,DATETIME,ID
3061698,N335,R158,01-00-03,KEW GARDENS,EF,IND,2021-06-20,REGULAR,5201958,2021-06-20 18:30:00,N335 - R158 - 01-00-03
3061699,N335,R158,01-00-03,KEW GARDENS,EF,IND,2021-06-20,RECOVR AUD,5201960,2021-06-20 18:30:00,N335 - R158 - 01-00-03
3091640,N335,R158,01-00-03,KEW GARDENS,EF,IND,2021-06-20,REGULAR,5201958,2021-06-20 18:30:00,N335 - R158 - 01-00-03
3091641,N335,R158,01-00-03,KEW GARDENS,EF,IND,2021-06-20,RECOVR AUD,5201960,2021-06-20 18:30:00,N335 - R158 - 01-00-03


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,DESC,ENTRIES,DATETIME,ID
181380,B024,R211,00-05-00,KINGS HWY,BQ,BMT,2021-05-03,REGULAR,262152,2021-05-03 20:00:00,B024 - R211 - 00-05-00
181381,B024,R211,00-05-00,KINGS HWY,BQ,BMT,2021-05-03,RECOVR AUD,16,2021-05-03 20:00:00,B024 - R211 - 00-05-00
211194,B024,R211,00-05-00,KINGS HWY,BQ,BMT,2021-05-03,REGULAR,262152,2021-05-03 20:00:00,B024 - R211 - 00-05-00
211195,B024,R211,00-05-00,KINGS HWY,BQ,BMT,2021-05-03,RECOVR AUD,16,2021-05-03 20:00:00,B024 - R211 - 00-05-00


#### Looks like a similar situation as with df_2019; some RECOVR AUDs look like trivial tests of some sort; will just drop the duplicates

In [80]:
print(df_2021.shape)

df_2021 = df_2021.sort_values(by=['ID', 'DATETIME'], 
                              ascending=False) \
                             .drop_duplicates(subset=['ID', 'DATETIME'])

df_2021.reset_index(drop=True, inplace=True)
print(df_2021.shape)

(3706606, 11)
(1853257, 11)


In [81]:
# Re-check for duplicates

check = df_2021.groupby(['ID', 'DATETIME'])['ENTRIES'] \
                         .count().sort_values(ascending=False)
check.reset_index().head()

Unnamed: 0,ID,DATETIME,ENTRIES
0,A002 - R051 - 02-00-00,2021-04-30 00:00:00,1
1,R138 - R293 - 00-03-04,2021-05-08 22:00:00,1
2,R138 - R293 - 00-03-04,2021-05-13 06:00:00,1
3,R138 - R293 - 00-03-04,2021-05-13 02:00:00,1
4,R138 - R293 - 00-03-04,2021-05-12 22:00:00,1


### 2b) Find common set of Stations in 2019 and 2021 datasets (Station roughly equates to the 'UNIT' column)

In [82]:
print(df_2019['UNIT'].unique().shape)
print(df_2021['UNIT'].unique().shape)

print(np.setdiff1d(df_2019['UNIT'].unique(), df_2021['UNIT'].unique()))
print(np.setdiff1d(df_2021['UNIT'].unique(), df_2019['UNIT'].unique()))

(469,)
(469,)
['R260']
['R094']


#### Each year's data has one station not present in the other - will find the common set between the two years

In [83]:
common_units = np.intersect1d(df_2019['UNIT'].unique(), df_2021['UNIT'].unique())
common_units.shape

(468,)

#### Now limit both year's datasets down to these common stations (UNITs)

In [84]:
print(df_2019.shape)
df_2019 = df_2019[df_2019['UNIT'].isin(common_units)]
df_2019.reset_index(drop=True, inplace=True)
print(df_2019.shape)

(1819646, 11)
(1817754, 11)


In [85]:
print(df_2021.shape)
df_2021 = df_2021[df_2021['UNIT'].isin(common_units)]
df_2021.reset_index(drop=True, inplace=True)
print(df_2021.shape)

(1853257, 11)
(1848056, 11)


### 2c) Find NET_DAILY_ENTRIES per unique turnstile ('ID')

#### Assumption 1: Some turnstiles count backwards, accounting for the negatives; just take absolute value of everything
#### Assumption 2: Even considering rush hr., >180 entries/hr ON AVERAGE over the full day (24 hours) is unlikely, so daily entries count > 24*180 = 4320 is probably due to counter rollover or other asynchronous counter reset -> just set these values to 0

In [86]:
daily_per_turnstile_2019 = df_2019.groupby(['ID', 'DATE']).first().reset_index().drop(columns='ENTRIES')

daily_per_turnstile_2019['NET_DAILY_ENTRIES'] = np.abs(df_2019.groupby(['ID', 'DATE'])['ENTRIES'].first() - 
                                                       df_2019.groupby(['ID', 'DATE'])['ENTRIES'].last()) \
                                                        .values

daily_per_turnstile_2019['NET_DAILY_ENTRIES'] = daily_per_turnstile_2019['NET_DAILY_ENTRIES'] \
                                                        .apply(lambda x: 0 if x > 180*24 else x)

daily_per_turnstile_2019.head()

Unnamed: 0,ID,DATE,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DESC,DATETIME,NET_DAILY_ENTRIES
0,A002 - R051 - 02-00-00,2019-04-30,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2019-04-30 20:00:00,1278
1,A002 - R051 - 02-00-00,2019-05-01,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2019-05-01 20:00:00,1346
2,A002 - R051 - 02-00-00,2019-05-02,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2019-05-02 20:00:00,1327
3,A002 - R051 - 02-00-00,2019-05-03,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2019-05-03 20:00:00,1326
4,A002 - R051 - 02-00-00,2019-05-04,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2019-05-04 20:00:00,560


In [87]:
# This looks reasonable:

daily_per_turnstile_2019['NET_DAILY_ENTRIES'].describe()

count    300165.000000
mean        915.348638
std         851.082939
min           0.000000
25%         222.000000
50%         704.000000
75%        1394.000000
max        4320.000000
Name: NET_DAILY_ENTRIES, dtype: float64

#### Repeat this whole process for df_2021

In [88]:
daily_per_turnstile_2021 = df_2021.groupby(['ID', 'DATE']).first().reset_index().drop(columns='ENTRIES')

daily_per_turnstile_2021['NET_DAILY_ENTRIES'] = np.abs(df_2021.groupby(['ID', 'DATE'])['ENTRIES'].first() - 
                                                       df_2021.groupby(['ID', 'DATE'])['ENTRIES'].last()) \
                                                        .values

daily_per_turnstile_2021['NET_DAILY_ENTRIES'] = daily_per_turnstile_2021['NET_DAILY_ENTRIES'] \
                                                        .apply(lambda x: 0 if x > 180*24 else x)

daily_per_turnstile_2021.head()

Unnamed: 0,ID,DATE,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DESC,DATETIME,NET_DAILY_ENTRIES
0,A002 - R051 - 02-00-00,2021-04-30,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2021-04-30 20:00:00,469
1,A002 - R051 - 02-00-00,2021-05-01,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2021-05-01 20:00:00,361
2,A002 - R051 - 02-00-00,2021-05-02,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2021-05-02 20:00:00,250
3,A002 - R051 - 02-00-00,2021-05-03,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2021-05-03 20:00:00,472
4,A002 - R051 - 02-00-00,2021-05-04,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2021-05-04 20:00:00,501


In [89]:
# This looks reasonable:

daily_per_turnstile_2021['NET_DAILY_ENTRIES'].describe()

count    308646.000000
mean        320.566260
std         327.177385
min           0.000000
25%          73.000000
50%         233.000000
75%         466.000000
max        3808.000000
Name: NET_DAILY_ENTRIES, dtype: float64

#### While we're here, let's add a WEEKDAY column (1 for M-F, 0 for Sat or Sun)

In [90]:
# pd.Series.dt.weekday: 0 = Mon, 6 = Sun

daily_per_turnstile_2019['WEEKDAY'] = pd.to_datetime(daily_per_turnstile_2019['DATE']) \
                                                        .dt.weekday.apply(lambda x: 1 if x < 5 else 0)
display(daily_per_turnstile_2019.sample(10))

Unnamed: 0,ID,DATE,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DESC,DATETIME,NET_DAILY_ENTRIES,WEEKDAY
162848,N607 - R025 - 01-00-00,2019-05-16,N607,R025,01-00-00,JAMAICA CENTER,EJZ,IND,REGULAR,2019-05-16 20:00:00,322,1
173495,PTH06 - R546 - 00-00-04,2019-06-07,PTH06,R546,00-00-04,PAVONIA/NEWPORT,1,PTH,REGULAR,2019-06-07 19:59:53,1179,1
66544,N023 - R332 - 01-00-00,2019-06-27,N023,R332,01-00-00,135 ST,BC,IND,REGULAR,2019-06-27 21:00:00,88,1
58354,J037 - R009 - 00-00-01,2019-05-12,J037,R009,00-00-01,121 ST,JZ,BMT,REGULAR,2019-05-12 20:00:00,0,0
279916,R533 - R055 - 00-03-05,2019-05-16,R533,R055,00-03-05,FLUSHING-MAIN,7,IRT,REGULAR,2019-05-16 20:00:00,2533,1
49159,H026 - R137 - 00-03-02,2019-05-06,H026,R137,00-03-02,MYRTLE-WYCKOFF,LM,BMT,REGULAR,2019-05-06 20:00:00,2492,1
103382,N192 - R336 - 00-00-00,2019-06-02,N192,R336,00-00-00,BEACH 60 ST,A,IND,REGULAR,2019-06-02 20:00:00,347,0
195370,R122 - R290 - 02-05-01,2019-06-10,R122,R290,02-05-01,HOUSTON ST,1,IRT,REGULAR,2019-06-10 21:00:00,0,1
89186,N098 - R028 - 00-00-03,2019-05-28,N098,R028,00-00-03,FULTON ST,2345ACJZ,IND,REGULAR,2019-05-28 23:00:00,1238,1
101829,N184 - R416 - 00-00-00,2019-05-29,N184,R416,00-00-00,BEACH 90 ST,AS,IND,REGULAR,2019-05-29 21:00:00,369,1


In [91]:
daily_per_turnstile_2021['WEEKDAY'] = pd.to_datetime(daily_per_turnstile_2021['DATE']) \
                                                        .dt.weekday.apply(lambda x: 1 if x < 5 else 0)
display(daily_per_turnstile_2021.sample(10))

Unnamed: 0,ID,DATE,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DESC,DATETIME,NET_DAILY_ENTRIES,WEEKDAY
100701,N120A - R153 - 01-00-03,2021-06-23,N120A,R153,01-00-03,UTICA AV,AC,IND,REGULAR,2021-06-23 21:00:00,318,1
70319,N017 - R331 - 00-00-03,2021-06-03,N017,R331,00-00-03,155 ST,C,IND,REGULAR,2021-06-03 21:00:00,197,1
60031,J016 - R381 - 00-00-01,2021-06-23,J016,R381,00-00-01,HALSEY ST,J,BMT,REGULAR,2021-06-23 21:00:00,545,1
216898,R169 - R168 - 01-00-03,2021-05-07,R169,R168,01-00-03,96 ST,123,IRT,REGULAR,2021-05-07 21:00:00,295,1
305026,R727 - R430 - 00-00-02,2021-06-07,R727,R430,00-00-02,PELHAM PKWY,5,IRT,REGULAR,2021-06-07 21:00:00,424,1
292565,R606 - R225 - 00-00-01,2021-05-28,R606,R225,00-00-01,HOYT ST,23,IRT,REGULAR,2021-05-28 20:00:00,357,1
120959,N307 - R359 - 00-05-01,2021-06-14,N307,R359,00-05-01,COURT SQ-23 ST,EMG,IND,REGULAR,2021-06-14 20:00:00,0,1
188183,PTH19 - R549 - 02-01-03,2021-05-13,PTH19,R549,02-01-03,NEWARK C,1,PTH,REGULAR,2021-05-13 20:41:05,77,1
214814,R161B - R452 - 00-05-03,2021-05-31,R161B,R452,00-05-03,72 ST,123,IRT,REGULAR,2021-05-31 21:00:00,0,1
100696,N120A - R153 - 01-00-03,2021-06-18,N120A,R153,01-00-03,UTICA AV,AC,IND,REGULAR,2021-06-18 21:00:00,334,1


#### Drop the April 30th Data

In [92]:
print(daily_per_turnstile_2019.shape)
daily_per_turnstile_2019 = daily_per_turnstile_2019[ \
                                pd.to_datetime(daily_per_turnstile_2019['DATE']) != pd.to_datetime('2019-04-30')]
daily_per_turnstile_2019.reset_index(drop=True, inplace=True)
print(daily_per_turnstile_2019.shape)
display(daily_per_turnstile_2019.head())
display(daily_per_turnstile_2019.tail())

(300165, 12)
(295338, 12)


Unnamed: 0,ID,DATE,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DESC,DATETIME,NET_DAILY_ENTRIES,WEEKDAY
0,A002 - R051 - 02-00-00,2019-05-01,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2019-05-01 20:00:00,1346,1
1,A002 - R051 - 02-00-00,2019-05-02,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2019-05-02 20:00:00,1327,1
2,A002 - R051 - 02-00-00,2019-05-03,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2019-05-03 20:00:00,1326,1
3,A002 - R051 - 02-00-00,2019-05-04,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2019-05-04 20:00:00,560,0
4,A002 - R051 - 02-00-00,2019-05-05,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2019-05-05 20:00:00,449,0


Unnamed: 0,ID,DATE,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DESC,DATETIME,NET_DAILY_ENTRIES,WEEKDAY
295333,TRAM2 - R469 - 00-05-01,2019-06-26,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,REGULAR,2019-06-26 21:00:00,0,1
295334,TRAM2 - R469 - 00-05-01,2019-06-27,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,REGULAR,2019-06-27 21:00:00,0,1
295335,TRAM2 - R469 - 00-05-01,2019-06-28,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,REGULAR,2019-06-28 21:00:00,0,1
295336,TRAM2 - R469 - 00-05-01,2019-06-29,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,REGULAR,2019-06-29 21:00:00,0,0
295337,TRAM2 - R469 - 00-05-01,2019-06-30,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,REGULAR,2019-06-30 21:00:00,0,0


In [93]:
# Drop the 4/30/2021 rows

print(daily_per_turnstile_2021.shape)
daily_per_turnstile_2021 = daily_per_turnstile_2021[ \
                                pd.to_datetime(daily_per_turnstile_2021['DATE']) != pd.to_datetime('2021-04-30')]
daily_per_turnstile_2019.reset_index(drop=True, inplace=True)
print(daily_per_turnstile_2021.shape)
display(daily_per_turnstile_2021.head())
display(daily_per_turnstile_2021.tail())

(308646, 12)
(303670, 12)


Unnamed: 0,ID,DATE,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DESC,DATETIME,NET_DAILY_ENTRIES,WEEKDAY
1,A002 - R051 - 02-00-00,2021-05-01,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2021-05-01 20:00:00,361,0
2,A002 - R051 - 02-00-00,2021-05-02,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2021-05-02 20:00:00,250,0
3,A002 - R051 - 02-00-00,2021-05-03,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2021-05-03 20:00:00,472,1
4,A002 - R051 - 02-00-00,2021-05-04,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2021-05-04 20:00:00,501,1
5,A002 - R051 - 02-00-00,2021-05-05,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,2021-05-05 20:00:00,522,1


Unnamed: 0,ID,DATE,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DESC,DATETIME,NET_DAILY_ENTRIES,WEEKDAY
308641,TRAM2 - R469 - 00-05-01,2021-06-26,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,REGULAR,2021-06-26 21:00:00,0,0
308642,TRAM2 - R469 - 00-05-01,2021-06-27,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,REGULAR,2021-06-27 21:00:00,0,0
308643,TRAM2 - R469 - 00-05-01,2021-06-28,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,REGULAR,2021-06-28 21:00:00,0,1
308644,TRAM2 - R469 - 00-05-01,2021-06-29,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,REGULAR,2021-06-29 21:00:00,0,1
308645,TRAM2 - R469 - 00-05-01,2021-06-30,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,REGULAR,2021-06-30 21:00:00,0,1


### 2e) Figure out which turnstiles all belong to the same station and sum up station-level NET_DAILY_ENTRIES

#### While UNIT ("Remote Unit") generally denotes a station, certain stations actually have several UNITs associated with them; the term "Complex" describes the physical location where a Remote Unit resides.

Import MTA Complex List Dataset (based on MTA-provided data & augmented slightly by Chris Whong); found at the following link:

https://qri.cloud/nyc-transit-data/remote_complex_lookup

In [94]:
# NOTE: The 'remote' column in this dataset refers to the same entity as 'UNIT' in the turnstile data
#         (the full name is technically 'Remote Unit'); Also, 'booth' below refers to the same entity
#         as 'C/A' in the turnstile data

complex_list = pd.read_csv('nyc_transit_data_remote_complex_lookup.csv')

print(complex_list.shape)
complex_list

(776, 6)


Unnamed: 0,remote,booth,complex_id,station,line_name,division
0,R001,A060,635.0,WHITEHALL ST,R1,BMT
1,R001,A058,635.0,WHITEHALL ST,R1,BMT
2,R001,R101S,635.0,SOUTH FERRY,R1,IRT
3,R002,A077,628.0,FULTON ST,ACJZ2345,BMT
4,R002,A081,628.0,FULTON ST,ACJZ2345,BMT
5,R002,A082,628.0,FULTON ST,ACJZ2345,BMT
6,R003,J025,86.0,CYPRESS HILLS,J,BMT
7,R004,J028,85.0,ELDERTS LANE,JZ,BMT
8,R005,J030,84.0,FOREST PARKWAY,J,BMT
9,R006,J031,83.0,WOODHAVEN BLVD,JZ,BMT


Now import the dataset that maps complex_id to physical location of each "Complex" (MTA-provided data but again slightly modified by Chris Whong). Can be found at the following link: 

https://qri.cloud/nyc-transit-data/turnstiles_station_list

In [95]:
station_locations = pd.read_csv('nyc_transit_turnstiles_station_list.csv')

print(station_locations.shape)
station_locations

(511, 14)


Unnamed: 0,ogc_fid,station_id,complex_id,gtfs_stop_id,division,line,stop_name,borough,daytime_routes,structure,gtfs_latitude,gtfs_longitude,north_direction_label,south_direction_label
0,1,1.0,1,R01,BMT,Astoria,Astoria - Ditmars Blvd,Q,N W,Elevated,40.775036,-73.912034,,Manhattan
1,2,2.0,2,R03,BMT,Astoria,Astoria Blvd,Q,N W,Elevated,40.770258,-73.917843,Ditmars Blvd,Manhattan
2,3,3.0,3,R04,BMT,Astoria,30 Av,Q,N W,Elevated,40.766779,-73.921479,Astoria - Ditmars Blvd,Manhattan
3,4,4.0,4,R05,BMT,Astoria,Broadway,Q,N W,Elevated,40.76182,-73.925508,Astoria - Ditmars Blvd,Manhattan
4,5,5.0,5,R06,BMT,Astoria,36 Av,Q,N W,Elevated,40.756804,-73.929575,Astoria - Ditmars Blvd,Manhattan
5,6,6.0,6,R08,BMT,Astoria,39 Av,Q,N W,Elevated,40.752882,-73.932755,Astoria - Ditmars Blvd,Manhattan
6,7,7.0,613,R11,BMT,Astoria,Lexington Av/59 St,M,N W R,Subway,40.76266,-73.967258,Queens,Downtown & Brooklyn
7,8,8.0,8,R13,BMT,Astoria,5 Av/59 St,M,N W R,Subway,40.764811,-73.973347,Queens,Downtown & Brooklyn
8,9,9.0,9,R14,BMT,Broadway - Brighton,57 St - 7 Av,M,N Q R W,Subway,40.764664,-73.980658,Uptown & Queens,Downtown & Brooklyn
9,10,10.0,10,R15,BMT,Broadway - Brighton,49 St,M,N R W,Subway,40.759901,-73.984139,Uptown & Queens,Downtown & Brooklyn


In [96]:
# Let's drop some unnecessary columns from station_locations

station_locations = station_locations[['complex_id', 'division', 'line', 'stop_name', 
                                       'borough', 'gtfs_latitude', 'gtfs_longitude']]
station_locations.head()

Unnamed: 0,complex_id,division,line,stop_name,borough,gtfs_latitude,gtfs_longitude
0,1,BMT,Astoria,Astoria - Ditmars Blvd,Q,40.775036,-73.912034
1,2,BMT,Astoria,Astoria Blvd,Q,40.770258,-73.917843
2,3,BMT,Astoria,30 Av,Q,40.766779,-73.921479
3,4,BMT,Astoria,Broadway,Q,40.76182,-73.925508
4,5,BMT,Astoria,36 Av,Q,40.756804,-73.929575


#### Now merge complex_list with station_locations

In [97]:
station_locations['complex_id'].dtype

dtype('O')

In [98]:
# Need to convert 'complex_id' in complex_list to same format

print(complex_list['complex_id'].dtype)

complex_list['complex_id'] = complex_list['complex_id'].apply(lambda x: str(x).replace('.0','')).astype('str')
print(complex_list['complex_id'].dtype)

float64
object


In [99]:
complex_locations = complex_list.merge(right=station_locations, 
                                       how='inner', 
                                       on=['complex_id', 'division'])

In [100]:
print(complex_locations.shape)
complex_locations.head(20)

(809, 11)


Unnamed: 0,remote,booth,complex_id,station,line_name,division,line,stop_name,borough,gtfs_latitude,gtfs_longitude
0,R001,A060,635,WHITEHALL ST,R1,BMT,Broadway,Whitehall St,M,40.703087,-74.012994
1,R001,A058,635,WHITEHALL ST,R1,BMT,Broadway,Whitehall St,M,40.703087,-74.012994
2,R001,R101S,635,SOUTH FERRY,R1,IRT,Broadway - 7Av,South Ferry,M,40.702068,-74.013664
3,R002,A077,628,FULTON ST,ACJZ2345,BMT,Jamaica,Fulton St,M,40.710374,-74.007582
4,R002,A081,628,FULTON ST,ACJZ2345,BMT,Jamaica,Fulton St,M,40.710374,-74.007582
5,R002,A082,628,FULTON ST,ACJZ2345,BMT,Jamaica,Fulton St,M,40.710374,-74.007582
6,R003,J025,86,CYPRESS HILLS,J,BMT,Jamaica,Cypress Hills,Bk,40.689941,-73.87255
7,R004,J028,85,ELDERTS LANE,JZ,BMT,Jamaica,75 St,Q,40.691324,-73.867139
8,R005,J030,84,FOREST PARKWAY,J,BMT,Jamaica,85 St - Forest Pkwy,Q,40.692435,-73.86001
9,R006,J031,83,WOODHAVEN BLVD,JZ,BMT,Jamaica,Woodhaven Blvd,Q,40.693879,-73.851576


#### Now we can merge complex_locations with the daily_per_turnstile datasets for 2019 & 2021

In [101]:
complex_locations.columns = [col.upper() for col in complex_locations.columns]
complex_locations.head()

Unnamed: 0,REMOTE,BOOTH,COMPLEX_ID,STATION,LINE_NAME,DIVISION,LINE,STOP_NAME,BOROUGH,GTFS_LATITUDE,GTFS_LONGITUDE
0,R001,A060,635,WHITEHALL ST,R1,BMT,Broadway,Whitehall St,M,40.703087,-74.012994
1,R001,A058,635,WHITEHALL ST,R1,BMT,Broadway,Whitehall St,M,40.703087,-74.012994
2,R001,R101S,635,SOUTH FERRY,R1,IRT,Broadway - 7Av,South Ferry,M,40.702068,-74.013664
3,R002,A077,628,FULTON ST,ACJZ2345,BMT,Jamaica,Fulton St,M,40.710374,-74.007582
4,R002,A081,628,FULTON ST,ACJZ2345,BMT,Jamaica,Fulton St,M,40.710374,-74.007582


In [102]:
temp = complex_locations.groupby(['COMPLEX_ID', 'GTFS_LATITUDE', 'GTFS_LONGITUDE'])\
                                                                    .count().reset_index() \
                                                                    .sort_values(by=['STATION', 'COMPLEX_ID'],
                                                                                 ascending=False) \
                                                                    .drop(columns=['REMOTE', 'BOOTH', 'STATION',
                                                                                   'LINE_NAME', 'DIVISION', 'LINE',
                                                                                   'STOP_NAME', 'BOROUGH'])

temp['COMPLEX_ID'].value_counts()

611    5
628    4
623    4
624    4
601    3
617    3
620    3
610    3
606    3
602    3
629    2
605    2
607    2
615    2
616    2
619    2
622    2
625    2
630    2
635    2
636    2
603    2
608    2
604    2
614    2
618    2
609    2
613    2
383    1
369    1
37     1
370    1
371    1
377    1
378    1
379    1
380    1
381    1
382    1
384    1
367    1
385    1
386    1
387    1
388    1
391    1
392    1
395    1
396    1
399    1
368    1
366    1
41     1
365    1
341    1
343    1
344    1
346    1
347    1
348    1
349    1
350    1
351    1
352    1
353    1
354    1
355    1
356    1
357    1
36     1
360    1
361    1
362    1
363    1
364    1
4      1
416    1
127    1
417    1
453    1
455    1
456    1
457    1
459    1
46     1
461    1
463    1
47     1
48     1
103    1
49     1
5      1
50     1
52     1
53     1
59     1
6      1
60     1
100    1
10     1
452    1
451    1
450    1
431    1
418    1
419    1
420    1
421    1
422    1
423    1
425    1
4

In [103]:
complex_locations[complex_locations['COMPLEX_ID'] == '611']

Unnamed: 0,REMOTE,BOOTH,COMPLEX_ID,STATION,LINE_NAME,DIVISION,LINE,STOP_NAME,BOROUGH,GTFS_LATITUDE,GTFS_LONGITUDE
14,R010,N062A,611,42 ST-PA BUS TE,ACENQRS1237,IND,8th Av - Fulton St,42 St - Port Authority Bus Terminal,M,40.757308,-73.989735
15,R010,N060,611,42 ST-PA BUS TE,ACENQRS1237,IND,8th Av - Fulton St,42 St - Port Authority Bus Terminal,M,40.757308,-73.989735
16,R011,N063A,611,42 ST-PA BUS TE,ACENQRS1237,IND,8th Av - Fulton St,42 St - Port Authority Bus Terminal,M,40.757308,-73.989735
17,R011,N062,611,42 ST-PA BUS TE,ACENQRS1237,IND,8th Av - Fulton St,42 St - Port Authority Bus Terminal,M,40.757308,-73.989735
18,R011,N063,611,42 ST-PA BUS TE,ACENQRS1237,IND,8th Av - Fulton St,42 St - Port Authority Bus Terminal,M,40.757308,-73.989735
81,R032,R145,611,42 ST-TIMES SQ,1237ACENQRS,IRT,Broadway - 7Av,Times Sq - 42 St,M,40.75529,-73.987495
82,R032,R145,611,42 ST-TIMES SQ,1237ACENQRS,IRT,Flushing,Times Sq - 42 St,M,40.755477,-73.987691
83,R032,R145,611,42 ST-TIMES SQ,1237ACENQRS,IRT,Lexington - Shuttle,Times Sq - 42 St,M,40.755983,-73.986229
84,R032,R143,611,42 ST-TIMES SQ,ACENQRS1237,IRT,Broadway - 7Av,Times Sq - 42 St,M,40.75529,-73.987495
85,R032,R143,611,42 ST-TIMES SQ,ACENQRS1237,IRT,Flushing,Times Sq - 42 St,M,40.755477,-73.987691


In [104]:
# Drop unnecessary column
daily_per_turnstile_2019.drop(columns=['DATETIME'], inplace=True)

# Get consistent column names between the two dataframes to be merged

daily_per_turnstile_2019.columns = ['ID', 'DATE', 'BOOTH', 'REMOTE', 'SCP', 'STATION', 
                                    'LINE_NAME', 'DIVISION', 'DESC', 'NET_DAILY_ENTRIES', 'WEEKDAY']
daily_per_turnstile_2019.head()

Unnamed: 0,ID,DATE,BOOTH,REMOTE,SCP,STATION,LINE_NAME,DIVISION,DESC,NET_DAILY_ENTRIES,WEEKDAY
0,A002 - R051 - 02-00-00,2019-05-01,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,1346,1
1,A002 - R051 - 02-00-00,2019-05-02,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,1327,1
2,A002 - R051 - 02-00-00,2019-05-03,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,1326,1
3,A002 - R051 - 02-00-00,2019-05-04,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,560,0
4,A002 - R051 - 02-00-00,2019-05-05,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,449,0


In [105]:
daily_per_turnstile_2019[(daily_per_turnstile_2019['REMOTE'] == 'R032') & 
                         (daily_per_turnstile_2019['BOOTH'] == 'R145')].head()

Unnamed: 0,ID,DATE,BOOTH,REMOTE,SCP,STATION,LINE_NAME,DIVISION,DESC,NET_DAILY_ENTRIES,WEEKDAY
198821,R145 - R032 - 00-00-00,2019-05-01,R145,R032,00-00-00,TIMES SQ-42 ST,1237ACENQRSW,IRT,REGULAR,854,1
198822,R145 - R032 - 00-00-00,2019-05-02,R145,R032,00-00-00,TIMES SQ-42 ST,1237ACENQRSW,IRT,REGULAR,884,1
198823,R145 - R032 - 00-00-00,2019-05-03,R145,R032,00-00-00,TIMES SQ-42 ST,1237ACENQRSW,IRT,REGULAR,780,1
198824,R145 - R032 - 00-00-00,2019-05-04,R145,R032,00-00-00,TIMES SQ-42 ST,1237ACENQRSW,IRT,REGULAR,495,0
198825,R145 - R032 - 00-00-00,2019-05-05,R145,R032,00-00-00,TIMES SQ-42 ST,1237ACENQRSW,IRT,REGULAR,386,0


In [106]:
daily_per_turnstile_loc_2019 = complex_locations.merge(right=daily_per_turnstile_2019, 
                                                       how='inner',
                                                       on=['BOOTH', 'REMOTE', 'DIVISION'])

In [107]:
# Didn't even lose 5000 rows here - pretty good!

print(daily_per_turnstile_2019.shape)
daily_per_turnstile_loc_2019.shape

(295338, 11)


(290643, 19)

In [108]:
# Check for NaNs

daily_per_turnstile_loc_2019[daily_per_turnstile_loc_2019.isna().any(axis=1)].shape[0]

0

In [109]:
# STATION_x and STATION_y seem to match up well! (LINE_NAME_x and LINE_NAME_y, too!)

daily_per_turnstile_loc_2019.sample(n=20)

Unnamed: 0,REMOTE,BOOTH,COMPLEX_ID,STATION_x,LINE_NAME_x,DIVISION,LINE,STOP_NAME,BOROUGH,GTFS_LATITUDE,GTFS_LONGITUDE,ID,DATE,SCP,STATION_y,LINE_NAME_y,DESC,NET_DAILY_ENTRIES,WEEKDAY
80893,R068,R633,351,VAN SICLEN AVE,3,IRT,Eastern Pky,Van Siclen Av,Bk,40.665449,-73.889395,R633 - R068 - 00-00-02,2019-06-19,00-00-02,VAN SICLEN AV,3,REGULAR,607,1
121383,R120,R415,368,MORRISON AVE,6,IRT,Pelham,Morrison Av- Sound View,Bx,40.829521,-73.874516,R415 - R120 - 00-00-01,2019-06-15,00-00-01,MORISN AV/SNDVW,6,REGULAR,918,0
211895,R259,N602,222,ROOSEVELT IS,F,IND,63rd St,Roosevelt Island,M,40.759145,-73.95326,N602 - R259 - 00-05-01,2019-06-24,00-05-01,ROOSEVELT ISLND,F,REGULAR,0,1
160283,R170,A033,602,14 ST-UNION SQ,456LNQR,BMT,Canarsie,Union Sq - 14 St,M,40.734789,-73.99073,A033 - R170 - 02-00-03,2019-05-24,02-00-03,14 ST-UNION SQ,LNQR456W,REGULAR,2977,1
123646,R126,N010,147,175 ST,A,IND,8th Av - Fulton St,175 St,M,40.847391,-73.939704,N010 - R126 - 00-00-02,2019-06-21,00-00-02,175 ST,A,REGULAR,2233,1
58007,R046,R238,610,42 ST-GRD CNTRL,4567S,IRT,Flushing,Grand Central - 42 St,M,40.751431,-73.976041,R238 - R046 - 00-00-07,2019-05-28,00-00-07,GRD CNTRL-42 ST,4567S,REGULAR,275,1
215713,R268,N409,629,METROPOLITAN AV,GL,IND,Crosstown,Metropolitan Av,Bk,40.712792,-73.951418,N409 - R268 - 00-00-05,2019-06-03,00-00-05,METROPOLITAN AV,GL,REGULAR,664,1
36165,R030,R116,327,CHAMBERS ST,123,IRT,Broadway - 7Av,Chambers St,M,40.715478,-74.009266,R116 - R030 - 00-06-01,2019-06-09,00-06-01,CHAMBERS ST,123,REGULAR,354,0
58847,R046,R238,610,42 ST-GRD CNTRL,4567S,IRT,Lexington - Shuttle,Grand Central - 42 St,M,40.752769,-73.979189,R238 - R046 - 00-00-03,2019-05-14,00-00-03,GRD CNTRL-42 ST,4567S,REGULAR,225,1
147537,R153,N120,181,UTICA AVE,AC,IND,8th Av - Fulton St,Utica Av,Bk,40.679364,-73.930729,N120 - R153 - 00-00-01,2019-05-12,00-00-01,UTICA AV,AC,REGULAR,786,0


#### Reorder the columns for more readability (might be useful not to drop any for future troubleshooting)

In [110]:
daily_per_turnstile_loc_2019 = daily_per_turnstile_loc_2019[['STATION_x', 'STATION_y', 'STOP_NAME', 
                                                             'LINE_NAME_x', 'LINE_NAME_y', 'DIVISION', 
                                                             'BOROUGH', 'DATE', 'NET_DAILY_ENTRIES', 'WEEKDAY', 
                                                             'DESC', 'GTFS_LATITUDE', 'GTFS_LONGITUDE', 'LINE', 
                                                             'REMOTE', 'BOOTH', 'SCP', 'COMPLEX_ID', 'ID']]
print(daily_per_turnstile_loc_2019.shape)
daily_per_turnstile_loc_2019.head()

(290643, 19)


Unnamed: 0,STATION_x,STATION_y,STOP_NAME,LINE_NAME_x,LINE_NAME_y,DIVISION,BOROUGH,DATE,NET_DAILY_ENTRIES,WEEKDAY,DESC,GTFS_LATITUDE,GTFS_LONGITUDE,LINE,REMOTE,BOOTH,SCP,COMPLEX_ID,ID
0,WHITEHALL ST,WHITEHALL S-FRY,Whitehall St,R1,R1W,BMT,M,2019-05-01,2604,1,REGULAR,40.703087,-74.012994,Broadway,R001,A060,00-00-00,635,A060 - R001 - 00-00-00
1,WHITEHALL ST,WHITEHALL S-FRY,Whitehall St,R1,R1W,BMT,M,2019-05-02,2676,1,REGULAR,40.703087,-74.012994,Broadway,R001,A060,00-00-00,635,A060 - R001 - 00-00-00
2,WHITEHALL ST,WHITEHALL S-FRY,Whitehall St,R1,R1W,BMT,M,2019-05-03,2768,1,REGULAR,40.703087,-74.012994,Broadway,R001,A060,00-00-00,635,A060 - R001 - 00-00-00
3,WHITEHALL ST,WHITEHALL S-FRY,Whitehall St,R1,R1W,BMT,M,2019-05-04,1717,0,REGULAR,40.703087,-74.012994,Broadway,R001,A060,00-00-00,635,A060 - R001 - 00-00-00
4,WHITEHALL ST,WHITEHALL S-FRY,Whitehall St,R1,R1W,BMT,M,2019-05-05,1184,0,REGULAR,40.703087,-74.012994,Broadway,R001,A060,00-00-00,635,A060 - R001 - 00-00-00


##### Repeat all of the above steps for the 2021 data

In [111]:
# Drop unnecessary column
daily_per_turnstile_2021.drop(columns=['DATETIME'], inplace=True)

# Get consistent column names between the two dataframes to be merged

daily_per_turnstile_2021.columns = ['ID', 'DATE', 'BOOTH', 'REMOTE', 'SCP', 'STATION', 
                                    'LINE_NAME', 'DIVISION', 'DESC', 'NET_DAILY_ENTRIES', 'WEEKDAY']
daily_per_turnstile_2021.head()

Unnamed: 0,ID,DATE,BOOTH,REMOTE,SCP,STATION,LINE_NAME,DIVISION,DESC,NET_DAILY_ENTRIES,WEEKDAY
1,A002 - R051 - 02-00-00,2021-05-01,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,361,0
2,A002 - R051 - 02-00-00,2021-05-02,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,250,0
3,A002 - R051 - 02-00-00,2021-05-03,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,472,1
4,A002 - R051 - 02-00-00,2021-05-04,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,501,1
5,A002 - R051 - 02-00-00,2021-05-05,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,522,1


In [112]:
daily_per_turnstile_loc_2021 = complex_locations.merge(right=daily_per_turnstile_2021, 
                                                       how='inner',
                                                       on=['BOOTH', 'REMOTE', 'DIVISION'])

In [113]:
# Again, only lost ~5000 rows here - pretty good!

print(daily_per_turnstile_2021.shape)
daily_per_turnstile_loc_2021.shape

(303670, 11)


(298140, 19)

In [114]:
# Check for NaNs

daily_per_turnstile_loc_2021[daily_per_turnstile_loc_2021.isna().any(axis=1)].shape[0]

0

In [115]:
# Again, STATION_x and STATION_y seem to match up well! (LINE_NAME_x and LINE_NAME_y, too!)

daily_per_turnstile_loc_2021.sample(n=20)

Unnamed: 0,REMOTE,BOOTH,COMPLEX_ID,STATION_x,LINE_NAME_x,DIVISION,LINE,STOP_NAME,BOROUGH,GTFS_LATITUDE,GTFS_LONGITUDE,ID,DATE,SCP,STATION_y,LINE_NAME_y,DESC,NET_DAILY_ENTRIES,WEEKDAY
250385,R336,N192,205,BEACH 60 ST,A,IND,Rockaway,Beach 60 St,Q,40.592374,-73.788522,N192 - R336 - 00-00-01,2021-06-19,00-00-01,BEACH 60 ST,A,REGULAR,166,0
92760,R085,A039,16,8 ST-B'WAY NYU,NR,BMT,Broadway - Brighton,8 St - NYU,M,40.730328,-73.992629,A039 - R085 - 01-00-00,2021-05-04,01-00-00,8 ST-NYU,NRW,REGULAR,144,1
12244,R014,R208,628,FULTON ST,2345ACJZ,IRT,Lexington Av,Fulton St,M,40.710368,-74.009509,R208 - R014 - 03-03-00,2021-05-03,03-03-00,FULTON ST,2345ACJZ,REGULAR,9,1
219781,R267,N316A,270,46 ST,MR,IND,Queens Blvd,46 St,Q,40.756312,-73.913333,N316A - R267 - 01-00-02,2021-06-16,01-00-02,46 ST,MR,REGULAR,77,1
232834,R297,N114,627,FRANKLIN AVE,ACS,IND,8th Av - Fulton St,Franklin Av,Bk,40.68138,-73.956848,N114 - R297 - 01-05-00,2021-05-03,01-05-00,FRANKLIN AV,ACS,REGULAR,1,1
41216,R033,R151,611,42 ST-TIMES SQ,1237ACENQRS,IRT,Broadway - 7Av,Times Sq - 42 St,M,40.75529,-73.987495,R151 - R033 - 00-00-08,2021-06-20,00-00-08,TIMES SQ-42 ST,1237ACENQRSW,REGULAR,476,0
171843,R177,R246,399,68ST-HUNTER COL,6,IRT,Lexington Av,68 St - Hunter College,M,40.768141,-73.96387,R246 - R177 - 00-03-01,2021-05-30,00-03-01,68ST-HUNTER CO,6,REGULAR,37,0
260590,R359,N400A,606,COURT SQ,EMG,IND,Crosstown,Court Sq,Q,40.746554,-73.943832,N400A - R359 - 02-00-01,2021-06-05,02-00-01,COURT SQ,EMG,REGULAR,34,0
39174,R032,R145,611,42 ST-TIMES SQ,1237ACENQRS,IRT,Lexington - Shuttle,Times Sq - 42 St,M,40.755983,-73.986229,R145 - R032 - 00-06-02,2021-05-22,00-06-02,TIMES SQ-42 ST,1237ACENQRSW,REGULAR,133,0
52068,R044,R210A,622,BROOKLYN BRIDGE,JZ456,IRT,Lexington Av,Brooklyn Bridge - City Hall,M,40.713065,-74.004131,R210A - R044 - 03-03-00,2021-05-19,03-03-00,BROOKLYN BRIDGE,456JZ,REGULAR,239,1


#### Reorder the columns for more readability (might be useful not to drop any for future troubleshooting)

In [116]:
daily_per_turnstile_loc_2021 = daily_per_turnstile_loc_2021[['STATION_x', 'STATION_y', 'STOP_NAME', 
                                                             'LINE_NAME_x', 'LINE_NAME_y', 'DIVISION', 
                                                             'BOROUGH', 'DATE', 'NET_DAILY_ENTRIES', 'WEEKDAY', 
                                                             'DESC', 'GTFS_LATITUDE', 'GTFS_LONGITUDE', 'LINE', 
                                                             'REMOTE', 'BOOTH', 'SCP', 'COMPLEX_ID', 'ID']]
print(daily_per_turnstile_loc_2021.shape)
daily_per_turnstile_loc_2021.head()

(298140, 19)


Unnamed: 0,STATION_x,STATION_y,STOP_NAME,LINE_NAME_x,LINE_NAME_y,DIVISION,BOROUGH,DATE,NET_DAILY_ENTRIES,WEEKDAY,DESC,GTFS_LATITUDE,GTFS_LONGITUDE,LINE,REMOTE,BOOTH,SCP,COMPLEX_ID,ID
0,WHITEHALL ST,WHITEHALL S-FRY,Whitehall St,R1,R1W,BMT,M,2021-05-01,692,0,REGULAR,40.703087,-74.012994,Broadway,R001,A060,00-00-00,635,A060 - R001 - 00-00-00
1,WHITEHALL ST,WHITEHALL S-FRY,Whitehall St,R1,R1W,BMT,M,2021-05-02,578,0,REGULAR,40.703087,-74.012994,Broadway,R001,A060,00-00-00,635,A060 - R001 - 00-00-00
2,WHITEHALL ST,WHITEHALL S-FRY,Whitehall St,R1,R1W,BMT,M,2021-05-03,895,1,REGULAR,40.703087,-74.012994,Broadway,R001,A060,00-00-00,635,A060 - R001 - 00-00-00
3,WHITEHALL ST,WHITEHALL S-FRY,Whitehall St,R1,R1W,BMT,M,2021-05-04,947,1,REGULAR,40.703087,-74.012994,Broadway,R001,A060,00-00-00,635,A060 - R001 - 00-00-00
4,WHITEHALL ST,WHITEHALL S-FRY,Whitehall St,R1,R1W,BMT,M,2021-05-05,905,1,REGULAR,40.703087,-74.012994,Broadway,R001,A060,00-00-00,635,A060 - R001 - 00-00-00


### 2f) Now generate NET_DAILY_STATION_ENTRIES (i.e. aggregate from per-turnstile to per-station basis)

#### (Recall from previous section that 'COMPLEX_ID' is the unifying element that identifies all turnstiles residing "under one roof")

In [117]:
daily_per_complex_2019 = daily_per_turnstile_loc_2019.groupby(['COMPLEX_ID', 'DATE'])['NET_DAILY_ENTRIES'] \
                                                     .sum().reset_index()

daily_per_complex_2019.columns = ['COMPLEX_ID', 'DATE', 'NET_DAILY_STATION_ENTRIES']
print(daily_per_complex_2019.shape)
display(daily_per_complex_2019.head())
display(daily_per_complex_2019.tail())

(25567, 3)


Unnamed: 0,COMPLEX_ID,DATE,NET_DAILY_STATION_ENTRIES
0,1,2019-05-01,18999
1,1,2019-05-02,19257
2,1,2019-05-03,18661
3,1,2019-05-04,9577
4,1,2019-05-05,6630


Unnamed: 0,COMPLEX_ID,DATE,NET_DAILY_STATION_ENTRIES
25562,99,2019-06-26,5117
25563,99,2019-06-27,5155
25564,99,2019-06-28,4908
25565,99,2019-06-29,3587
25566,99,2019-06-30,3284


#### Now merge in the other desired columns

In [118]:
daily_per_complex_2019 = daily_per_complex_2019.merge(right=daily_per_turnstile_loc_2019
                                                      .groupby(['COMPLEX_ID', 'DATE'])
                                                      .first().reset_index(),
                                                      how='inner',
                                                      on=['COMPLEX_ID', 'DATE']) \
                                                      .drop(columns='NET_DAILY_ENTRIES')

In [119]:
print(daily_per_complex_2019.shape)
display(daily_per_complex_2019.head(10))
display(daily_per_complex_2019.tail(10))

(25567, 19)


Unnamed: 0,COMPLEX_ID,DATE,NET_DAILY_STATION_ENTRIES,STATION_x,STATION_y,STOP_NAME,LINE_NAME_x,LINE_NAME_y,DIVISION,BOROUGH,WEEKDAY,DESC,GTFS_LATITUDE,GTFS_LONGITUDE,LINE,REMOTE,BOOTH,SCP,ID
0,1,2019-05-01,18999,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,1,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00
1,1,2019-05-02,19257,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,1,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00
2,1,2019-05-03,18661,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,1,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00
3,1,2019-05-04,9577,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,0,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00
4,1,2019-05-05,6630,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,0,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00
5,1,2019-05-06,18272,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,1,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00
6,1,2019-05-07,19146,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,1,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00
7,1,2019-05-08,19359,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,1,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00
8,1,2019-05-09,19073,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,1,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00
9,1,2019-05-10,18527,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,1,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00


Unnamed: 0,COMPLEX_ID,DATE,NET_DAILY_STATION_ENTRIES,STATION_x,STATION_y,STOP_NAME,LINE_NAME_x,LINE_NAME_y,DIVISION,BOROUGH,WEEKDAY,DESC,GTFS_LATITUDE,GTFS_LONGITUDE,LINE,REMOTE,BOOTH,SCP,ID
25557,99,2019-06-21,5253,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,1,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00
25558,99,2019-06-22,4194,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,0,RECOVR AUD,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00
25559,99,2019-06-23,3647,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,0,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00
25560,99,2019-06-24,5175,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,1,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00
25561,99,2019-06-25,5099,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,1,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00
25562,99,2019-06-26,5117,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,1,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00
25563,99,2019-06-27,5155,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,1,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00
25564,99,2019-06-28,4908,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,1,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00
25565,99,2019-06-29,3587,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,0,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00
25566,99,2019-06-30,3284,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,0,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00


##### Do the same for 2021 data

In [120]:
daily_per_complex_2021 = daily_per_turnstile_loc_2021.groupby(['COMPLEX_ID', 'DATE'])['NET_DAILY_ENTRIES'] \
                                                     .sum().reset_index()

daily_per_complex_2021.columns = ['COMPLEX_ID', 'DATE', 'NET_DAILY_STATION_ENTRIES']
print(daily_per_complex_2021.shape)
display(daily_per_complex_2021.head())
display(daily_per_complex_2021.tail())

(25615, 3)


Unnamed: 0,COMPLEX_ID,DATE,NET_DAILY_STATION_ENTRIES
0,1,2021-05-01,3485
1,1,2021-05-02,2378
2,1,2021-05-03,4990
3,1,2021-05-04,5290
4,1,2021-05-05,5196


Unnamed: 0,COMPLEX_ID,DATE,NET_DAILY_STATION_ENTRIES
25610,99,2021-06-26,1132
25611,99,2021-06-27,981
25612,99,2021-06-28,1663
25613,99,2021-06-29,1542
25614,99,2021-06-30,1669


#### Now merge in the other desired columns

In [121]:
daily_per_complex_2021 = daily_per_complex_2021.merge(right=daily_per_turnstile_loc_2021
                                                      .groupby(['COMPLEX_ID', 'DATE'])
                                                      .first().reset_index(),
                                                      how='inner',
                                                      on=['COMPLEX_ID', 'DATE']) \
                                                      .drop(columns='NET_DAILY_ENTRIES')

In [122]:
print(daily_per_complex_2021.shape)
display(daily_per_complex_2021.head(10))
display(daily_per_complex_2021.tail(10))

(25615, 19)


Unnamed: 0,COMPLEX_ID,DATE,NET_DAILY_STATION_ENTRIES,STATION_x,STATION_y,STOP_NAME,LINE_NAME_x,LINE_NAME_y,DIVISION,BOROUGH,WEEKDAY,DESC,GTFS_LATITUDE,GTFS_LONGITUDE,LINE,REMOTE,BOOTH,SCP,ID
0,1,2021-05-01,3485,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,0,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00
1,1,2021-05-02,2378,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,0,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00
2,1,2021-05-03,4990,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,1,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00
3,1,2021-05-04,5290,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,1,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00
4,1,2021-05-05,5196,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,1,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00
5,1,2021-05-06,5470,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,1,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00
6,1,2021-05-07,5448,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,1,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00
7,1,2021-05-08,3192,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,0,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00
8,1,2021-05-09,2362,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,0,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00
9,1,2021-05-10,4986,DITMARS BL-31 S,ASTORIA DITMARS,Astoria - Ditmars Blvd,NQ,NQW,BMT,Q,1,REGULAR,40.775036,-73.912034,Astoria,R095,R515,00-00-00,R515 - R095 - 00-00-00


Unnamed: 0,COMPLEX_ID,DATE,NET_DAILY_STATION_ENTRIES,STATION_x,STATION_y,STOP_NAME,LINE_NAME_x,LINE_NAME_y,DIVISION,BOROUGH,WEEKDAY,DESC,GTFS_LATITUDE,GTFS_LONGITUDE,LINE,REMOTE,BOOTH,SCP,ID
25605,99,2021-06-21,1711,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,1,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00
25606,99,2021-06-22,1705,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,1,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00
25607,99,2021-06-23,1909,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,1,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00
25608,99,2021-06-24,1836,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,1,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00
25609,99,2021-06-25,1868,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,1,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00
25610,99,2021-06-26,1132,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,0,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00
25611,99,2021-06-27,981,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,0,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00
25612,99,2021-06-28,1663,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,1,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00
25613,99,2021-06-29,1542,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,1,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00
25614,99,2021-06-30,1669,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,1,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00


#### Verify that we've still got a common set of stations (COMPLEX_IDs) between the two years

In [123]:
print(daily_per_complex_2019['COMPLEX_ID'].unique().shape)
print(daily_per_complex_2021['COMPLEX_ID'].unique().shape)
print(np.setdiff1d(daily_per_complex_2019['COMPLEX_ID'].unique(), daily_per_complex_2021['COMPLEX_ID'].unique()))
print(np.setdiff1d(daily_per_complex_2021['COMPLEX_ID'].unique(), daily_per_complex_2019['COMPLEX_ID'].unique()))

(420,)
(420,)
[]
[]


### 2g) Now add ZIPCODEs corresponding to the latitude/longitude information

In [124]:
# Just using WEEKDAY column as a placeholder during grouping; will drop immediately

locations = daily_per_complex_2019.groupby(['COMPLEX_ID', 'GTFS_LATITUDE', 'GTFS_LONGITUDE']) \
                                  ['WEEKDAY'].first().reset_index()

locations.drop(columns='WEEKDAY', inplace=True)
print(locations.shape)
locations

(421, 3)


Unnamed: 0,COMPLEX_ID,GTFS_LATITUDE,GTFS_LONGITUDE
0,1,40.775036,-73.912034
1,10,40.759901,-73.984139
2,100,40.70687,-73.953431
3,101,40.708359,-73.957757
4,103,40.72028,-73.993915
5,107,40.706476,-74.011056
6,108,40.711396,-73.889601
7,109,40.706186,-73.895877
8,110,40.704423,-73.903077
9,111,40.702762,-73.90774


In [125]:
# This implementation is based on the info posted here:
#  https://stackoverflow.com/questions/66144427/how-do-i-get-zipcodes-from-longitude-and-latitude-on-python

# geo = geopy.Nominatim(user_agent="check_1")

def get_zipcode(df):
    geo = geopy.Nominatim(user_agent="check_1")
    try:
        return geo.reverse('{}, {}'.format(df['GTFS_LATITUDE'], df['GTFS_LONGITUDE'])) \
                                   .raw['address']['postcode']
    except:
        return np.nan

locations['ZIPCODE'] = locations.apply(get_zipcode, axis=1)
    
# locations['ZIPCODE'] = locations.apply(lambda x: geo.reverse('{}, {}'
#                                 .format(x['GTFS_LATITUDE'], x['GTFS_LONGITUDE'])) \
#                                 .raw['address']['postcode'], axis=1)

# geo.reverse("{}, {}".format(40.641362, -74.017881)).raw['address']['neighbourhood']

In [126]:
print(locations.shape)
display(locations[locations['ZIPCODE'].isna()])

(421, 4)


Unnamed: 0,COMPLEX_ID,GTFS_LATITUDE,GTFS_LONGITUDE,ZIPCODE
186,313,40.778453,-73.98197,


In [127]:
# Not sure why the coordinates above failed in Geopy; manual lookup reveals the ZIPCODE to be 10023

locations.loc[186, 'ZIPCODE'] = 10023
display(locations[locations['ZIPCODE'].isna()])

Unnamed: 0,COMPLEX_ID,GTFS_LATITUDE,GTFS_LONGITUDE,ZIPCODE


#### Strip off the -xxxx extension present in some of the ZIPCODEs

In [128]:
locations['ZIPCODE'] = locations['ZIPCODE'].apply(lambda x: str(x).split(sep = '-')[0])

locations[locations['ZIPCODE'].map(len) > 5]

Unnamed: 0,COMPLEX_ID,GTFS_LATITUDE,GTFS_LONGITUDE,ZIPCODE
313,448,40.754622,-73.845625,111354


In [129]:
# Geopy fails again: Manual lookup of the coordinates above reveals ZIPCODE to be 11368

locations.loc[313, 'ZIPCODE'] = 11368
locations.loc[313, 'ZIPCODE']

11368

In [130]:
# Another known error (don't worry, Geopy - I still like you better than the GoogleMaps API)

locations[locations['ZIPCODE'] == '11227']

Unnamed: 0,COMPLEX_ID,GTFS_LATITUDE,GTFS_LONGITUDE,ZIPCODE
205,337,40.688246,-73.980492,11227


In [131]:
# 11227 is not a valid ZIPCODE; manual lookup of the coordinates above reveals ZIPCODE to be 11217

locations.loc[205, 'ZIPCODE'] = 11217
locations.loc[205, 'ZIPCODE']

11217

#### Add ZIPCODE column to daily_per_complex datasets for both years

In [132]:
print(daily_per_complex_2019.shape)
daily_per_complex_2019 =daily_per_complex_2019.merge(right=locations, 
                                                     how='left', 
                                                     on=['COMPLEX_ID', 'GTFS_LATITUDE', 'GTFS_LONGITUDE'])
print(daily_per_complex_2019.shape)

(25567, 19)
(25567, 20)


In [133]:
daily_per_complex_2019.sample(10)

Unnamed: 0,COMPLEX_ID,DATE,NET_DAILY_STATION_ENTRIES,STATION_x,STATION_y,STOP_NAME,LINE_NAME_x,LINE_NAME_y,DIVISION,BOROUGH,WEEKDAY,DESC,GTFS_LATITUDE,GTFS_LONGITUDE,LINE,REMOTE,BOOTH,SCP,ID,ZIPCODE
23585,66,2019-06-22,3915,18 AVE,18 AV,18 Av,D,D,BMT,Bk,0,REGULAR,40.607954,-74.001736,West End,R372,E012,00-00-00,E012 - R372 - 00-00-00,11214
21236,6,2019-05-22,2724,BEEBE-39 AVE,39 AV,39 Av,NQ,NQW,BMT,Q,1,REGULAR,40.752882,-73.932755,Astoria,R090,R510,00-00-00,R510 - R090 - 00-00-00,11101
20551,5,2019-05-08,6216,WASHINGTON-36 A,36 AV,36 Av,NQ,NQW,BMT,Q,1,REGULAR,40.756804,-73.929575,Astoria,R091,R511,00-00-00,R511 - R091 - 00-00-00,11106
25559,99,2019-06-23,3647,LORIMER ST,LORIMER ST,Lorimer St,JM,JM,BMT,Bk,0,REGULAR,40.703869,-73.947408,Jamaica,R353,J005,00-00-00,J005 - R353 - 00-00-00,11207
6720,232,2019-05-20,13551,2 AVE,2 AV,2 Av,F,F,IND,M,1,REGULAR,40.723402,-73.989938,6th Av - Culver,R300,N523,00-00-00,N523 - R300 - 00-00-00,10003
18215,433,2019-06-18,5163,JACKSON AVE,JACKSON AV,Jackson Av,25,25,IRT,Bx,1,REGULAR,40.81649,-73.907807,Lenox - White Plains Rd,R405,R312,00-00-00,R312 - R405 - 00-00-00,10459
1959,141,2019-05-15,1667,PARK PLACE,PARK PLACE,Park Pl,S,S,BMT,Bk,1,REGULAR,40.674772,-73.957624,Franklin Shuttle,R411,B009,00-00-00,B009 - R411 - 00-00-00,11233
9690,287,2019-05-01,1741,FLUSHING AVE,FLUSHING AV,Flushing Av,G,G,IND,Bk,1,REGULAR,40.700377,-73.950234,Crosstown,R316,N414,00-00-00,N414 - R316 - 00-00-00,11211
10185,295,2019-05-08,9869,231 ST,231 ST,231 St,1,1,IRT,Bx,1,REGULAR,40.878856,-73.904834,Broadway - 7Av,R040,R194,00-00-00,R194 - R040 - 00-00-00,10463
10295,296,2019-06-26,4900,MARBLE HILL-225,MARBLE HILL-225,Marble Hill - 225 St,1,1,IRT,M,1,REGULAR,40.874561,-73.909831,Broadway - 7Av,R039,R192,00-00-00,R192 - R039 - 00-00-00,10463


#### Double check for NaNs, then save daily_per_complex_2019 to file (can reload below and avoid running all of above in future)

In [134]:
daily_per_complex_2019[daily_per_complex_2019.isna().any(axis=1)].shape[0]

0

In [136]:
# Okay, write this dataframe to file and then we don't have to run any of the above again!

daily_per_complex_2019.to_csv('daily_per_complex_2019.csv', index=False)

##### Repeat the above steps for the 2021 data

In [137]:
# Add ZIPCODE column

print(daily_per_complex_2021.shape)
daily_per_complex_2021 =daily_per_complex_2021.merge(right=locations, 
                                                     how='left', 
                                                     on=['COMPLEX_ID', 'GTFS_LATITUDE', 'GTFS_LONGITUDE'])
print(daily_per_complex_2021.shape)

(25615, 19)
(25615, 20)


In [138]:
daily_per_complex_2021.sample(10)

Unnamed: 0,COMPLEX_ID,DATE,NET_DAILY_STATION_ENTRIES,STATION_x,STATION_y,STOP_NAME,LINE_NAME_x,LINE_NAME_y,DIVISION,BOROUGH,WEEKDAY,DESC,GTFS_LATITUDE,GTFS_LONGITUDE,LINE,REMOTE,BOOTH,SCP,ID,ZIPCODE
17469,422,2021-05-25,1305,BURKE AVE,BURKE AV,Burke Av,25,25,IRT,Bx,1,REGULAR,40.871356,-73.867164,Lenox - White Plains Rd,R363,R330,00-00-00,R330 - R363 - 00-00-00,10467
18483,439,2021-05-02,3657,149 ST-GR CONC,149/GRAND CONC,125 St,245,245,IRT,M,0,REGULAR,40.807754,-73.945495,Lenox - White Plains Rd,R205,R260,01-05-00,R260 - R205 - 01-05-00,10027
21128,58,2021-05-24,5847,STILLWELL AVE,CONEY IS-STILLW,Coney Island - Stillwell Av,DFNQ,DFNQ,BMT,Bk,1,REGULAR,40.577422,-73.981233,Sea Beach / West End / Culver / Brighton,R151,G001,00-00-00,G001 - R151 - 00-00-00,11224
7644,250,2021-05-20,1713,KINGS HIGHWAY,KINGS HWY,Kings Hwy,F,F,IND,Bk,1,REGULAR,40.603217,-73.972361,6th Av - Culver,R130,N557,00-00-00,N557 - R130 - 00-00-00,11223
18425,438,2021-05-05,4865,135 ST,135 ST,135 St,23,23,IRT,M,1,REGULAR,40.814229,-73.94077,Lenox - White Plains Rd,R207,R306,00-00-00,R306 - R207 - 00-00-00,10030
20092,464,2021-05-25,4869,VERNON/JACKSON,VERNON-JACKSON,Vernon Blvd - Jackson Av,7,7,IRT,Q,1,REGULAR,40.742626,-73.953581,Flushing,R276,R504,00-00-00,R504 - R276 - 00-00-00,11101
13501,353,2021-05-21,1424,PRESIDENT ST,PRESIDENT ST,President St,25,25,IRT,Bk,1,REGULAR,40.667883,-73.950683,Nostrand,R277,R635,00-00-00,R635 - R277 - 00-00-00,11225
24429,80,2021-05-31,619,121 ST,121 ST,121 St,JZ,JZ,BMT,Q,1,REGULAR,40.700492,-73.828294,Jamaica,R009,J037,00-00-00,J037 - R009 - 00-00-00,11418
3223,164,2021-06-21,22448,34 ST-PENN STA,34 ST-PENN STA,34 St - Penn Station,ACE,ACE,IND,M,1,REGULAR,40.752287,-73.993391,8th Av - Fulton St,R012,N067,00-00-00,N067 - R012 - 00-00-00,10001
4915,198,2021-06-05,917,HOWARD BCH-JFK,HOWARD BCH JFK,Howard Beach - JFK Airport,A,A,IND,Q,0,REGULAR,40.660476,-73.830301,Rockaway,R414,N182,00-00-00,N182 - R414 - 00-00-00,11414


#### Double check for NaNs, then save daily_per_complex_2021 to file (can reload below and avoid running all of above in future)

In [139]:
daily_per_complex_2021[daily_per_complex_2021.isna().any(axis=1)].shape[0]

0

In [140]:
# Okay, write this dataframe to file and then we don't have to run any of the above again!

daily_per_complex_2021.to_csv('daily_per_complex_2021.csv', index=False)

## Move to Project_Notebook_Visualizations & start by loading in the 2 datasets just saved!