In [41]:
import pandas as pd
import numpy as np
import geopandas as gpd
import glob
from sklearn.metrics.pairwise import haversine_distances
from math import radians
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')

from matplotlib import pyplot as plt
%matplotlib inline

In [20]:
# mta ridership - aggregated on 4hr intervals

mta = pd.read_csv('subway_2021_ridership.csv')
mta = mta[mta['flag'] == True]
mta.head()

Unnamed: 0,STATION,DATE_time,ENTRIES,ridership,flag
0,1 AV,2020-12-26 04:00:00,15510689,3.0,True
1,1 AV,2020-12-26 08:00:00,15510717,28.0,True
2,1 AV,2020-12-26 12:00:00,15510758,41.0,True
3,1 AV,2020-12-26 16:00:00,15510831,73.0,True
4,1 AV,2020-12-26 20:00:00,15510877,46.0,True


In [21]:
mta.describe()

Unnamed: 0,ENTRIES,ridership
count,610875.0,610875.0
mean,31497460.0,85.28743
std,175744500.0,108.394265
min,0.0,0.0
25%,822501.5,11.0
50%,4247213.0,49.0
75%,8190512.0,122.0
max,2147419000.0,9934.0


In [22]:
# locations

mta_loc = pd.read_csv('subway_locations_connections.csv')
mta_loc.head()

Unnamed: 0,origin_name,origin_id,origin_lat,origin_long,dest_name,dest_id,dest_lat,dest_long
0,1 AV,L06,40.730953,-73.981628,"['3 Av', 'Bedford Av']","['L05', 'L08']","[40.732849, 40.717304]","[-73.986122, -73.956872]"
1,103 ST,119,40.799446,-73.968379,"['96 St', 'Cathedral Pkwy']","['120', '118']","[40.793919, 40.803967]","[-73.972323, -73.966847]"
2,103 ST - CORONA PLAZA,706,40.749865,-73.8627,"['Junction Blvd', '111 St']","['707', '705']","[40.749145, 40.75173]","[-73.869527, -73.855334]"
3,104 ST,A63,40.681711,-73.837683,"['111 St', 'Rockaway Blvd']","['A64', 'A61']","[40.684331, 40.680429]","[-73.832163, -73.843853]"
4,110 ST,623,40.79502,-73.94425,"['116 St', '103 St']","['622', '624']","[40.798629, 40.7906]","[-73.941617, -73.947478]"


In [28]:
# get daily ridership counts for each station

mta['DATE_time'] = pd.to_datetime(mta['DATE_time'])
mta['DATE'] = mta['DATE_time'].dt.date
mta_rider = mta.groupby(by=['DATE', 'STATION'], as_index=False).sum()[['DATE', 'STATION', 'ridership']]
mta_daily = mta_rider.merge(mta_loc, left_on='STATION', right_on='origin_name')
mta_daily.drop_duplicates(subset=['DATE', 'STATION'], inplace=True)
mta_daily = mta_daily[['DATE', 'STATION', 'ridership', 'origin_id', 'origin_lat', 'origin_long']]
mta_daily['DATE'] = pd.to_datetime(mta_daily['DATE'])
mta_daily['day'] = mta_daily.DATE.dt.day_of_year
mta_daily = mta_daily[mta_daily.DATE > '2020-12-31']  #keep data just for 2021
mta_daily.head()

Unnamed: 0,DATE,STATION,ridership,origin_id,origin_lat,origin_long,day
6,2021-01-01,1 AV,130.0,L06,40.730953,-73.981628,1
7,2021-01-02,1 AV,199.0,L06,40.730953,-73.981628,2
8,2021-01-03,1 AV,161.0,L06,40.730953,-73.981628,3
9,2021-01-04,1 AV,329.0,L06,40.730953,-73.981628,4
10,2021-01-05,1 AV,379.0,L06,40.730953,-73.981628,5


In [34]:
#connection list
links = pd.read_csv( 'https://raw.githubusercontent.com/CUSP2021ADS/Data/main/NYCSubwayEdges.csv' , index_col=None, header=0 )
links.head()

Unnamed: 0,origin,dest
0,G26,G24
1,G26,G28
2,G24,G26
3,G24,G22
4,G22,G24


In [31]:
# mask nodes on which to test

np.random.seed(2002)

sub_list = mta_daily.origin_id.unique()

nodes_known, nodes_mask = train_test_split(sub_list, test_size=0.3)

print(len(nodes_known), len(nodes_mask))

166 72


In [16]:
mta_daily.head()

Unnamed: 0,DATE,STATION,ridership,origin_id,origin_lat,origin_long
0,2020-12-26,1 AV,191.0,L06,40.730953,-73.981628
1,2020-12-27,1 AV,159.0,L06,40.730953,-73.981628
2,2020-12-28,1 AV,310.0,L06,40.730953,-73.981628
3,2020-12-29,1 AV,390.0,L06,40.730953,-73.981628
4,2020-12-30,1 AV,402.0,L06,40.730953,-73.981628


In [32]:
nodes_mask

array(['A40', 'B21', 'R04', '234', 'G28', '406', '505', 'L14', 'G24',
       'R40', '402', '210', '213', 'G19', 'S04', '123', '602', '241',
       'M19', '111', '246', 'A11', 'F33', '224', '603', 'F02', 'J22',
       'L16', '130', 'D06', 'M04', 'H06', 'J29', '705', 'G11', '615',
       '226', 'H08', 'R42', '618', 'G09', 'L08', '134', '242', '205',
       '114', 'J30', 'R32', 'R03', 'F21', '256', '248', 'M16', 'F34',
       'M23', 'A48', '215', '607', 'F29', 'F18', 'A52', '409', '719',
       'D22', 'R15', 'B10', 'H13', 'D08', 'D27', '206', '255', 'L11'],
      dtype=object)

In [74]:
d = 30

# get average ridership of connections for a station
for d in mta_daily.day.unique()[:30]:

    pred = []

    mta_mask = mta_daily[(mta_daily.origin_id.isin(nodes_mask)) & (mta_daily.day == d)]
    mta_known = mta_daily[(mta_daily.origin_id.isin(nodes_known)) & (mta_daily.day == d)]
    mta_known.index = mta_known.origin_id
    mta_mask.index = mta_mask.origin_id
    links_temp = links[links.dest.isin(mta_known.origin_id)] # keep only links which are present in ridership data

    # get neighbors and avg ridership for each masked node
    for stat in mta_mask.origin_id.values:

        stat_ridership = mta_mask.loc[stat].ridership
        neib = links_temp[links_temp.origin == stat].dest.values # get neighbors
        neib_ridership = np.mean(mta_known.loc[neib].ridership.values) # avg. ridership

        pred.append([stat_ridership, neib_ridership])
    
    x = pd.DataFrame(pred)
    x.dropna(inplace=True)
    print(r2_score(x[0], x[1]))

-0.37976316204468685
-0.48976027518543663
-0.5648080396918322
-0.4178522915898708
-0.46639590965383393
-0.39516817453081177
-0.41507168096577285
-0.2933645924532584
-0.47852904709679156
-0.1668667119509477
-0.28842101765591477
-0.32129831829033506
-0.2721286501881921
-0.19537993051735558
-0.20134947471952946
-0.47548134806295805
-0.49844657972168105
-0.34004911129732274
-0.1869962992705252
-0.38561581562040836
-0.3515555773604795
-0.40405141655257615
-0.4377338240043176
-0.5055378548776914
-0.33311617340252053
-0.36595894553532027
-0.4327830256968306
-0.33586493943802376
-0.2836653619453473
-0.376787966885354
