In [70]:
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import networkx as nx
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')

In [71]:
# mta ridership - aggregated on 4hr intervals

mta = pd.read_csv('subway_2021_ridership.csv')
mta = mta[mta['flag'] == True]
mta.head()

Unnamed: 0,STATION,DATE_time,ENTRIES,ridership,flag
0,1 AV,2020-12-26 04:00:00,15510689,3.0,True
1,1 AV,2020-12-26 08:00:00,15510717,28.0,True
2,1 AV,2020-12-26 12:00:00,15510758,41.0,True
3,1 AV,2020-12-26 16:00:00,15510831,73.0,True
4,1 AV,2020-12-26 20:00:00,15510877,46.0,True


In [72]:
# aggregate on daily 

mta['DATE_time'] = pd.to_datetime(mta['DATE_time'])
mta['DATE'] = mta['DATE_time'].dt.date
mta_rider = mta.groupby(by=['DATE', 'STATION'], as_index=False).sum()[['DATE', 'STATION', 'ridership']]
mta_rider

Unnamed: 0,DATE,STATION,ridership
0,2020-12-26,1 AV,191.0
1,2020-12-26,103 ST,317.0
2,2020-12-26,103 ST-CORONA,703.0
3,2020-12-26,104 ST,20.0
4,2020-12-26,110 ST,239.0
...,...,...,...
102331,2021-12-24,WOODHAVEN BLVD,1546.0
102332,2021-12-24,WOODLAWN,488.0
102333,2021-12-24,WORLD TRADE CTR,233.0
102334,2021-12-24,YORK ST,989.0


In [73]:
daily_rider = pd.pivot(data=mta_rider, index='DATE', columns=['STATION'], values='ridership')
daily_rider.head()

STATION,1 AV,103 ST,103 ST-CORONA,104 ST,110 ST,111 ST,116 ST,121 ST,125 ST,135 ST,...,WAKEFIELD/241,WALL ST,WHITLOCK AV,WILSON AV,WINTHROP ST,WOODHAVEN BLVD,WOODLAWN,WORLD TRADE CTR,YORK ST,ZEREGA AV
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-12-26,191.0,317.0,703.0,20.0,239.0,70.0,84.0,216.0,782.0,1.0,...,267.0,53.0,35.0,299.0,224.0,800.0,362.0,71.0,7309.0,44.0
2020-12-27,159.0,366.0,649.0,19.0,229.0,87.0,65.0,213.0,822.0,0.0,...,315.0,46.0,20.0,253.0,188.0,696.0,349.0,60.0,447.0,47.0
2020-12-28,310.0,662.0,953.0,36.0,399.0,145.0,139.0,407.0,1226.0,536.0,...,495.0,218.0,55.0,478.0,542.0,1079.0,663.0,173.0,867.0,100.0
2020-12-29,390.0,678.0,1002.0,54.0,368.0,157.0,157.0,420.0,1187.0,57.0,...,478.0,227.0,48.0,461.0,573.0,1430.0,686.0,178.0,896.0,123.0
2020-12-30,402.0,680.0,1169.0,50.0,407.0,143.0,177.0,445.0,1116.0,,...,484.0,238.0,52.0,454.0,538.0,1195.0,696.0,173.0,894.0,121.0


In [74]:
# keep last 60 days test, fist 300 training

test_dates = daily_rider.index[300:]
train_dates = daily_rider.index[:299]

In [75]:
# train-test split for nodes

np.random.seed(2002)

sub_list = mta_rider.STATION.unique()

nodes_known, nodes_mask = train_test_split(sub_list, test_size=0.25)

print(len(nodes_known), len(nodes_mask))

215 72


$T_{i} = \sum w_{i,j}T_{j}$

$w_{i,j}$ estimated with regression

training for $t<T_0$

In [117]:
# run the model - using all stations T_j

def baseline4(model, station, data=daily_rider):

    print(station)
    # create train and test data
    data_train = daily_rider[daily_rider.index.isin(train_dates)]
    data_train.fillna(0, inplace=True)
    
    X_train = data_train.drop(columns={station}).values
    y_train = data_train[station].values  # y values as station ridership counts
    model.fit(X_train, y_train)
    pred = model.predict(X_train)
    
    print('train r2:', model.score(X_train, y_train))
    
    # test data X - all training nodes
    data_test = daily_rider.loc[daily_rider.index.isin(test_dates),
                        (daily_rider.columns.isin(nodes_known))]

    data_test.fillna(0, inplace=True)
    
    # create dataframe for the weights for each T_ij
    weights = pd.DataFrame(columns=data_train.drop(columns={station}).columns.values)
    weights.loc[0] = model.coef_
    
    weights = weights[data_test.columns]
    
    # y test (t>T0)
    y_test = daily_rider.loc[daily_rider.index.isin(test_dates),
                        station].values
    
    # get predictions for test set(T_i = sum w_i,j*T_j)
    pred = np.sum(weights.values*data_test.values, axis=1)
    
    print('test r2:', r2_score(y_test, pred))
    
    print('stations with high weights:', weights.sort_values(by=0, axis=1, ascending=False).columns[:5].values)
    print('\n')
    
    return (station, model.score(X_train, y_train), r2_score(y_test, pred))
    

all_stat = [baseline4(LinearRegression(fit_intercept=False, positive=True), stat) for stat in nodes_mask]

14 ST
train r2: 0.2714289962104798
test r2: 0.8008463959925135
stations with high weights: ['BROADWAY JCT' 'AQUEDUCT RACETR' 'CYPRESS HILLS' '3 AV' '80 ST']


157 ST
train r2: 0.9080740628943865
test r2: 0.7565003978638514
stations with high weights: ['80 ST' 'PELHAM PKWY' 'AQUEDUCT RACETR' '168 ST' 'FRESH POND RD']


EUCLID AV
train r2: 0.816498896046304
test r2: 0.30092516346966314
stations with high weights: ['AQUEDUCT RACETR' '80 ST' 'INWOOD-207 ST' '168 ST' 'BAYCHESTER AV']


HUNTS POINT AV
train r2: 0.3789809791617498
test r2: 0.22781486222236091
stations with high weights: ['INWOOD-207 ST' '80 ST' 'PELHAM PKWY' 'DITMAS AV' 'W 8 ST-AQUARIUM']


MARCY AV
train r2: 0.8817328422211904
test r2: -1.8326423788258364
stations with high weights: ['INWOOD-207 ST' 'AQUEDUCT RACETR' '50 ST' '55 ST' 'AVENUE I']


FLUSHING AV
train r2: 0.9594835141226639
test r2: -0.7074755092917819
stations with high weights: ['DITMAS AV' 'FRESH POND RD' '80 ST' 'COURT SQ' '135 ST']


18 ST
train r2: 0.94798

stations with high weights: ['PELHAM PKWY' 'DITMAS AV' 'LEXINGTON AV/53' 'WHITLOCK AV' 'FRESH POND RD']


BOWERY
train r2: 0.7480846058022641
test r2: -0.1002823285403942
stations with high weights: ['80 ST' '8 ST-NYU' 'CLARK ST' 'CORTLANDT ST' '135 ST']


JAY ST-METROTEC
train r2: 0.9307321284629455
test r2: 0.7487983157514962
stations with high weights: ['AQUEDUCT RACETR' 'CITY HALL' 'CHAMBERS ST' 'FT HAMILTON PKY'
 'LEXINGTON AV/53']


18 AV
train r2: 0.9400009127318536
test r2: -0.758334149358014
stations with high weights: ['PELHAM PKWY' '168 ST' 'BROAD CHANNEL' 'DITMAS AV' '25 AV']


LIBERTY AV
train r2: 0.15537116681479446
test r2: -2.792826179743885
stations with high weights: ['BROAD CHANNEL' 'CYPRESS HILLS' 'AVENUE I' 'METS-WILLETS PT' '1 AV']


SUTTER AV
train r2: 0.8598840020622578
test r2: 0.8013304186805643
stations with high weights: ['80 ST' 'PELHAM PKWY' '168 ST' 'ATLANTIC AV' 'FRESH POND RD']


CARROLL ST
train r2: 0.8890091124855992
test r2: 0.35222693490044155
stati

In [77]:
#connection list
links = pd.read_csv( 'https://raw.githubusercontent.com/CUSP2021ADS/Data/main/NYCSubwayEdges.csv' , index_col=None, header=0 )
links.head()

Unnamed: 0,origin,dest
0,G26,G24
1,G26,G28
2,G24,G26
3,G24,G22
4,G22,G24


In [78]:
# locations

mta_loc = pd.read_csv('subway_locations_connections.csv')
mta_loc.head()

Unnamed: 0,origin_name,origin_id,origin_lat,origin_long,dest_name,dest_id,dest_lat,dest_long
0,1 AV,L06,40.730953,-73.981628,"['3 Av', 'Bedford Av']","['L05', 'L08']","[40.732849, 40.717304]","[-73.986122, -73.956872]"
1,103 ST,119,40.799446,-73.968379,"['96 St', 'Cathedral Pkwy']","['120', '118']","[40.793919, 40.803967]","[-73.972323, -73.966847]"
2,103 ST - CORONA PLAZA,706,40.749865,-73.8627,"['Junction Blvd', '111 St']","['707', '705']","[40.749145, 40.75173]","[-73.869527, -73.855334]"
3,104 ST,A63,40.681711,-73.837683,"['111 St', 'Rockaway Blvd']","['A64', 'A61']","[40.684331, 40.680429]","[-73.832163, -73.843853]"
4,110 ST,623,40.79502,-73.94425,"['116 St', '103 St']","['622', '624']","[40.798629, 40.7906]","[-73.941617, -73.947478]"


In [123]:
# get 5 neighbors for each station

X_train, y_train = mta_loc[['origin_lat', 'origin_long']], mta_loc[['origin_id']]
mod = KNeighborsRegressor(n_neighbors=6, metric='haversine')
mod.fit(X_train, y_train)
dist, ind = mod.kneighbors(X_train)

In [124]:
# run the model - using just 5 neighboring stations T_j

def baseline4(model, station, data=daily_rider):

    print(station)
    
    # get 
    try:
        neib = mta_loc[mta_loc.index.isin(ind[mta_loc[mta_loc.origin_name == station].index][0][1:])]['origin_name'].values
        print(neib)
    except:
        neib = []
    # create train and test data
    data_train = daily_rider[daily_rider.index.isin(train_dates)]
    data_train.fillna(0, inplace=True)
    
    if len(neib) != 0:
        X_train = data_train[[i for i in neib if i in data_train.columns]]
        y_train = data_train[station].values  # y values as station ridership counts
        print(X_train.shape)
        if X_train.shape[1] != 0:     

            model.fit(X_train, y_train)
            pred = model.predict(X_train)

            print('train r2:', model.score(X_train, y_train))
            
            data_test = daily_rider.loc[daily_rider.index.isin(test_dates),
                        (daily_rider.columns.isin(nodes_known))]

            data_test.fillna(0, inplace=True)

            # create dataframe for the weights for each T_ij
            weights = pd.DataFrame(columns=X_train.columns)
            weights.loc[0] = model.coef_

            data_test = data_test[[i for i in neib if i in data_test.columns]]

            weights = weights[data_test.columns]

            # y test (t>T0)
            y_test = daily_rider.loc[daily_rider.index.isin(test_dates),
                                station].values

            # get predictions for test set(T_i = sum w_i,j*T_j)
            pred = np.sum(weights.values*data_test.values, axis=1)

            print('test r2:', r2_score(y_test, pred))

            print('stations with high weights:', weights.sort_values(by=0, axis=1, ascending=False).columns[:5].values)
            print('\n')

            return (station, model.score(X_train, y_train), r2_score(y_test, pred))
        else:
            pass
    else:
        pass

stat_5 = [baseline4(LinearRegression(fit_intercept=False, positive=True), stat) for stat in nodes_mask]

14 ST
['18 ST' '6 AV' '8 AV' 'CHRISTOPHER ST - SHERIDAN SQ' 'W 4 ST']
(299, 3)
train r2: 0.14853452066591932
test r2: -4.357961734418532
stations with high weights: ['6 AV' '8 AV']


157 ST
['145 ST' '155 ST' '163 ST - AMSTERDAM AV' '168 ST'
 '168 ST - WASHINGTON HTS']
(299, 3)
train r2: 0.5045654025563787
test r2: 0.23862469918709195
stations with high weights: ['168 ST' '155 ST' '145 ST']


EUCLID AV
['CLEVELAND ST' 'CRESCENT ST' 'GRANT AV' 'NORWOOD AV' 'SHEPHERD AV']
(299, 5)
train r2: 0.7195720073907732
test r2: 0.7271662110767482
stations with high weights: ['GRANT AV' 'SHEPHERD AV' 'NORWOOD AV' 'CRESCENT ST' 'CLEVELAND ST']


HUNTS POINT AV
['FREEMAN ST' 'INTERVALE AV' 'LONGWOOD AV' 'SIMPSON ST' 'WHITLOCK AV']
(299, 5)
train r2: 0.16947096723306854
test r2: -1.208375798868257
stations with high weights: ['INTERVALE AV' 'FREEMAN ST' 'SIMPSON ST' 'WHITLOCK AV']


MARCY AV
['BEDFORD AV' 'BROADWAY' 'HEWES ST' 'LORIMER ST' 'METROPOLITAN AV']
(299, 5)
train r2: 0.6658203595338219
test 

train r2: 0.8947687515882534
test r2: -5.202605991166177
stations with high weights: ['NEW UTRECHT AV' '25 AV' '71 ST' 'BAY PKWY']


LIBERTY AV
['ALABAMA AV' 'ATLANTIC AV' 'BROADWAY JCT' 'PENNSYLVANIA AV' 'SUTTER AV']
(299, 5)
train r2: 0.0670422539941915
test r2: -2.885986971900971
stations with high weights: ['ATLANTIC AV' 'BROADWAY JCT']


SUTTER AV
['ALABAMA AV' 'ATLANTIC AV' 'JUNIUS ST' 'LIBERTY AV' 'LIVONIA AV']
(299, 5)
train r2: 0.7808552325699611
test r2: 0.8889848401749676
stations with high weights: ['ATLANTIC AV' 'JUNIUS ST' 'LIVONIA AV']


CARROLL ST
['4 AV' '9 ST' 'HOYT - SCHERMERHORN STS' 'SMITH - 9 STS' 'UNION ST']
(299, 1)
train r2: 0.7253918065666294
test r2: -5.494777937150419
stations with high weights: []


WALL ST
['BOWLING GREEN' 'BROAD ST' 'FULTON ST' 'RECTOR ST' 'WHITEHALL ST']
(299, 4)
train r2: 0.9291016968270536
test r2: 0.8484799292132286
stations with high weights: ['BOWLING GREEN' 'RECTOR ST' 'FULTON ST']


PROSPECT AV
['E 149 ST' 'INTERVALE AV' 'JACKSON 

In [121]:
stat_all = pd.DataFrame(all_stat).rename(columns={0:'station', 1:'train_r2_allStat', 2:'test_r2_allStat'})
stat5 = pd.DataFrame(stat_5).rename(columns={0:'station', 1:'train_r2_5Stat', 2:'test_r2_5Stat'})

In [122]:
stat_all.merge(stat5, on='station')

Unnamed: 0,station,train_r2_allStat,test_r2_allStat,train_r2_5Stat,test_r2_5Stat
0,14 ST,0.271429,0.800846,0.148535,-4.357962
1,157 ST,0.908074,0.7565,0.339241,-2.043952
2,EUCLID AV,0.816499,0.300925,0.719572,0.727166
3,HUNTS POINT AV,0.378981,0.227815,0.169467,-1.2116
4,MARCY AV,0.881733,-1.832642,0.659288,0.621457
5,FLUSHING AV,0.959484,-0.707476,0.829046,-2.620356
6,18 ST,0.947987,-0.431764,0.889247,0.591439
7,GUN HILL RD,0.219778,-6.153055,-0.001582,-9.101403
8,25 ST,0.886595,-0.166524,0.54375,0.497269
9,MOSHOLU PKWY,0.862734,0.753089,0.739449,-12.18232
