In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import networkx as nx
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')



In [2]:
# mta ridership - aggregated on 4hr intervals

mta = pd.read_csv('subway_2021_ridership.csv')
mta = mta[mta['flag'] == True]
mta.head()

Unnamed: 0,STATION,DATE_time,ENTRIES,ridership,flag
0,1 AV,2020-12-26 04:00:00,15510689,3.0,True
1,1 AV,2020-12-26 08:00:00,15510717,28.0,True
2,1 AV,2020-12-26 12:00:00,15510758,41.0,True
3,1 AV,2020-12-26 16:00:00,15510831,73.0,True
4,1 AV,2020-12-26 20:00:00,15510877,46.0,True


In [3]:
# aggregate on daily 

mta['DATE_time'] = pd.to_datetime(mta['DATE_time'])
mta['DATE'] = mta['DATE_time'].dt.date
mta_rider = mta.groupby(by=['DATE', 'STATION'], as_index=False).sum()[['DATE', 'STATION', 'ridership']]
mta_rider

Unnamed: 0,DATE,STATION,ridership
0,2020-12-26,1 AV,191.0
1,2020-12-26,103 ST,317.0
2,2020-12-26,103 ST-CORONA,703.0
3,2020-12-26,104 ST,20.0
4,2020-12-26,110 ST,239.0
...,...,...,...
102331,2021-12-24,WOODHAVEN BLVD,1546.0
102332,2021-12-24,WOODLAWN,488.0
102333,2021-12-24,WORLD TRADE CTR,233.0
102334,2021-12-24,YORK ST,989.0


In [4]:
daily_rider = pd.pivot(data=mta_rider, index='DATE', columns=['STATION'], values='ridership')
daily_rider.head()

STATION,1 AV,103 ST,103 ST-CORONA,104 ST,110 ST,111 ST,116 ST,121 ST,125 ST,135 ST,...,WAKEFIELD/241,WALL ST,WHITLOCK AV,WILSON AV,WINTHROP ST,WOODHAVEN BLVD,WOODLAWN,WORLD TRADE CTR,YORK ST,ZEREGA AV
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-12-26,191.0,317.0,703.0,20.0,239.0,70.0,84.0,216.0,782.0,1.0,...,267.0,53.0,35.0,299.0,224.0,800.0,362.0,71.0,7309.0,44.0
2020-12-27,159.0,366.0,649.0,19.0,229.0,87.0,65.0,213.0,822.0,0.0,...,315.0,46.0,20.0,253.0,188.0,696.0,349.0,60.0,447.0,47.0
2020-12-28,310.0,662.0,953.0,36.0,399.0,145.0,139.0,407.0,1226.0,536.0,...,495.0,218.0,55.0,478.0,542.0,1079.0,663.0,173.0,867.0,100.0
2020-12-29,390.0,678.0,1002.0,54.0,368.0,157.0,157.0,420.0,1187.0,57.0,...,478.0,227.0,48.0,461.0,573.0,1430.0,686.0,178.0,896.0,123.0
2020-12-30,402.0,680.0,1169.0,50.0,407.0,143.0,177.0,445.0,1116.0,,...,484.0,238.0,52.0,454.0,538.0,1195.0,696.0,173.0,894.0,121.0


In [5]:
# keep last 60 days test, fist 300 training

test_dates = daily_rider.index[300:]
train_dates = daily_rider.index[:299]

In [6]:
# train-test split for nodes

np.random.seed(2002)

sub_list = mta_rider.STATION.unique()

nodes_known, nodes_mask = train_test_split(sub_list, test_size=0.25)

print(len(nodes_known), len(nodes_mask))

215 72


$T_{i} = \sum w_{i,j}T_{j}$

$w_{i,j}$ estimated with regression

training for $t<T_0$

In [10]:
# run the model

def baseline4(model, station, data=daily_rider):

    print(station)
    # create train and test data
    data_train = daily_rider[daily_rider.index.isin(train_dates)]
    data_train.fillna(0, inplace=True)
    
    X_train = data_train.drop(columns={station}).values
    y_train = data_train[station].values  # y values as station ridership counts
    model.fit(X_train, y_train)
    pred = model.predict(X_train)
    
    print('train r2:', model.score(X_train, y_train))
    
    # test data X - all training nodes
    data_test = daily_rider.loc[daily_rider.index.isin(test_dates),
                        (daily_rider.columns.isin(nodes_known))]

    data_test.fillna(0, inplace=True)
    
    # create dataframe for the weights for each T_ij
    weights = pd.DataFrame(columns=data_train.drop(columns={station}).columns.values)
    weights.loc[0] = model.coef_
    
    weights = weights[data_test.columns]
    
    # y test (t>T0)
    y_test = daily_rider.loc[daily_rider.index.isin(test_dates),
                        station].values
    
    # get predictions for test set(T_i = sum w_i,j*T_j)
    pred = np.sum(weights.values*data_test.values, axis=1)
    
    print('test r2:', r2_score(y_test, pred))
    
    print('stations with high weights:', weights.sort_values(by=0, axis=1, ascending=False).columns[:5].values)
    print('\n')
    
    return (station, r2_score(y_test, pred))
    

[baseline4(LinearRegression(fit_intercept=False, positive=True), stat) for stat in nodes_mask]

14 ST
train r2: 0.2714289962104798
test r2: 0.8008463959925135
stations with high weights: ['BROADWAY JCT' 'AQUEDUCT RACETR' 'CYPRESS HILLS' '3 AV' '80 ST']


157 ST
train r2: 0.9080740628943865
test r2: 0.7565003978638514
stations with high weights: ['80 ST' 'PELHAM PKWY' 'AQUEDUCT RACETR' '168 ST' 'FRESH POND RD']


EUCLID AV
train r2: 0.816498896046304
test r2: 0.30092516346966314
stations with high weights: ['AQUEDUCT RACETR' '80 ST' 'INWOOD-207 ST' '168 ST' 'BAYCHESTER AV']


HUNTS POINT AV
train r2: 0.3789809791617498
test r2: 0.22781486222236091
stations with high weights: ['INWOOD-207 ST' '80 ST' 'PELHAM PKWY' 'DITMAS AV' 'W 8 ST-AQUARIUM']


MARCY AV
train r2: 0.8817328422211904
test r2: -1.8326423788258364
stations with high weights: ['INWOOD-207 ST' 'AQUEDUCT RACETR' '50 ST' '55 ST' 'AVENUE I']


FLUSHING AV
train r2: 0.9594835141226639
test r2: -0.7074755092917819
stations with high weights: ['DITMAS AV' 'FRESH POND RD' '80 ST' 'COURT SQ' '135 ST']


18 ST
train r2: 0.94798

train r2: 0.9307321284629455
test r2: 0.7487983157514962
stations with high weights: ['AQUEDUCT RACETR' 'CITY HALL' 'CHAMBERS ST' 'FT HAMILTON PKY'
 'LEXINGTON AV/53']


18 AV
train r2: 0.9400009127318536
test r2: -0.758334149358014
stations with high weights: ['PELHAM PKWY' '168 ST' 'BROAD CHANNEL' 'DITMAS AV' '25 AV']


LIBERTY AV
train r2: 0.15537116681479446
test r2: -2.792826179743885
stations with high weights: ['BROAD CHANNEL' 'CYPRESS HILLS' 'AVENUE I' 'METS-WILLETS PT' '1 AV']


SUTTER AV
train r2: 0.8598840020622578
test r2: 0.8013304186805643
stations with high weights: ['80 ST' 'PELHAM PKWY' '168 ST' 'ATLANTIC AV' 'FRESH POND RD']


CARROLL ST
train r2: 0.8890091124855992
test r2: 0.35222693490044155
stations with high weights: ['PELHAM PKWY' '51 ST' '168 ST' 'BOTANIC GARDEN' 'FRANKLIN AV']


WALL ST
train r2: 0.9556458772063658
test r2: 0.7064335561232861
stations with high weights: ['AQUEDUCT RACETR' 'LEXINGTON AV/53' 'ST. GEORGE' '168 ST' 'CITY HALL']


PROSPECT AV
train

[('14 ST', 0.8008463959925135),
 ('157 ST', 0.7565003978638514),
 ('EUCLID AV', 0.30092516346966314),
 ('HUNTS POINT AV', 0.22781486222236091),
 ('MARCY AV', -1.8326423788258364),
 ('FLUSHING AV', -0.7074755092917819),
 ('18 ST', -0.4317641127224563),
 ('GUN HILL RD', -6.153054532881758),
 ('25 ST', -0.16652429213210262),
 ('MYRTLE-WYCKOFF', 0.40992489286489797),
 ('MOSHOLU PKWY', 0.7530887029109739),
 ('JAMAICA VAN WK', 0.5058850203475107),
 ('PELHAM BAY PARK', 0.7782339985340796),
 ('ELMHURST AV', -0.23704530448487593),
 ('GRAND ARMY PLAZ', 0.6581650612581423),
 ('WAKEFIELD/241', 0.2962612374057457),
 ('NOSTRAND AV', 0.16334014561299204),
 ('MARBLE HILL-225', 0.902464715809745),
 ('34 ST-HERALD SQ', -0.5754747281806156),
 ('KINGS HWY', -0.48698741536563994),
 ('GRAND-NEWTOWN', -2.4228782781953897),
 ('ALABAMA AV', 0.6351384508788318),
 ('GATES AV', -0.8056816504823796),
 ('111 ST', 0.845122515202001),
 ('KOSCIUSZKO ST', -0.5586084601897385),
 ('SENECA AVE', -0.4479959087344725),
 ('W