In [1]:
import pandas as pd
import numpy as np
import json
import xgboost as xgb

In [12]:
timestamp = pd.Timestamp('2015-05-01 00:00:00')

In [49]:
departures_model = xgb.Booster(model_file='../models/boosterDepartures.xgbm')
arrivals_model = xgb.Booster(model_file='../models/boosterArrivals.xgbm')

#load additional dfeatures
additional_features_df = pd.read_csv('../data/modelInput/additionalFeatures.csv')
additional_features_df = additional_features_df.drop(['icon', 'precipType', 'summary', 'date', 'holiday_description'], axis=1)
additional_features_df['date_hour'] = additional_features_df['date_hour'].apply(pd.Timestamp)
additional_features_df = additional_features_df.set_index('date_hour')
additional_features_df = additional_features_df.tz_localize(None)

In [50]:
additional_features_df.head()

Unnamed: 0_level_0,apparentTemperature,cloudCover,dewPoint,humidity,precipAccumulation,precipIntensity,precipProbability,pressure,temperature,uvIndex,visibility,windBearing,windSpeed,hour,weekday,month,year,is_holiday,is_weekend,is_weekend_or_holiday
date_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2015-01-01 00:00:00,-10.83,0.22,-11.59,0.63,,0.0,0.0,1019.96,-5.76,0.0,15.4,229,3.42,0,3,1,2015,1,0,1
2015-01-01 01:00:00,-11.83,0.17,-12.09,0.62,,0.0,0.0,1019.03,-5.94,0.0,15.26,226,4.31,1,3,1,2015,1,0,1
2015-01-01 02:00:00,-12.07,0.06,-12.44,0.6,,0.0,0.0,1018.15,-6.03,0.0,15.48,232,4.47,2,3,1,2015,1,0,1
2015-01-01 03:00:00,-11.53,0.07,-12.67,0.57,,0.0,0.0,1017.48,-5.43,0.0,15.42,236,4.74,3,3,1,2015,1,0,1
2015-01-01 04:00:00,-11.01,0.34,-12.96,0.53,,0.0,0.0,1016.75,-4.78,0.0,15.51,242,5.17,4,3,1,2015,1,0,1


In [65]:
cluster_number = 50
with open('../models/stationClusterMap.json', 'r') as f:
     cluster_station_map = json.load(f)

In [52]:
stations = pd.read_csv('../data/modelInput/stations.csv')
stations['first_used'] = stations['first_used'].apply(pd.Timestamp)
stations['last_used'] = stations['last_used'].apply(pd.Timestamp)
stations.head()

Unnamed: 0,station_id,station_name,latitude,longitude,first_used,last_used
0,3,Colleges of the Fenway,42.340021,-71.100812,2015-04-17,2016-11-30
1,4,Tremont St. at Berkeley St.,42.345392,-71.069616,2015-01-02,2016-11-30
2,5,Northeastern U / North Parking Lot,42.341814,-71.090179,2015-04-17,2016-11-30
3,6,Cambridge St. at Joy St.,42.361174,-71.065142,2015-01-01,2016-11-30
4,7,Fan Pier,42.353287,-71.044389,2015-04-17,2016-11-30


In [56]:
clusters = np.arange(0, cluster_number)
timestamp_start = pd.Timestamp(timestamp.year, timestamp.month, timestamp.day, 0)
timestamp_end = pd.Timestamp(timestamp.year, timestamp.month, timestamp.day, 23)
date_hours = pd.date_range(timestamp_start, timestamp_end, freq='H')
cluster_time_index = pd.MultiIndex.from_product([date_hours, clusters], names=['date_hour', 'cluster_id'])
cluster_time_df = pd.DataFrame({'arrivals': 0, 'departures': 0}, index=cluster_time_index)
cluster_time_df = cluster_time_df.reset_index()
cluster_time_df = cluster_time_df.set_index('date_hour')
#cluster_time_df = cluster_time_df.tz_localize('EST')

In [57]:
cluster_time_df.head()

Unnamed: 0_level_0,cluster_id,arrivals,departures
date_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-05-01,0,0,0
2015-05-01,1,0,0
2015-05-01,2,0,0
2015-05-01,3,0,0
2015-05-01,4,0,0


In [58]:
model_input = cluster_time_df.merge(additional_features_df, how='left', left_index=True, right_index=True)
#model_input = model_input.tz_convert('EST')
#model_input = model_input.tz_localize(None)
model_input.head()

Unnamed: 0_level_0,cluster_id,arrivals,departures,apparentTemperature,cloudCover,dewPoint,humidity,precipAccumulation,precipIntensity,precipProbability,...,visibility,windBearing,windSpeed,hour,weekday,month,year,is_holiday,is_weekend,is_weekend_or_holiday
date_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-05-01,0,0,0,5.9,0.16,3.72,0.8,,0.0,0.0,...,15.51,64,1.73,0,4,5,2015,0,0,0
2015-05-01,1,0,0,5.9,0.16,3.72,0.8,,0.0,0.0,...,15.51,64,1.73,0,4,5,2015,0,0,0
2015-05-01,2,0,0,5.9,0.16,3.72,0.8,,0.0,0.0,...,15.51,64,1.73,0,4,5,2015,0,0,0
2015-05-01,3,0,0,5.9,0.16,3.72,0.8,,0.0,0.0,...,15.51,64,1.73,0,4,5,2015,0,0,0
2015-05-01,4,0,0,5.9,0.16,3.72,0.8,,0.0,0.0,...,15.51,64,1.73,0,4,5,2015,0,0,0


In [121]:
x_model_input = xgb.DMatrix(model_input)
model_input['departures'] = departures_model.predict(x_model_input)
model_input['arrivals'] = arrivals_model.predict(x_model_input)
model_input = model_input.reset_index()
model_input.head()

Unnamed: 0,date_hour,cluster_id,arrivals,departures,apparentTemperature,cloudCover,dewPoint,humidity,precipAccumulation,precipIntensity,...,visibility,windBearing,windSpeed,hour,weekday,month,year,is_holiday,is_weekend,is_weekend_or_holiday
0,2015-05-01,0,0.666281,0.407924,5.9,0.16,3.72,0.8,,0.0,...,15.51,64,1.73,0,4,5,2015,0,0,0
1,2015-05-01,1,1.177789,0.772143,5.9,0.16,3.72,0.8,,0.0,...,15.51,64,1.73,0,4,5,2015,0,0,0
2,2015-05-01,2,0.360593,0.259184,5.9,0.16,3.72,0.8,,0.0,...,15.51,64,1.73,0,4,5,2015,0,0,0
3,2015-05-01,3,0.174178,0.083475,5.9,0.16,3.72,0.8,,0.0,...,15.51,64,1.73,0,4,5,2015,0,0,0
4,2015-05-01,4,0.025554,0.024951,5.9,0.16,3.72,0.8,,0.0,...,15.51,64,1.73,0,4,5,2015,0,0,0


In [122]:
active_stations = stations[(stations['first_used'] < timestamp) & (stations['last_used'] > timestamp)]['station_id'].unique()

In [134]:
weight_matrix = np.matlib.zeros((cluster_number, active_stations.size))
for cluster_index in range(0, cluster_number):
    cluster_stations = cluster_station_map[str(cluster_index)]
    station_index = 0
    for station_id in active_stations:
        if (str(station_id) in cluster_stations):
            weight_matrix[cluster_index, station_index] = cluster_stations[str(station_id)]
        station_index += 1

inv_weight_matrix = weight_matrix.I
weight_threshold = 1e-3
trips = pd.DataFrame([], columns=['date_hour', 'station_id', 'arrivals', 'departures'])
for cluster_index in range(0, cluster_number):
    station_index = 0
    for station_id in active_stations:
        weight = inv_weight_matrix[station_index, cluster_index] 
        if(weight > weight_threshold):
            weighted_trips = model_input[model_input['cluster_id'] == cluster_index][['date_hour', 'arrivals', 'departures']]
            weighted_trips['arrivals'] = weighted_trips['arrivals'] * weight
            weighted_trips['departures'] = weighted_trips['departures'] * weight
            weighted_trips['station_id'] = station_id
            trips = trips.append(weighted_trips)
        station_index += 1

trips = trips.groupby(['date_hour', 'station_id']).sum()
trips['flow'] = trips['arrivals'] - trips['departures']

In [135]:
trips

Unnamed: 0_level_0,Unnamed: 1_level_0,arrivals,departures,flow
date_hour,station_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-05-01 00:00:00,3,0.076614,0.050039,0.026575
2015-05-01 00:00:00,4,0.121665,0.068767,0.052898
2015-05-01 00:00:00,5,0.143325,0.102205,0.041121
2015-05-01 00:00:00,6,0.115175,0.070666,0.044509
2015-05-01 00:00:00,7,0.056254,0.056898,-0.000644
2015-05-01 00:00:00,8,0.064178,0.030524,0.033654
2015-05-01 00:00:00,9,0.453147,0.258992,0.194155
2015-05-01 00:00:00,10,0.141177,0.091699,0.049478
2015-05-01 00:00:00,11,0.076645,0.050025,0.026620
2015-05-01 00:00:00,12,0.147554,0.105220,0.042334


In [136]:
trips.round()

Unnamed: 0_level_0,Unnamed: 1_level_0,arrivals,departures,flow
date_hour,station_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-05-01 00:00:00,3,0.0,0.0,0.0
2015-05-01 00:00:00,4,0.0,0.0,0.0
2015-05-01 00:00:00,5,0.0,0.0,0.0
2015-05-01 00:00:00,6,0.0,0.0,0.0
2015-05-01 00:00:00,7,0.0,0.0,-0.0
2015-05-01 00:00:00,8,0.0,0.0,0.0
2015-05-01 00:00:00,9,0.0,0.0,0.0
2015-05-01 00:00:00,10,0.0,0.0,0.0
2015-05-01 00:00:00,11,0.0,0.0,0.0
2015-05-01 00:00:00,12,0.0,0.0,0.0
