In [1]:
import pandas as pd
import numpy as np
import math
from sklearn import tree
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

In [2]:
#read trips data and join it with weather and public holidays dataset
trips = pd.read_csv('../data/modelInput/flowPerHourAndStation.csv')
additional_features = pd.read_csv('../data/modelInput/additionalFeatures.csv')
trips = trips.merge(additional_features, how='left', on=('date_hour'))
del additional_features

stations = pd.read_csv('../data/modelInput/stations_201505_201611.csv')
trips = trips.merge(stations, how='left', on=('station_id'))

trips['date_hour'] = trips['date_hour'].apply(pd.Timestamp) 
trips = trips.set_index('date_hour')

In [3]:
trips.columns

Index(['station_id', 'arrivals', 'departures', 'flow', 'apparentTemperature',
       'cloudCover', 'dewPoint', 'humidity', 'icon', 'precipAccumulation',
       'precipIntensity', 'precipProbability', 'precipType', 'pressure',
       'summary', 'temperature', 'uvIndex', 'visibility', 'windBearing',
       'windSpeed', 'date', 'hour', 'weekday', 'month', 'year',
       'holiday_description', 'is_holiday', 'is_weekend',
       'is_weekend_or_holiday', 'station_name', 'latitude', 'longitude',
       'first_used', 'last_used', 'latitude_pca', 'longitude_pca'],
      dtype='object')

In [4]:
#Features not used: cloudCover and uvIndex (both contain NaN) 
features = ['latitude_pca', 'longitude_pca',
            'apparentTemperature', 'dewPoint', 'humidity', 'precipIntensity',
            'precipProbability', 'pressure', 'temperature',
            'visibility', 'windBearing', 'windSpeed', 'hour',
            'weekday', 'is_holiday', 'is_weekend', 'is_weekend_or_holiday']

#Split the dataset at 2016-5-1 into training and test data
test_begin_date = pd.Timestamp(2016, 5, 1)

X_train = trips[:test_begin_date][features]
X_test = trips[test_begin_date:][features]
y_train = trips[:test_begin_date]['departures']
y_test = trips[test_begin_date:]['departures']

X_train, y_train = shuffle(X_train, y_train, random_state=42)
X_test, y_test = shuffle(X_test, y_test, random_state=42)

print('Training set size: ', X_train.shape[0])
print('Test set size: ', X_test.shape[0])

Training set size:  1045415
Test set size:  611184


In [5]:
#Train the decision tree for departures
benchmark = tree.DecisionTreeRegressor()
benchmark.fit(X_train, y_train)

print('Results on test set for departures')
print('Root mean squared error: %2.3f' % math.sqrt(mean_squared_error(benchmark.predict(X_test), y_test)))
print('R^2 score: %2.3f' % benchmark.score(X_test, y_test))

Results on test set for departures
Root mean squared error: 2.193
R^2 score: 0.182


In [6]:
#Train the decision tree for arrivals
y_train = trips[:test_begin_date]['arrivals']
y_test = trips[test_begin_date:]['arrivals']

X_train = trips[:test_begin_date][features]
X_test = trips[test_begin_date:][features]
X_train, y_train = shuffle(X_train, y_train, random_state=42)
X_test, y_test = shuffle(X_test, y_test, random_state=42)

benchmark = tree.DecisionTreeRegressor()
benchmark.fit(X_train, y_train)

print('Results on test set for arrivals')
print('Root mean squared error: %2.3f' % math.sqrt(mean_squared_error(benchmark.predict(X_test), y_test)))
print('R^2 score: %2.3f' % benchmark.score(X_test, y_test))

Results on test set for arrivals
Root mean squared error: 2.163
R^2 score: 0.256
