In [18]:
import pandas as pd
import numpy as np
import math
from sklearn import tree
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

In [2]:
trips = pd.read_csv('../data/modelInput/flowPerHourAndCluster.csv')
additional_features = pd.read_csv('../data/modelInput/additionalFeatures.csv')
trips = trips.merge(additional_features, how='left', on=('date_hour'))
del additional_features
trips['date_hour'] = trips['date_hour'].apply(pd.Timestamp) 
trips = trips.set_index('date_hour')

In [3]:
trips.columns

Index(['cluster_id', 'arrivals', 'departures', 'apparentTemperature',
       'cloudCover', 'dewPoint', 'humidity', 'icon', 'precipAccumulation',
       'precipIntensity', 'precipProbability', 'precipType', 'pressure',
       'summary', 'temperature', 'uvIndex', 'visibility', 'windBearing',
       'windSpeed', 'date', 'hour', 'weekday', 'month', 'year',
       'holiday_description', 'is_holiday', 'is_weekend',
       'is_weekend_or_holiday'],
      dtype='object')

In [12]:
#Features not used: cloudCover and uvIndex (both contain NaN) 
features = ['cluster_id',
            'apparentTemperature', 'dewPoint', 'humidity', 'precipIntensity',
            'precipProbability', 'pressure', 'temperature',
            'visibility', 'windBearing', 'windSpeed', 'hour',
            'weekday', 'is_holiday', 'is_weekend', 'is_weekend_or_holiday']

test_begin_date = pd.Timestamp(2016, 5, 1)

X_train = trips[:test_begin_date][features]
X_test = trips[test_begin_date:][features]
y_train = trips[:test_begin_date]['departures']
y_test = trips[test_begin_date:]['departures']

X_train, y_train = shuffle(X_train, y_train, random_state=42)
X_test, y_test = shuffle(X_test, y_test, random_state=42)

print('Training set size: ', X_train.shape[0])
print('Test set size: ', X_test.shape[0])

Training set size:  439250
Test set size:  256800


In [26]:
benchmark = tree.DecisionTreeRegressor()
benchmark.fit(X_train, y_train)

print('Results on test set for departures')
print('Root mean squared error: %2.3f' % math.sqrt(mean_squared_error(benchmark.predict(X_test), y_test)))
print('R^2 score: %2.3f' % benchmark.score(X_test, y_test))

Results on test set for departures
Root mean squared error: 4.376
R^2 score: 0.739


In [28]:
y_train = trips[:test_begin_date]['arrivals']
y_test = trips[test_begin_date:]['arrivals']

X_train = trips[:test_begin_date][features]
X_test = trips[test_begin_date:][features]
X_train, y_train = shuffle(X_train, y_train, random_state=42)
X_test, y_test = shuffle(X_test, y_test, random_state=42)

benchmark = tree.DecisionTreeRegressor()
benchmark.fit(X_train, y_train)

print('Results on test set for arrivals')
print('Root mean squared error: %2.3f' % math.sqrt(mean_squared_error(benchmark.predict(X_test), y_test)))
print('R^2 score: %2.3f' % benchmark.score(X_test, y_test))

Results on test set for arrivals
Root mean squared error: 4.310
R^2 score: 0.748
