In [1]:
"""
You need to run this cell for the code in following cells to work.
"""

# Enable module reloading
%load_ext autoreload
%autoreload 2

import os
os.chdir("..")

In [2]:
import math

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, mean_squared_error, mean_squared_log_error
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor

from src.data.preprocessing.utils import filter_data, merge_data
from src.data.feature_unions import merged_fu, merged_fu_without_outliers

In [3]:
def cv_mean_std_score(cv_result, scoring):
    result = {}
    for key, value in cv_result.items():
        if 'test_' in key:
            result[key] = {'mean': value.mean(), 'std': value.std()}
    return result

def nrmse(y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    return rmse / (max(y_true) - min(y_true)) * 100

def rmsle(y_true, y_pred):
    return math.sqrt(mean_squared_log_error(y_true, y_pred))

scoring = {
    'neg_root_mean_squared_error': 'neg_root_mean_squared_error',
    'neg_normalized_root_mean_squared_error': make_scorer(nrmse, greater_is_better=False),
#     'neg_root_mean_squared_log_error': make_scorer(rmsle, greater_is_better=False)
}

In [4]:
%%time
buildings_data = pd.read_csv('data/building_metadata.csv')
weather_data = pd.read_csv('data/weather_train.csv')
meter_data = pd.read_csv('data/train.csv')

CPU times: user 9.29 s, sys: 860 ms, total: 10.1 s
Wall time: 10.2 s


# Experiments

##  Experiments with `site_id` 0

In [5]:
%%time
buildings_data, weather_data, meter_data = filter_data(
    buildings_data, weather_data, meter_data, site_id=0, meter=0, meter_reading=200
)
data = merge_data(buildings_data, weather_data, meter_data)

CPU times: user 4.75 s, sys: 708 ms, total: 5.46 s
Wall time: 5.46 s


In [6]:
y = data[['meter_reading']].values.ravel()
len(y), y

(605288,
 array([  0.    ,   0.    ,   0.    , ...,  47.2332, 142.245 , 132.802 ]))

In [17]:
%%time
x = merged_fu.union_features(data)
x

CPU times: user 1.46 s, sys: 544 ms, total: 2 s
Wall time: 2min 29s


Unnamed: 0,Education,Entertainment/public assembly,Lodging/residential,Office,Other,square_feet,meter_reading_scaled_1,meter_reading_scaled_2,meter_reading_scaled_3,meter_reading_scaled_4,meter_reading_scaled_5,air_temperature,dew_temperature,wind_direction,wind_speed
0,1.0,0.0,0.0,0.0,0.0,-0.779911,-0.657943,-0.657943,-0.657943,-0.657943,-0.657943,0.358819,0.488281,-1.332590,-1.567109
1,1.0,0.0,0.0,0.0,0.0,-0.755156,-0.657943,-0.657943,-0.657943,-0.657943,-0.657943,0.358819,0.488281,-1.332590,-1.567109
2,1.0,0.0,0.0,0.0,0.0,-0.584502,-0.657943,-0.657943,-0.657943,-0.657943,-0.657943,0.358819,0.488281,-1.332590,-1.567109
3,1.0,0.0,0.0,0.0,0.0,0.281602,-0.657943,-0.657943,-0.657943,-0.657943,-0.657943,0.358819,0.488281,-1.332590,-1.567109
4,1.0,0.0,0.0,0.0,0.0,-0.730698,-0.657943,-0.657943,-0.657943,-0.657943,-0.657943,0.358819,0.488281,-1.332590,-1.567109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605283,0.0,0.0,0.0,1.0,0.0,-0.784199,-0.261517,-0.305532,-0.205217,-0.176896,-0.196572,0.822506,1.086596,-0.139979,-0.127903
605284,0.0,0.0,1.0,0.0,0.0,-0.577315,1.926144,2.053517,1.989830,2.080827,1.971634,0.822506,1.086596,-0.139979,-0.127903
605285,0.0,0.0,0.0,1.0,0.0,-0.629474,0.292889,0.242847,0.306538,0.374780,0.361132,0.822506,1.086596,-0.139979,-0.127903
605286,1.0,0.0,0.0,0.0,0.0,-0.603404,1.741883,1.710039,1.662266,1.653168,1.500767,0.822506,1.086596,-0.139979,-0.127903


### Extremely  Randomized  Trees

####  With outliers

In [7]:
pipeline = Pipeline([
    ('features', merged_fu),
    ('classifier', ExtraTreesRegressor(
        n_jobs=-1
    ))
])

In [8]:
%%time
result = cross_validate(pipeline, data, y, cv=5, scoring=scoring)

CPU times: user 1min 8s, sys: 16.1 s, total: 1min 24s
Wall time: 16min 57s


In [9]:
result

{'fit_time': array([219.8563323 , 188.39170861, 178.42246723, 200.68005228,
        190.4943397 ]),
 'score_time': array([ 4.13445902,  6.01872849,  8.24122906, 13.21035266,  7.60145688]),
 'test_neg_root_mean_squared_error': array([-30.41032752, -34.67210464, -59.32714906, -24.16964255,
        -25.32649584]),
 'test_neg_normalized_root_mean_squared_error': array([-15.6437359 , -17.76125681, -29.66505778, -12.08542555,
        -12.66343787]),
 'test_neg_root_mean_squared_log_error': array([-2.39625613, -2.55994908, -3.11067965, -0.66217591, -0.56967891])}

In [10]:
cv_mean_std_score(result, scoring)

{'test_neg_root_mean_squared_error': {'mean': -34.781143923057456,
  'std': 12.834458504648522},
 'test_neg_normalized_root_mean_squared_error': {'mean': -17.56378278145746,
  'std': 6.390496239371232},
 'test_neg_root_mean_squared_log_error': {'mean': -1.8597479365086678,
  'std': 1.0432075249847157}}

####  Without outliers

In [7]:
pipeline = Pipeline([
    ('features', merged_fu_without_outliers),
    ('classifier', ExtraTreesRegressor(
        n_jobs=-1
    ))
])

In [8]:
%%time
result = cross_validate(pipeline, data, y, cv=5, scoring=scoring)

CPU times: user 1min 6s, sys: 16.1 s, total: 1min 22s
Wall time: 16min 4s


In [9]:
result

{'fit_time': array([184.97140241, 183.02505994, 171.70970035, 180.8711586 ,
        204.79703856]),
 'score_time': array([ 3.95679188,  7.4586904 ,  8.17715549, 10.08739924,  9.48864102]),
 'test_neg_root_mean_squared_error': array([-29.28447081, -35.14438758, -60.24574366, -24.1713793 ,
        -25.33199547]),
 'test_neg_normalized_root_mean_squared_error': array([-15.06457064, -18.00319016, -30.12437805, -12.08629397,
        -12.66618773])}

In [10]:
cv_mean_std_score(result, scoring)

{'test_neg_root_mean_squared_error': {'mean': -34.83559536458503,
  'std': 13.271246272136471},
 'test_neg_normalized_root_mean_squared_error': {'mean': -17.588924109668923,
  'std': 6.605964793249112}}

### Gradient Boosting

####  With outliers

In [8]:
pipeline = Pipeline([
    ('features', merged_fu),
    ('classifier', GradientBoostingRegressor())
])

In [12]:
%%time
result = cross_validate(pipeline, data, y, cv=5, scoring=scoring)

CPU times: user 11min 47s, sys: 779 ms, total: 11min 48s
Wall time: 18min 15s


In [13]:
result

{'fit_time': array([229.45887518, 235.7858367 , 212.88394046, 189.19264197,
        194.28082752]),
 'score_time': array([3.67976141, 6.59404635, 9.59670663, 9.030231  , 4.92878103]),
 'test_neg_root_mean_squared_error': array([ -8.7156448 ,  -7.73164527, -16.99786573, -29.69650662,
        -32.12221959]),
 'test_neg_normalized_root_mean_squared_error': array([ -4.48351782,  -3.96064037,  -8.49935783, -14.84899576,
        -16.06135072])}

In [14]:
cv_mean_std_score(result, scoring)

{'test_neg_root_mean_squared_error': {'mean': -19.052776404173873,
  'std': 10.230799539832114},
 'test_neg_normalized_root_mean_squared_error': {'mean': -9.570772500169276,
  'std': 5.0692981468776255}}

####  Without outliers

In [15]:
pipeline = Pipeline([
    ('features', merged_fu_without_outliers),
    ('classifier', GradientBoostingRegressor())
])

In [16]:
%%time
result = cross_validate(pipeline, data, y, cv=5, scoring=scoring)

CPU times: user 11min 15s, sys: 888 ms, total: 11min 15s
Wall time: 18min 8s


In [17]:
result

{'fit_time': array([228.72421312, 230.65235376, 220.70947409, 180.56716204,
        193.54192305]),
 'score_time': array([3.96224117, 7.47788644, 6.86959887, 9.73576498, 6.43519139]),
 'test_neg_root_mean_squared_error': array([ -7.830535  ,  -7.6937339 , -17.08321472, -29.73671728,
        -32.15582047]),
 'test_neg_normalized_root_mean_squared_error': array([ -4.02819803,  -3.94121975,  -8.54203446, -14.86910209,
        -16.07815141])}

In [18]:
cv_mean_std_score(result, scoring)

{'test_neg_root_mean_squared_error': {'mean': -18.900004272651746,
  'std': 10.436142791331061},
 'test_neg_normalized_root_mean_squared_error': {'mean': -9.491741148661252,
  'std': 5.174115283168382}}