In [99]:
import numpy as np
import pandas as pd

import random

from sklearn.base import BaseEstimator
from sklearn.dummy import DummyRegressor

from sklearn.ensemble import RandomForestRegressor

from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, column_or_1d
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_val_predict

from tscv import GapKFold

from time import time, localtime, strftime

%matplotlib inline

The aim is to build a training pipeline and apply it on a small number of (building, meter)

We won't use a scikitlearn pipeline as it does not allow to clean data by deleting rows.<br>
(as explained here : https://stackoverflow.com/questions/25539311/custom-transformer-for-sklearn-pipeline-that-alters-both-x-and-y)

In [2]:
test_df = pd.read_csv('../data/raw/csvs/test.csv', parse_dates=['timestamp'])

In [3]:
test_df.head()

Unnamed: 0,row_id,building_id,meter,timestamp
0,0,0,0,2017-01-01
1,1,1,0,2017-01-01
2,2,2,0,2017-01-01
3,3,3,0,2017-01-01
4,4,4,0,2017-01-01


In [4]:
bdata = pd.read_csv(
    '../data/raw/csvs/building_metadata.csv', 
    index_col='building_id', 
    usecols=['building_id', 'site_id']
)
bdata.head()

Unnamed: 0_level_0,site_id
building_id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


In [5]:
test_df = test_df.join(bdata, on='building_id', how='left')

In [6]:
test_df.head()

Unnamed: 0,row_id,building_id,meter,timestamp,site_id
0,0,0,0,2017-01-01,0
1,1,1,0,2017-01-01,0
2,2,2,0,2017-01-01,0
3,3,3,0,2017-01-01,0
4,4,4,0,2017-01-01,0


<b>Pipeline functions</b>

In [33]:
def load_and_prepare_site_data(site_id, data_folder_path):
    
    # Loads weather data
    raw_df_weather = pd.read_csv(data_folder_path + 'weather_train.csv', 
                     parse_dates=['timestamp'], index_col=['site_id', 'timestamp'])

    b_df_weather = raw_df_weather.loc[(site_id,)]

    # keep only air_temperature and dew_temperature
    b_df_weather.drop(
        ['precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed', 'cloud_coverage'],
        axis=1,
        inplace=True
    )

    # Clean timestamps index.
    clean_index = pd.date_range(start=b_df_weather.index.min(), end=b_df_weather.index.max(), freq='H')
    b_df_weather = b_df_weather.reindex(index=clean_index, copy=True)


    # Interpolate missing values.
    b_df_weather.interpolate(method='linear', limit=3, inplace=True)

    # Build time features
    b_df_weather['day_hour'] = b_df_weather.index.to_series().dt.hour
    b_df_weather['day_of_week'] = b_df_weather.index.to_series().dt.dayofweek

    # Builds averaged weather features.

    timeframes = [24]
    features_to_avg = ['air_temperature', 'dew_temperature']
    do_center = False

    for c in features_to_avg:
        ts = b_df_weather[c]
        for timeframe in timeframes:
            shifted_ts = ts.rolling(timeframe, center=do_center).mean()
            new_col_name = '' + c + '_ma_' + str(timeframe) + 'H'
            b_df_weather[new_col_name] = shifted_ts
            
            
    # Drops rows with NaNs.
    b_df_weather.dropna(axis=0, how='any', inplace=True)
            
    print('shape={}'.format(b_df_weather.shape))
        
    return b_df_weather

In [8]:
# Loads meter_reading data
def load_meter_data(building_id, meter_id, data_folder_path):
    
    raw_df_meters = pd.read_csv(data_folder_path + 'train.csv', parse_dates=['timestamp'])

    to_keep = (raw_df_meters['building_id']==building_id) & (raw_df_meters['meter']==meter_id)
    b_df_meters = raw_df_meters[to_keep].copy()

    b_df_meters.drop('building_id', axis=1, inplace=True)
    b_df_meters.drop('meter', axis=1, inplace=True)

    b_df_meters.set_index('timestamp', inplace=True)
    b_df_meters.sort_index(inplace=True)
    
    return b_df_meters

In [73]:
"""
Drops rows that are not in both dtaframe indexes.
Converts Y from pd.df to pd.Series
"""

def prepare_meter_train_set(site_weather_df, building_meter_df):
    
    common_index = site_weather_df.index.intersection(other=building_meter_df.index)
    
    # Reset indexes
    
    X = site_weather_df.loc[common_index].copy()
    Y = building_meter_df.loc[common_index].copy()

    return (X, Y['meter_reading'])

<b>Model functions</b>

In [30]:
class MeanByMultiCatEstimator(BaseEstimator):
    """ A template estimator to be used as a reference implementation.
    For more information regarding how to build your own estimator, read more
    in the :ref:`User Guide <user_guide>`.
    Parameters
    ----------
    demo_param : str, default='demo_param'
        A parameter used for demonstation of how to pass and store paramters.
    """
    def __init__(self, cat_column_indexes=[0], verbose=False):
        self.verbose = verbose
        self.cat_column_indexes = cat_column_indexes

    def fit(self, X, y):
        """A reference implementation of a fitting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).
        Returns
        -------
        self : object
            Returns self.
        """
        
        X, y = check_X_y(X, y, accept_sparse=True)
        """Input validation for standard estimators.
        Checks X and y for consistent length, enforces X to be 2D and y 1D. By
        default, X is checked to be non-empty and containing only finite values.
        Standard input checks are also applied to y, such as checking that y
        does not have np.nan or np.inf targets. For multi-label y, set
        multi_output=True to allow 2D and sparse y. If the dtype of X is
        object, attempt converting to float, raising on failure.
        """
        
        
        cat_columns = []
        
        for col_idx in self.cat_column_indexes:
            if(col_idx >= X.shape[1]):
                raise ValueError("category column indexes should be < X.shape[1]")
            cat_columns.append(X[:, col_idx])
            
        cat_tuples = set(zip(*cat_columns))
        
        categories = {}
        self.means = {}
        
        self.mean = y.mean()
        
        for x_bin in cat_tuples:
            categories[x_bin] = []
            
        if self.verbose:    
            print('categories : {}'.format(categories.keys()))
            
        for k in range(X.shape[0]):
            sample_bin = tuple(X[k, self.cat_column_indexes])
            categories[sample_bin].append(y[k])
        
        for k, v in categories.items():
            self.means[k] = np.array(v).mean()
        
        self.is_fitted_ = True
        # `fit` should always return `self`
        
        if self.verbose:
            for k, v in self.means.items():
                print('({}, {})'.format(k, v))
        
        return self

    
    
    def predict(self, X):
        """ A reference implementation of a predicting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        Returns
        -------
        y : ndarray, shape (n_samples,)
            Returns an array of ones.
        """
        
        
        X = check_array(X, accept_sparse=True)
        """Input validation on an array, list, sparse matrix or similar.
        By default, the input is checked to be a non-empty 2D array containing
        only finite values. If the dtype of the array is object, attempt
        converting to float, raising on failure."""
        
        check_is_fitted(self, 'is_fitted_')
        
        predictions = []
        
        
        cat_columns=[]
        for col in self.cat_column_indexes:
            cat_columns.append(X[:, col])
            
        cat_tuples = list(zip(*cat_columns))
        
        
        
        for sample_cat in cat_tuples:
            cat_mean = self.means.get(sample_cat)
            if(cat_mean == None):
                predictions.append(self.mean)
            else:
                predictions.append(cat_mean)
            
        
        
        return np.array(predictions)

<b>Main</b>

In [10]:
bdata.head()

Unnamed: 0_level_0,site_id
building_id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


In [11]:
site_ids = bdata['site_id'].unique().tolist()

In [12]:
train_df = pd.read_csv('../data/raw/csvs/train.csv', parse_dates=['timestamp'])

In [13]:
train_df_grouped = train_df.groupby(['building_id', 'meter']).count()
train_df_grouped.drop('timestamp', axis=1, inplace=True)
train_df_grouped.rename({'meter_reading' : 'n_meter_readings'}, axis=1, inplace=True)
train_df_grouped.sort_values(by='n_meter_readings', axis=0, ascending=False, inplace=True)

In [14]:
train_df_grouped = train_df_grouped.join(bdata, on='building_id', how='left')

In [15]:
train_df_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n_meter_readings,site_id
building_id,meter,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,8784,0
685,0,8784,5
672,0,8784,5
673,0,8784,5
674,0,8784,5


In [16]:
# main
"""

for site in site_id:

    load_and_prepare_site_data()

    for building on this site:
        
        for meter in building_meter:
        
            load_meter_data()
            prepare_train_set()
            
            cross-validate()
            fit()
            
            save()



"""




'\n\nfor site in site_id:\n\n    load_and_prepare_site_data()\n\n    for building on this site:\n        \n        for meter in building_meter:\n        \n            load_meter_data()\n            prepare_train_set()\n            \n            cross-validate()\n            fit()\n            \n            save()\n\n\n\n'

In [17]:
train_df_grouped.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2380 entries, (0, 0) to (403, 0)
Data columns (total 2 columns):
n_meter_readings    2380 non-null int64
site_id             2380 non-null int64
dtypes: int64(2)
memory usage: 55.6 KB


In [18]:
train_df_grouped.iloc[:2000].tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,n_meter_readings,site_id
building_id,meter,Unnamed: 2_level_1,Unnamed: 3_level_1
1003,3,8306,10
815,0,8255,8
810,0,8255,8
853,0,8208,8
1273,1,8194,14


In [93]:
# sub-sample train_df

np.random.seed(102)

# sample among first 2000 meters (with the most observations by meters)
subsample_indexes = np.random.choice(2000, 20, replace=False)

subsample_building_meters = train_df_grouped.iloc[subsample_indexes]

In [94]:
subsample_building_meters

Unnamed: 0_level_0,Unnamed: 1_level_0,n_meter_readings,site_id
building_id,meter,Unnamed: 2_level_1,Unnamed: 3_level_1
723,0,8784,5
969,1,8778,9
1242,3,8784,14
682,0,8784,5
346,0,8782,3
189,0,8781,2
154,0,8784,1
1237,3,8784,14
250,1,8782,2
41,0,8784,0


In [95]:
subsample_building_meters.index.get_level_values('building_id').unique().tolist()

[723,
 969,
 1242,
 682,
 346,
 189,
 154,
 1237,
 250,
 41,
 268,
 1254,
 263,
 454,
 1308,
 942,
 402,
 901,
 202,
 920]

In [96]:
subsample_site_ids = subsample_building_meters['site_id'].unique()
subsample_site_ids

array([ 5,  9, 14,  3,  2,  1,  0])

In [101]:
data_folder = '../data/raw/csvs/'


for site_id in subsample_site_ids:
    
    print('.site {}'.format(site_id))
    
    site_weather_data = load_and_prepare_site_data(site_id, data_folder)
    
    # GapKFold
    # gap ~ two weeks, train = 1 month (12 folds)
    gap = 24*7*2
    gap_kf = GapKFold(n_splits=12, gap_before=gap, gap_after=gap)
    
    site_building_meters = subsample_building_meters[subsample_building_meters['site_id']==site_id]
    site_buildings = site_building_meters.index.get_level_values('building_id').unique().tolist()
    #site_buildings.sort_values(by='building_id', axis='index', inplace=True)
    
    print(site_buildings)
    
    for building in site_buildings:
        
        building_meters = site_building_meters.loc[building].index.tolist()
        
        for building_meter in building_meters:
            
            print('\t.(building, meter)=({}, {})'.format(building, building_meter))
            
            meter_df = load_meter_data(building, building_meter, data_folder)
            
            print('\t\tmeter_df.shape={}'.format(meter_df.shape))
            
            x_train, y_train = prepare_meter_train_set(site_weather_data, meter_df)
            
            # Dummy estimator cross-validation score
            
            dummy_score = cross_val_score(
                estimator=DummyRegressor(strategy="mean"),
                X=x_train,
                y=y_train,
                scoring='neg_mean_squared_log_error',
                cv=gap_kf
            ).mean()
            
            print('\t\tdummy_score={}'.format(dummy_score))
            
            # Time-only model cross-validation score
            
            day_hour_col_idx = x_train.columns.to_list().index('day_hour')
            day_of_week_col_idx = x_train.columns.to_list().index('day_of_week')
            time_col_indexes = [day_hour_col_idx, day_of_week_col_idx]
            
            time_avg_score = cross_val_score(
                estimator=MeanByMultiCatEstimator(time_col_indexes),
                X=x_train,
                y=y_train,
                scoring='neg_mean_squared_log_error',
                cv=gap_kf
            ).mean()
            
            print('\t\ttime_avg_score={}'.format(time_avg_score))
            
            # Time + weather random forest model
            
            rfr_grid_params = {
                'n_estimators' : [40, 50, 60, 75, 100, 125],
                'max_features' : ['sqrt'],
                'max_depth' : [8, 10, 12, 14, 16]
            }

            
            # ------
            gcv_start_time = time()
            
            rfr_CV = GridSearchCV(
                estimator=RandomForestRegressor(), 
                param_grid=rfr_grid_params,
                scoring='neg_mean_squared_log_error',
                n_jobs=6,
                iid=False,
                cv=gap_kf
            )
            
            rfr_CV.fit(x_train, y_train)

            gcv_end_time = time()
            print('\t\tgcv time : %s seconds' % (gcv_end_time-gcv_start_time))
            # ------
            
            print('\t\tbest_params : {}'.format(rfr_CV.best_params_))
            print('\t\tbest_score : {}'.format(rfr_CV.best_score_))
            
    print('--')

.site 5
shape=(8707, 6)
[723, 682]
	.(building, meter)=(723, 0)
		meter_df.shape=(8784, 1)
		dummy_score=-3.262917301171566
		time_avg_score=-3.1095751678833508




		gcv time : 30.463061571121216 seconds
		best_params : {'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 40}
		best_score : -3.226092070376177
	.(building, meter)=(682, 0)
		meter_df.shape=(8784, 1)
		dummy_score=-0.22292376616670864
		time_avg_score=-0.11042779775559657
		gcv time : 30.069775104522705 seconds
		best_params : {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 125}
		best_score : -0.07037308030768386
--
.site 9
shape=(8760, 6)
[969, 942, 901, 920]
	.(building, meter)=(969, 1)
		meter_df.shape=(8778, 1)
		dummy_score=-3.9614097843377856
		time_avg_score=-3.644978635189375




		gcv time : 34.71291184425354 seconds
		best_params : {'max_depth': 16, 'max_features': 'sqrt', 'n_estimators': 40}
		best_score : -1.9620302944918044
	.(building, meter)=(942, 2)
		meter_df.shape=(8768, 1)
		dummy_score=-0.5793690125218269
		time_avg_score=-0.5790689575714222
		gcv time : 34.35040354728699 seconds
		best_params : {'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 40}
		best_score : -0.5724150900211142
	.(building, meter)=(901, 1)
		meter_df.shape=(8767, 1)
		dummy_score=-0.8578883008585598
		time_avg_score=-0.8120597543149586
		gcv time : 34.87980937957764 seconds
		best_params : {'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 75}
		best_score : -0.5470349482951529
	.(building, meter)=(920, 0)
		meter_df.shape=(8765, 1)
		dummy_score=-0.6538050063701258
		time_avg_score=-0.5545614286235979
		gcv time : 34.66594386100769 seconds
		best_params : {'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 40}
		best_score : -0.6029747514558846
--
.site 14
sh



		gcv time : 35.36256670951843 seconds
		best_params : {'max_depth': 16, 'max_features': 'sqrt', 'n_estimators': 40}
		best_score : -3.7253373989810563
	.(building, meter)=(1237, 3)
		meter_df.shape=(8784, 1)
		dummy_score=-11.830446797945838
		time_avg_score=-10.132007470509368
		gcv time : 32.30099821090698 seconds
		best_params : {'max_depth': 16, 'max_features': 'sqrt', 'n_estimators': 50}
		best_score : -7.786312548992728
	.(building, meter)=(1254, 1)
		meter_df.shape=(8784, 1)
		dummy_score=-12.778938414360304
		time_avg_score=-12.586264473912394




		gcv time : 34.12360715866089 seconds
		best_params : {'max_depth': 16, 'max_features': 'sqrt', 'n_estimators': 40}
		best_score : -3.837733798689243
	.(building, meter)=(1308, 0)
		meter_df.shape=(8784, 1)
		dummy_score=-0.4636730099719755
		time_avg_score=-0.4590813207063489
		gcv time : 35.79371786117554 seconds
		best_params : {'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 50}
		best_score : -0.4788193110907275
--
.site 3
shape=(8761, 6)
[346, 454, 402]
	.(building, meter)=(346, 0)
		meter_df.shape=(8782, 1)
		dummy_score=-0.1776283988439805
		time_avg_score=-0.17107960043657341
		gcv time : 35.889785289764404 seconds
		best_params : {'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 100}
		best_score : -0.1814918127118547
	.(building, meter)=(454, 0)
		meter_df.shape=(8782, 1)
		dummy_score=-0.15851857919132983
		time_avg_score=-0.12702487041458357
		gcv time : 38.2906277179718 seconds
		best_params : {'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 75}
		



		gcv time : 36.30686163902283 seconds
		best_params : {'max_depth': 12, 'max_features': 'sqrt', 'n_estimators': 60}
		best_score : -0.3145358015221529
	.(building, meter)=(268, 0)
		meter_df.shape=(8783, 1)
		dummy_score=-0.10714931545194424
		time_avg_score=-0.041458903172194216
		gcv time : 35.71755266189575 seconds
		best_params : {'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 40}
		best_score : -0.05270635995172521
	.(building, meter)=(263, 0)
		meter_df.shape=(8783, 1)
		dummy_score=-0.19990102438884041
		time_avg_score=-0.1114296746552914
		gcv time : 35.632256269454956 seconds
		best_params : {'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 60}
		best_score : -0.09438404988375547
	.(building, meter)=(202, 3)
		meter_df.shape=(8334, 1)
		dummy_score=-0.48782044215078996
		time_avg_score=-0.48823056660684155
		gcv time : 33.594964265823364 seconds
		best_params : {'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 125}
		best_score : -0.4010776181365536
--




		gcv time : 32.03970289230347 seconds
		best_params : {'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 125}
		best_score : -10.404574745596424
--
