In [25]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, column_or_1d
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_val_score

from tscv import GapKFold

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

%matplotlib inline

The goal here is to build and evalutate simple models based on mean meter readings (per day of week, per hour).

An important and tricky point is how are we going to split the data between train and validation.<br>
For time-serie data, usually we cannot do the usual random split, because of correlations.<br>
Usually, some kind of walk-forward approach is used.<br>
Here, we use hv-block cross validation : we keep a gap of unused data between train and validation, to avoid using correlated data between train and validation.<br>
This method has been studied and described by Racine (2000) : http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.30.6748&rep=rep1&type=pdf<br>
We used an open-source implementation that extends scikit-learn : http://www.zhengwenjie.net/tscv/<br>
See notebook 'test_tscv_lib.ipynb'

In [13]:
filepath = '../../data/intermediate/experimentation_train_sets/'
filename = 'train_b_1176_m_0_t_20200111_195028.csv'

df_features = pd.read_csv(filepath + filename, parse_dates=['timestamp'], index_col=['timestamp'])
df_features.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8524 entries, 2016-01-02 00:00:00 to 2016-12-31 23:00:00
Data columns (total 8 columns):
day_of_week                     8524 non-null int64
day_hour                        8524 non-null int64
dew_temperature_ma_24H          8524 non-null float64
air_temperature                 8524 non-null float64
wind_speed_ma_24H               8524 non-null float64
precip_depth_1_hr_ma_24H        8524 non-null float64
sea_level_pressure_shift_10H    8524 non-null float64
meter_reading                   8524 non-null float64
dtypes: float64(6), int64(2)
memory usage: 599.3 KB


In [12]:
df_features.head()

Unnamed: 0_level_0,day_of_week,day_hour,dew_temperature_ma_24H,air_temperature,wind_speed_ma_24H,precip_depth_1_hr_ma_24H,sea_level_pressure_shift_10H,meter_reading
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-02 00:00:00,5,0,-10.120833,-3.3,4.7375,-0.208333,1019.4,73.866
2016-01-02 01:00:00,5,1,-9.979167,-3.9,4.779167,-0.166667,1018.9,69.788
2016-01-02 02:00:00,5,2,-9.8875,-5.0,4.8625,-0.125,1017.9,68.563
2016-01-02 03:00:00,5,3,-9.816667,-5.6,4.883333,-0.083333,1017.4,69.133
2016-01-02 04:00:00,5,4,-9.766667,-6.1,4.8625,-0.083333,1017.1,69.085


In [23]:
y_meter = df_features['meter_reading']
X_meter = df_features.loc[:, df_features.columns != 'meter_reading']
print('X_meter.shape = {}, y_meter.shape = {}'.format(X_meter.shape, y_meter.shape))

X_meter.shape = (8524, 7), y_meter.shape = (8524,)


In [26]:
# GapKFold
# gap ~ two weeks, train = 1 month (12 folds)

gap = 24*7*2
gap_kf = GapKFold(n_splits=12, gap_before=gap, gap_after=gap)

Let's build a mean value estimator.

From https://github.com/scikit-learn-contrib/project-template/blob/master/skltemplate/_template.py

check_array and check_X_y

https://github.com/scikit-learn/scikit-learn/blob/e5698bde9/sklearn/utils/validation.py#L904

In [24]:
# Actually we don't need that because it already exists in sklearn, under the name 'dummyRegressor'
# https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html

In [28]:
cross_val_score(
    estimator=DummyRegressor(strategy="mean"),
    X=X_meter,
    y=y_meter,
    scoring='neg_mean_squared_log_error',
    cv=gap_kf).mean()

-0.10966881921925091

In [31]:
cross_val_score(
    estimator=DummyRegressor(strategy="median"),
    X=X_meter,
    y=y_meter,
    scoring='neg_mean_squared_log_error',
    cv=gap_kf).mean()

-0.10827571289521588

In [44]:
# note : this is not the RMSLE metric defined for the competition, it is not in sklearn, we must implement it.

In [45]:
# Now let's take the mean by hour

In [54]:
class MeanByCatEstimator(BaseEstimator):
    """ A template estimator to be used as a reference implementation.
    For more information regarding how to build your own estimator, read more
    in the :ref:`User Guide <user_guide>`.
    Parameters
    ----------
    demo_param : str, default='demo_param'
        A parameter used for demonstation of how to pass and store paramters.
    """
    def __init__(self, cat_column_idx=0, verbose=False):
        self.verbose = verbose
        self.cat_column_idx= cat_column_idx

    def fit(self, X, y):
        """A reference implementation of a fitting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).
        Returns
        -------
        self : object
            Returns self.
        """
        
        X, y = check_X_y(X, y, accept_sparse=True)
        """Input validation for standard estimators.
        Checks X and y for consistent length, enforces X to be 2D and y 1D. By
        default, X is checked to be non-empty and containing only finite values.
        Standard input checks are also applied to y, such as checking that y
        does not have np.nan or np.inf targets. For multi-label y, set
        multi_output=True to allow 2D and sparse y. If the dtype of X is
        object, attempt converting to float, raising on failure.
        """
        
        if(self.cat_column_idx >= X.shape[1]):
             raise ValueError("category column index should be < X.shape[1]")
        
        categories = {}
        self.means = {}
        
        self.mean = y.mean()
        
        for x_bin in np.unique(X[:, self.cat_column_idx]):
            categories[x_bin] = []
            
        if self.verbose:    
            print('categories : {}'.format(categories.keys()))
            
        for k in range(X.shape[0]):
            categories[X[k, self.cat_column_idx]].append(y[k])
        
        for k, v in categories.items():
            self.means[k] = np.array(v).mean()
        
        self.is_fitted_ = True
        # `fit` should always return `self`
        
        if self.verbose:
            for k, v in self.means.items():
                print('({}, {})'.format(k, v))
        
        return self

    
    
    def predict(self, X):
        """ A reference implementation of a predicting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        Returns
        -------
        y : ndarray, shape (n_samples,)
            Returns an array of ones.
        """
        
        
        X = check_array(X, accept_sparse=True)
        """Input validation on an array, list, sparse matrix or similar.
        By default, the input is checked to be a non-empty 2D array containing
        only finite values. If the dtype of the array is object, attempt
        converting to float, raising on failure."""
        
        check_is_fitted(self, 'is_fitted_')
        
        predictions = []
        
        for sample_cat in X[:, self.cat_column_idx]:
            cat_mean = self.means.get(sample_cat)
            if(cat_mean == None):
                predictions.append(self.mean)
            else:
                predictions.append(cat_mean)
            
        
        
        return np.array(predictions)

In [55]:
day_hour_col_idx = X_meter.columns.to_list().index('day_hour')

myEst = MeanByCatEstimator(cat_column_idx=day_hour_col_idx)

In [56]:
myEst.fit(X_meter, y_meter)

MeanByCatEstimator(cat_column_idx=1, verbose=False)

In [45]:

X_meter.shape

(8524, 7)

In [48]:
t_6 = np.array([0,3,5,6,8,10,15,2,5,7,22,27])

t = np.transpose(np.array([
    np.ones(12, dtype=np.int64),
    t_6,
    np.ones(12, dtype=np.int64), 
    np.ones(12, dtype=np.int64),
    np.ones(12, dtype=np.int64),
    np.ones(12, dtype=np.int64),
    np.ones(12, dtype=np.int64),
    np.ones(12, dtype=np.int64)
]))


print(t)

myEst.predict(t)

[[ 1  0  1  1  1  1  1  1]
 [ 1  3  1  1  1  1  1  1]
 [ 1  5  1  1  1  1  1  1]
 [ 1  6  1  1  1  1  1  1]
 [ 1  8  1  1  1  1  1  1]
 [ 1 10  1  1  1  1  1  1]
 [ 1 15  1  1  1  1  1  1]
 [ 1  2  1  1  1  1  1  1]
 [ 1  5  1  1  1  1  1  1]
 [ 1  7  1  1  1  1  1  1]
 [ 1 22  1  1  1  1  1  1]
 [ 1 27  1  1  1  1  1  1]]


array([ 85.67500568,  76.00959687,  75.88157423,  76.47674302,
       101.06289014, 134.37695014, 147.80155211,  77.40017938,
        75.88157423,  89.0337563 , 108.73477222, 112.84224061])

In [58]:
day_hour_col_idx = X_meter.columns.to_list().index('day_hour')

cross_val_score(
    estimator=MeanByCatEstimator(cat_column_idx=day_hour_col_idx),
    X=X_meter,
    y=y_meter,
    scoring='neg_mean_squared_log_error',
    cv=gap_kf).mean()

-0.04666112104598815

In [135]:
class MeanByMultiCatEstimator(BaseEstimator):
    """ A template estimator to be used as a reference implementation.
    For more information regarding how to build your own estimator, read more
    in the :ref:`User Guide <user_guide>`.
    Parameters
    ----------
    demo_param : str, default='demo_param'
        A parameter used for demonstation of how to pass and store paramters.
    """
    def __init__(self, cat_column_indexes=[0], verbose=False):
        self.verbose = verbose
        self.cat_column_indexes = cat_column_indexes

    def fit(self, X, y):
        """A reference implementation of a fitting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).
        Returns
        -------
        self : object
            Returns self.
        """
        
        X, y = check_X_y(X, y, accept_sparse=True)
        """Input validation for standard estimators.
        Checks X and y for consistent length, enforces X to be 2D and y 1D. By
        default, X is checked to be non-empty and containing only finite values.
        Standard input checks are also applied to y, such as checking that y
        does not have np.nan or np.inf targets. For multi-label y, set
        multi_output=True to allow 2D and sparse y. If the dtype of X is
        object, attempt converting to float, raising on failure.
        """
        
        
        cat_columns = []
        
        for col_idx in self.cat_column_indexes:
            if(col_idx >= X.shape[1]):
                raise ValueError("category column indexes should be < X.shape[1]")
            cat_columns.append(X[:, col_idx])
            
        cat_tuples = set(zip(*cat_columns))
        
        categories = {}
        self.means = {}
        
        self.mean = y.mean()
        
        for x_bin in cat_tuples:
            categories[x_bin] = []
            
        if self.verbose:    
            print('categories : {}'.format(categories.keys()))
            
        for k in range(X.shape[0]):
            sample_bin = tuple(X[k, self.cat_column_indexes])
            categories[sample_bin].append(y[k])
        
        for k, v in categories.items():
            self.means[k] = np.array(v).mean()
        
        self.is_fitted_ = True
        # `fit` should always return `self`
        
        if self.verbose:
            for k, v in self.means.items():
                print('({}, {})'.format(k, v))
        
        return self

    
    
    def predict(self, X):
        """ A reference implementation of a predicting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        Returns
        -------
        y : ndarray, shape (n_samples,)
            Returns an array of ones.
        """
        
        
        X = check_array(X, accept_sparse=True)
        """Input validation on an array, list, sparse matrix or similar.
        By default, the input is checked to be a non-empty 2D array containing
        only finite values. If the dtype of the array is object, attempt
        converting to float, raising on failure."""
        
        check_is_fitted(self, 'is_fitted_')
        
        predictions = []
        
        
        cat_columns=[]
        for col in self.cat_column_indexes:
            cat_columns.append(X[:, col])
            
        cat_tuples = list(zip(*cat_columns))
        
        
        
        for sample_cat in cat_tuples:
            cat_mean = self.means.get(sample_cat)
            if(cat_mean == None):
                predictions.append(self.mean)
            else:
                predictions.append(cat_mean)
            
        
        
        return np.array(predictions)

In [136]:
X_meter.columns

Index(['day_of_week', 'day_hour', 'dew_temperature_ma_24H', 'air_temperature',
       'wind_speed_ma_24H', 'precip_depth_1_hr_ma_24H',
       'sea_level_pressure_shift_10H'],
      dtype='object')

In [139]:
cross_val_score(
    estimator=MeanByMultiCatEstimator([0,1]),
    X=X_meter,
    y=y_meter,
    scoring='neg_mean_squared_log_error',
    cv=gap_kf).mean()

-0.026519821080049997