In [2]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, column_or_1d
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_val_score

from tscv import GapKFold

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

%matplotlib inline

The goal here is to build and evalutate simple models based on mean meter readings (per day of week, per hour).

An important and tricky point is how are we going to split the data between train and validation.<br>
For time-serie data, usually we cannot do the usual random split, because of correlations.<br>
Usually, some kind of walk-forward approach is used.<br>
Here, we use hv-block cross validation : we keep a gap of unused data between train and validation, to avoid using correlated data between train and validation.<br>
This method has been studied and described by Racine (2000) : http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.30.6748&rep=rep1&type=pdf<br>
We used an open-source implementation that extends scikit-learn : http://www.zhengwenjie.net/tscv/<br>
See notebook 'test_tscv_lib.ipynb'

In [3]:
filepath = '../../data/intermediate/experimentation_train_sets/'

filename = 'train_b_1176_m_0_t_20200118_214925.csv'

df_features = pd.read_csv(filepath + filename, parse_dates=['timestamp'], index_col=['timestamp'])
df_features.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8512 entries, 2016-01-02 00:00:00 to 2016-12-31 11:00:00
Data columns (total 11 columns):
day_of_week                     8512 non-null int64
day_hour                        8512 non-null int64
dew_temperature_ma_24H          8512 non-null float64
air_temperature                 8512 non-null float64
wind_speed_ma_24H               8512 non-null float64
precip_depth_1_hr_ma_24H        8512 non-null float64
sea_level_pressure_shift_10H    8512 non-null float64
meter_reading                   8512 non-null float64
meter_reading_trend             8512 non-null float64
meter_reading_seasonal          8512 non-null float64
meter_reading_deseasoned        8512 non-null float64
dtypes: float64(9), int64(2)
memory usage: 798.0 KB


In [4]:
df_features.head()

Unnamed: 0_level_0,day_of_week,day_hour,dew_temperature_ma_24H,air_temperature,wind_speed_ma_24H,precip_depth_1_hr_ma_24H,sea_level_pressure_shift_10H,meter_reading,meter_reading_trend,meter_reading_seasonal,meter_reading_deseasoned
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2016-01-02 00:00:00,5,0,-10.120833,-3.3,4.7375,-0.208333,1019.4,73.866,82.076,-26.759415,100.625415
2016-01-02 01:00:00,5,1,-9.979167,-3.9,4.779167,-0.166667,1018.9,69.788,82.069396,-32.469222,102.257222
2016-01-02 02:00:00,5,2,-9.8875,-5.0,4.8625,-0.125,1017.9,68.563,82.132312,-35.705572,104.268572
2016-01-02 03:00:00,5,3,-9.816667,-5.6,4.883333,-0.083333,1017.4,69.133,81.996854,-36.882202,106.015202
2016-01-02 04:00:00,5,4,-9.766667,-6.1,4.8625,-0.083333,1017.1,69.085,81.552562,-37.362799,106.447799


In [23]:
target_variables = set(['meter_reading', 'meter_reading_trend', 'meter_reading_seasonal', 'meter_reading_deseasoned'])
features_variables = set(df_features.columns.to_list())-target_variables-set([''])
meteorological_variables = features_variables - set(['day_hour', 'day_of_week'])
print(meteorological_variables)

{'air_temperature', 'precip_depth_1_hr_ma_24H', 'dew_temperature_ma_24H', 'sea_level_pressure_shift_10H', 'wind_speed_ma_24H'}


In [27]:
#y_meter_trend = df_features['meter_reading']
#y_meter_seasonal = df_features['meter_reading_seasonal']
#y_meter_deseasoned = df_features['meter_reading_deseasoned']
Ys_meter = df_features[target_variables]

X_meter = df_features.loc[:, set(df_features.columns.to_list())-set(target_variables)]
print('X_meter.shape = {}, Ys_meter.shape = {}'.format(X_meter.shape, Ys_meter.shape))

X_meter.shape = (8512, 7), Ys_meter.shape = (8512, 4)


In [28]:
# GapKFold
# gap ~ two weeks, train = 1 month (12 folds)

gap = 24*7*2
gap_kf = GapKFold(n_splits=12, gap_before=gap, gap_after=gap)

Let's build a mean value estimator.

From https://github.com/scikit-learn-contrib/project-template/blob/master/skltemplate/_template.py

check_array and check_X_y

https://github.com/scikit-learn/scikit-learn/blob/e5698bde9/sklearn/utils/validation.py#L904

In [29]:
# Actually we don't need that because it already exists in sklearn, under the name 'dummyRegressor'
# https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html

In [30]:
cross_val_score(
    estimator=DummyRegressor(strategy="mean"),
    X=X_meter,
    y=y_meter,
    scoring='neg_mean_squared_log_error',
    cv=gap_kf).mean()

-0.10972135653366528

In [31]:
cross_val_score(
    estimator=DummyRegressor(strategy="median"),
    X=X_meter,
    y=y_meter,
    scoring='neg_mean_squared_log_error',
    cv=gap_kf).mean()

-0.10826417581945569

In [32]:
# note : this is not the RMSLE metric defined for the competition, it is not in sklearn, we must implement it.

In [33]:
# Now let's take the mean by hour

In [34]:
class MeanByCatEstimator(BaseEstimator):
    """ A template estimator to be used as a reference implementation.
    For more information regarding how to build your own estimator, read more
    in the :ref:`User Guide <user_guide>`.
    Parameters
    ----------
    demo_param : str, default='demo_param'
        A parameter used for demonstation of how to pass and store paramters.
    """
    def __init__(self, cat_column_idx=0, verbose=False):
        self.verbose = verbose
        self.cat_column_idx= cat_column_idx

    def fit(self, X, y):
        """A reference implementation of a fitting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).
        Returns
        -------
        self : object
            Returns self.
        """
        
        X, y = check_X_y(X, y, accept_sparse=True)
        """Input validation for standard estimators.
        Checks X and y for consistent length, enforces X to be 2D and y 1D. By
        default, X is checked to be non-empty and containing only finite values.
        Standard input checks are also applied to y, such as checking that y
        does not have np.nan or np.inf targets. For multi-label y, set
        multi_output=True to allow 2D and sparse y. If the dtype of X is
        object, attempt converting to float, raising on failure.
        """
        
        if(self.cat_column_idx >= X.shape[1]):
             raise ValueError("category column index should be < X.shape[1]")
        
        categories = {}
        self.means = {}
        
        self.mean = y.mean()
        
        for x_bin in np.unique(X[:, self.cat_column_idx]):
            categories[x_bin] = []
            
        if self.verbose:    
            print('categories : {}'.format(categories.keys()))
            
        for k in range(X.shape[0]):
            categories[X[k, self.cat_column_idx]].append(y[k])
        
        for k, v in categories.items():
            self.means[k] = np.array(v).mean()
        
        self.is_fitted_ = True
        # `fit` should always return `self`
        
        if self.verbose:
            for k, v in self.means.items():
                print('({}, {})'.format(k, v))
        
        return self

    
    
    def predict(self, X):
        """ A reference implementation of a predicting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        Returns
        -------
        y : ndarray, shape (n_samples,)
            Returns an array of ones.
        """
        
        
        X = check_array(X, accept_sparse=True)
        """Input validation on an array, list, sparse matrix or similar.
        By default, the input is checked to be a non-empty 2D array containing
        only finite values. If the dtype of the array is object, attempt
        converting to float, raising on failure."""
        
        check_is_fitted(self, 'is_fitted_')
        
        predictions = []
        
        for sample_cat in X[:, self.cat_column_idx]:
            cat_mean = self.means.get(sample_cat)
            if(cat_mean == None):
                predictions.append(self.mean)
            else:
                predictions.append(cat_mean)
            
        
        
        return np.array(predictions)

In [35]:
day_hour_col_idx = X_meter.columns.to_list().index('day_hour')

myEst = MeanByCatEstimator(cat_column_idx=day_hour_col_idx)

In [36]:
myEst.fit(X_meter, y_meter)

MeanByCatEstimator(cat_column_idx=5, verbose=False)

In [37]:

X_meter.shape

(8512, 7)

In [38]:
t_6 = np.array([0,3,5,6,8,10,15,2,5,7,22,27])

t = np.transpose(np.array([
    np.ones(12, dtype=np.int64),
    t_6,
    np.ones(12, dtype=np.int64), 
    np.ones(12, dtype=np.int64),
    np.ones(12, dtype=np.int64),
    np.ones(12, dtype=np.int64),
    np.ones(12, dtype=np.int64),
    np.ones(12, dtype=np.int64)
]))


print(t)

myEst.predict(t)

[[ 1  0  1  1  1  1  1  1]
 [ 1  3  1  1  1  1  1  1]
 [ 1  5  1  1  1  1  1  1]
 [ 1  6  1  1  1  1  1  1]
 [ 1  8  1  1  1  1  1  1]
 [ 1 10  1  1  1  1  1  1]
 [ 1 15  1  1  1  1  1  1]
 [ 1  2  1  1  1  1  1  1]
 [ 1  5  1  1  1  1  1  1]
 [ 1  7  1  1  1  1  1  1]
 [ 1 22  1  1  1  1  1  1]
 [ 1 27  1  1  1  1  1  1]]


array([80.58139886, 80.58139886, 80.58139886, 80.58139886, 80.58139886,
       80.58139886, 80.58139886, 80.58139886, 80.58139886, 80.58139886,
       80.58139886, 80.58139886])

In [39]:
day_hour_col_idx = X_meter.columns.to_list().index('day_hour')

cross_val_score(
    estimator=MeanByCatEstimator(cat_column_idx=day_hour_col_idx),
    X=X_meter,
    y=y_meter,
    scoring='neg_mean_squared_log_error',
    cv=gap_kf).mean()

-0.04645937314320131

In [40]:
class MeanByMultiCatEstimator(BaseEstimator):
    """ A template estimator to be used as a reference implementation.
    For more information regarding how to build your own estimator, read more
    in the :ref:`User Guide <user_guide>`.
    Parameters
    ----------
    demo_param : str, default='demo_param'
        A parameter used for demonstation of how to pass and store paramters.
    """
    def __init__(self, cat_column_indexes=[0], verbose=False):
        self.verbose = verbose
        self.cat_column_indexes = cat_column_indexes

    def fit(self, X, y):
        """A reference implementation of a fitting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).
        Returns
        -------
        self : object
            Returns self.
        """
        
        X, y = check_X_y(X, y, accept_sparse=True)
        """Input validation for standard estimators.
        Checks X and y for consistent length, enforces X to be 2D and y 1D. By
        default, X is checked to be non-empty and containing only finite values.
        Standard input checks are also applied to y, such as checking that y
        does not have np.nan or np.inf targets. For multi-label y, set
        multi_output=True to allow 2D and sparse y. If the dtype of X is
        object, attempt converting to float, raising on failure.
        """
        
        
        cat_columns = []
        
        for col_idx in self.cat_column_indexes:
            if(col_idx >= X.shape[1]):
                raise ValueError("category column indexes should be < X.shape[1]")
            cat_columns.append(X[:, col_idx])
            
        cat_tuples = set(zip(*cat_columns))
        
        categories = {}
        self.means = {}
        
        self.mean = y.mean()
        
        for x_bin in cat_tuples:
            categories[x_bin] = []
            
        if self.verbose:    
            print('categories : {}'.format(categories.keys()))
            
        for k in range(X.shape[0]):
            sample_bin = tuple(X[k, self.cat_column_indexes])
            categories[sample_bin].append(y[k])
        
        for k, v in categories.items():
            self.means[k] = np.array(v).mean()
        
        self.is_fitted_ = True
        # `fit` should always return `self`
        
        if self.verbose:
            for k, v in self.means.items():
                print('({}, {})'.format(k, v))
        
        return self

    
    
    def predict(self, X):
        """ A reference implementation of a predicting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        Returns
        -------
        y : ndarray, shape (n_samples,)
            Returns an array of ones.
        """
        
        
        X = check_array(X, accept_sparse=True)
        """Input validation on an array, list, sparse matrix or similar.
        By default, the input is checked to be a non-empty 2D array containing
        only finite values. If the dtype of the array is object, attempt
        converting to float, raising on failure."""
        
        check_is_fitted(self, 'is_fitted_')
        
        predictions = []
        
        
        cat_columns=[]
        for col in self.cat_column_indexes:
            cat_columns.append(X[:, col])
            
        cat_tuples = list(zip(*cat_columns))
        
        
        
        for sample_cat in cat_tuples:
            cat_mean = self.means.get(sample_cat)
            if(cat_mean == None):
                predictions.append(self.mean)
            else:
                predictions.append(cat_mean)
            
        
        
        return np.array(predictions)

In [41]:
X_meter.columns

Index(['air_temperature', 'day_of_week', 'precip_depth_1_hr_ma_24H',
       'dew_temperature_ma_24H', 'sea_level_pressure_shift_10H', 'day_hour',
       'wind_speed_ma_24H'],
      dtype='object')

In [42]:
cross_val_score(
    estimator=MeanByMultiCatEstimator([0,1]),
    X=X_meter,
    y=y_meter,
    scoring='neg_mean_squared_log_error',
    cv=gap_kf).mean()

-0.10820027661686792

In [43]:
# Let's try a linear regression

In [44]:
from sklearn.linear_model import LinearRegression

In [45]:
linReg = LinearRegression()

linReg.fit(X_meter, y_meter)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [46]:
cross_val_score(
    estimator=LinearRegression(),
    X=X_meter,
    y=y_meter,
    scoring='neg_mean_squared_log_error',
    cv=gap_kf).mean()

-0.07293833763629023

In [47]:
# Try a k nearest-neighbours.
# We need to scale, so to use a scaler with cross-validation, we need to set a pipeline.

In [48]:
X_meter.head()

Unnamed: 0_level_0,air_temperature,day_of_week,precip_depth_1_hr_ma_24H,dew_temperature_ma_24H,sea_level_pressure_shift_10H,day_hour,wind_speed_ma_24H
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-02 00:00:00,-3.3,5,-0.208333,-10.120833,1019.4,0,4.7375
2016-01-02 01:00:00,-3.9,5,-0.166667,-9.979167,1018.9,1,4.779167
2016-01-02 02:00:00,-5.0,5,-0.125,-9.8875,1017.9,2,4.8625
2016-01-02 03:00:00,-5.6,5,-0.083333,-9.816667,1017.4,3,4.883333
2016-01-02 04:00:00,-6.1,5,-0.083333,-9.766667,1017.1,4,4.8625


In [49]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [50]:

transformers = [('scaler', StandardScaler()), ('estimator', LinearRegression())]

my_first_pipe = Pipeline(transformers)




In [51]:
cross_val_score(my_first_pipe, X_meter, y_meter,  scoring='neg_mean_squared_log_error',
    cv=gap_kf)

array([-0.09796662, -0.06398672, -0.06379792, -0.09599391, -0.07103132,
       -0.0845807 , -0.07904463, -0.08699917, -0.06445765, -0.04719854,
       -0.05903246, -0.0611704 ])

In [52]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [53]:
pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())
cross_val_score(pipe, X_meter, y_meter, scoring='neg_mean_squared_log_error', cv=gap_kf).mean()

-0.05704436300275637

In [54]:
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'standardscaler', 'kneighborsregressor', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'kneighborsregressor__algorithm', 'kneighborsregressor__leaf_size', 'kneighborsregressor__metric', 'kneighborsregressor__metric_params', 'kneighborsregressor__n_jobs', 'kneighborsregressor__n_neighbors', 'kneighborsregressor__p', 'kneighborsregressor__weights'])

In [55]:
pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())

knn_grid_params = {
    'kneighborsregressor__n_neighbors' : [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 60, 70],
    'kneighborsregressor__p' : [1, 2, 3, 4]
}

knn_CV = GridSearchCV(
    estimator=pipe, 
    param_grid=knn_grid_params,
    scoring='neg_mean_squared_log_error',
    cv=gap_kf
)

knn_CV.fit(X_meter, y_meter)

GridSearchCV(cv=GapKFold(gap_after=336, gap_before=336, n_splits=12),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('kneighborsregressor',
                                        KNeighborsRegressor(algorithm='auto',
                                                            leaf_size=30,
                                                            metric='minkowski',
                                                            metric_params=None,
                                                            n_jobs=None,
                                                            n_neighbors=5, p=2,
                                            

In [56]:
knn_CV.cv_results_

{'mean_fit_time': array([0.00907431, 0.00906595, 0.00956583, 0.00924706, 0.00905933,
        0.00915203, 0.00945429, 0.00925905, 0.00909917, 0.0090907 ,
        0.00926975, 0.00931327, 0.00906517, 0.01098557, 0.00953527,
        0.00953162, 0.0091147 , 0.00917468, 0.01005393, 0.00987609,
        0.01055231, 0.01071147, 0.00969889, 0.00999039, 0.00943601,
        0.00949981, 0.00934782, 0.00969885, 0.00941432, 0.00914168,
        0.0096099 , 0.00943118, 0.00939141, 0.00953019, 0.00935215,
        0.00933045, 0.0091451 , 0.00928698, 0.00930512, 0.00938596,
        0.00920514, 0.00919368, 0.00947704, 0.00951183, 0.00929187,
        0.00930796, 0.00944126, 0.00944253]),
 'std_fit_time': array([0.00082405, 0.00075596, 0.00114457, 0.00075913, 0.00077006,
        0.00081727, 0.00075853, 0.00075708, 0.00075254, 0.00076851,
        0.00075292, 0.00082973, 0.00072943, 0.00183227, 0.00107672,
        0.00097065, 0.0007684 , 0.00079237, 0.00121796, 0.00110467,
        0.0016609 , 0.00189433, 0.000

In [57]:
print('best_params : {}'.format(knn_CV.best_params_))
print('best_score : {}'.format(knn_CV.best_score_))

best_params : {'kneighborsregressor__n_neighbors': 35, 'kneighborsregressor__p': 1}
best_score : -0.04450676899920729


In [None]:
pipe_MLPRegressor = Pipeline([('scaler',  StandardScaler()),
            ('MLPRegressor', MLPRegressor(random_state = 42))])


grid_params_MLPRegressor = [{
    'MLPRegressor__solver': ['lbfgs'],
    'MLPRegressor__max_iter': [100,200,300,500],
    'MLPRegressor__activation' : ['relu','logistic','tanh'],
    'MLPRegressor__hidden_layer_sizes':[(2,), (4,),(2,2),(4,4),(4,2),(10,10),(2,2,2)],
}]


CV_mlpregressor = GridSearchCV (estimator = pipe_MLPRegressor,
                               param_grid = grid_params_MLPRegressor,
                               cv = 5,return_train_score=True, verbose=0)

CV_mlpregressor.fit(x_train, y_train)

CV_mlpregressor.predict(x_test)

In [58]:
Ys_meter.columns

Index(['meter_reading_deseasoned', 'meter_reading', 'meter_reading_seasonal',
       'meter_reading_trend'],
      dtype='object')

In [None]:
# Build an estimator which is the sum of an estimator + the seasonal component.






In [158]:
# TODO plus simple : faire  def __init__(self, trend_estimator, seasonal_component_col_idx)
# et mettre y_seasonal dans X

class TrendEstimator(BaseEstimator):
    """ A template estimator to be used as a reference implementation.
    For more information regarding how to build your own estimator, read more
    in the :ref:`User Guide <user_guide>`.
    Parameters
    ----------
    demo_param : str, default='demo_param'
        A parameter used for demonstation of how to pass and store paramters.
    """
    def __init__(self, trend_estimator, seasonal_component_function, seasonal_column_idxs):
        self.trend_estimator = trend_estimator
        self.seasonal_component_function = seasonal_component_function
        self.seasonal_column_idxs = seasonal_column_idxs

    def fit(self, X, y):
        """A reference implementation of a fitting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).
        Returns
        -------
        self : object
            Returns self.
        """
        X, y = check_X_y(X, y, accept_sparse=True)
        
        print('X.shape = {}'.format(X.shape))
        
        self.features_idxs = list(set(range(X.shape[1])) - set(seasonal_column_idxs))
        print('self.features_idxs = {}'.format(self.features_idxs))
        
        # fit only on trend features
        
        #X_trend = np.delete(np.copy(X), seasonal_column_idxs, axis=1)
        X_trend = X[:, self.features_idxs]
        print('X_trend.shape = {}'.format(X_trend.shape))
        
        self.trend_estimator.fit(X_trend, y)
        
        self.is_fitted_ = True
        # `fit` should always return `self`
        return self

    def predict(self, X):
        """ A reference implementation of a predicting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        Returns
        -------
        y : ndarray, shape (n_samples,)
            Returns an array of ones.
        """
        
        X = check_array(X, accept_sparse=True)
        check_is_fitted(self, 'is_fitted_')
        
        X_trend = X[self.features_idxs]
        predicted_trend = self.trend_estimator.predict(X_trend)
        
        X_season = X[:, seasonal_column_idxs]
        
        predictions = []
        for k in range(X.shape[0]):
            season_args_k = X_season[k, :]
            predictions.append(predicted_trend[k] + self.seasonal_component_function(season_args_k))
        
      
        return np.array(predictions)

In [159]:
def seasonal_function(t_list):
    day_of_week = t_list[0]
    day_hour = t_list[1]
    t = datetime.datetime(2016, 1, 8+day_of_week, day_hour)
    return Ys_meter.loc[t, 'meter_reading_trend']

In [160]:
# time cols = [1, 5]
X_meter.columns

Index(['air_temperature', 'day_of_week', 'precip_depth_1_hr_ma_24H',
       'dew_temperature_ma_24H', 'sea_level_pressure_shift_10H', 'day_hour',
       'wind_speed_ma_24H'],
      dtype='object')

In [161]:
trendLinReg = LinearRegression()


full_est = TrendEstimator(trendLinReg, seasonal_function, [1,5])

full_est.fit(X=X_meter, y=Ys_meter['meter_reading_trend'])

X.shape = (8512, 7)
self.features_idxs = [0, 1, 4, 5, 6]
X_trend.shape = (8512, 5)


TrendEstimator(seasonal_column_idxs=[1, 5],
               seasonal_component_function=<function seasonal_function at 0x7f651c22a158>,
               trend_estimator=LinearRegression(copy_X=True, fit_intercept=True,
                                                n_jobs=None, normalize=False))

In [46]:
cross_val_score(
    estimator=full_est,
    X=X_meter,
    y=y_meter,
    scoring='neg_mean_squared_log_error',
    cv=gap_kf).mean()

-0.07293833763629023

In [74]:
seasonal_column_idxs = [2,3]

In [77]:
features_idxs = list(set(range(X_meter.shape[1])) - set(seasonal_column_idxs))

In [78]:
features_idxs

[0, 1, 4, 5, 6]

In [68]:
a = X_meter.to_numpy()

In [71]:
b = np.delete(a, [2,3], axis=1)

In [70]:
a.shape

(8512, 7)

In [72]:
b.shape

(8512, 5)

In [130]:
np.unique(X_meter.day_of_week.values)

array([0, 1, 2, 3, 4, 5, 6])

In [123]:

day_of_week = 0
day_hour = 2



'2016-0-2:00:00'

In [108]:
from datetime import datetime

In [134]:
t = datetime.datetime(2016, 1, 8+day_of_week, day_hour)

In [135]:
Ys_meter.index.min()

Timestamp('2016-01-02 00:00:00')

In [136]:
Ys_meter.loc[t, 'meter_reading_trend']

105.38227083333334

In [111]:
fromisocalendar(2016,0,day_of_week)

NameError: name 'fromisocalendar' is not defined

In [112]:
type(X_meter.index[0])

pandas._libs.tslibs.timestamps.Timestamp

In [101]:
(X_meter['day_of_week'] == 5) & (X_meter['day_hour'] == 1)

timestamp
2016-01-02 00:00:00    False
2016-01-02 01:00:00     True
2016-01-02 02:00:00    False
2016-01-02 03:00:00    False
2016-01-02 04:00:00    False
2016-01-02 05:00:00    False
2016-01-02 06:00:00    False
2016-01-02 07:00:00    False
2016-01-02 08:00:00    False
2016-01-02 09:00:00    False
2016-01-02 10:00:00    False
2016-01-02 11:00:00    False
2016-01-02 12:00:00    False
2016-01-02 13:00:00    False
2016-01-02 14:00:00    False
2016-01-02 15:00:00    False
2016-01-02 16:00:00    False
2016-01-02 17:00:00    False
2016-01-02 18:00:00    False
2016-01-02 19:00:00    False
2016-01-02 20:00:00    False
2016-01-02 21:00:00    False
2016-01-02 22:00:00    False
2016-01-02 23:00:00    False
2016-01-03 00:00:00    False
2016-01-03 01:00:00    False
2016-01-03 02:00:00    False
2016-01-03 03:00:00    False
2016-01-03 04:00:00    False
2016-01-03 05:00:00    False
                       ...  
2016-12-30 06:00:00    False
2016-12-30 07:00:00    False
2016-12-30 08:00:00    False
2016

In [84]:
Ys_meter.head()

Unnamed: 0_level_0,meter_reading_deseasoned,meter_reading,meter_reading_seasonal,meter_reading_trend
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-02 00:00:00,100.625415,73.866,-26.759415,82.076
2016-01-02 01:00:00,102.257222,69.788,-32.469222,82.069396
2016-01-02 02:00:00,104.268572,68.563,-35.705572,82.132312
2016-01-02 03:00:00,106.015202,69.133,-36.882202,81.996854
2016-01-02 04:00:00,106.447799,69.085,-37.362799,81.552562


In [83]:
Ys_meter.loc['2016-01-02 02:00:00', 'meter_reading_trend']

82.1323125