In [1]:
import numpy as np
import pandas as pd

import random
import joblib

from os import path

from sklearn.base import BaseEstimator
from sklearn.dummy import DummyRegressor

from sklearn.ensemble import RandomForestRegressor

from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, column_or_1d
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_val_predict

from tscv import GapKFold

from time import time, localtime, strftime



%matplotlib inline

In [2]:
training_time_id = '20200328_173456'

In [3]:
test_df = pd.read_csv('../data/raw/csvs/test.csv', parse_dates=['timestamp'])
test_df.set_index('row_id', inplace=True) # (in two steps to avoid a warning)

In [4]:
test_df.head()

Unnamed: 0_level_0,building_id,meter,timestamp
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,2017-01-01
1,1,0,2017-01-01
2,2,0,2017-01-01
3,3,0,2017-01-01
4,4,0,2017-01-01


In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 3 columns):
building_id    int64
meter          int64
timestamp      datetime64[ns]
dtypes: datetime64[ns](1), int64(2)
memory usage: 1.2 GB


In [6]:
# retrieve training directory path

base_directory_path = '../models/test/'
timed_base_folder_name = 'trained_models_' + training_time_id
training_folder_path = path.join(base_directory_path, timed_base_folder_name)

In [7]:
"""

- prepare each site weather data

- get each building site

- (for tests only) clean building list, keep only buildings for which we saved a model (using training_infos.csv)

- for all meters
    load and predict




"""

'\n\n- prepare each site weather data\n\n- get each building site\n\n- (for tests only) clean building list, keep only buildings for which we saved a model (using training_infos.csv)\n\n- for all meters\n    load and predict\n\n\n\n\n'

<b>Prepare and cache each site weather data</b>

In [8]:
def load_and_prepare_site_data(data_folder_path):
    
    # Loads weather data
    raw_df_weather = pd.read_csv(path.join(data_folder_path, 'weather_test.csv'), 
                     parse_dates=['timestamp'], index_col=['site_id', 'timestamp'])
    
    # Get site list
    site_list = raw_df_weather.index.get_level_values('site_id').unique().tolist()
    prepared_site_data = {}
    
    for site in site_list:
        prepared_site_data[site] = prepare_site_data(raw_df_weather, site)
        
        
    return prepared_site_data
    

In [9]:
"""
For test set we also perform linear extrapolation (contrary to train).
"""
def prepare_site_data(weather_df, site_id):
    
    b_df_weather = weather_df.loc[(site_id,)]

    # keep only air_temperature and dew_temperature
    b_df_weather.drop(
        ['precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed', 'cloud_coverage'],
        axis=1,
        inplace=True
    )

    # Clean timestamps index.
    clean_index = pd.date_range(start=b_df_weather.index.min(), end=b_df_weather.index.max(), freq='H')
    b_df_weather = b_df_weather.reindex(index=clean_index, copy=True)


    # Interpolate missing values.
    b_df_weather.interpolate(method='linear', limit=3, inplace=True)
    
    # Build time features
    b_df_weather['day_hour'] = b_df_weather.index.to_series().dt.hour
    b_df_weather['day_of_week'] = b_df_weather.index.to_series().dt.dayofweek

    # Builds averaged weather features.

    timeframes = [24]
    features_to_avg = ['air_temperature', 'dew_temperature']
    do_center = False

    for c in features_to_avg:
        ts = b_df_weather[c]
        for timeframe in timeframes:
            shifted_ts = ts.rolling(timeframe, center=do_center).mean()
            new_col_name = '' + c + '_ma_' + str(timeframe) + 'H'
            # Extrapolate missing values (specific to test set preparation)
            extrapolated_shifted_ts = shifted_ts.interpolate(
                method='linear',
                limit_direction='backward', 
                limit_area='outside', 
                inplace=False
            )
            b_df_weather[new_col_name] = extrapolated_shifted_ts
            
            
    # Drops rows with NaNs.
    b_df_weather.dropna(axis=0, how='any', inplace=True)
            
    print('shape={}'.format(b_df_weather.shape))
        
    return b_df_weather

In [10]:
data_folder = '../data/raw/csvs/'

site_data = load_and_prepare_site_data(data_folder)

shape=(17520, 6)
shape=(17183, 6)
shape=(17520, 6)
shape=(17520, 6)
shape=(17519, 6)
shape=(17124, 6)
shape=(17494, 6)
shape=(16368, 6)
shape=(17520, 6)
shape=(17191, 6)
shape=(17465, 6)
shape=(16368, 6)
shape=(17184, 6)
shape=(17520, 6)
shape=(17519, 6)
shape=(16734, 6)


<b>Retrieve each building site</b>

In [11]:
test_df.head()

Unnamed: 0_level_0,building_id,meter,timestamp
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,2017-01-01
1,1,0,2017-01-01
2,2,0,2017-01-01
3,3,0,2017-01-01
4,4,0,2017-01-01


In [12]:
bdata = pd.read_csv(
    '../data/raw/csvs/building_metadata.csv', 
    index_col='building_id', 
    usecols=['building_id', 'site_id']
)
bdata.head()

Unnamed: 0_level_0,site_id
building_id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


In [13]:
test_df_grouped = test_df.groupby(['building_id', 'meter']).count()
test_df_grouped = test_df.groupby(['building_id', 'meter']).count()
test_df_grouped.drop('timestamp', axis=1, inplace=True)
test_df_grouped.head()

building_id,meter
0,0
1,0
2,0
3,0
4,0


In [14]:
test_df_grouped = test_df_grouped.join(bdata, on='building_id', how='left')

In [15]:
test_df_grouped.shape

(2380, 1)

In [16]:
test_df_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,site_id
building_id,meter,Unnamed: 2_level_1
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


<b>Keep only (building, meter) for which we trained a model</b>

In [17]:
# Load training info
training_info_path = path.join(training_folder_path, 'training_info.csv')

training_info = pd.read_csv(training_info_path)

In [18]:
training_info.head()

Unnamed: 0,building,meter_id,rfr_improvement,saved_model
0,723,0,-9.119602,time_avg
1,682,0,32.3926,rfr
2,969,1,46.323085,rfr
3,1242,3,63.526895,rfr
4,346,0,-19.987248,time_avg


In [19]:
trained_meter_index = pd.MultiIndex.from_frame(training_info[['building', 'meter_id']])
sub_test_df_grouped = test_df_grouped.loc[trained_meter_index]
sub_test_df_grouped.head()                                

Unnamed: 0_level_0,Unnamed: 1_level_0,site_id
building,meter_id,Unnamed: 2_level_1
723,0,5
682,0,5
969,1,9
1242,3,14
346,0,3


<b>Model functions</b>

In [20]:
class MeanByMultiCatEstimator(BaseEstimator):
    """ A template estimator to be used as a reference implementation.
    For more information regarding how to build your own estimator, read more
    in the :ref:`User Guide <user_guide>`.
    Parameters
    ----------
    demo_param : str, default='demo_param'
        A parameter used for demonstation of how to pass and store paramters.
    """
    def __init__(self, cat_column_indexes=[0], verbose=False):
        self.verbose = verbose
        self.cat_column_indexes = cat_column_indexes

    def fit(self, X, y):
        """A reference implementation of a fitting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).
        Returns
        -------
        self : object
            Returns self.
        """
        
        X, y = check_X_y(X, y, accept_sparse=True)
        """Input validation for standard estimators.
        Checks X and y for consistent length, enforces X to be 2D and y 1D. By
        default, X is checked to be non-empty and containing only finite values.
        Standard input checks are also applied to y, such as checking that y
        does not have np.nan or np.inf targets. For multi-label y, set
        multi_output=True to allow 2D and sparse y. If the dtype of X is
        object, attempt converting to float, raising on failure.
        """
        
        
        cat_columns = []
        
        for col_idx in self.cat_column_indexes:
            if(col_idx >= X.shape[1]):
                raise ValueError("category column indexes should be < X.shape[1]")
            cat_columns.append(X[:, col_idx])
            
        cat_tuples = set(zip(*cat_columns))
        
        categories = {}
        self.means = {}
        
        self.mean = y.mean()
        
        for x_bin in cat_tuples:
            categories[x_bin] = []
            
        if self.verbose:    
            print('categories : {}'.format(categories.keys()))
            
        for k in range(X.shape[0]):
            sample_bin = tuple(X[k, self.cat_column_indexes])
            categories[sample_bin].append(y[k])
        
        for k, v in categories.items():
            self.means[k] = np.array(v).mean()
        
        self.is_fitted_ = True
        # `fit` should always return `self`
        
        if self.verbose:
            for k, v in self.means.items():
                print('({}, {})'.format(k, v))
        
        return self

    
    
    def predict(self, X):
        """ A reference implementation of a predicting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        Returns
        -------
        y : ndarray, shape (n_samples,)
            Returns an array of ones.
        """
        
        
        X = check_array(X, accept_sparse=True)
        """Input validation on an array, list, sparse matrix or similar.
        By default, the input is checked to be a non-empty 2D array containing
        only finite values. If the dtype of the array is object, attempt
        converting to float, raising on failure."""
        
        check_is_fitted(self, 'is_fitted_')
        
        predictions = []
        
        
        cat_columns=[]
        for col in self.cat_column_indexes:
            cat_columns.append(X[:, col])
            
        cat_tuples = list(zip(*cat_columns))
        
        
        
        for sample_cat in cat_tuples:
            cat_mean = self.means.get(sample_cat)
            if(cat_mean == None):
                predictions.append(self.mean)
            else:
                predictions.append(cat_mean)
            
        
        
        return np.array(predictions)

<b>Load models and predict</b>

In [21]:
"""
for (building, meter) in sub_test_df_grouped.index:
    - get site
    - get timestamps to predict
    - load model
    - predict

"""

'\nfor (building, meter) in sub_test_df_grouped.index:\n    - get site\n    - get timestamps to predict\n    - load model\n    - predict\n\n'

In [40]:
for (building, meter) in sub_test_df_grouped.index:
    
    site = sub_test_df_grouped.loc[(building, meter), 'site_id']
    
    this_b_and_m = (test_df['building_id']==building) & (test_df['meter']==meter)
    timestamps_to_predict = test_df[this_b_and_m]['timestamp']
    
    # Load model
    b_folder = 'building_' + str(building)
    m_folder = 'meter_' + str(meter)
    model_path = path.join(training_folder_path, b_folder, m_folder, 'best_model.joblib')
    meter_model = joblib.load(model_path)
    
    print('site : {}'.format(site))
    
    x_test = site_data[site].loc[pd.Index(timestamps_to_predict)]
    
    meter_model.predict(x_test)

site : 5


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  app.launch_new_instance()


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [37]:
x_test.head(30)

Unnamed: 0_level_0,air_temperature,dew_temperature,day_hour,day_of_week,air_temperature_ma_24H,dew_temperature_ma_24H
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-01 01:00:00,9.0,7.0,1.0,6.0,6.125,4.666667
2017-01-01 02:00:00,9.0,8.0,2.0,6.0,6.125,4.666667
2017-01-01 03:00:00,8.0,8.0,3.0,6.0,6.125,4.666667
2017-01-01 04:00:00,8.0,8.0,4.0,6.0,6.125,4.666667
2017-01-01 05:00:00,8.0,8.0,5.0,6.0,6.125,4.666667
2017-01-01 06:00:00,9.0,8.0,6.0,6.0,6.125,4.666667
2017-01-01 07:00:00,9.0,8.0,7.0,6.0,6.125,4.666667
2017-01-01 08:00:00,9.0,8.0,8.0,6.0,6.125,4.666667
2017-01-01 09:00:00,9.0,8.0,9.0,6.0,6.125,4.666667
2017-01-01 10:00:00,9.0,8.0,10.0,6.0,6.125,4.666667


In [None]:
timestamps_to_predict.head()

In [36]:
x_test.head().isna().sum()

air_temperature           0
dew_temperature           0
day_hour                  0
day_of_week               0
air_temperature_ma_24H    0
dew_temperature_ma_24H    0
dtype: int64

In [41]:
for site, data in site_data.items():
    print('-- {} --'.format(site))
    print(data.shape)
    #print(data.isna().sum())

-- 0 --
(17520, 6)
-- 1 --
(17183, 6)
-- 2 --
(17520, 6)
-- 3 --
(17520, 6)
-- 4 --
(17519, 6)
-- 5 --
(17124, 6)
-- 6 --
(17494, 6)
-- 7 --
(16368, 6)
-- 8 --
(17520, 6)
-- 9 --
(17191, 6)
-- 10 --
(17465, 6)
-- 11 --
(16368, 6)
-- 12 --
(17184, 6)
-- 13 --
(17520, 6)
-- 14 --
(17519, 6)
-- 15 --
(16734, 6)


In [48]:
a = set(site_data[5].index)
b = set(timestamps_to_predict)

In [49]:
a-b

set()

In [51]:
len(b-a)

396

In [52]:
b-a

{Timestamp('2017-01-01 00:00:00'),
 Timestamp('2017-08-08 04:00:00'),
 Timestamp('2017-08-08 05:00:00'),
 Timestamp('2017-08-08 06:00:00'),
 Timestamp('2017-08-08 07:00:00'),
 Timestamp('2017-08-08 08:00:00'),
 Timestamp('2017-08-08 09:00:00'),
 Timestamp('2017-08-08 10:00:00'),
 Timestamp('2017-08-08 11:00:00'),
 Timestamp('2017-08-08 12:00:00'),
 Timestamp('2017-08-08 13:00:00'),
 Timestamp('2017-08-08 14:00:00'),
 Timestamp('2017-08-08 15:00:00'),
 Timestamp('2017-08-08 16:00:00'),
 Timestamp('2017-08-08 17:00:00'),
 Timestamp('2017-08-08 18:00:00'),
 Timestamp('2017-08-08 19:00:00'),
 Timestamp('2017-08-08 20:00:00'),
 Timestamp('2017-08-08 21:00:00'),
 Timestamp('2017-08-08 22:00:00'),
 Timestamp('2017-08-08 23:00:00'),
 Timestamp('2017-08-09 00:00:00'),
 Timestamp('2017-08-09 01:00:00'),
 Timestamp('2017-08-09 02:00:00'),
 Timestamp('2017-08-09 03:00:00'),
 Timestamp('2018-04-11 14:00:00'),
 Timestamp('2018-04-11 15:00:00'),
 Timestamp('2018-04-11 16:00:00'),
 Timestamp('2018-04-