In [1]:
import numpy as np
import pandas as pd

import random
import joblib

from os import path

from sklearn.base import BaseEstimator
from sklearn.dummy import DummyRegressor

from sklearn.ensemble import RandomForestRegressor

from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, column_or_1d
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_val_predict

from tscv import GapKFold

from time import time, localtime, strftime



%matplotlib inline

In [2]:
training_time_id = '20200405_153303'

In [3]:
test_df = pd.read_csv('../data/raw/csvs/test.csv', parse_dates=['timestamp'])
test_df.set_index('row_id', inplace=True) # (in two steps to avoid a warning)

In [4]:
test_df.head()

Unnamed: 0_level_0,building_id,meter,timestamp
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,2017-01-01
1,1,0,2017-01-01
2,2,0,2017-01-01
3,3,0,2017-01-01
4,4,0,2017-01-01


In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 3 columns):
building_id    int64
meter          int64
timestamp      datetime64[ns]
dtypes: datetime64[ns](1), int64(2)
memory usage: 1.2 GB


In [6]:
min_tps = test_df['timestamp'].min()
max_tps = test_df['timestamp'].max()
print('min_tps: {}'.format(min_tps))
print('max_tps: {}'.format(max_tps))

min_tps: 2017-01-01 00:00:00
max_tps: 2018-12-31 23:00:00


In [7]:
# retrieve training directory path

base_directory_path = '../models/test/'
timed_base_folder_name = 'trained_models_' + training_time_id
training_folder_path = path.join(base_directory_path, timed_base_folder_name)

In [8]:
"""

- prepare each site weather data

- get each building site

- (for tests only) clean building list, keep only buildings for which we saved a model (using training_infos.csv)

- for all meters
    load and predict




"""

'\n\n- prepare each site weather data\n\n- get each building site\n\n- (for tests only) clean building list, keep only buildings for which we saved a model (using training_infos.csv)\n\n- for all meters\n    load and predict\n\n\n\n\n'

<b>Prepare and cache each site weather data</b>

In [9]:
def load_and_prepare_site_data(data_folder_path, min_timestamp, max_timestamp):
    
    # Loads weather data
    raw_df_weather = pd.read_csv(path.join(data_folder_path, 'weather_test.csv'), 
                     parse_dates=['timestamp'], index_col=['site_id', 'timestamp'])
    
    # Get site list
    site_list = raw_df_weather.index.get_level_values('site_id').unique().tolist()
    prepared_site_data = {}
    
    for site in site_list:
        prepared_site_data[site] = prepare_site_data(raw_df_weather, site, min_timestamp, max_timestamp)
             
    return prepared_site_data   

In [10]:
"""
For test set we also perform linear extrapolation (contrary to train).
We extrapolate between min_tps and max_tps.
Also we do not drop rows with nan(s).
"""
def prepare_site_data(weather_df, site_id, min_tps, max_tps):
    
    b_df_weather = weather_df.loc[(site_id,)]

    # keep only air_temperature and dew_temperature
    b_df_weather.drop(
        ['precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed', 'cloud_coverage'],
        axis=1,
        inplace=True
    )

    # Clean timestamps index.
    clean_index = pd.date_range(start=min_tps, end=max_tps, freq='H')
    b_df_weather = b_df_weather.reindex(index=clean_index, copy=True)
    b_df_weather.sort_index(inplace=True)


    # Interpolate missing values.
    b_df_weather.interpolate(method='linear', limit=3, inplace=True)
    
    # Build time features
    b_df_weather['day_hour'] = b_df_weather.index.to_series().dt.hour
    b_df_weather['day_of_week'] = b_df_weather.index.to_series().dt.dayofweek

    # Builds averaged weather features.

    timeframes = [24]
    features_to_avg = ['air_temperature', 'dew_temperature']
    do_center = False

    for c in features_to_avg:
        ts = b_df_weather[c]
        for timeframe in timeframes:
            shifted_ts = ts.rolling(timeframe, center=do_center).mean()
            new_col_name = '' + c + '_ma_' + str(timeframe) + 'H'
            # Extrapolate missing values (specific to test set preparation)
            extrapolated_shifted_ts = shifted_ts.interpolate(
                method='linear',
                limit=24,
                limit_direction='backward', 
                limit_area='outside', 
                inplace=False
            )
            b_df_weather[new_col_name] = extrapolated_shifted_ts
            
            
    # Do not drop rows with NaNs.
    #b_df_weather.dropna(axis=0, how='any', inplace=True)
            
    print('shape={}'.format(b_df_weather.shape))
        
    return b_df_weather

In [11]:
data_folder = '../data/raw/csvs/'

site_data = load_and_prepare_site_data(data_folder, min_tps, max_tps)

shape=(17520, 6)
shape=(17520, 6)
shape=(17520, 6)
shape=(17520, 6)
shape=(17520, 6)
shape=(17520, 6)
shape=(17520, 6)
shape=(17520, 6)
shape=(17520, 6)
shape=(17520, 6)
shape=(17520, 6)
shape=(17520, 6)
shape=(17520, 6)
shape=(17520, 6)
shape=(17520, 6)
shape=(17520, 6)


<b>Retrieve each building site</b>

In [12]:
test_df.head()

Unnamed: 0_level_0,building_id,meter,timestamp
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,2017-01-01
1,1,0,2017-01-01
2,2,0,2017-01-01
3,3,0,2017-01-01
4,4,0,2017-01-01


In [13]:
bdata = pd.read_csv(
    '../data/raw/csvs/building_metadata.csv', 
    index_col='building_id', 
    usecols=['building_id', 'site_id']
)
bdata.head()

Unnamed: 0_level_0,site_id
building_id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


In [14]:
test_df_grouped = test_df.groupby(['building_id', 'meter']).count()
test_df_grouped = test_df.groupby(['building_id', 'meter']).count()
test_df_grouped.drop('timestamp', axis=1, inplace=True)
test_df_grouped.head()

building_id,meter
0,0
1,0
2,0
3,0
4,0


In [15]:
test_df_grouped = test_df_grouped.join(bdata, on='building_id', how='left')

In [16]:
test_df_grouped.shape

(2380, 1)

In [17]:
test_df_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,site_id
building_id,meter,Unnamed: 2_level_1
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


<b>Keep only (building, meter) for which we trained a model</b>

In [18]:
# Load training info
training_info_path = path.join(training_folder_path, 'training_info.csv')

training_info = pd.read_csv(training_info_path)

In [19]:
training_info.head()

Unnamed: 0,building,meter_id,rfr_improvement,best_model,helper_model
0,723,0,-8.48103,MeanByMultiCatEstimator,NoneType
1,682,0,32.027189,RandomForestRegressor,MeanByMultiCatEstimator
2,969,1,46.80529,RandomForestRegressor,MeanByMultiCatEstimator
3,1242,3,63.414382,RandomForestRegressor,MeanByMultiCatEstimator
4,346,0,-20.479436,MeanByMultiCatEstimator,NoneType


In [20]:
trained_meter_index = pd.MultiIndex.from_frame(training_info[['building', 'meter_id']])
sub_test_df_grouped = test_df_grouped.loc[trained_meter_index]
sub_test_df_grouped.head()                                

Unnamed: 0_level_0,Unnamed: 1_level_0,site_id
building,meter_id,Unnamed: 2_level_1
723,0,5
682,0,5
969,1,9
1242,3,14
346,0,3


In [21]:
a = np.array([[1,2,3, 12, 6, 89, 69, -3], [4, 5, 6, 13, 4, 27, 31, 19]])
a

array([[ 1,  2,  3, 12,  6, 89, 69, -3],
       [ 4,  5,  6, 13,  4, 27, 31, 19]])

In [22]:
a[:, [2, 4, 7]]

array([[ 3,  6, -3],
       [ 6,  4, 19]])

In [23]:
a.shape

(2, 8)

<b>Model functions</b>

In [24]:
class MeanByMultiCatEstimator(BaseEstimator):
    """ A template estimator to be used as a reference implementation.
    For more information regarding how to build your own estimator, read more
    in the :ref:`User Guide <user_guide>`.
    Parameters
    ----------
    demo_param : str, default='demo_param'
        A parameter used for demonstation of how to pass and store paramters.
    """
    def __init__(self, cat_column_indexes=[0], verbose=False):
        self.verbose = verbose
        self.cat_column_indexes = cat_column_indexes

    def fit(self, X, y):
        """A reference implementation of a fitting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).
        Returns
        -------
        self : object
            Returns self.
        """
        
        X, y = check_X_y(X, y, accept_sparse=True)
        """Input validation for standard estimators.
        Checks X and y for consistent length, enforces X to be 2D and y 1D. By
        default, X is checked to be non-empty and containing only finite values.
        Standard input checks are also applied to y, such as checking that y
        does not have np.nan or np.inf targets. For multi-label y, set
        multi_output=True to allow 2D and sparse y. If the dtype of X is
        object, attempt converting to float, raising on failure.
        """
        
        
        cat_columns = []
        
        for col_idx in self.cat_column_indexes:
            if(col_idx >= X.shape[1]):
                raise ValueError("category column indexes should be < X.shape[1]")
            cat_columns.append(X[:, col_idx])
            
        cat_tuples = set(zip(*cat_columns))
        
        categories = {}
        self.means = {}
        
        self.mean = y.mean()
        
        for x_bin in cat_tuples:
            categories[x_bin] = []
            
        if self.verbose:    
            print('categories : {}'.format(categories.keys()))
            
        for k in range(X.shape[0]):
            sample_bin = tuple(X[k, self.cat_column_indexes])
            categories[sample_bin].append(y[k])
        
        for k, v in categories.items():
            self.means[k] = np.array(v).mean()
        
        self.is_fitted_ = True
        # `fit` should always return `self`
        
        if self.verbose:
            for k, v in self.means.items():
                print('({}, {})'.format(k, v))
        
        return self

    
    
    def predict(self, X):
        """ A reference implementation of a predicting function.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        Returns
        -------
        y : ndarray, shape (n_samples,)
            Returns an array of ones.
        """
        
        
        X = check_array(X, accept_sparse=True, force_all_finite='allow-nan')
        """Input validation on an array, list, sparse matrix or similar.
        By default, the input is checked to be a non-empty 2D array containing
        only finite values. If the dtype of the array is object, attempt
        converting to float, raising on failure."""
        
        # Do not allow nans in time-features
        check_array(X[:, self.cat_column_indexes], accept_sparse=True)
        
        check_is_fitted(self, 'is_fitted_')
        
        predictions = []
        
        cat_columns=[]
        for col in self.cat_column_indexes:
            cat_columns.append(X[:, col])
            
        cat_tuples = list(zip(*cat_columns))
        
        for sample_cat in cat_tuples:
            cat_mean = self.means.get(sample_cat)
            if(cat_mean == None):
                predictions.append(self.mean)
            else:
                predictions.append(cat_mean)
            
        return np.array(predictions)

In [47]:
"""
A container to save a pair of model : the best performing model,
and if the best performing model cannot predict rows containing nans, a less-performant helperModel.
This could be modified to extend BaseEstimator.
"""
class ModelContainer:
    
    def __init__(self, best_model_arg, helper_model_arg):
        self.best_model = best_model_arg
        self.helper_model = helper_model_arg
        
    """
    If the model cannot predict rows with nans, separate them from rows without nans (clean rows).
    Then use helper model to predict rows with nans.
    """
    def predict(self, X):
        
        print("predicting...")
        
        if(helper_model == None):
            return pd.Series(data=best_model.predict(X), index=X.index)
        
        clean_rows = X.isna().sum(axis=1) == 0
        
        print('{} clean rows'.format(clean_rows.sum()))
        print('{} dirty rows'.format((~clean_rows).sum()))
        
        clean_index = X[clean_rows].index
        dirty_index = X[~clean_rows].index
        
        clean_preds = best_model.predict(X.loc[clean_index])
        dirty_preds = helper_model.predict(X.loc[dirty_index])
        
        preds_df_clean = pd.Series(data=clean_preds, index=clean_index)
        preds_df_dirty = pd.Series(data=dirty_preds, index=dirty_index)
        
        preds = pd.concat([preds_df_clean, preds_df_dirty], axis=0)
        preds.sort_index(inplace=True)
        
        return preds

<b>Load models and predict</b>

In [26]:
"""
for (building, meter) in sub_test_df_grouped.index:
    - get site
    - get timestamps to predict
    - load model
    - predict

"""

'\nfor (building, meter) in sub_test_df_grouped.index:\n    - get site\n    - get timestamps to predict\n    - load model\n    - predict\n\n'

In [98]:
prediction_dfs = []

for (building, meter) in sub_test_df_grouped.index:
    
    site = sub_test_df_grouped.loc[(building, meter), 'site_id']
    
    this_b_and_m = (test_df['building_id']==building) & (test_df['meter']==meter)
    timestamps_to_predict = test_df[this_b_and_m]['timestamp']
    
    # Load model
    b_folder = 'building_' + str(building)
    m_folder = 'meter_' + str(meter)
    model_path = path.join(training_folder_path, b_folder, m_folder, 'model_container.joblib')
    model_container = joblib.load(model_path)
    
    best_model = model_container.best_model
    helper_model = model_container.helper_model
    
    print('site : {}'.format(site))
    
    x_test = site_data[site].loc[pd.Index(timestamps_to_predict)]
    
    print('best_model type: {}'.format(type(best_model).__name__))
    
    meter_preds = model_container.predict(x_test)
    
    meter_preds_df = pd.DataFrame({
        'building_id' : building,
        'meter' : meter,
        'timestamp' : meter_preds.index,
        'meter_reading' : meter_preds.reset_index(drop=True, inplace=False)
    })
    
    prediction_dfs.append(meter_preds_df)
    
predictions_df = pd.concat(prediction_dfs, axis=0)
predictions_df.reset_index(drop=True, inplace=True)

site : 5
best_model type: MeanByMultiCatEstimator
predicting...
site : 5
best_model type: RandomForestRegressor
predicting...
17124 clean rows
396 dirty rows
site : 9
best_model type: RandomForestRegressor
predicting...
17191 clean rows
329 dirty rows
site : 14
best_model type: RandomForestRegressor
predicting...
17519 clean rows
1 dirty rows
site : 3
best_model type: MeanByMultiCatEstimator
predicting...
site : 2
best_model type: MeanByMultiCatEstimator
predicting...


In [99]:
predictions_df.shape

(105120, 4)

In [100]:
predictions_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,723,0,2017-01-01 01:00:00,64.727823
1,723,0,2017-01-01 02:00:00,66.04254
2,723,0,2017-01-01 03:00:00,65.656002
3,723,0,2017-01-01 04:00:00,65.47331
4,723,0,2017-01-01 05:00:00,65.831002


In [101]:
predictions_df.tail()

Unnamed: 0,building_id,meter,timestamp,meter_reading
105115,189,0,2018-12-31 19:00:00,12.29
105116,189,0,2018-12-31 20:00:00,11.926923
105117,189,0,2018-12-31 21:00:00,10.973077
105118,189,0,2018-12-31 22:00:00,10.688462
105119,189,0,2018-12-31 23:00:00,10.808462


<b>Prepare submission csv</b>

In [102]:
submission_df = pd.read_csv('../data/raw/csvs/sample_submission.csv')

In [103]:
submission_df.head()

Unnamed: 0,row_id,meter_reading
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [104]:
test_df.head()

Unnamed: 0_level_0,building_id,meter,timestamp
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,2017-01-01
1,1,0,2017-01-01
2,2,0,2017-01-01
3,3,0,2017-01-01
4,4,0,2017-01-01


In [105]:
predictions_df.set_index(['building_id', 'meter', 'timestamp'], inplace=True)

In [106]:
my_submission_df = test_df.join(predictions_df, on=['building_id', 'meter', 'timestamp'], how='left')

In [107]:
my_submission_df.head()

Unnamed: 0_level_0,building_id,meter,timestamp,meter_reading
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,0,2017-01-01,
1,1,0,2017-01-01,
2,2,0,2017-01-01,
3,3,0,2017-01-01,
4,4,0,2017-01-01,


In [108]:
my_submission_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 4 columns):
building_id      int64
meter            int64
timestamp        datetime64[ns]
meter_reading    float64
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 1.6 GB


In [109]:
my_subm_nas = my_submission_df.isna().sum()
my_subm_nas

building_id             0
meter                   0
timestamp               0
meter_reading    41592480
dtype: int64

In [113]:
n_rows = my_submission_df.shape[0]
n_predictions = n_rows-my_subm_nas['meter_reading']

print('n predictions: {}/{} ({}%)'.format(n_predictions, n_rows, round(n_predictions/n_rows*100, 2)))

n predictions: 105120/41697600 (0.25%)
