In [24]:
import numpy as np
import math
import pandas as pd
from statsmodels.tsa.seasonal import seasonal_decompose

from time import localtime, strftime

%matplotlib inline

some links :<br>
- https://cloud.google.com/ml-engine/docs/scikit/custom-pipeline?hl=fr<br>
- https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65<br>
- https://www.kdnuggets.com/2019/10/extend-scikit-learn-bring-sanity-machine-learning-workflow.html<br>
- https://stackoverflow.com/questions/25539311/custom-transformer-for-sklearn-pipeline-that-alters-both-x-and-y

In [25]:
this_building = 493

# {0: electricity, 1: chilledwater, 2: steam, 3: hotwater}
this_meter = 0

data_folder = '../data/raw/csvs/'



In [43]:
a, b = load_data(this_building, this_meter, data_folder)

building is on site 3


In [26]:
def load_data(building_id, meter_id, data_folder_path):
    
    # Loads building_metadata to get site-building mapping
    bdata = pd.read_csv(data_folder_path + 'building_metadata.csv', 
                        index_col='building_id', 
                        usecols=['building_id', 'site_id'])

    this_site = bdata.loc[this_building, 'site_id']
    print('building is on site {}'.format(this_site))
    
    
    # Loads meter_reading data
    raw_df_meters = pd.read_csv(data_folder_path + 'train.csv', parse_dates=['timestamp'])

    to_keep = (raw_df_meters['building_id']==this_building) & (raw_df_meters['meter']==this_meter)
    b_df_meters = raw_df_meters[to_keep].copy()

    b_df_meters.drop('building_id', axis=1, inplace=True)
    b_df_meters.drop('meter', axis=1, inplace=True)

    b_df_meters.set_index('timestamp', inplace=True)
    b_df_meters.sort_index(inplace=True)
    
    # Loads weather data
    
    weather_features_to_load = ['air_temperature', 'dew_temperature'] # TODO as funct arg ?
    
    raw_df_weather = pd.read_csv(
        data_folder_path + 'weather_train.csv', 
        parse_dates=['timestamp'],
        index_col=['site_id','timestamp'],
        usecols=['site_id','timestamp'] + weather_features_to_load
    )

    b_df_weather = raw_df_weather.loc[(this_site,)]

    
    # Joins meter_reading and weather dfs.
    #df_features = b_df_meters.join(b_df_weather, on='timestamp', how='left')
    
    return (b_df_meters, b_df_weather)
    

In [39]:
# Transformer that builds time features.

from sklearn.base import BaseEstimator, TransformerMixin


class InterpolatorTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, limit=3):
        self.limit = limit
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        res = X.copy()
            
        print('-- before --')
        print(res.info())
            
        # Clean timestamps index.
        clean_index = pd.date_range(start=res.index.min(), end=res.index.max(), freq='H')
        res = res.reindex(index=clean_index, copy=True)

        # Interpolate missing values.
        res.interpolate(method='linear', limit=self.limit, inplace=True)
        
        print('-- after --')
        print(res.info())
        
        return res
        


class TimeFeatureTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    
    def transform(self, X):
        res = X.copy()
        
        
        # Build time features
        res['day_hour'] = res.index.to_series().dt.hour
        res['day_of_week'] = res.index.to_series().dt.dayofweek
        
        
        print('--')
        print(res.head())
        return res

In [40]:
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, column_or_1d
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_val_predict
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.base import BaseEstimator
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

In [45]:
a.head()

Unnamed: 0_level_0,meter_reading
timestamp,Unnamed: 1_level_1
2016-01-01 00:00:00,284.52
2016-01-01 01:00:00,290.52
2016-01-01 02:00:00,292.8
2016-01-01 03:00:00,289.89
2016-01-01 04:00:00,290.01


In [41]:
dr = DummyRegressor(strategy="mean")

In [42]:
ppln = make_pipeline(InterpolatorTransformer(limit=3), TimeFeatureTransformer(), dr)

In [44]:
ppln.fit(b)

-- before --
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8780 entries, 2016-01-01 00:00:00 to 2016-12-31 23:00:00
Data columns (total 2 columns):
air_temperature    8776 non-null float64
dew_temperature    8774 non-null float64
dtypes: float64(2)
memory usage: 205.8 KB
None
-- after --
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8784 entries, 2016-01-01 00:00:00 to 2016-12-31 23:00:00
Freq: H
Data columns (total 2 columns):
air_temperature    8784 non-null float64
dew_temperature    8784 non-null float64
dtypes: float64(2)
memory usage: 205.9 KB
None
--
                     air_temperature  dew_temperature  day_hour  day_of_week
2016-01-01 00:00:00             10.0              2.2         0            4
2016-01-01 01:00:00              9.4              2.8         1            4
2016-01-01 02:00:00              8.9              2.2         2            4
2016-01-01 03:00:00              7.8              1.1         3            4
2016-01-01 04:00:00              7.8 

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [16]:
a.head()

Unnamed: 0_level_0,meter_reading
timestamp,Unnamed: 1_level_1
2016-01-01 00:00:00,284.52
2016-01-01 01:00:00,290.52
2016-01-01 02:00:00,292.8
2016-01-01 03:00:00,289.89
2016-01-01 04:00:00,290.01


In [17]:
b.head()

Unnamed: 0_level_0,air_temperature,dew_temperature
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-01-01 00:00:00,10.0,2.2
2016-01-01 01:00:00,9.4,2.8
2016-01-01 02:00:00,8.9,2.2
2016-01-01 03:00:00,7.8,1.1
2016-01-01 04:00:00,7.8,0.6
