In [1]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit #for data preprocessing and crass validating 
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression #logistic Regression
from sklearn.ensemble import RandomForestRegressor #Random Forest 

from statistics import mean
from hyperopt import Trials, hp, fmin, tpe, STATUS_OK, space_eval #for hyperparameter tuning and minimizing

from cyclic_boosting.pipelines import pipeline_CBClassifier
from sklearn.ensemble import HistGradientBoostingRegressor

from datetime import date
from datetime import datetime

In [2]:
samples = pd.read_csv('unmerged_buildings.csv')
samples['date'] = pd.to_datetime(samples['date'])

reading_types = pd.read_csv('reading_types.csv')

samples.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957826 entries, 0 to 957825
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   Unnamed: 0     957826 non-null  int64         
 1   date           957826 non-null  datetime64[ns]
 2   value_type_id  957826 non-null  int64         
 3   value          957826 non-null  float64       
 4   building_id    957826 non-null  int64         
 5   work_hours     957826 non-null  bool          
 6   day type       957826 non-null  int64         
 7   Fall           957826 non-null  bool          
 8   Spring         957826 non-null  bool          
 9   Summer         957826 non-null  bool          
 10  Winter         957826 non-null  bool          
 11  trimester_day  957826 non-null  int64         
 12  working_hour   957826 non-null  int64         
dtypes: bool(5), datetime64[ns](1), float64(1), int64(6)
memory usage: 63.0 MB


In [10]:
for k, df in samples.groupby(by = ['building_id', 'value_type_id']):
    df = df.drop(columns=['value_type_id', 'Unnamed: 0'], axis = 1)
    df = df.sort_values(by='date') 

    df['prev_reading'] = df['value'].shift(1)
    df['after_reading'] = df['value'].shift(-1)

    df = df.ffill() 
    df = df.bfill()

    df = df.drop(columns = ['date'], axis = 1)

    df.info()
    x_train, x_test, y_train, y_test = train_test_split(df.drop(['value'], axis = 1), df['value'])
    tst_cv = TimeSeriesSplit(n_splits=5, gap = 48)

    space = {
    'n_estimators': hp.choice('n_estimators', range(50, 100)),
    'max_depth': hp.choice('max_depth', [1, 5, 10, 20, 50, 75, 100, 150, 200]),
    'min_samples_split': hp.choice('min_samples_split', [2, 3, 4, 5, 10, 20]),
    'min_samples_leaf': hp.choice('min_samples_leaf', [1, 2, 3, 4, 5]),
    'bootstrap': hp.choice('bootstrap', [True, False]),
    'criterion': hp.choice('criterion', ['squared_error', 'absolute_error', 'friedman_mse']),
    'max_features': hp.choice('max_features', ['sqrt', None, 'log2'])
    }

    # kfold = KFold(n_splits = 5, shuffle = True, random_state = 0)

    def objective(params):
        clf = RandomForestRegressor(**params)
        print(params)
        tst_cv = TimeSeriesSplit(n_splits=5, gap = 48)
        scores = cross_val_score(clf, x_train, y_train, cv = tst_cv, scoring = 'r2', n_jobs = -1)
        best_score = mean(scores) 
        loss = -best_score
        return {'loss': loss, 'params': params, 'status': STATUS_OK}

    num_trials = Trials()
    best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 40, trials = num_trials)

    randomForest = RandomForestRegressor(n_estimators = space_eval(space, best)['n_estimators'], max_depth = space_eval(space, best)['max_depth'],
                                        min_samples_split = space_eval(space, best)['min_samples_split'], min_samples_leaf = space_eval(space, best)['min_samples_leaf'],
                                        bootstrap = space_eval(space, best)['bootstrap'], criterion = space_eval(space, best)['criterion'], 
                                        max_features = space_eval(space, best)['max_features'])

<class 'pandas.core.frame.DataFrame'>
Index: 6203 entries, 66071 to 45057
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   value          6203 non-null   float64
 1   building_id    6203 non-null   int64  
 2   work_hours     6203 non-null   bool   
 3   day type       6203 non-null   int64  
 4   Fall           6203 non-null   bool   
 5   Spring         6203 non-null   bool   
 6   Summer         6203 non-null   bool   
 7   Winter         6203 non-null   bool   
 8   trimester_day  6203 non-null   int64  
 9   working_hour   6203 non-null   int64  
 10  prev_reading   6203 non-null   float64
 11  after_reading  6203 non-null   float64
dtypes: bool(5), float64(3), int64(4)
memory usage: 418.0 KB
{'bootstrap': False, 'criterion': 'friedman_mse', 'max_depth': 200, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 61}
{'bootstrap': False, 'criterion': 'friedman_mse', 'max_dept

KeyboardInterrupt: 