In [1]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit #for data preprocessing and crass validating 
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression #logistic Regression
from sklearn.ensemble import RandomForestRegressor #Random Forest 

from statistics import mean
from hyperopt import Trials, hp, fmin, tpe, STATUS_OK, space_eval #for hyperparameter tuning and minimizing

from cyclic_boosting.pipelines import pipeline_CBClassifier
from sklearn.ensemble import HistGradientBoostingRegressor

from datetime import date
from datetime import datetime

In [2]:
samples = pd.read_csv('preprocessed.csv')
samples['date'] = pd.to_datetime(samples['date'])

reading_types = pd.read_csv('reading_types.csv')

samples.info()

df_lst = [v for k, v in samples.groupby('value_type_id')]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957826 entries, 0 to 957825
Data columns (total 41 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   Unnamed: 0     957826 non-null  int64         
 1   date           957826 non-null  datetime64[ns]
 2   value_type_id  957826 non-null  int64         
 3   value          957826 non-null  float64       
 4   work_hours     957826 non-null  bool          
 5   day type       957826 non-null  int64         
 6   Fall           957826 non-null  bool          
 7   Spring         957826 non-null  bool          
 8   Summer         957826 non-null  bool          
 9   Winter         957826 non-null  bool          
 10  trimester_day  957826 non-null  int64         
 11  working_hour   957826 non-null  int64         
 12  building_1     957826 non-null  bool          
 13  building_2     957826 non-null  bool          
 14  building_3     957826 non-null  bool          
 15  

In [3]:
for df in df_lst:
    df = df.drop(columns=['value_type_id', 'Unnamed: 0'], axis = 1)
    df = df.sort_values(by='date') 
    df = df.drop(columns = ['date'], axis = 1)
    x_train, x_test, y_train, y_test = train_test_split(df.drop(['value'], axis = 1), df['value'])
    tst_cv = TimeSeriesSplit(n_splits=5, gap = 48)

    space = {
    'n_estimators': hp.choice('n_estimators', range(50, 100)),
    'max_depth': hp.choice('max_depth', [1, 5, 10, 20, 50, 75, 100, 150, 200]),
    'min_samples_split': hp.choice('min_samples_split', [2, 3, 4, 5, 10, 20]),
    'min_samples_leaf': hp.choice('min_samples_leaf', [1, 2, 3, 4, 5]),
    'bootstrap': hp.choice('bootstrap', [True, False]),
    'criterion': hp.choice('criterion', ['squared_error', 'absolute_error', 'friedman_mse']),
    'max_features': hp.choice('max_features', ['sqrt', None, 'log2'])
    }

    # kfold = KFold(n_splits = 5, shuffle = True, random_state = 0)

    def objective(params):
        clf = RandomForestRegressor(**params)
        print(params)
        tst_cv = TimeSeriesSplit(n_splits=5, gap = 48)
        scores = cross_val_score(clf, x_train, y_train, cv = tst_cv, scoring = 'r2', n_jobs = -1)
        best_score = mean(scores) 
        loss = -best_score
        return {'loss': loss, 'params': params, 'status': STATUS_OK}

    num_trials = Trials()
    best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 40, trials = num_trials)

    randomForest = RandomForestRegressor(n_estimators = space_eval(space, best)['n_estimators'], max_depth = space_eval(space, best)['max_depth'],
                                        min_samples_split = space_eval(space, best)['min_samples_split'], min_samples_leaf = space_eval(space, best)['min_samples_leaf'],
                                        bootstrap = space_eval(space, best)['bootstrap'], criterion = space_eval(space, best)['criterion'], 
                                        max_features = space_eval(space, best)['max_features'])

{'bootstrap': False, 'criterion': 'absolute_error', 'max_depth': 50, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 94}
{'bootstrap': True, 'criterion': 'squared_error', 'max_depth': 75, 'max_features': 'log2', 'min_samples_leaf': 5, 'min_samples_split': 4, 'n_estimators': 52}
{'bootstrap': False, 'criterion': 'friedman_mse', 'max_depth': 1, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 96}
{'bootstrap': False, 'criterion': 'friedman_mse', 'max_depth': 75, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 20, 'n_estimators': 81}
{'bootstrap': True, 'criterion': 'friedman_mse', 'max_depth': 75, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 20, 'n_estimators': 94}
{'bootstrap': True, 'criterion': 'friedman_mse', 'max_depth': 200, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 63}
{'bootstrap': True, 'criterion': 'absolute_error', 