In [1]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit #for data preprocessing and crass validating 
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression #logistic Regression
from sklearn.ensemble import RandomForestRegressor #Random Forest 

from statistics import mean
from hyperopt import Trials, hp, fmin, tpe, STATUS_OK, space_eval #for hyperparameter tuning and minimizing

from cyclic_boosting.pipelines import pipeline_CBClassifier
from sklearn.ensemble import HistGradientBoostingRegressor

from datetime import date
from datetime import datetime

In [2]:
samples = pd.read_csv('preprocessed.csv')
samples['date'] = pd.to_datetime(samples['date'])

reading_types = pd.read_csv('reading_types.csv')

samples.info()

df_lst = [v for k, v in samples.groupby('value_type_id')]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2289876 entries, 0 to 2289875
Data columns (total 38 columns):
 #   Column         Dtype         
---  ------         -----         
 0   Unnamed: 0     int64         
 1   value_type_id  float64       
 2   value          float64       
 3   date           datetime64[ns]
 4   Fall           bool          
 5   Spring         bool          
 6   Summer         bool          
 7   Winter         bool          
 8   trimester_day  int64         
 9   building_1.0   bool          
 10  building_2.0   bool          
 11  building_3.0   bool          
 12  building_6.0   bool          
 13  building_8.0   bool          
 14  building_10.0  bool          
 15  building_11.0  bool          
 16  building_12.0  bool          
 17  building_13.0  bool          
 18  building_16.0  bool          
 19  building_17.0  bool          
 20  building_18.0  bool          
 21  building_19.0  bool          
 22  building_20.0  bool          
 23  buildin

In [3]:
for df in df_lst:
    df = df.drop(columns=['value_type_id', 'Unnamed: 0'], axis = 1)
    df = df.sort_values(by='date') 
    df = df.drop(columns = ['date'], axis = 1)
    x_train, x_test, y_train, y_test = train_test_split(df.drop('value', axis = 1), df['value'])
    tst_cv = TimeSeriesSplit(n_splits=5, gap = 48)

    space = {
    'n_estimators': hp.choice('n_estimators', range(50, 100)),
    'max_depth': hp.choice('max_depth', [1, 5, 10, 20, 50, 75, 100, 150, 200]),
    'min_samples_split': hp.choice('min_samples_split', [2, 3, 4, 5, 10, 20]),
    'min_samples_leaf': hp.choice('min_samples_leaf', [1, 2, 3, 4, 5]),
    'bootstrap': hp.choice('bootstrap', [True, False]),
    'criterion': hp.choice('criterion', ['squared_error', 'absolute_error', 'friedman_mse']),
    'max_features': hp.choice('max_features', ['sqrt', None, 'log2'])
    }

    # kfold = KFold(n_splits = 5, shuffle = True, random_state = 0)

    def objective(params):
        clf = RandomForestRegressor(**params)
        print(params)
        tst_cv = TimeSeriesSplit(n_splits=5, gap = 48)
        scores = cross_val_score(clf, x_train, y_train, cv = tst_cv, scoring = 'r2', n_jobs = -1)
        best_score = mean(scores) 
        loss = -best_score
        return {'loss': loss, 'params': params, 'status': STATUS_OK}

    num_trials = Trials()
    best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 300, trials = num_trials)

    randomForest = RandomForestRegressor(n_estimators = space_eval(space, best)['n_estimators'], max_depth = space_eval(space, best)['max_depth'],
                                        min_samples_split = space_eval(space, best)['min_samples_split'], min_samples_leaf = space_eval(space, best)['min_samples_leaf'],
                                        bootstrap = space_eval(space, best)['bootstrap'], criterion = space_eval(space, best)['criterion'], 
                                        max_features = space_eval(space, best)['max_features'])

{'bootstrap': False, 'criterion': 'friedman_mse', 'max_depth': 150, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_estimators': 58}
{'bootstrap': False, 'criterion': 'squared_error', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 66}
{'bootstrap': True, 'criterion': 'absolute_error', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 94}
  1%|          | 2/300 [10:58<27:15:14, 329.24s/trial, best loss: -0.8698871108763022]


KeyboardInterrupt: 