In [1]:
# Path setup
import sys
import os

sys.path.append("/home/dchen/Random_Forest_Weights/")

# Basics:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Helpful:
from sklearn.model_selection import train_test_split

# Pipeline and ColumnsTransformer:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit
# models:
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor

# my functions:
from src_rf.methods.calc_mean import *
from src_rf.methods.calc_weights import *
from src_rf.methods.calc_dist import *

### 1. Load Data

In [2]:
df = pd.read_csv("/home/dchen/Random_Forest_Weights/src_rf/data/energy_data_hourly.csv"
                 , index_col = 'datetime', parse_dates=True)

### 2. Train Test Split

In [3]:
X = df.drop('total_energy_usage', axis = 1)
y = df['total_energy_usage']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3 ,shuffle=False, random_state=42)

### 3. Hyperparameter tuning

In [28]:
# Define the hyperparameters and their possible values
param_dist = {
    'n_estimators': np.arange(100, 1000, 100),
    'max_depth': [None, 2, 5, 10, 25, 50 ,70, 100],
    'min_samples_split': [2, 5, 10 ,20, 35, 50],
    'min_samples_leaf': [1, 3, 5, 10, 15, 25, 35, 50],
    'bootstrap': [True, False],
}


In [29]:
param_dist

{'n_estimators': array([100, 200, 300, 400, 500, 600, 700, 800, 900]),
 'max_depth': [None, 2, 5, 10, 25, 50, 70, 100],
 'min_samples_split': [2, 5, 10, 20, 35, 50],
 'min_samples_leaf': [1, 3, 5, 10, 15, 25, 35, 50],
 'bootstrap': [True, False]}

In [30]:
rf = RandomForestRegressor(random_state=42, n_jobs = -2)
tscv = TimeSeriesSplit(n_splits=5)

In [31]:
random_search = RandomizedSearchCV(
    rf, 
    param_distributions=param_dist, 
    n_iter=150, 
    cv=tscv, 
    verbose=1, 
    scoring='neg_mean_squared_error', # or any other appropriate scoring metric
    random_state=42
)

In [32]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 150 candidates, totalling 750 fits


KeyboardInterrupt: 

In [None]:
print(random_search.best_params_)
print(random_search.best_score_)

In [15]:
random_search

RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
                   estimator=RandomForestRegressor(n_jobs=-2, random_state=42),
                   n_iter=3,
                   param_distributions={'bootstrap': [True, False],
                                        'criterion': ['mse', 'mae'],
                                        'max_depth': [None, 2, 3, 4, 5, 6, 7, 8,
                                                      9, 10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19],
                                        'min_samples_leaf': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19]),
                                        'min_samples_split': array([ 2,  4,  6,  8, 10, 12, 14, 16, 18]),
                                        'n_estimators': array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
       140, 150, 160, 170, 180, 190])},
                   random_state=42, scoring

In [13]:
rf_cv_results = pd.DataFrame(random_search.cv_results_)

In [14]:
rf_cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_depth,param_criterion,param_bootstrap,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,177.439952,148.717553,0.076879,0.007369,170,4,7,15.0,mae,False,"{'n_estimators': 170, 'min_samples_split': 4, ...",-14.45703,-13.212728,-14.931418,-19.794066,-25.28798,-17.536645,4.475414,3
1,0.29374,0.099343,0.03214,0.011489,70,8,5,10.0,mse,True,"{'n_estimators': 70, 'min_samples_split': 8, '...",-12.340073,-9.134249,-13.764008,-15.957592,-18.210413,-13.881267,3.097607,2
2,0.295871,0.127593,0.028679,0.009951,60,2,11,,mse,True,"{'n_estimators': 60, 'min_samples_split': 2, '...",-12.179471,-8.60169,-13.876031,-15.550214,-17.519837,-13.545449,3.040749,1
