In [1]:
# Path setup
import sys
import os

sys.path.append("/home/dchen/Random_Forest_Weights/")

# Basics:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Helpful:
from sklearn.model_selection import train_test_split

# Pipeline and ColumnsTransformer:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit
# models:
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor

# my functions:
from src_rf.methods.calc_mean import *
from src_rf.methods.calc_weights import *
from src_rf.methods.calc_dist import *

### 1. Load Data

In [2]:
df = pd.read_csv("/home/dchen/Random_Forest_Weights/src_rf/data/energy_data_hourly.csv"
                 , index_col = 'datetime', parse_dates=True)

### 2. Train Test Split

In [3]:
X = df.drop('total_energy_usage', axis = 1)
y = df['total_energy_usage']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3 ,shuffle=False, random_state=42)

### 3. Hyperparameter tuning

In [28]:
# Define the hyperparameters and their possible values
param_dist = {
    'n_estimators': np.arange(100, 1000, 100),
    'max_depth': [None, 2, 5, 10, 25, 50 ,70, 100],
    'min_samples_split': [2, 5, 10 ,20, 35, 50],
    'min_samples_leaf': [1, 3, 5, 10, 15, 25, 35, 50],
    'bootstrap': [True, False],
}


In [29]:
param_dist

{'n_estimators': array([100, 200, 300, 400, 500, 600, 700, 800, 900]),
 'max_depth': [None, 2, 5, 10, 25, 50, 70, 100],
 'min_samples_split': [2, 5, 10, 20, 35, 50],
 'min_samples_leaf': [1, 3, 5, 10, 15, 25, 35, 50],
 'bootstrap': [True, False]}

In [30]:
rf = RandomForestRegressor(random_state=42, n_jobs = -2)
tscv = TimeSeriesSplit(n_splits=5)

In [31]:
random_search = RandomizedSearchCV(
    rf, 
    param_distributions=param_dist, 
    n_iter=150, 
    cv=tscv, 
    verbose=1, 
    scoring='neg_mean_squared_error', # or any other appropriate scoring metric
    random_state=42
)

In [5]:
# random_search.fit(X_train, y_train)

In [28]:
# Load cv results:
rf_cv_results = pd.read_csv("/Data/Delong_BA_Data/rf_weights/rf_cv_results.csv")
# Load cv 2 results:
rf_cv_results_2 = pd.read_csv("/Data/Delong_BA_Data/rf_weights/rf_cv_results_2.csv")

In [41]:
rf_cv_results.sort_values(['mean_test_score'], ascending = False)[['mean_test_score', 'std_test_score'
                                                                   , 'param_n_estimators', 'param_min_samples_split'
                                                                   ,'param_min_samples_leaf', 'param_max_depth'
                                                                   , 'param_bootstrap']].head(7)

Unnamed: 0,mean_test_score,std_test_score,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_depth,param_bootstrap
189,-13.448995,3.019934,800,2,10,,True
89,-13.45313,3.014812,700,5,10,50.0,True
169,-13.478841,3.02889,400,2,10,25.0,True
130,-13.501498,3.039728,200,2,10,25.0,True
58,-13.513956,2.887897,900,5,15,25.0,True
10,-13.522711,2.876794,600,5,15,,True
174,-13.522711,2.876794,600,10,15,,True


In [36]:
rf_cv_results_2.sort_values(['mean_test_score'], ascending = False)[['mean_test_score', 'std_test_score','param_max_samples'
                                                                   , 'param_n_estimators', 'param_min_samples_split'
                                                                   ,'param_min_samples_leaf', 'param_max_depth'
                                                                   , 'param_bootstrap']].head(15)

Unnamed: 0,mean_test_score,std_test_score,param_max_samples,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_depth,param_bootstrap
76,-12.982787,2.979846,0.5,700,5,5,40.0,True
42,-12.994283,2.97062,0.5,600,2,5,60.0,True
7,-13.093425,2.847754,0.5,700,5,10,,True
47,-13.103472,2.844889,0.5,600,7,10,,True
50,-13.130063,3.028278,0.6,800,7,5,,True
6,-13.138624,3.02827,0.6,700,5,5,,True
59,-13.188964,2.897788,0.6,800,10,10,60.0,True
46,-13.198894,2.895032,0.6,700,10,10,50.0,True
55,-13.198894,2.895032,0.6,700,2,10,50.0,True
56,-13.198894,2.895032,0.6,700,3,10,50.0,True
