In [15]:
# Path setup
import sys
import os

sys.path.append("/home/dchen/Random_Forest_Weights/")

# Basics:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Helpful:
from sklearn.model_selection import train_test_split

# Pipeline and ColumnsTransformer:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit
# models:
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor

# my functions:
from src_rf.methods.calc_mean import *
from src_rf.methods.calc_weights import *
from src_rf.methods.calc_dist import *

### 1. Load Data

In [3]:
df = pd.read_csv("/home/dchen/Random_Forest_Weights/src_rf/data/energy_data_hourly.csv"
                 , index_col = 'datetime', parse_dates=True)

### 2. Train Test Split

In [8]:
X = df.drop('total_energy_usage', axis = 1)
y = df['total_energy_usage']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2 ,shuffle=False, random_state=42)

### 3. Hyperparameter tuning

In [17]:
# Define the hyperparameters and their possible values
param_dist = {
    'n_estimators': np.arange(10, 200, 10),
    'max_depth': [None] + list(np.arange(2, 20)),
    'min_samples_split': np.arange(2, 20, 2),
    'min_samples_leaf': np.arange(1, 20, 2),
    'bootstrap': [True, False],
    'criterion': ['mse', 'mae']
}


In [18]:
rf = RandomForestRegressor()
tscv = TimeSeriesSplit(n_splits=5)

In [20]:
random_search = RandomizedSearchCV(
    rf, 
    param_distributions=param_dist, 
    n_iter=3, 
    cv=tscv, 
    verbose=1, 
    n_jobs=-1,
    scoring='neg_mean_squared_error', # or any other appropriate scoring metric
    random_state=42
)

In [None]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [None]:
print(random_search.best_params_)
print(random_search.best_score_)