

> authored by Ena Macahiya



# Random Forest Overview
- supervised learning, handles regression and classification
- ensemble learning method - combine predictions from other models. each smaller model in random forest ensemble is a **decision tree**
- how it works: multiple trees are created using random subsets of data and features. each DT is an expert providing its opinion on how to classify the data. predictions are made by calculating prediction for each DT and returning most popular or average result

In [None]:
import pandas as pd
import math
import os

# model imports
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import randint

In [None]:
# import preprocessed data
path = 'D:\\AI4ALL-Group-8C\\data\\processed\\processed_AAPL'
os.makedirs(path, exist_ok=True)

X_train = pd.read_csv(os.path.join(path, 'X_train.csv'))
X_test = pd.read_csv(os.path.join(path, 'X_test.csv'))
X_val = pd.read_csv(os.path.join(path, 'X_val.csv'))

y_train = pd.read_csv(os.path.join(path, 'y_train.csv')).iloc[:, 0]
y_test = pd.read_csv(os.path.join(path, 'y_test.csv')).iloc[:, 0]
y_val = pd.read_csv(os.path.join(path, 'y_val.csv')).iloc[:, 0]

In [27]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_train sample data:\n{y_train.head()}")

X_train shape: (371411, 515)
y_train shape: (371411,)
y_train sample data:
0    97.25
1    16.89
2     7.70
3    47.23
4    37.47
Name: target, dtype: float64


In [28]:
tscv = TimeSeriesSplit(n_splits=5)

# fit model w hyperparam tuning
param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}

rf = RandomForestRegressor(random_state=42) # params can use n_estimators and random_state to have same result

rand_search = RandomizedSearchCV(rf, # model
                                 param_distributions = param_dist, # dict of hyp ranges to sample from
                                 n_iter=5, # num of random combos
                                 cv=tscv, # num of folds in cross-validation
                                 random_state=42, # seed
                                 n_jobs=4, # 4 CPU cores
                                 verbose=2) # shows progress

rand_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [29]:
# best model from random hyp test
best_rf = rand_search.best_estimator_
print('Best hyperparameters:',  rand_search.best_params_)

Best hyperparameters: {'max_depth': 7, 'n_estimators': 485}


In [32]:
# val predictions and acc
y_pred = best_rf.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print("MSE:", mse)
print("Root Mean Squared Error:", math.sqrt(mse))
print("R²:", r2_score(y_val, y_pred))

MSE: 10739.799200930855
Root Mean Squared Error: 103.63300246992198
R²: -0.013786213535183833


In [33]:
# test predictions and acc
y_pred = best_rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)
print("Root Mean Squared Error:", math.sqrt(mse))
print("R²:", r2_score(y_test, y_pred))

MSE: 16883.586560291325
Root Mean Squared Error: 129.9368560505114
R²: -0.0376554132484479


# Rundown of Accuracy Calculations
MSE = the closer to zero, the better. lower = better fit of the model to the data. measures the avg of the squares of the errors or deviations. for each data point find this: (actual value - predicted value)^2 and add it all together. then divide by the number of data points. </br>
R^2 = the higher, the better. if its 1, the model explains all the variability of the dependent variable around its mean. represents the proportion of the variance in the dependent variable that can be explained by the independent variables in a regression model. (how well does the independent var explain the variability of the dependent varaible). the formula is a little more complicated but in essence you divide the sum of squares of residuals over the total sum of squares, and subtract it from 1.

# Saving the Model

In [34]:
import joblib
import os 

model_dir = 'D:\\AI4ALL-Group-8C\\models'
model_filename = 'best_random_forest_regressor.joblib'
model_path = os.path.join(model_dir, model_filename)

os.makedirs(model_dir, exist_ok=True)

# save the model
joblib.dump(best_rf, model_path)
print(f"model saved to {model_path}")

model saved to D:\AI4ALL-Group-8C\models\best_random_forest_regressor.joblib


In [None]:
# model loading (for use in a new session or after training is done)
# to load the model, uncomment if running the full notebook without re-training.

# loaded_model = joblib.load(model_path)
# print("model loaded successfully!")

# # example: Test predictions and accuracy using the loaded model
# y_pred_loaded_test = loaded_model.predict(X_test)
# print("\nMetrics from Loaded Model (Test Set)")
# print("MSE (Test):", mean_squared_error(y_test, y_pred_loaded_test))
# print("R² (Test):", r2_score(y_test, y_pred_loaded_test))