# MDS-thesis/random_forest

## 0.Setup

### Install packages

In [1]:
# Install required packages
#!pip install seaborn
#%pip install -q pandas matplotlib numpy scikit-learn
#%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

### Load packages

In [2]:
# Import packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
%matplotlib inline
#to display matplotlib plots directly in the notebook

from sklearn import linear_model
#from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
#from sklearn.model_selection import TimeSeriesSplit
#from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

## 1. Load Data

### VSCode

In [3]:
# Import train
#train = pd.read_csv('../Data/train_test/train_df.csv')
#train = pd.read_csv('Data/train_test/train_df.csv')

# Import test
#test = pd.read_csv('../Data/train_test/test_df.csv')
#test = pd.read_csv('Data/train_test/test_df.csv')

### Colab

In [4]:
import pandas as pd
#file_path = "/content/train_df.csv"  # Adjust this path
#train = pd.read_csv(file_path)
#display(train)

#file_path = "/content/test_df.csv"  # Adjust this path
#test = pd.read_csv(file_path)
#display(test)

### Kaggle

In [5]:
import os
import pandas as pd

# Define dataset path
ds_path = "/kaggle/input/final-ds"

# Create a list with the files in the dataset (dataframes)
ds_files = os.listdir(ds_path) #list available files in the dataset

# Load each file into a dictionary (assuming all files are csv)
ds = {file: pd.read_csv(f"{ds_path}/{file}") for file in ds_files}

# Create the specific dataframe
final = ds[ds_files[0]] #first csv file
#final.head()
test = ds[ds_files[1]]  
#test.head()
train = ds[ds_files[2]]  
#train.head()


## 2. Preprocess

In [6]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# y train and test
y_train = train['tc_loss_area']
y_test = test['tc_loss_area']

# Normalize output
scaler = StandardScaler()
y_train = scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test = scaler.transform(y_test.values.reshape(-1, 1))


# Get rid of columns that start with 'subnational1_' and 'cluster_' in train and test   
train1 = train.loc[:,~train.columns.str.startswith('subnational1_')]
train1 = train.loc[:,~train.columns.str.startswith('cluster_')]
test1 = test.loc[:,~test.columns.str.startswith('subnational1_')]
test1 = test.loc[:,~test.columns.str.startswith('cluster_')]

# X train and test
X_train1 = train1.drop(columns=['tc_loss_area', 'codmpio'])
X_test1 = test1.drop(columns=['tc_loss_area', 'codmpio'])

# Normalize features
X_train1 = scaler.fit_transform(X_train1)
X_test1 = scaler.transform(X_test1)

## 3. Model

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=17, n_jobs=-1)

# Train the model
rf_model.fit(X_train1, y_train.ravel())  # `ravel()` converts y_train to a 1D array if needed

# Make predictions
y_pred_train = rf_model.predict(X_train1)
y_pred_test = rf_model.predict(X_test1)

# Evaluate the model
train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

# Print results
print(f"Train MSE: {train_mse:.4f}, Train R²: {train_r2:.4f}")
print(f"Test MSE: {test_mse:.4f}, Test R²: {test_r2:.4f}")


Train MSE: 0.0520, Train R²: 0.9480
Test MSE: 0.8313, Test R²: -0.0444


## 4. Hyperparameter tunning

In [8]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Define the parameter grid
param_dist = {
    "n_estimators": [100, 200, 300, 500],  # Number of trees
    "max_depth": [10, 20, 30, None],  # Max depth of trees
    "min_samples_split": [2, 5, 10],  # Minimum samples to split a node
    "min_samples_leaf": [1, 2, 4],  # Minimum samples in a leaf node
    "max_features": ["sqrt", "log2", None]  # Number of features considered per split
}

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(random_state=42)

# Randomized Search Cross Validation
random_search = RandomizedSearchCV(
    rf, param_distributions=param_dist,
    n_iter=20, cv=5, scoring="neg_mean_squared_error",
    verbose=2, n_jobs=-1, random_state=42
)

# Fit the model
random_search.fit(X_train1, y_train.ravel())

# Best parameters found
print("Best Parameters:", random_search.best_params_)

# Best model
best_rf_model = random_search.best_estimator_


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None}


## 5. Evaluate model

In [9]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on test data
y_pred_test = best_rf_model.predict(X_test1)
y_pred_train = best_rf_model.predict(X_train1)

# Evaluate performance
train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print(f"Optimized Train MSE: {train_mse:.4f}, Train R²: {train_r2:.4f}")
print(f"Optimized Test MSE: {test_mse:.4f}, Test R²: {test_r2:.4f}")


Optimized Train MSE: 0.1706, Train R²: 0.8294
Optimized Test MSE: 0.7624, Test R²: 0.0421


## z. Save models

In [10]:
import joblib

# Save the trained Random Forest model
joblib.dump(best_rf_model, "random_forest_model.pkl")

['random_forest_model.pkl']