<a href="https://colab.research.google.com/github/dellavecchiaemiliano/LGD_prediction_solution_assignment/blob/main/LGD_RF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Mounting Drive**

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#**Importing libraries**

In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

#**Full Dataset**

Importing Dataset

In [29]:
data_cleaned = pd.read_csv('/content/drive/MyDrive/RM_lectures/data/Data preparation/data_cleaned')

Splitting the dataset in predictors and target variable

In [30]:
X = data_cleaned.drop('LGD', axis=1)
y = data_cleaned['LGD']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Fitting Random Forest

In [32]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

In [33]:
rf_regressor.fit(X_train, y_train)

Estimating the evaluation metrics

In [34]:
y_pred = rf_regressor.predict(X_test)

In [35]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [36]:
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2:", r2)

MAE: 0.012037243172951882
MSE: 0.00025860007152145645
RMSE: 0.016081046965961403
R2: 0.9935793255457749


#**Reduced Features Dataset**

Importing Dataset

In [37]:
X_red = pd.read_csv('/content/drive/MyDrive/RM_lectures/data/Data preparation/X_reduced')

Splitting the dataset in predictors and target variable

In [38]:
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(X_red, y, test_size=0.2)

Fitting Random Forest

In [39]:
rf_regressor.fit(X_train_red, y_train_red)

Estimating the evaluation metrics

In [40]:
y_pred_red = rf_regressor.predict(X_test_red)

In [41]:
mae_red = mean_absolute_error(y_test_red, y_pred_red)
mse_red = mean_squared_error(y_test_red, y_pred_red)
rmse_red = np.sqrt(mse)
r2_red = r2_score(y_test_red, y_pred_red)

In [42]:
print("MAE:", mae_red)
print("MSE:", mse_red)
print("RMSE:", rmse_red)
print("R2:", r2_red)

MAE: 0.011835052015604681
MSE: 0.00025780628153446037
RMSE: 0.016081046965961403
R2: 0.9934576538372517


##**Validation Set**

Defining the Hyperparameter space for Grid Search

In [43]:
param_grid = {
    "n_estimators": [260, 270],
    'max_depth': [13, 14],
    'min_samples_split': [15, 16],
    'min_samples_leaf': [2, 3]
}

In [44]:
rf = RandomForestRegressor()

Fitting Grid Search

In [45]:
rf_grid = GridSearchCV(estimator = rf, param_grid = param_grid,
                          cv = 3, verbose=3, n_jobs = -1)

In [47]:
rf_grid.fit(X_train_red, y_train_red)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [48]:
best_params = rf_grid.best_params_
print(best_params)

{'max_depth': 14, 'min_samples_leaf': 2, 'min_samples_split': 15, 'n_estimators': 270}


##**Training with Optimal Hyperparameters**

Fitting Random Forest with optimal Hyperparameters

In [49]:
rf_bestP = RandomForestRegressor(n_estimators=260, max_depth=14, min_samples_leaf=2, min_samples_split=15, random_state=42)

In [50]:
rf_bestP.fit(X_train_red, y_train_red)

In [51]:
y_pred_bestP = rf_bestP.predict(X_test_red)

Estimating the evaluation metrics

In [52]:
mae_red_bestP = mean_absolute_error(y_test, y_pred_bestP)
mse_red_bestP = mean_squared_error(y_test, y_pred_bestP)
rmse_red_bestP = np.sqrt(mse_red_bestP)
r2_red_bestP = r2_score(y_test, y_pred_bestP)

In [53]:
print("MAE:", mae_red_bestP)
print("MSE:", mse_red_bestP)
print("RMSE:", rmse_red_bestP)
print("R2:", r2_red_bestP)

MAE: 0.2244929411788534
MSE: 0.07899400735009048
RMSE: 0.2810587258031504
R2: -0.9613096084836688
