#**Mounting Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#**Importing Libraries**

In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV

#**Full Dataset**

Importing Dataset

In [4]:
data_cleaned = pd.read_csv('/content/drive/MyDrive/RM_lectures/data/Data preparation/data_cleaned')

Splitting the dataset in predictors and target variable

In [5]:
X = data_cleaned.drop('LGD', axis=1)
y = data_cleaned['LGD']

In [6]:
tree_model = DecisionTreeRegressor()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Fitting Decision Tree

In [8]:
tree_model.fit(X_train, y_train)

In [9]:
y_pred = tree_model.predict(X_test)

Estimating the evaluation metrics

In [10]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [11]:
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2:", r2)

MAE: 0.02022041612483745
MSE: 0.0007568335500650195
RMSE: 0.02751060795520556
R2: 0.9810246526166229


#**Reduced Features Dataset**

Importing Dataset

In [12]:
X_red = pd.read_csv('/content/drive/MyDrive/RM_lectures/data/Data preparation/X_reduced')

Splitting the dataset in predictors and target variable

In [13]:
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(X_red, y, test_size=0.2)

Fitting Decision Tree

In [14]:
tree_model.fit(X_train_red, y_train_red)

In [15]:
y_pred_red = tree_model.predict(X_test_red)

Estimating the evaluation metrics

In [16]:
mae_red = mean_absolute_error(y_test_red, y_pred_red)
mse_red = mean_squared_error(y_test_red, y_pred_red)
rmse_red = np.sqrt(mse)
r2_red = r2_score(y_test_red, y_pred_red)

In [17]:
print("MAE:", mae_red)
print("MSE:", mse_red)
print("RMSE:", rmse_red)
print("R2:", r2_red)

MAE: 0.019786736020806245
MSE: 0.0007280234070221067
RMSE: 0.02751060795520556
R2: 0.9815475491587982


#**Validation Set**

Defining the Hyperparameter space for Grid Search

In [18]:
param_grid = {
    'max_depth': [6, 7, 8],
    'min_samples_split': [1, 2, 3],
    'min_samples_leaf': [2, 3, 4],
    'max_features': ['auto', 'sqrt', 'log2', None]
}

Fitting Grid Search

In [19]:
grid_search = GridSearchCV(estimator=tree_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

In [20]:
grid_search.fit(X_train_red, y_train_red)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


180 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py", line 1247, in fit
    super().fit(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklear

In [21]:
print("Best parameters:", grid_search.best_params_)

Best parameters: {'max_depth': 8, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 2}


#**Training with Optimal Hyperparameters**

Fitting decision tree with optimal hyperparameters

In [22]:
tree_bestP = DecisionTreeRegressor(max_depth=7, max_features="auto", min_samples_leaf=3, min_samples_split=2)

In [23]:
tree_bestP.fit(X_train_red, y_train_red)



In [24]:
y_pred_bestP_red = tree_bestP.predict(X_test_red)

Estimating the evaluation metrics

In [25]:
mae_red_bestP = mean_absolute_error(y_test_red, y_pred_red)
mse_red_bestP = mean_squared_error(y_test_red, y_pred_red)
rmse_red_bestP = np.sqrt(mse)
r2_red_bestP = r2_score(y_test_red, y_pred_red)

In [26]:
print("MAE:", mae_red_bestP)
print("MSE:", mse_red_bestP)
print("RMSE:", rmse_red_bestP)
print("R2:", r2_red_bestP)

MAE: 0.019786736020806245
MSE: 0.0007280234070221067
RMSE: 0.02751060795520556
R2: 0.9815475491587982
