<a href="https://colab.research.google.com/github/dellavecchiaemiliano/Thesis_Project/blob/main/Forecasting_LGD_by_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Mounting Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#**Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

#**Reduced Features dataset with 12 Predictors**

In [None]:
data_int = pd.read_csv("/content/drive/MyDrive/TESI NPL's/data_int_scaled")
data_int = data_int.set_index("Pratica_Sequential")
data_int.head()

Unnamed: 0_level_0,IMPORTO IPOTECA,default_quarter,Unemployment_rate_Italy,Inflation_Italy,GBV_Ingresso xRapporto,Real_residential_property_Italy,Euribor 6m,Codice RAE,GDP_Italy,Regione debitore_Lombardia,Provincia debitore_Bergamo,DescrizioneRapporto_c/c ordinario e assimilati,LGD
Pratica_Sequential,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0.186109,-0.306989,1.048767,-0.23777,1.210156,-0.488803,-0.307201,0.048799,0.241793,0,0,0,0.93
1,-0.066819,-0.306989,1.048767,-0.23777,1.210156,-0.488803,-0.307201,0.048799,0.241793,0,0,0,0.93
2,-0.229215,-0.440584,1.091323,-0.764089,-0.292565,-0.283475,-0.25768,0.432357,0.03467,1,0,1,0.32
3,1.338747,0.29419,-0.16627,-0.482999,0.553256,0.400644,-0.457345,0.432357,0.421529,1,0,0,0.47
4,1.975732,-0.039798,0.091206,-0.27326,1.068911,-0.368705,-0.314258,0.432357,0.390871,1,0,0,0.8


In [None]:
X_data_int = data_int.drop("LGD", axis=1)
y = data_int["LGD"]

In [None]:
X_train_data_int, X_test_data_int, y_train_data_int, y_test_data_int = train_test_split(X_data_int, y, test_size=0.2)

In [None]:
svr = SVR()

In [None]:
svr.fit(X_train_data_int, y_train_data_int)

In [None]:
y_pred_data_int = svr.predict(X_test_data_int)

Estimating the evaluation metrics

In [None]:
mae_data_int = mean_absolute_error(y_test_data_int, y_pred_data_int)

In [None]:
mse_data_int = mean_squared_error(y_test_data_int, y_pred_data_int)

In [None]:
rmse_data_int = np.sqrt(mse_data_int)

In [None]:
r2_data_int = r2_score(y_test_data_int, y_pred_data_int)

In [None]:
print("MAE:", mae_data_int)
print("MSE:", mse_data_int)
print("RMSE:", rmse_data_int)
print("R2:", r2_data_int)

MAE: 0.26020349397507025
MSE: 0.1448724042909079
RMSE: 0.38062107704501585
R2: 0.021424990269295896


##**Validation set**

Defining the Hyperparameter space for Grid Search

In [None]:
param_grid_data_int = {
    "C": [0.1, 1, 10],
    "gamma": [0.1, 0.01],
    "kernel": ["rbf", "poly", "sigmoid"]
}

Fitting Grid Search

In [None]:
grid_search_data_int = GridSearchCV(svr, param_grid_data_int, cv=3, scoring="neg_mean_squared_error",
                           verbose=3, n_jobs = -1)

In [None]:
grid_search_data_int.fit(X_train_data_int, y_train_data_int)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


In [None]:
best_params_data_int = grid_search_data_int.best_params_
print(best_params_data_int)

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}


##**Training with Optimal Hyperparameters**

In [None]:
svr_bestP_data_int = SVR(C=1, gamma=0.1, kernel="rbf")

In [None]:
svr_bestP_data_int.fit(X_train_data_int, y_train_data_int)

In [None]:
y_pred_bestP_data_int = svr_bestP_data_int.predict(X_test_data_int)

Estimating the evaluation metrics

In [None]:
mae_bestP_data_int = mean_absolute_error(y_test_data_int, y_pred_bestP_data_int)

In [None]:
mse_bestP_data_int = mean_squared_error(y_test_data_int, y_pred_bestP_data_int)

In [None]:
rmse_bestP_data_int = np.sqrt(mse_bestP_data_int)

In [None]:
r2_bestP_data_int = r2_score(y_test_data_int, y_pred_bestP_data_int)

In [None]:
print(mae_bestP_data_int)
print(mse_bestP_data_int)
print(rmse_bestP_data_int)
print(r2_bestP_data_int)

0.2606562952093712
0.14535077450089526
0.3812489665571505
0.036476392865150586


#**Reduced Scaled Features dataset with 42 Predictors**

In [None]:
data_U = pd.read_csv("/content/drive/MyDrive/TESI NPL's/data_U_scaled")
data_U = data_U.set_index("Pratica_Sequential")
data_U.head()

Unnamed: 0_level_0,Regione debitore_Puglia,TitolareCredito,Regione debitore_Toscana,Security_ingresso,Regione debitore_Sardegna,GDP_Italy,Codice SAE,Regione debitore_Lombardia,Provincia debitore_Milano,Provincia debitore_Viterbo,...,DescrizioneRapporto_spese,Provincia debitore_Latina,Provincia debitore_Brindisi,Codice RAE,Regione debitore_Lazio,Provincia debitore_Bergamo,Provincia debitore_L'Aquila,Provincia debitore_Cosenza,DescrizioneRapporto_mutuo ipotecario,LGD
Pratica_Sequential,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0.00524,0,1,0,0.241793,0.259422,0,0,0,...,0,0,0,0.048799,0,0,0,0,1,0.93
1,0,0.00524,0,1,0,0.241793,0.259422,0,0,0,...,0,0,0,0.048799,0,0,0,0,1,0.93
2,0,0.001551,0,0,0,0.03467,0.423427,1,0,0,...,0,0,0,0.432357,0,0,0,0,0,0.32
3,0,0.001551,0,1,0,0.421529,0.423427,1,0,0,...,0,0,0,0.432357,0,0,0,0,1,0.47
4,0,0.001551,0,1,0,0.390871,0.423427,1,0,0,...,0,0,0,0.432357,0,0,0,0,1,0.8


In [None]:
X_data_U = data_U.drop("LGD", axis=1)
y_data_U= data_U["LGD"]

In [None]:
X_train_data_U, X_test_data_U, y_train_data_U, y_test_data_U = train_test_split(X_data_U, y_data_U, test_size=0.2)

In [None]:
svr.fit(X_train_data_U, y_train_data_U)

In [None]:
y_pred_data_U = svr.predict(X_test_data_U)

Estimating the evaluation metrics

In [None]:
mae_data_U = mean_absolute_error(y_test_data_U, y_pred_data_U)

In [None]:
mse_data_U = mean_squared_error(y_test_data_U, y_pred_data_U)

In [None]:
rmse_data_U = np.sqrt(mse_data_U)

In [None]:
r2_data_U = r2_score(y_test_data_U, y_pred_data_U)

In [None]:
print("MAE:", mae_data_U)
print("MSE:", mse_data_U)
print("RMSE:", rmse_data_U)
print("R2:", r2_data_U)

MAE: 0.24994575266731914
MSE: 0.13561899200972913
RMSE: 0.3682648394969701
R2: 0.10494741744325153


##**Validation set**

Defining the Hyperparameter space for Grid Search

In [None]:
param_grid_data_U = {
    "C": [0.1, 1, 10],
    "gamma": [0.1, 0.01],
    "kernel": ["rbf", "poly", "sigmoid"]
}

Fitting Grid Search

In [None]:
grid_search_data_U = GridSearchCV(svr, param_grid_data_U, cv=3, scoring="neg_mean_squared_error",
                           verbose=3, n_jobs = -1)

In [None]:
grid_search_data_U.fit(X_train_data_U, y_train_data_U)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


In [None]:
best_params_data_U = grid_search_data_U.best_params_
print(best_params_data_U)

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}


##**Training with Optimal Hyperparameters**

In [None]:
svr_bestP_data_U = SVR(C=1, gamma=0.1, kernel="rbf")

In [None]:
svr_bestP_data_U.fit(X_train_data_U, y_train_data_U)

In [None]:
y_pred_bestP_data_U = svr_bestP_data_U.predict(X_test_data_U)

Estimating the evaluation metrics

In [None]:
mae_bestP_data_U = mean_absolute_error(y_test_data_U, y_pred_bestP_data_U)

In [None]:
mse_bestP_data_U = mean_squared_error(y_test_data_U, y_pred_bestP_data_U)

In [None]:
rmse_bestP_data_U = np.sqrt(mse_bestP_data_U)

In [None]:
r2_bestP_data_U = r2_score(y_test_data_U, y_pred_bestP_data_U)

In [None]:
print(mae_bestP_data_U)
print(mse_bestP_data_U)
print(rmse_bestP_data_U)
print(r2_bestP_data_U)

0.24993944533108825
0.13562906855800197
0.3682785203592547
0.10488091465908245


#**Full scaled dataset with 198 predictors**

In [None]:
df = pd.read_csv("/content/drive/MyDrive/TESI NPL's/X_scaled")
df = df.set_index("Pratica_Sequential")
df.head()

Unnamed: 0_level_0,GRADO IPOTECA FORMALE (contrattuale),IMPORTO IPOTECA,TitolareCredito,Codice SAE,Codice RAE,Security_ingresso,GBV_Ingresso xRapporto,default_quarter,Euribor 6m,GDP_Italy,...,DescrizioneRapporto_portafoglio finanziario,DescrizioneRapporto_portafoglio sbf,DescrizioneRapporto_prestiti personali,DescrizioneRapporto_rapporto passato a perdite,DescrizioneRapporto_sopravvenienze,DescrizioneRapporto_sovvenzione bancaria,DescrizioneRapporto_spese,DescrizioneRapporto_spese extracontabili,DescrizioneRapporto_spese extracontabili soff.,DescrizioneRapporto_spese legali
Pratica_Sequential,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.138431,0.186109,0.00524,0.259422,0.048799,1,1.210156,-0.306989,-0.307201,0.241793,...,0,0,0,0,0,0,0,0,0,0
1,0.031401,-0.066819,0.00524,0.259422,0.048799,1,1.210156,-0.306989,-0.307201,0.241793,...,0,0,0,0,0,0,0,0,0,0
2,0.817214,-0.229215,0.001551,0.423427,0.432357,0,-0.292565,-0.440584,-0.25768,0.03467,...,0,0,0,0,0,0,0,0,0,0
3,0.138431,1.338747,0.001551,0.423427,0.432357,1,0.553256,0.29419,-0.457345,0.421529,...,0,0,0,0,0,0,0,0,0,0
4,0.138431,1.975732,0.001551,0.423427,0.432357,1,1.068911,-0.039798,-0.314258,0.390871,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(df, y, test_size=0.2, random_state=42)

In [None]:
svr.fit(X_train_df, y_train_df)

In [None]:
y_pred_df = svr.predict(X_test_df)

Estimating the evaluation metrics

In [None]:
mae_df = mean_absolute_error(y_test_df, y_pred_df)

In [None]:
mse_df = mean_squared_error(y_test_df, y_pred_df)

In [None]:
rmse_df = np.sqrt(mse_df)

In [None]:
r2_df = r2_score(y_test_df, y_pred_df)

In [None]:
print("MAE:", mae_df)
print("MSE:", mse_df)
print("RMSE:", rmse_df)
print("R2:", r2_df)

MAE: 0.2623007151852336
MSE: 0.14668636297399287
RMSE: 0.3829965573918294
R2: 0.08283260018021843


##**Validation set**

Defining the Hyperparameter space for Grid Search

In [None]:
param_grid_df = {
    "C": [1, 10],
    "gamma": [1, 0.1],
    "kernel": ["rbf", "sigmoid"]
}

Fitting Grid Search

In [None]:
grid_search_df = GridSearchCV(svr, param_grid_df, cv=3, scoring="neg_mean_squared_error",
                           verbose=3, n_jobs = -1)

In [None]:
grid_search_df.fit(X_train_df, y_train_df)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [None]:
best_params = grid_search_df.best_params_
print(best_params)

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}


##**Training with optimal Hyperparameters**

In [None]:
svr_bestP_df = SVR(C=1, gamma=0.1, kernel="rbf")

In [None]:
svr_bestP_df.fit(X_train_df, y_train_df)

In [None]:
y_pred_bestP_df = svr_bestP_df.predict(X_test_df)

Estimating the evaluation metrics

In [None]:
mae_bestP_df = mean_absolute_error(y_test_df, y_pred_bestP_df)

In [None]:
mse_bestP_df = mean_squared_error(y_test_df, y_pred_bestP_df)

In [None]:
rmse_bestP_df = np.sqrt(mse_bestP_df)

In [None]:
r2_bestP_df = r2_score(y_test_df, y_pred_bestP_df)

In [None]:
print(mae_bestP_df)
print(mse_bestP_df)
print(rmse_bestP_df)
print(r2_bestP_df)

0.2627014361863382
0.1468845253507143
0.3832551700247686
0.08159357517397481
