# Carga de librerías

In [1]:
import pandas as pd
import numpy as np

# Carga y limpieza de datos

In [101]:
# Cargar conjuntos de datos
train_csv = pd.read_csv('data/train.csv')
validation_csv = pd.read_csv('data/validation.csv')
test_csv = pd.read_csv('data/test.csv')

# Separar datos y etiquetas
feature_cols = train_csv.columns.values.tolist()[1:]
X_train = train_csv[feature_cols]
y_train = train_csv['RiskPerformance']
X_validation = validation_csv[feature_cols]
y_validation = validation_csv['RiskPerformance']
X_test = test_csv[feature_cols]
y_test = test_csv['RiskPerformance']

In [98]:
# Comprobar si hay valores nulos
print(train_csv.isnull().sum() + validation_csv.isnull().sum() + test_csv.isnull().sum())

RiskPerformance                       0
ExternalRiskEstimate                  0
MSinceOldestTradeOpen                 0
MSinceMostRecentTradeOpen             0
AverageMInFile                        0
NumSatisfactoryTrades                 0
NumTrades60Ever2DerogPubRec           0
NumTrades90Ever2DerogPubRec           0
PercentTradesNeverDelq                0
MSinceMostRecentDelq                  0
MaxDelq2PublicRecLast12M              0
MaxDelqEver                           0
NumTotalTrades                        0
NumTradesOpeninLast12M                0
PercentInstallTrades                  0
MSinceMostRecentInqexcl7days          0
NumInqLast6M                          0
NumInqLast6Mexcl7days                 0
NetFractionRevolvingBurden            0
NetFractionInstallBurden              0
NumRevolvingTradesWBalance            0
NumInstallTradesWBalance              0
NumBank2NatlTradesWHighUtilization    0
PercentTradesWBalance                 0
dtype: int64


In [102]:
# Comprobar rangos de valores
print(X_train.describe())

# Normalizar los datos
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train))
X_validation = pd.DataFrame(scaler.transform(X_validation))
X_test = pd.DataFrame(scaler.transform(X_test))


       ExternalRiskEstimate  MSinceOldestTradeOpen  MSinceMostRecentTradeOpen  \
count           6459.000000            6459.000000                6459.000000   
mean              67.563090             183.783558                   8.548847   
std               21.139229             109.572157                  12.708462   
min               -9.000000              -9.000000                  -9.000000   
25%               63.000000             117.000000                   2.500000   
50%               71.000000             177.000000                   5.000000   
75%               80.000000             249.000000                  11.000000   
max               94.000000             603.000000                 227.000000   

       AverageMInFile  NumSatisfactoryTrades  NumTrades60Ever2DerogPubRec  \
count     6459.000000            6459.000000                  6459.000000   
mean        74.134231              19.403158                     0.053414   
std         38.893707              13.0

# Entrenamiento del modelo

In [103]:
# Modelo de regresión logística
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
print('Train accuracy:', accuracy_score(y_train, y_pred_train))
print('Confusion matrix:\n', confusion_matrix(y_train, y_pred_train))
print('='*50)
y_pred_validation = model.predict(X_validation)
print('Validation accuracy:', accuracy_score(y_validation, y_pred_validation))
print('Confusion matrix:\n', confusion_matrix(y_validation, y_pred_validation))

Train accuracy: 0.7199256850905713
Confusion matrix:
 [[2519  831]
 [ 978 2131]]
Validation accuracy: 0.7005
Confusion matrix:
 [[775 258]
 [341 626]]


In [95]:
# A modo de prueba, probamos a expandir variables para ver si se mejoran los resultados
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
X_train_poly = poly.fit_transform(X_train)
X_validation_poly = poly.transform(X_validation)

# Modelo de regresión logística con variables expandidas
model = LogisticRegression(max_iter=10000)
model.fit(X_train_poly, y_train)

y_pred_train = model.predict(X_train_poly)
print('Train accuracy:', accuracy_score(y_train, y_pred_train))
print('Confusion matrix:\n', confusion_matrix(y_train, y_pred_train))
print('='*50)
y_pred_validation = model.predict(X_validation_poly)
print('Validation accuracy:', accuracy_score(y_validation, y_pred_validation))
print('Confusion matrix:\n', confusion_matrix(y_validation, y_pred_validation))


Train accuracy: 0.7491871806781235
Confusion matrix:
 [[2646  704]
 [ 916 2193]]
Validation accuracy: 0.6945
Confusion matrix:
 [[770 263]
 [348 619]]


# Explicabilidad del modelo

## Explicabilidad global

In [104]:
from interpret import show
from interpret.data import ClassHistogram
from interpret.glassbox import ExplainableBoostingClassifier

# Crear un histograma de las clases
hist = ClassHistogram().explain_data(X_train, y_train, name = 'Train Data')
show(hist)

# Crear un modelo interpretable
ebm = ExplainableBoostingClassifier()
ebm.fit(X_train, y_train)

# Mostrar el modelo
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

## Interpretabilidad local

In [105]:
# Mostrar la importancia de las variables
ebm_local = ebm.explain_local(X_validation, y_validation, name='EBM')
show(ebm_local)