# 75.06 Organización de datos: Trabajo Práctico 3

Integrantes del grupo
- Avecilla, Ignacio - 105067
- Balmaceda, Fernando - 105525
- Singer, Joaquín - 105854
- Villegas, Tomás - 106456

In [40]:
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import xgboost as xgb
from sklearn.svm import SVC
from keras.models import Sequential
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from keras.layers import Dense
from keras.regularizers import l2
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [41]:
url='https://drive.google.com/file/d/1-uJwth2nalEDQPqJZzbADhqUt-tfiypD/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
df_pca_transform = pd.read_csv(url, sep = ";")

In [42]:
df_pca_transform

In [43]:
df_pca_transform.drop(columns = ["Unnamed: 0"], inplace = True)

# Entrenamiento de modelos

Dividimos el dataframe en train y test, utilizando el 30% del dataframe como test.

In [44]:
train, test = train_test_split(df_pca_transform, test_size = 0.3, random_state = 42)

x_train = train.drop(columns=["target"])
y_train = train["target"]

x_test = test.drop(columns=["target"])
y_test = test["target"]

# 1. Random Forest

In [45]:
%%time

rf_params = {
    'bootstrap': [True, False],
    'criterion': ['gini','entropy'],
    'max_depth': [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators':[100,200,300,400,500]
}

rf_cv = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions = rf_params,
    cv = 5, 
    n_iter = 5,
    scoring = 'f1',
    verbose = 3,
    n_jobs = 1,
    random_state=42
).fit(x_train, y_train)

In [46]:
best_params = rf_cv.best_params_
best_params

In [47]:
rf = RandomForestClassifier(
    random_state=42,
    bootstrap= best_params["bootstrap"],
    criterion= best_params["criterion"],
    max_depth= best_params["max_depth"],
    min_samples_leaf= best_params["min_samples_leaf"],
    min_samples_split= best_params["min_samples_split"],
    n_estimators= best_params["n_estimators"]
)

rf.fit(x_train, y_train)

In [48]:
y_pred = rf.predict(x_test)

print(classification_report(y_test, y_pred))
print("ROC-AUC score:", round(metrics.roc_auc_score(y_test, y_pred), 4))

In [49]:
mat = confusion_matrix(y_test, y_pred)
labels = ['No Default', 'Default']
 
sns.heatmap(mat, square=True, annot=True, fmt='d', cbar=False, cmap='Blues',
            xticklabels=labels, yticklabels=labels)
 
plt.xlabel('Predicted label')
plt.ylabel('Actual label')

# 2. XGBoost Classifier

In [50]:
%%time

clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic')
xgboost_params = {
    'n_estimators': [100, 150, 200],
    'learning_rate': [0.2, 0.3, 0.4, 0.5],
    'max_depth': [5, 6, 7, 8, 9],
    'min_child_weight': [1, 2, 3, 4]
}

xgboost_cv = RandomizedSearchCV(
    clf_xgb,
    param_distributions = xgboost_params,
    cv = 5,
    n_iter = 5,
    scoring = 'f1',
    verbose = 3,
    n_jobs = 1,
    random_state=42
).fit(x_train, y_train)

In [51]:
best_params = xgboost_cv.best_params_
best_params

In [52]:
xgb_model = xgb.XGBClassifier(
    objective = 'binary:logistic',
    learning_rate = best_params["learning_rate"],
    max_depth = best_params["max_depth"],
    min_child_weight = best_params["min_child_weight"],
    n_estimators = best_params["n_estimators"]
)

xgb_model.fit(x_train, y_train)

In [53]:
y_pred = xgb_model.predict(x_test)

print(classification_report(y_test, y_pred))
print("ROC-AUC score:", round(metrics.roc_auc_score(y_test, y_pred), 4))

In [54]:
mat = confusion_matrix(y_test, y_pred)
labels = ['No Default', 'Default']

sns.heatmap(mat, square=True, annot=True, fmt='d', cbar=False, cmap='Blues',
            xticklabels=labels, yticklabels=labels)
 
plt.xlabel('Predicted label')
plt.ylabel('Actual label')

# 3. SVM

Primero buscamos el mejor **kernel**, **C**, **degree** y **max_iter**.

In [55]:
%%time

model = SVC()
params = {'kernel': ['poly', 'rbf'], 'C':[0.1, 1, 10],'degree': [2,3], 'max_iter' : [10000]}
grid_search = GridSearchCV(model, params, scoring='f1', n_jobs=1, cv=5, verbose=3).fit(x_train, y_train)

In [56]:
best_params = grid_search.best_params_
best_params

Luego, en base a los hiperparametros obtenidos buscamos el mejor **gamma**.

In [57]:
%%time

params = {'kernel': [best_params["kernel"]],
          'C' : [best_params["C"]],
          'degree' : [best_params["degree"]],
          'gamma': [0.0001, 0.01, 0.1, 'scale'],
          'max_iter' : [best_params["max_iter"]]}
grid_search = GridSearchCV(model, params, scoring='f1', n_jobs=1, cv=5, verbose=3).fit(x_train, y_train)

In [58]:
best_params = grid_search.best_params_
best_params

In [59]:
svm = SVC(
    C=best_params["C"],
    degree=best_params["degree"],
    gamma=best_params["gamma"],
    kernel=best_params["kernel"],
    max_iter=best_params["max_iter"],
    probability=True
)

svm.fit(x_train, y_train)

In [60]:
y_pred = svm.predict(x_test)
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", round(metrics.roc_auc_score(y_test, y_pred), 4))

In [61]:
mat = confusion_matrix(y_test, y_pred)
labels = ['No Default', 'Default']
 
sns.heatmap(mat, square=True, annot=True, fmt='d', cbar=False, cmap='Blues',
            xticklabels=labels, yticklabels=labels)
 
plt.xlabel('Predicted label')
plt.ylabel('Actual label')

# 4. VotingClassifier ensemble

In [62]:
voting = VotingClassifier(
    estimators=[
        ('Random Forest', rf),
        ('XGBoost', xgb_model),
        ('SVM', svm)],
    voting='soft',
    n_jobs = -1
)

voting.fit(x_train, y_train)

In [63]:
y_pred = voting.predict(x_test)
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", round(metrics.roc_auc_score(y_test, y_pred), 4))

In [64]:
mat = confusion_matrix(y_test, y_pred)
labels = ['No Default', 'Default']
 
sns.heatmap(mat, square=True, annot=True, fmt='d', cbar=False, cmap='Blues',
            xticklabels=labels, yticklabels=labels)
 
plt.xlabel('Predicted label')
plt.ylabel('Actual label')

# 5. Red Neuronal

In [65]:
def crear_red_neuronal(optimizer="adam"):
    red = Sequential()
    
    red.add(Dense(128, activation='relu', input_dim=len(x_train.columns), kernel_regularizer=l2(0.001)))
    
    red.add(Dense(5, activation='tanh', kernel_regularizer=l2(0.001)))
    
    red.add(Dense(2, activation='softmax', kernel_regularizer=l2(0.001)))
    
    red.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.001)))
    
    red.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=[tf.keras.metrics.AUC()])
    
    return red

In [66]:
%%time

modelo = KerasClassifier(build_fn=crear_red_neuronal, verbose=0)

optimizadores = [
    keras.optimizers.Adam(learning_rate=0.00001),
    keras.optimizers.Adam(learning_rate=0.0001),
    keras.optimizers.SGD(learning_rate=0.00001),
    keras.optimizers.SGD(learning_rate=0.0001)
]

params = {
    'epochs' : [10, 50, 100],
    'optimizer' : optimizadores
}

random_search = RandomizedSearchCV(modelo, params, scoring='roc_auc', n_jobs=1, cv=5, n_iter=5, verbose=3).fit(x_train, y_train)

In [67]:
best_params = random_search.best_params_

optimizer = best_params["optimizer"]
nombre_optimizer = optimizer.get_config()["name"]
learning_rate = optimizer.get_config()["learning_rate"]
epochs = best_params["epochs"]

print(f"mejor optimizador: {nombre_optimizer} con learning rate {learning_rate}")
print(f"mejor cantidad de epochs: {epochs}")

In [68]:
red = Sequential()

red.add(Dense(128, activation='relu', input_dim=len(x_train.columns), kernel_regularizer=l2(0.001)))

red.add(Dense(5, activation='tanh', kernel_regularizer=l2(0.001)))

red.add(Dense(2, activation='softmax', kernel_regularizer=l2(0.001)))

red.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.001)))

red.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=[tf.keras.metrics.AUC()])

In [69]:
hist = red.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=epochs, batch_size=200)

In [70]:
y_pred = red.predict(x_test) > 0.5
y_pred = list(map(lambda x: 0 if x == False else 1, y_pred))
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", round(metrics.roc_auc_score(y_test, y_pred), 4))

In [71]:
mat = confusion_matrix(y_test, y_pred)
labels = ['No Default', 'Default']
 
sns.heatmap(mat, square=True, annot=True, fmt='d', cbar=False, cmap='Blues',
            xticklabels=labels, yticklabels=labels)
 
plt.xlabel('Predicted label')
plt.ylabel('Actual label')

# 6. Ensamble en cascada

In [72]:
y_pred = red.predict(x_test)

modelos = [svm, rf, xgb_model]

for modelo in modelos:

    no_predichas = pd.Series(y_pred[:, 0] > 0.1, name='bools')
    print(no_predichas.value_counts())
    y_pred[no_predichas.values] = modelo.predict_proba(x_test[no_predichas.values])[:,1].reshape(-1, 1)


no_predichas = pd.Series(y_pred[:, 0] > 0.1, name='bools')
print(no_predichas.value_counts())
y_pred[no_predichas.values] = red.predict(x_test[no_predichas.values])

In [73]:
y_pred = y_pred > 0.5
y_pred = list(map(lambda x: 0 if x == False else 1, y_pred))
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", round(metrics.roc_auc_score(y_test, y_pred), 4))

In [74]:
mat = confusion_matrix(y_test, y_pred)
labels = ['No Default', 'Default']
 
sns.heatmap(mat, square=True, annot=True, fmt='d', cbar=False, cmap='Blues',
            xticklabels=labels, yticklabels=labels)
 
plt.xlabel('Predicted label')
plt.ylabel('Actual label')