<h2 style="text-align: center;"><strong>Differential Privacy Strategies for Data Analytics in the Banking Sector</strong></h2>

<div style="text-align: center;">
  Universidad de los Andes<br>
  Author: Daniela Espinosa 202022615<br>
</div>


### **Modelling**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve,precision_score, recall_score, confusion_matrix, f1_score

from imblearn.combine import SMOTEENN
from boruta import BorutaPy

import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam


In [2]:
data = pd.read_csv('C:/Users/danie/OneDrive/Documentos/1 UNIANDES/10 semestre/Tesis/differential-privacy-banking-sector/data/processed/bank-processed.csv', sep=',')
data.shape

(45211, 33)

In [3]:
X = data.drop(columns=['y'])
Y = data['y']

numeric_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y']

scale = StandardScaler()
X[numeric_cols] = scale.fit_transform(X[numeric_cols])

X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X, Y, test_size=0.2, random_state=42)

#### Identify Unbalanced Clases

In [8]:
stat = pd.DataFrame(Y.value_counts())
stat['percent'] = stat/Y.shape[0]*100
stat

Unnamed: 0,y,percent
0,39922,88.30152
1,5289,11.69848


#### Synthetic Minority Oversampling Technique (SMOTE)

In [None]:
smote_enn = SMOTEENN(random_state=42)
X_resample, Y_resample = smote_enn.fit_resample(X, Y)


In [23]:
stat2 = pd.DataFrame(Y_resample.value_counts())
stat2['percent'] = stat2/Y_resample.shape[0]*100
stat2

Unnamed: 0,y,percent
1,39280,54.51013
0,32780,45.48987


### **Feature Selection**


#### Boruta Algorithm (wrapper)

In [26]:
# Asegurar que X_resample sea un DataFrame con nombres de columnas
if not isinstance(X_resample, pd.DataFrame):
    X_resample = pd.DataFrame(X_resample, columns=X.columns)

# Asegurar que Y_resample sea un array 1D
if not isinstance(Y_resample, pd.Series):
    Y_resample = pd.Series(Y_resample)

# Inicializar el modelo de XGBoost asegurando compatibilidad
rf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Configurar BorutaPy
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42)

# Ajustar BorutaPy asegurando que Y_resample sea 1D
feat_selector.fit(X_resample.values, Y_resample.values.ravel())

# Obtener las características seleccionadas
X_filtered = X.columns[feat_selector.support_].tolist()




Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	32
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	28
Tentative: 	4
Rejected: 	0
Iteration: 	9 / 100
Confirmed: 	28
Tentative: 	4
Rejected: 	0
Iteration: 	10 / 100
Confirmed: 	28
Tentative: 	4
Rejected: 	0
Iteration: 	11 / 100
Confirmed: 	28
Tentative: 	4
Rejected: 	0
Iteration: 	12 / 100
Confirmed: 	29
Tentative: 	3
Rejected: 	0
Iteration: 	13 / 100
Confirmed: 	29
Tentative: 	3
Rejected: 	0
Iteration: 	14 / 100
Confirmed: 	29
Tentative: 	3
Rejected: 	0
Iteration: 	15 / 100
Confirmed: 	29
Tentative: 	3
Rejected: 	0
Iteration: 	16 / 100
Confirmed: 	31
Tentative: 	1
Rejected: 	0
I

In [27]:
len(X_filtered)

32

### **Hyperparameter Tuning**

### VERSIÓN INICIAL

In [None]:

# Definir modelos
models = { 
    'Logit': LogisticRegression(solver='saga', max_iter=10000, penalty='l2'),
    'Lasso': LogisticRegression(solver='saga', max_iter=10000),
    'Ridge': LogisticRegression(solver='saga', max_iter=10000),
    'ElasticNet': LogisticRegression(solver='saga', max_iter=10000),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, random_state=42),  # <-- Cambio aquí
    'RandomForest': RandomForestClassifier(random_state=42)
}

# Definir hiperparámetros
param_grids = {
    'Logit': {'C': [0.01, 0.1, 1, 10, 100]},
    'Lasso': {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1']},
    'Ridge': {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l2']},
    'ElasticNet': {'C': [0.01, 0.1, 1, 10, 100], 'l1_ratio': [0.1, 0.5, 0.9], 'penalty': ['elasticnet']},
    'XGBoost': {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.2]},
    'RandomForest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
}

# Validación cruzada
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_models = {}
best_scores = {}

for model_name, model in models.items():
    print(f'Training {model_name}...')
    
    grid_search = GridSearchCV(model, param_grids[model_name], cv=kf, scoring='roc_auc', n_jobs=-1)
    
    # Si es XGBoost, pasamos `eval_metric` dentro de `fit()`
    if model_name == "XGBoost":
        grid_search.fit(X_resample[X_filtered], Y_resample.ravel(), eval_metric='logloss')
    else:
        grid_search.fit(X_resample[X_filtered], Y_resample.ravel())
    
    best_models[model_name] = grid_search.best_estimator_
    best_scores[model_name] = grid_search.best_score_
    
    print(f'Best params for {model_name}: {grid_search.best_params_}')
    print(f'Best ROC AUC score for {model_name}: {grid_search.best_score_:.6f}')

# Seleccionar el mejor modelo basado en la métrica ROC AUC
best_model_name = max(best_scores, key=best_scores.get)
best_model = best_models[best_model_name]

print(f'\nBest overall model: {best_model_name} with ROC AUC: {best_scores[best_model_name]:.6f}')




Training Logit...
Best params for Logit: {'C': 100}
Best ROC AUC score for Logit: 0.967235
Training Lasso...
Best params for Lasso: {'C': 100, 'penalty': 'l1'}
Best ROC AUC score for Lasso: 0.967235
Training Ridge...
Best params for Ridge: {'C': 100, 'penalty': 'l2'}
Best ROC AUC score for Ridge: 0.967235
Training ElasticNet...
Best params for ElasticNet: {'C': 100, 'l1_ratio': 0.1, 'penalty': 'elasticnet'}
Best ROC AUC score for ElasticNet: 0.967235
Training XGBoost...




Best params for XGBoost: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200}
Best ROC AUC score for XGBoost: 0.997685
Training RandomForest...
Best params for RandomForest: {'max_depth': None, 'n_estimators': 200}
Best ROC AUC score for RandomForest: 0.997219

Best overall model: XGBoost with ROC AUC: 0.997685


In [20]:
# Evaluación del mejor modelo

# Predicciones en el conjunto de prueba
y_pred = best_model.predict(X_test_split[X_filtered])

# Calcular métricas adicionales
accuracy = accuracy_score(y_test_split, y_pred)
precision = precision_score(y_test_split, y_pred)
recall = recall_score(y_test_split, y_pred)
f1 = f1_score(y_test_split, y_pred)
conf_matrix = confusion_matrix(y_test_split, y_pred)
false_positive = conf_matrix[0][1]  # Error Tipo 1
false_negative = conf_matrix[1][0]  # Error Tipo 2

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Error Tipo 1 (Falsos Positivos): {false_positive}')
print(f'Error Tipo 2 (Falsos Negativos): {false_negative}')

Accuracy: 0.9661616720115006
Precision: 0.8897715988083416
Recall: 0.8212648945921174
F1 Score: 0.8541468064823642
Confusion Matrix:
[[7841  111]
 [ 195  896]]
Error Tipo 1 (Falsos Positivos): 111
Error Tipo 2 (Falsos Negativos): 195


### **VERSIÓN MEJORADA**

### Modelos clásicos

In [None]:
# Definir modelos clásicos
models = { 
    'Logit': LogisticRegression(solver='saga', max_iter=10000, penalty='l2'),
    'Lasso': LogisticRegression(solver='saga', max_iter=10000, penalty='l1'),
    'Ridge': LogisticRegression(solver='saga', max_iter=10000, penalty='l2'),
    'ElasticNet': LogisticRegression(solver='saga', max_iter=10000, penalty='elasticnet', l1_ratio=0.5),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42)
}

# Definir hiperparámetros
param_grids = {
    'Logit': {'C': [0.01, 0.1, 1, 10, 100]},
    'Lasso': {'C': [0.01, 0.1, 1, 10, 100]},
    'Ridge': {'C': [0.01, 0.1, 1, 10, 100]},
    'ElasticNet': {'C': [0.01, 0.1, 1, 10, 100], 'l1_ratio': [0.1, 0.5, 0.9]},
    'XGBoost': {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.2]},
    'RandomForest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
}

### Entrenamiento + cross validation + metricas

In [None]:
# Diccionario para almacenar resultados
results = {}

plt.figure(figsize=(10, 8))  # Inicializar la figura para las curvas ROC

for model_name, model in models.items():
    print(f'Training {model_name}...')
    
    # Hacer búsqueda de hiperparámetros
    grid_search = GridSearchCV(model, param_grids[model_name], cv=5, scoring='roc_auc', n_jobs=-1)
    
    if model_name == "XGBoost":
        grid_search.fit(X_resample[X_filtered], Y_resample.ravel(), eval_metric='logloss')
    else:
        grid_search.fit(X_resample[X_filtered], Y_resample.ravel())

    best_model = grid_search.best_estimator_

    # Obtener predicciones finales con el mejor modelo
    y_pred = cross_val_predict(best_model, X_resample[X_filtered], Y_resample.ravel(), cv=5, n_jobs=-1)
    y_prob = cross_val_predict(best_model, X_resample[X_filtered], Y_resample.ravel(), cv=5, method="predict_proba")[:, 1]
    
    # Calcular matriz de confusión y errores tipo I y II
    conf_matrix = confusion_matrix(Y_resample, y_pred)
    actual_negatives = conf_matrix[0].sum()
    actual_positives = conf_matrix[1].sum()

    false_positive = conf_matrix[0][1]  
    false_negative = conf_matrix[1][0]  

    false_positive_pct = (false_positive / actual_negatives) * 100
    false_negative_pct = (false_negative / actual_positives) * 100

    # Guardar métricas
    results[model_name] = {
        'Best Params': grid_search.best_params_,
        'Accuracy': accuracy_score(Y_resample, y_pred),
        'Precision': precision_score(Y_resample, y_pred),
        'Recall': recall_score(Y_resample, y_pred),
        'F1 Score': f1_score(Y_resample, y_pred),
        'Type I Error': false_positive_pct,
        'Type II Error': false_negative_pct,
        'ROC AUC': roc_auc_score(Y_resample, y_prob)
    }

    # Curva ROC
    fpr, tpr, _ = roc_curve(Y_resample, y_prob)
    plt.plot(fpr, tpr, label=f"{model_name} (AUC = {results[model_name]['ROC AUC']:.2f})")

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Models ROC curves')
plt.legend()
plt.show()

# Mostrar tabla de métricas
df_results = pd.DataFrame(results).T
print(df_results)


### Red Neuronal

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np

def build_model(input_shape, learning_rate=0.001, dropout_rate=0.2, units=64, activation='relu', hidden_layers=2):
    model = Sequential()
    model.add(Dense(units, activation=activation, input_shape=(input_shape,)))
    model.add(Dropout(dropout_rate))
    
    for _ in range(hidden_layers - 1):  
        model.add(Dense(units, activation=activation))
        model.add(Dropout(dropout_rate))
    
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['AUC'])
    return model

# Parámetros de la red neuronal
params = {'learning_rate': 0.001, 'dropout_rate': 0.2, 'units': 128, 'activation': 'relu', 'hidden_layers': 3}
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

X_resample_np = X_resample[X_filtered].to_numpy()
Y_resample_np = Y_resample.to_numpy()

train_losses, val_losses, train_auc, val_auc = [], [], [], []
y_true_all, y_pred_all, y_prob_all = [], [], []

for train_index, val_index in kf.split(X_resample_np):
    X_train, X_val = X_resample_np[train_index], X_resample_np[val_index]
    Y_train, Y_val = Y_resample_np[train_index], Y_resample_np[val_index]
    
    model = build_model(input_shape=X_train.shape[1], **params)

    history = model.fit(
        X_train, Y_train,
        epochs=50, batch_size=32,
        validation_data=(X_val, Y_val),
        callbacks=[early_stopping],
        verbose=1
    )

    train_losses.append(history.history['loss'])
    val_losses.append(history.history['val_loss'])
    train_auc.append(history.history['auc'])
    val_auc.append(history.history['val_auc'])

    # Guardar predicciones en el set de validación
    y_true_all.extend(Y_val)
    y_pred_all.extend((model.predict(X_val) > 0.5).astype(int).ravel())
    y_prob_all.extend(model.predict(X_val).ravel())

# Convertir listas a arrays numpy
y_true_all = np.array(y_true_all)
y_pred_all = np.array(y_pred_all)
y_prob_all = np.array(y_prob_all)

# Calcular métricas finales en validación
conf_matrix = confusion_matrix(y_true_all, y_pred_all)
actual_negatives = conf_matrix[0].sum()
actual_positives = conf_matrix[1].sum()

false_positive = conf_matrix[0][1]  
false_negative = conf_matrix[1][0]  

false_positive_pct = (false_positive / actual_negatives) * 100
false_negative_pct = (false_negative / actual_positives) * 100

roc_auc = roc_auc_score(y_true_all, y_prob_all)
accuracy = accuracy_score(y_true_all, y_pred_all)
precision = precision_score(y_true_all, y_pred_all)
recall = recall_score(y_true_all, y_pred_all)
f1 = f1_score(y_true_all, y_pred_all)

# Guardar métricas en el diccionario de resultados
results['Neural Network'] = {
    'ROC AUC': roc_auc,
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1,
    'Type I Error': false_positive_pct,
    'Type II Error': false_negative_pct
}

# Graficar curva ROC
fpr, tpr, _ = roc_curve(y_true_all, y_prob_all)
plt.plot(fpr, tpr, label=f"Neural Network (AUC = {roc_auc:.2f})")

# Graficar evolución de loss y AUC
epochs = range(1, len(train_losses[0]) + 1)
plt.figure(figsize=(12, 5))

# Pérdida (Loss)
plt.subplot(1, 2, 1)
plt.plot(epochs, np.mean(train_losses, axis=0), 'b-', label='Training Loss')
plt.plot(epochs, np.mean(val_losses, axis=0), 'r-', label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Evolución de la Pérdida')
plt.legend()

# AUC
plt.subplot(1, 2, 2)
plt.plot(epochs, np.mean(train_auc, axis=0), 'b-', label='Training AUC')
plt.plot(epochs, np.mean(val_auc, axis=0), 'r-', label='Validation AUC')
plt.xlabel('Epochs')
plt.ylabel('AUC')
plt.title('Evolución del AUC')
plt.legend()

plt.show()

# Mostrar tabla de métricas
df_results = pd.DataFrame(results).T
print(df_results)
