In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from pure_ldp.frequency_oracles import direct_encoding as DirectEncoding

In [None]:
# Cargar datos
data = pd.read_csv('../data/processed/bank-filtered.csv')
X = data.drop(columns=['y'])
Y = data['y']

# Dividir en entrenamiento y prueba
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Parámetros del mejor modelo
best_params = {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200}

# Función para aplicar LDP con Direct Encoding
def apply_ldp(X, epsilon):
    ldp_X = np.zeros_like(X)
    for i in range(X.shape[1]):
        de = DirectEncoding(epsilon=epsilon, d=X.iloc[:, i].nunique())
        ldp_X[:, i] = [de.privatise(x) for x in X.iloc[:, i]]
    return pd.DataFrame(ldp_X, columns=X.columns)

# Evaluar modelo en datos originales y con LDP
results = {}

def evaluate_model(X_train, X_test, Y_train, Y_test, label):
    model = xgb.XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss', random_state=42)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    Y_prob = model.predict_proba(X_test)[:, 1]
    
    # Calcular métricas
    conf_matrix = confusion_matrix(Y_test, Y_pred)
    actual_negatives, actual_positives = conf_matrix[0].sum(), conf_matrix[1].sum()
    false_positive_pct = (conf_matrix[0][1] / actual_negatives) * 100
    false_negative_pct = (conf_matrix[1][0] / actual_positives) * 100
    
    results[label] = {
        'ROC AUC': roc_auc_score(Y_test, Y_prob),
        'Accuracy': accuracy_score(Y_test, Y_pred),
        'Precision': precision_score(Y_test, Y_pred),
        'Recall': recall_score(Y_test, Y_pred),
        'F1 Score': f1_score(Y_test, Y_pred),
        'Type I Error': false_positive_pct,
        'Type II Error': false_negative_pct
    }

# Evaluar modelo con datos originales
evaluate_model(X_train, X_test, Y_train, Y_test, 'Original')

# Evaluar modelo con datos bajo LDP
for epsilon in [0.1, 1, 5]:
    X_train_ldp = apply_ldp(X_train, epsilon)
    X_test_ldp = apply_ldp(X_test, epsilon)
    evaluate_model(X_train_ldp, X_test_ldp, Y_train, Y_test, f'LDP (epsilon={epsilon})')

# Convertir resultados en DataFrame y mostrar
results_df = pd.DataFrame(results).T
print(results_df)
