In [5]:
import yaml
with open ("/Users/zoe/Documents/Bank-account-fraud/params.yaml") as p:
    params = yaml.safe_load(p)
params

{'data_location': '/Users/zoe/Documents/Bank-account-fraud/data',
 'output_location': '/Users/zoe/Documents/Bank-account-fraud/output'}

In [3]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 500)
import warnings as wr
wr.filterwarnings('ignore')

In [13]:
X_train = pd.read_csv(f"{params['data_location']}/x_train_data.csv", index_col=0)
y_train = pd.read_csv(f"{params['data_location']}/y_train_data.csv", index_col=0)
X_test = pd.read_csv(f"{params['data_location']}/x_test_data.csv", index_col=0)
y_test = pd.read_csv(f"{params['data_location']}/y_test_data.csv", index_col=0)

# Baseline model

In [19]:
from xgboost import XGBClassifier

In [84]:
# imbalaned dataset with low percentage of positive cases
#Â use AUC as the measuring metrics
# assume it's more costly to have higher FN(misclassified fraud cases as not fraud) than higher FP (false alarm for not fraud as fraud).

In [27]:
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve

def baseline_model_score(X, y, model):
    """
    Function to compute the AUC of the baseline model and compare FN and FP costs.
    
    Parameters:
        X: DataFrame with the training features
        y: Series with the true labels (fraud_bool)
        model: The model to be evaluated (must have a 'predict_proba' method)
        
    Returns:
        auc_score: AUC score of the model
        fn_cost: The number of false negatives for the given model
        fp_cost: The number of false positives for the given model
    """
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
        
    model.fit(X, y)
    
    # Predict probabilities of the positive class (fraud) using the model
    probs = model.predict_proba(X)[:, 1]
    
    # AUC score
    auc_score = roc_auc_score(y, probs)
    
    # Predicted classes using a threshold of 0.5
    y_pred = (probs >= 0.5).astype(int)
    
    # Confusion matrix (True Positives, False Positives, False Negatives, True Negatives)
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    fpr, tpr, thresholds = roc_curve(y, probs)
    
    print(f"AUC: {auc_score:.4f}")
    print(f"True Positive Rate (FNR/recall): {tp/(tp+fn)}")
    print(f"False Positives Rate (FPR): {fp/(tn+fp)}")
    
    return auc_score, fpr, tpr

In [28]:
model = XGBClassifier()
auc_score, fpr, tpr,  = baseline_model_score(X_train, y_train, model=model)

AUC: 0.9537
True Positive Rate (FNR): 0.10112107623318385
False Positives Rate (FPR): 0.00023385751125044242
