In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


## Load Data
Loads the fraud and credit card datasets from CSV files. Handles missing file errors gracefully by printing an error message.

In [4]:
# Load the datasets
try:
    fraud_df = pd.read_csv('../data/processed/cleaned_fraud_data.csv')
    credit_df = pd.read_csv('../data/processed/cleaned_credit_data.csv')
except FileNotFoundError as e:
    print(f"Error: {e}. Please check the file paths.")

## Fraud Data: Train/Test Split, SMOTE, and Scaling
Prepares the fraud dataset for modeling: drops the target column, one-hot encodes categorical variables, splits the data, balances classes using SMOTE, and scales the 'Amount' feature.

In [None]:
# Prepare fraud detection data
# Assuming 'class' is the target variable in fraud_df
Xf = fraud_df.drop(columns=['class'])
yf = fraud_df['class']

# One-hot encode before sampling
Xf = pd.get_dummies(Xf)

Xf_train, Xf_test, yf_train, yf_test = train_test_split(Xf, yf, stratify=yf, test_size=0.3, random_state=42)

smote = SMOTE(random_state=42)
Xf_train_resampled, yf_train_resampled = smote.fit_resample(Xf_train, yf_train)

scaler = StandardScaler()
# Only scale 'Amount'
Xf_train_resampled['Amount'] = scaler.fit_transform(Xf_train_resampled[['Amount']])
Xf_test['Amount'] = scaler.transform(Xf_test[['Amount']])


## Credit Card Data: Train/Test Split, SMOTE, and Scaling
Prepares the credit card dataset for modeling: drops the target column, splits the data, balances classes using SMOTE, and scales the 'Amount' feature.

In [None]:
# Prepare credit card fraud data
# Assuming 'Class' is the target variable in credit_df
Xc = credit_df.drop(columns='Class')
yc = credit_df['Class']


Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, stratify=yc, test_size=0.3, random_state=42)

# smote for credit card fraud data
smote = SMOTE(random_state=42)
Xc_train_resampled, yc_train_resampled = smote.fit_resample(Xc_train, yc_train)

scaler = StandardScaler()
# Only scale 'Amount'
Xc_train_resampled['Amount'] = scaler.fit_transform(Xc_train_resampled[['Amount']])
Xc_test['Amount'] = scaler.transform(Xc_test[['Amount']])

In [None]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression models
# Using max_iter=1000 to ensure convergence
log_fraud = LogisticRegression(max_iter=1000, random_state=42)
log_fraud.fit(Xf_train_resampled, yf_train_resampled)

log_credit = LogisticRegression(max_iter=1000, random_state=42)
log_credit.fit(Xc_train_resampled, yc_train_resampled)

In [None]:
from xgboost import XGBClassifier

# Train XGBoost models with appropriate scale_pos_weight
# Adjust scale_pos_weight based on the class imbalance
# For fraud detection, we assume a lower imbalance, hence a lower scale_pos_weight

xgb_fraud = XGBClassifier(scale_pos_weight=10, use_label_encoder=False, eval_metric='logloss')
xgb_fraud.fit(Xf_train_resampled, yf_train_resampled)

# For credit card fraud, we assume a higer imbalance, hence a higer scale_pos_weight
xgb_credit = XGBClassifier(scale_pos_weight=50, use_label_encoder=False, eval_metric='logloss')
xgb_credit.fit(Xc_train_resampled, yc_train_resampled)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc, f1_score

def evaluate_model(model, X_test, y_test):
    """
    Evaluates a classification model's performance on test data.

    Prints the confusion matrix, classification report, F1 score, and area under the precision-recall curve (AUC-PR).
    
    Parameters:
        model: Trained classification model with predict and predict_proba methods.
        X_test: Test features.
        y_test: True labels for test data.
    """
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]

    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    auc_pr = auc(recall, precision)
    f1 = f1_score(y_test, y_pred)

    print(f"F1 Score: {f1:.4f}")
    print(f"AUC-PR: {auc_pr:.4f}")

# Evaluate models on test data
evaluate_model(log_fraud, Xf_test, yf_test)
evaluate_model(xgb_fraud, Xf_test, yf_test)

evaluate_model(log_credit, Xc_test, yc_test)
evaluate_model(xgb_credit, Xc_test, yc_test)


In [None]:
import shap
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
explainer = shap.TreeExplainer(xgb_fraud)
shap_values = explainer.shap_values(Xf_test)

In [None]:
shap.summary_plot(shap_values, Xf_test, plot_type="bar", max_display=10)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0], Xf_test.iloc[0])

In [None]:
shap.decision_plot(explainer.expected_value, shap_values[0], Xf_test.iloc[0])