In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns   

from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss, ClusterCentroids, TomekLinks

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

train_encoded = pd.read_csv('train_encoded.csv')
test_encoded = pd.read_csv('test_encoded.csv')

In [9]:
# drop 'ReimbursementDeductibleRatio' column from both train and test datasets
train_encoded.drop(columns='ReimbursementDeductibleRatio', inplace=True)
test_encoded.drop(columns='ReimbursementDeductibleRatio', inplace=True)

In [10]:
# features and target
X = train_encoded.drop('PotentialFraud', axis=1)
y = train_encoded['PotentialFraud']

In [11]:
# sampling techniques
sampling_methods = {
    "SMOTE": SMOTE(random_state=42),
    "RandomOverSampler": RandomOverSampler(random_state=42),
    "ADASYN": ADASYN(random_state=42),
    "BorderlineSMOTE": BorderlineSMOTE(random_state=42),
    "RandomUnderSampler": RandomUnderSampler(random_state=42),
    "NearMiss": NearMiss(),
    "ClusterCentroids": ClusterCentroids(random_state=42),
    "TomekLinks": TomekLinks()
}

In [12]:
# classifiers
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(),
    "NaiveBayes": GaussianNB(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0)
}

In [13]:
# evaluation
def evaluate_model(X, y, sampler, model, model_name, sampler_name):
    X_resampled, y_resampled = sampler.fit_resample(X, y)
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_resampled, y_resampled, test_size=0.2, random_state=42)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    y_prob = model.predict_proba(X_val)[:, 1] if hasattr(model, 'predict_proba') else None

    result = {
        "Model": model_name,
        "Sampler": sampler_name,
        "Accuracy": accuracy_score(y_val, y_pred),
        "ROC_AUC": roc_auc_score(y_val, y_prob) if y_prob is not None else None,
        "Report": classification_report(y_val, y_pred, output_dict=True)
    }
    return result

In [None]:
results = []
for sampler_name, sampler in sampling_methods.items():
    for model_name, model in models.items():
        print(f"Training {model_name} with {sampler_name}")
        result = evaluate_model(X, y, sampler, model, model_name, sampler_name)
        results.append(result)

# convert to df for summary
summary = pd.DataFrame(results)
summary[['Model', 'Sampler', 'Accuracy', 'ROC_AUC']]

Training LogisticRegression with SMOTE
