In [None]:
# 📦 Basic libraries for data handling and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 🔄 Progress bar and system tools
from tqdm import tqdm  # For tracking progress in loops
import sys             # For system-level operations (e.g., flushing output)

# ⚖️ Resampling techniques from imbalanced-learn
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler, BorderlineSMOTE  # Over-sampling methods
from imblearn.under_sampling import RandomUnderSampler, NearMiss, ClusterCentroids, TomekLinks  # Under-sampling methods

# 🧪 Model preparation
from sklearn.model_selection import train_test_split       # For splitting dataset
from sklearn.preprocessing import StandardScaler           # For feature scaling
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # For evaluation

# 🤖 Classification models
from sklearn.linear_model import LogisticRegression        # Logistic Regression
from sklearn.tree import DecisionTreeClassifier            # Decision Tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier  # Tree ensembles
from sklearn.neighbors import KNeighborsClassifier         # K-Nearest Neighbors
from sklearn.naive_bayes import GaussianNB                 # Naive Bayes

# ⚡ Advanced boosting models
from xgboost import XGBClassifier                          # XGBoost
from lightgbm import LGBMClassifier                        # LightGBM
from catboost import CatBoostClassifier                    # CatBoost

In [None]:
# 📄 Load preprocessed and encoded datasets
train_encoded = pd.read_csv('train_encoded.csv')
test_encoded = pd.read_csv('test_encoded.csv')   

In [3]:
# drop 'ReimbursementDeductibleRatio' column from both train and test datasets
train_encoded.drop(columns='ReimbursementDeductibleRatio', inplace=True)
test_encoded.drop(columns='ReimbursementDeductibleRatio', inplace=True)

In [None]:
# feature and target definition
X = train_encoded.drop('PotentialFraud', axis=1)
y = train_encoded['PotentialFraud']

# split and scale
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# scaled data to df to have feature names retained
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [5]:
# Resampling techniques
resamplers = {
    'SMOTE': SMOTE(random_state=42),
    'ADASYN': ADASYN(random_state=42),
    'RandomOverSampler': RandomOverSampler(random_state=42),
    'BorderlineSMOTE': BorderlineSMOTE(random_state=42),
    'RandomUnderSampler': RandomUnderSampler(random_state=42),
    'TomekLinks': TomekLinks(),
}

# Models with proper settings
models = {
    'LogisticRegression': LogisticRegression(solver='saga', max_iter=2000, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'NaiveBayes': GaussianNB(),
    "XGBoost": XGBClassifier(eval_metric='logloss', verbosity=0),
    'LightGBM': LGBMClassifier(random_state=42, verbose=-1),  
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42)  
}

In [6]:
# Prepare balanced datasets with detailed progress and logging
balanced_data = {}
print("⚙️ Preparing balanced datasets...\n")

# Initialize progress bar
for name, sampler in tqdm(resamplers.items(), desc="Balancing Techniques", unit="technique", total=len(resamplers)):
    sys.stdout.write(f"\r🔄 Applying: {name}...") 
    sys.stdout.flush()  
    X_res, y_res = sampler.fit_resample(X_train_scaled_df, y_train)
    balanced_data[name] = (X_res, y_res)

print("\n✅ Balanced datasets ready.")

⚙️ Preparing balanced datasets...



Balancing Techniques:   0%|          | 0/6 [00:00<?, ?technique/s]

🔄 Applying: SMOTE...

Balancing Techniques:  17%|█▋        | 1/6 [00:57<04:49, 57.94s/technique]

🔄 Applying: ADASYN...

Balancing Techniques:  33%|███▎      | 2/6 [05:40<12:40, 190.06s/technique]

🔄 Applying: RandomOverSampler...

Balancing Techniques:  50%|█████     | 3/6 [05:40<05:10, 103.45s/technique]

🔄 Applying: BorderlineSMOTE...

Balancing Techniques:  67%|██████▋   | 4/6 [09:41<05:15, 157.69s/technique]

🔄 Applying: RandomUnderSampler...

Balancing Techniques:  83%|████████▎ | 5/6 [09:41<01:40, 100.92s/technique]

🔄 Applying: TomekLinks...

Balancing Techniques: 100%|██████████| 6/6 [18:28<00:00, 184.67s/technique]


✅ Balanced datasets ready.





In [7]:
# Evaluation
print("🔍 Starting model evaluation on all balanced datasets...\n")

# For each model and resampling technique
for technique_name, (X_bal, y_bal) in balanced_data.items():
    print(f"\n📊 Evaluating Models for: {technique_name}\n{'-'*50}")
    for model_name, model in models.items():
        print(f"\n➡️ Training model: {model_name}")
        model.fit(X_bal, y_bal)
        y_pred = model.predict(X_test_scaled_df) 
        
        # Accuracy score
        acc = accuracy_score(y_test, y_pred)
        print(f"✅ Accuracy: {acc:.4f}")
        
        # Classification Report
        print(f"\n📝 Classification Report for {model_name}:\n")
        print(classification_report(y_test, y_pred))
        
        # Confusion Matrix with proper alignment using pandas
        cm = confusion_matrix(y_test, y_pred)
        cm_df = pd.DataFrame(cm, index=['True Negative', 'True Positive'], columns=['Predicted Negative', 'Predicted Positive'])
        print(f"\n🔢 Confusion Matrix for {model_name} ({technique_name}):")
        print(cm_df)

print("\n✅ All model evaluations complete.")

🔍 Starting model evaluation on all balanced datasets...


📊 Evaluating Models for: SMOTE
--------------------------------------------------

➡️ Training model: LogisticRegression
✅ Accuracy: 0.7949

📝 Classification Report for LogisticRegression:

              precision    recall  f1-score   support

           0       0.83      0.84      0.83    103507
           1       0.73      0.73      0.73     63687

    accuracy                           0.79    167194
   macro avg       0.78      0.78      0.78    167194
weighted avg       0.79      0.79      0.79    167194


🔢 Confusion Matrix for LogisticRegression (SMOTE):
               Predicted Negative  Predicted Positive
True Negative               86568               16939
True Positive               17348               46339

➡️ Training model: DecisionTree
✅ Accuracy: 0.9862

📝 Classification Report for DecisionTree:

              precision    recall  f1-score   support

           0       0.99      0.98      0.99    103507
      