# 0. Set-Up

In [None]:
from google.colab import drive

import pandas as pd
import numpy as np
import re

from matplotlib import pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV, ParameterGrid
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve, auc
import warnings
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from sklearn.decomposition import PCA
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score, # AUC-PR
    classification_report
)
import time
from joblib import parallel_backend
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader


warnings.filterwarnings("ignore")

In [None]:
FILE_PATH = "/content/drive/MyDrive/BT4012 Group 16!!/" # change the file path accordingly

drive.mount('/content/drive')

Import the stratified 80/20 train and test datasets that have already undergone data cleaning, feature engineering and feature selection. Right-skewed numerical features have been log-transformed and categorical features have been one-hot encoded.

In [None]:
train_df = pd.read_csv(f"{FILE_PATH}ohe_selected_train_df.csv", keep_default_na=True)

# Check
print("Dataset Size:")
print(train_df.shape)

train_df.head(5)

In [None]:
test_df = pd.read_csv(f"{FILE_PATH}ohe_selected_test_df.csv", keep_default_na=True)

# Check
print("Dataset Size:")
print(test_df.shape)

test_df.head(5)

# 2. Preparing Numerical and Categorical Columns for Model Training

In [None]:
# Final categorical columns (after OHE)
catergorical_cols = ['company_profile_missing',
 'has_questions',
 'description_contains_remote_keywords',
 'department_missing',
 'title_contains_action_word',
 'title_seniority_level',
 'US_listing_without_state_info',
 'benefits_contains_remote_keywords',
 'title_contains_remote_keywords',
 'telecommuting',
 'company_profile_contains_remote_keywords',
 'department_invalid_dept',
 'industry_Accounting',
 'industry_Airlines/Aviation',
 'industry_Apparel & Fashion',
 'industry_Automotive',
 'industry_Banking',
 'industry_Building Materials',
 'industry_Civic & Social Organization',
 'industry_Computer Games',
 'industry_Computer Software',
 'industry_Construction',
 'industry_Consumer Electronics',
 'industry_Consumer Goods',
 'industry_Consumer Services',
 'industry_Cosmetics',
 'industry_Design',
 'industry_E-Learning',
 'industry_Education Management',
 'industry_Electrical/Electronic Manufacturing',
 'industry_Entertainment',
 'industry_Environmental Services',
 'industry_Events Services',
 'industry_Facilities Services',
 'industry_Financial Services',
 'industry_Food & Beverages',
 'industry_Health, Wellness and Fitness',
 'industry_Hospital & Health Care',
 'industry_Hospitality',
 'industry_Human Resources',
 'industry_Information Technology and Services',
 'industry_Insurance',
 'industry_Internet',
 'industry_Legal Services',
 'industry_Leisure, Travel & Tourism',
 'industry_Logistics and Supply Chain',
 'industry_Management Consulting',
 'industry_Marketing and Advertising',
 'industry_Media Production',
 'industry_Medical Practice',
 'industry_Nonprofit Organization Management',
 'industry_Oil & Energy',
 'industry_Online Media',
 'industry_Public Relations and Communications',
 'industry_RARE CATEGORY',
 'industry_Real Estate',
 'industry_Restaurants',
 'industry_Retail',
 'industry_Staffing and Recruiting',
 'industry_Telecommunications',
 'industry_UNKNOWN',
 'industry_Warehousing',
 'country_AU',
 'country_CA',
 'country_DE',
 'country_GB',
 'country_GR',
 'country_IN',
 'country_NL',
 'country_NZ',
 'country_PH',
 'country_RARE CATEGORY',
 'country_UNKNOWN',
 'country_US',
 'function_Accounting/Auditing',
 'function_Administrative',
 'function_Art/Creative',
 'function_Business Development',
 'function_Consulting',
 'function_Customer Service',
 'function_Design',
 'function_Education',
 'function_Engineering',
 'function_Finance',
 'function_Health Care Provider',
 'function_Human Resources',
 'function_Information Technology',
 'function_Management',
 'function_Marketing',
 'function_Other',
 'function_Project Management',
 'function_RARE CATEGORY',
 'function_Sales',
 'function_UNKNOWN',
 'function_Writing/Editing',
 'required_education_Associate Degree',
 "required_education_Bachelor's Degree",
 'required_education_Certification',
 'required_education_Doctorate',
 'required_education_High School or equivalent',
 "required_education_Master's Degree",
 'required_education_Professional',
 'required_education_Some College Coursework Completed',
 'required_education_Some High School Coursework',
 'required_education_UNKNOWN',
 'required_education_Unspecified',
 'required_education_Vocational',
 'required_education_Vocational - Degree',
 'required_education_Vocational - HS Diploma',
 'required_experience_Associate',
 'required_experience_Director',
 'required_experience_Entry level',
 'required_experience_Executive',
 'required_experience_Internship',
 'required_experience_Mid-Senior level',
 'required_experience_Not Applicable',
 'required_experience_UNKNOWN',
 'employment_type_Contract',
 'employment_type_Full-time',
 'employment_type_Other',
 'employment_type_Part-time',
 'employment_type_Temporary',
 'employment_type_UNKNOWN']

# Check
X_train_categorical = train_df[catergorical_cols].values
X_test_categorical  = test_df[catergorical_cols].values
print("Categorical (OHE) shape:", X_train_categorical.shape)


# Final numerical columns (after log-transformed)
numerical_cols = ['requirements_special_char_ratio',
 'department_avg_word_length_log',
 'company_profile_POS_CONJ_normalised',
 'requirements_all_caps_words_log',
 'description_POS_ADJ_normalised',
 'requirements_unique_char_normalised',
 'company_profile_POS_ADJ_normalised',
 'requirements_longest_repeated_chars_length_log',
 'requirements_char_count_log',
 'description_POS_PRON_normalised',
 'description_flesch_score',
 'requirements_exclamation_normalised',
 'requirements_POS_NUM_normalised',
 'requirements_POS_PRON_normalised',
 'company_profile_POS_NOUN_normalised',
 'location_segment_count_log',
 'benefits_POS_PRON_normalised',
 'description_sentence_count_log',
 'description_POS_NOUN_normalised',
 'benefits_has_email',
 'company_profile_POS_DET_normalised',
 'description_POS_DET_normalised',
 'company_profile_avg_word_length_log',
 'requirements_POS_CONJ_normalised',
 'department_special_char_ratio',
 'description_exclamation_normalised',
 'company_profile_duplicated',
 'benefits_unique_char_normalised',
 'requirements_gunning_fog_log',
 'requirements_sentiment_compound',
 'requirements_avg_word_length',
 'description_unique_char_normalised',
 'benefits_POS_PRT_normalised',
 'description_uppercase_ratio',
 'requirements_has_url',
 'company_profile_POS_PRT_normalised',
 'description_has_email',
 'requirements_unique_word_normalised',
 'benefits_avg_sentence_length_log',
 'company_profile_sentiment_compound',
 'description_POS_VERB_normalised',
 'title_avg_word_length_log',
 'description_POS_ADP_normalised',
 'benefits_spelling_error_rate',
 'requirements_duplicated',
 'requirements_has_email',
 'benefits_POS_X_normalised',
 'benefits_POS_ADJ_normalised',
 'company_profile_unique_char_normalised',
 'title_unique_word_normalised',
 'title_unique_char_normalised',
 'description_sentiment_neg',
 'benefits_gunning_fog_log',
 'benefits_question_normalised',
 'requirements_POS_NOUN_normalised',
 'description_special_char_ratio',
 'description_sentiment_pos',
 'description_question_normalised',
 'benefits_POS_NOUN_normalised',
 'company_profile_question_count_log',
 'company_profile_avg_sentence_length',
 'description_avg_sentence_length_log',
 'benefits_POS_CONJ_normalised',
 'company_profile_POS_PRON_normalised',
 'description_sentiment_compound',
 'benefits_POS_ADP_normalised',
 'benefits_POS_DET_normalised',
 'company_profile_char_count_log',
 'requirements_spelling_error_rate',
 'title_avg_sentence_length_log',
 'company_profile_flesch_score',
 'company_profile_sentiment_pos',
 'benefits_exclamation_normalised',
 'description_objective_score',
 'description_digit_ratio',
 'description_POS_ADV_normalised',
 'requirements_avg_sentence_length_log',
 'company_profile_has_phone',
 'benefits_longest_repeated_chars_length_log',
 'benefits_has_url',
 'company_profile_unique_word_normalised',
 'company_profile_sentence_count_log',
 'requirements_POS_ADJ_normalised',
 'requirements_POS_VERB_normalised',
 'title_char_count_log',
 'benefits_sentiment_compound',
 'company_profile_exclamation_normalised',
 'benefits_sentiment_pos',
 'benefits_char_count_log',
 'benefits_avg_word_length_log',
 'description_all_caps_words_normalised',
 'requirements_flesch_score',
 'department_unique_char_normalised',
 'company_profile_gunning_fog',
 'title_digit_ratio',
 'benefits_unique_word_normalised',
 'company_profile_POS_NUM_normalised',
 'company_profile_uppercase_ratio',
 'benefits_special_char_ratio',
 'benefits_digit_ratio',
 'requirements_sentiment_pos',
 'company_profile_special_char_ratio',
 'title_special_char_ratio',
 'description_duplicated',
 'company_profile_objective_score',
 'company_profile_digit_ratio',
 'description_spelling_error_rate',
 'benefits_objective_score',
 'company_profile_POS_ADP_normalised',
 'benefits_POS_NUM_normalised',
 'description_all_caps_words_log',
 'benefits_uppercase_ratio',
 'requirements_uppercase_ratio',
 'requirements_sentiment_neg',
 'requirements_POS_X_normalised',
 'benefits_all_caps_words_normalised',
 'description_unique_word_normalised',
 'requirements_digit_ratio',
 'benefits_sentiment_neg',
 'benefits_flesch_score',
 'description_POS_NUM_normalised',
 'description_gunning_fog_log',
 'company_profile_spelling_error_rate',
 'description_avg_word_length_log',
 'requirements_question_count_log',
 'benefits_POS_VERB_normalised',
 'requirements_POS_ADP_normalised',
 'company_profile_sentiment_neg',
 'requirements_objective_score',
 'requirements_POS_PRT_normalised',
 'benefits_POS_ADV_normalised',
 'benefits_sentence_count_log',
 'description_POS_CONJ_normalised',
 'description_POS_PRT_normalised',
 'requirements_sentence_count_log',
 'description_has_phone',
 'description_POS_X_normalised',
 'company_profile_POS_X_normalised',
 'company_profile_all_caps_words_normalised',
 'company_profile_longest_repeated_chars_length_log',
 'company_profile_POS_ADV_normalised',
 'title_longest_repeated_chars_length',
 'requirements_POS_ADV_normalised',
 'requirements_POS_DET_normalised',
 'requirements_all_caps_words_normalised',
 'company_profile_POS_VERB_normalised',
 'description_longest_repeated_chars_length_log']

# Scale only the numerical columns
num_scaler = StandardScaler()

# Fit on train, transform train & test
X_train_numerical = num_scaler.fit_transform(train_df[numerical_cols])
X_test_numerical  = num_scaler.transform(test_df[numerical_cols])

# Check
print("Scaled numeric shape:", X_train_numerical.shape)

# In total, there are 276 columns which is made up of 123 categorical columns, 147 numerical columns,
# 4 text columns, 1 id column ('job_id') and 1 target column ('fraudulent').

# 3. Creating Text Embedding with Bert

In [None]:
TEXT_COLS = ["company_profile_noencoded", "description_noencoded", "requirements_noencoded", "benefits_noencoded"]

def concat_text_columns(df, text_cols=TEXT_COLS):
    # Replace NaNs with empty strings, then join with a separator
    return (
        df[text_cols]
        .fillna("")
        .agg(" [SEP] ".join, axis=1)
    )

# Build full_text for train & test
train_df["full_text"] = concat_text_columns(train_df)
test_df["full_text"]  = concat_text_columns(test_df)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased").to(device)
model.eval()

In [None]:
def compute_bert_embeddings(texts, tokenizer, model, device, batch_size, max_length):
    all_embeddings = []

    start_time = time.time()

    # tqdm to show progress bar
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding Batches"):
        batch_texts = texts[i:i+batch_size]

        encoded = tokenizer(
            batch_texts,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=max_length,
        ).to(device)

        with torch.no_grad():
            outputs = model(**encoded)
            token_embeddings = outputs.last_hidden_state

            attention_mask = (
                encoded["attention_mask"]
                .unsqueeze(-1)
                .expand(token_embeddings.size())
                .float()
            )

            sum_embeddings = torch.sum(token_embeddings * attention_mask, dim=1)
            sum_mask = torch.clamp(attention_mask.sum(dim=1), min=1e-9)
            mean_pooled = sum_embeddings / sum_mask

        all_embeddings.append(mean_pooled.cpu().numpy())

    total_time = time.time() - start_time
    print(f"\nTotal embedding time: {total_time:.2f} seconds "
          f"({total_time/60:.2f} minutes)")

    return np.vstack(all_embeddings)

# Actually compute the text embeddings
texts_train = train_df["full_text"].fillna("").tolist()
texts_test  = test_df["full_text"].fillna("").tolist()

train_embeddings = compute_bert_embeddings(
    texts_train, tokenizer, model, device=device, batch_size=16, max_length=512
)

test_embeddings = compute_bert_embeddings(
    texts_test, tokenizer, model, device=device, batch_size=16, max_length=512
)

print("Train embeddings shape:", train_embeddings.shape)
print("Test embeddings shape:",  test_embeddings.shape)

PCA to reduce dimensionality

In [None]:
# Keep 90% variance
pca = PCA(n_components=0.90, random_state=42)

# Fit PCA on TRAIN only
pca.fit(train_embeddings)

# Transform train and test
X_train_embeddings = pca.transform(train_embeddings)
X_test_embeddings  = pca.transform(test_embeddings)

print("Num components:", pca.n_components_)
print("PCA train shape:", X_train_embeddings.shape)
print("PCA test shape :", X_test_embeddings.shape)
print("Explained variance ratio sum:", pca.explained_variance_ratio_.sum())

# 4. Concatenating the final features with text embeddings

In [None]:
X_train_final = np.hstack([X_train_numerical, X_train_categorical, X_train_embeddings])
X_test_final  = np.hstack([X_test_numerical,  X_test_categorical,  X_test_embeddings])

y_train = train_df["fraudulent"].values
y_test  = test_df["fraudulent"].values

print("X_train_final:", X_train_final.shape)
print("y_train:", y_train.shape)

print("X_test_final:", X_test_final.shape)
print("y_test:", y_test.shape)

# 5. Preparing for Model Training

Stratified Cross Validation Helper Function

In [None]:
def run_cv_grid(
    estimator,
    X,
    y,
    param_grid,
    *,
    model_name=None,
    n_splits=5,
    scoring="average_precision",
    n_jobs=-1,
    refit=True, # Fit best model at the end
    verbose=2,
):
    name = model_name or estimator.__class__.__name__

    cv = StratifiedKFold(
        n_splits=n_splits,
        shuffle=True,
        random_state=42
    )

    combos = list(ParameterGrid(param_grid))
    print(f"\n {name}: {len(combos)} candidates × {n_splits} folds = {len(combos) * n_splits} fits ")
    print(f"Scoring = {scoring}\n")

    grid = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=cv,
        scoring=scoring,
        n_jobs=n_jobs,
        refit=refit,
        verbose=verbose,
        return_train_score=False,
        error_score="raise"
    )

    t0 = time.time()
    with parallel_backend("threading"):
        grid.fit(X, y)
    total_min = (time.time() - t0) / 60.0

    print(f"\n[{name}] total CV time: {total_min:.2f} min")
    print(f"[{name}] best {scoring}: {grid.best_score_:.5f}")
    print(f"[{name}] best params: {grid.best_params_}")

    return grid

Weights to Handle Class Imbalance

In [None]:
# Class imbalance handling (with full train labels, not fold-specific)
# To be reused for all base models
neg, pos = (y_train == 0).sum(), (y_train == 1).sum()
scale_pos = float(neg) / max(float(pos), 1.0)
print(f"scale_pos_weight (neg/pos) = {scale_pos:.3f}")

# 6. Base ML Models

## a. Logistic Regression

In [None]:
base_lr = LogisticRegression(
    penalty="l2",
    max_iter=5000,
    solver="saga",
    n_jobs=1,
    class_weight={0: 1.0, 1: scale_pos},
    random_state=42,
)

param_grid_lr = {
    "C": [0.5, 1.0, 2.0],
}

grid_lr = run_cv_grid(
    estimator=base_lr,
    X=X_train_final,
    y=y_train,
    param_grid=param_grid_lr,
    model_name="LogisticRegression",
    scoring="average_precision",
)

best_lr = grid_lr.best_estimator_

In [None]:
y_test_proba = best_lr.predict_proba(X_test_final)[:, 1]
y_test_pred  = (y_test_proba >= 0.5).astype(int)

print("\n Logistic Regression: Test Metrics")
print(f"ROC-AUC : {roc_auc_score(y_test, y_test_proba):.4f}")
print(f"AUC-PR  : {average_precision_score(y_test, y_test_proba):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_test_pred):.4f}")
print(f"Recall  : {recall_score(y_test, y_test_pred):.4f}")
print(f"F1      : {f1_score(y_test, y_test_pred):.4f}")


## b. Support Vector Machine

In [None]:
base_svm = SVC(
    kernel="linear",
    probability=True,
    class_weight={0: 1.0, 1: scale_pos},
    random_state=42,
)

param_grid_svm = {
    "C": [1.0, 2.0],
}

grid_svm = run_cv_grid(
    estimator=base_svm,
    X=X_train_final,
    y=y_train,
    param_grid=param_grid_svm,
    model_name="LinearSVM",
    scoring="average_precision",
)

best_svm = grid_svm.best_estimator_

In [None]:
y_test_proba = best_svm.predict_proba(X_test_final)[:, 1]
y_test_pred  = (y_test_proba >= 0.5).astype(int)

print("\n SVM: Test Metrics")
print(f"ROC-AUC : {roc_auc_score(y_test, y_test_proba):.4f}")
print(f"AUC-PR  : {average_precision_score(y_test, y_test_proba):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_test_pred):.4f}")
print(f"Recall  : {recall_score(y_test, y_test_pred):.4f}")
print(f"F1      : {f1_score(y_test, y_test_pred):.4f}")


## c. Random Forest (Bagging)

In [None]:
base_rf = RandomForestClassifier(
    n_estimators=300,
    n_jobs=-1,
    class_weight={0: 1.0, 1: scale_pos},
    random_state=42,
)

param_grid_rf = {
    "n_estimators": [300, 500],
    "max_depth": [None, 10],
    "min_samples_leaf": [1, 2],
}

grid_rf = run_cv_grid(
    estimator=base_rf,
    X=X_train_final,
    y=y_train,
    param_grid=param_grid_rf,
    model_name="RandomForest",
    scoring="average_precision",
)

best_rf = grid_rf.best_estimator_

In [None]:
y_test_proba = best_rf.predict_proba(X_test_final)[:, 1]
y_test_pred  = (y_test_proba >= 0.5).astype(int)

print("\n Random Forest: Test Metrics")
print(f"ROC-AUC : {roc_auc_score(y_test, y_test_proba):.4f}")
print(f"AUC-PR  : {average_precision_score(y_test, y_test_proba):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_test_pred):.4f}")
print(f"Recall  : {recall_score(y_test, y_test_pred):.4f}")
print(f"F1      : {f1_score(y_test, y_test_pred):.4f}")


## d. XGBoost / LightGBM (Boosting)

In [None]:
base_lgb = LGBMClassifier(
    device="gpu",
    scale_pos_weight=scale_pos,
    n_jobs=-1,
    random_state=42,
)

param_grid_lgb = {
    "n_estimators": [200, 500],
    "learning_rate": [0.05, 0.1],
    "num_leaves": [31, 64],
}

grid_lgb = run_cv_grid(
    estimator=base_lgb,
    X=X_train_final,
    y=y_train,
    param_grid=param_grid_lgb,
    model_name="LightGBM",
    scoring="average_precision",
)

best_lgb = grid_lgb.best_estimator_


In [None]:
y_test_proba = best_lgb.predict_proba(X_test_final)[:, 1]
y_test_pred  = (y_test_proba >= 0.5).astype(int)

print("\n LightGBM: Test Metrics")
print(f"ROC-AUC : {roc_auc_score(y_test, y_test_proba):.4f}")
print(f"AUC-PR  : {average_precision_score(y_test, y_test_proba):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_test_pred):.4f}")
print(f"Recall  : {recall_score(y_test, y_test_pred):.4f}")
print(f"F1      : {f1_score(y_test, y_test_pred):.4f}")

## e. PyTorch MLP

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Convert to tensors
X_train_t = torch.tensor(X_train_final, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32)
X_test_t  = torch.tensor(X_test_final, dtype=torch.float32).to(device)
y_test_np = y_test  # keep numpy for metrics

pos_weight = torch.tensor([scale_pos], dtype=torch.float32).to(device) # Convert the pos_weight that we have been using for other models


In [None]:
class FraudMLP(nn.Module):
    def __init__(self, input_dim, hidden_sizes=(256, 128), dropout=0.3):
        super().__init__()
        layers = []
        in_dim = input_dim
        for h in hidden_sizes:
            layers.append(nn.Linear(in_dim, h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            in_dim = h
        layers.append(nn.Linear(in_dim, 1))  # binary output (logits)
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x).squeeze(1)


In [None]:
def train_one_epoch(model, dataloader, optimizer, loss_fn):
    model.train()
    total_loss = 0.0
    for xb, yb in dataloader:
        xb = xb.to(device)
        yb = yb.to(device)

        optimizer.zero_grad()
        logits = model(xb)
        loss = loss_fn(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)


def eval_average_precision(model, X_val, y_val):
    model.eval()
    with torch.no_grad():
        logits = model(X_val.to(device))
        proba  = torch.sigmoid(logits).cpu().numpy()
    return average_precision_score(y_val, proba)


In [None]:
def grid_search_pytorch_mlp(
    X_train_np,
    y_train_np,
    param_grid,
    n_splits=5,
    max_epochs=20,
    batch_size=128,
    patience=3,
):
    """
    X_train_np, y_train_np: numpy arrays
    param_grid: dict like {"hidden_sizes":[...], "dropout":[...], "lr":[...]}
    """
    input_dim = X_train_np.shape[1]
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    all_params = list(ParameterGrid(param_grid))
    print(f"\nPyTorch MLP: {len(all_params)} candidates × {n_splits} folds = {len(all_params)*n_splits} fits")
    print("Scoring = average_precision (AUC-PR)\n")

    best_score = -np.inf
    best_params = None

    for i, params in enumerate(all_params, start=1):
        print(f"\n Candidate {i}/{len(all_params)}: {params}")
        fold_scores = []

        for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train_np, y_train_np), start=1):
            # Split fold data
            X_tr = torch.tensor(X_train_np[tr_idx], dtype=torch.float32)
            y_tr = torch.tensor(y_train_np[tr_idx], dtype=torch.float32)
            X_val = torch.tensor(X_train_np[val_idx], dtype=torch.float32)
            y_val = y_train_np[val_idx]  # keep as numpy for metrics

            # Dataloader
            train_ds = TensorDataset(X_tr, y_tr)
            train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

            # Fresh model for each fold
            model = FraudMLP(
                input_dim=input_dim,
                hidden_sizes=params["hidden_sizes"],
                dropout=params["dropout"],
            ).to(device)

            optimizer = torch.optim.Adam(
                model.parameters(),
                lr=params["lr"],
            )
            loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

            best_fold_loss = float("inf")
            patience_counter = 0

            for epoch in range(max_epochs):
                train_loss = train_one_epoch(model, train_dl, optimizer, loss_fn)

                # Simple early stopping on training loss
                if train_loss < best_fold_loss - 1e-4:
                    best_fold_loss = train_loss
                    patience_counter = 0
                    best_state = model.state_dict()
                else:
                    patience_counter += 1
                    if patience_counter >= patience:
                        break

            # Load best weights for evaluation
            model.load_state_dict(best_state)

            ap = eval_average_precision(model, X_val, y_val)
            fold_scores.append(ap)
            print(f"  Fold {fold}: AUC-PR = {ap:.4f}")

        mean_ap = float(np.mean(fold_scores))
        print(f"--> Mean AUC-PR for {params}: {mean_ap:.4f}")

        if mean_ap > best_score:
            best_score = mean_ap
            best_params = params

    print("\n Best PyTorch MLP hyperparameters")
    print("Best params:", best_params)
    print(f"Best mean CV AUC-PR: {best_score:.4f}")

    return best_params, best_score


In [None]:
mlp_param_grid = {
    "hidden_sizes": [
        (256, 128),
        (256, 128, 64),
    ],
    "dropout": [0.3, 0.5],
    "lr": [1e-3, 5e-4],
}

best_params_mlp, best_cv_ap = grid_search_pytorch_mlp(
    X_train_final,
    y_train,
    param_grid=mlp_param_grid,
    n_splits=5,
    max_epochs=20,
    batch_size=128,
    patience=3,
)


In [None]:
# Final model using best hyperparameters
input_dim = X_train_final.shape[1]

final_mlp = FraudMLP(
    input_dim=input_dim,
    hidden_sizes=best_params_mlp["hidden_sizes"],
    dropout=best_params_mlp["dropout"],
).to(device)

optimizer = torch.optim.Adam(
    final_mlp.parameters(),
    lr=best_params_mlp["lr"],
)
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

# Full-train dataset and loader
full_train_ds = TensorDataset(X_train_t, y_train_t)
full_train_dl = DataLoader(full_train_ds, batch_size=128, shuffle=True)

epochs = 30
best_loss = float("inf")
patience = 5
patience_counter = 0

for epoch in range(epochs):
    train_loss = train_one_epoch(final_mlp, full_train_dl, optimizer, loss_fn)
    print(f"Epoch {epoch+1}/{epochs} — loss = {train_loss:.4f}")

    if train_loss < best_loss - 1e-4:
        best_loss = train_loss
        patience_counter = 0
        best_state = final_mlp.state_dict()
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping on full train.")
            break

final_mlp.load_state_dict(best_state)

In [None]:
# Test evaluation
final_mlp.eval()
with torch.no_grad():
    logits_test = final_mlp(X_test_t)
    proba_test = torch.sigmoid(logits_test).cpu().numpy()

y_test_pred = (proba_test >= 0.5).astype(int)

print("\nPyTorch MLP: Test Metrics")
print(f"ROC-AUC : {roc_auc_score(y_test_np, proba_test):.4f}")
print(f"AUC-PR  : {average_precision_score(y_test_np, proba_test):.4f}")
print(f"Accuracy: {accuracy_score(y_test_np, y_test_pred):.4f}")
print(f"Precision: {precision_score(y_test_np, y_test_pred):.4f}")
print(f"Recall  : {recall_score(y_test_np, y_test_pred):.4f}")
print(f"F1      : {f1_score(y_test_np, y_test_pred):.4f}")


# 7. Ensemble

In [None]:
best_mlp_params = {'dropout': 0.5, 'hidden_sizes': (256, 128, 64), 'lr': 0.0005} # Taken from MLP grid search

In [None]:
def get_cv_pred_probas_sklearn(estimator, X, y, n_splits=5):
    """
    Build a probability vector for all training samples using
    5-fold CV: each sample's prediction comes from a fold where it
    was in the validation part (not used to train that fold's model).
    """
    X = np.asarray(X)
    y = np.asarray(y)
    preds = np.zeros(len(y), dtype=float)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), start=1):
        print(f"[{estimator.__class__.__name__}] Fold {fold}")
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr = y[tr_idx]

        model = clone(estimator) # same hyperparams, fresh fit
        model.fit(X_tr, y_tr)
        preds[val_idx] = model.predict_proba(X_val)[:, 1]

    return preds


In [None]:
def get_cv_pred_probas_mlp(
    X,
    y,
    n_splits=5,
    hidden_sizes=(256, 128, 64),
    dropout=0.3,
    lr=1e-3,
    batch_size=128,
    max_epochs=20,
    patience=3,
):
    X_np = np.asarray(X)
    y_np = np.asarray(y)
    input_dim = X_np.shape[1]

    preds = np.zeros(len(y_np), dtype=float)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_np, y_np), start=1):
        print(f"[MLP] Fold {fold}")

        X_tr = torch.tensor(X_np[tr_idx], dtype=torch.float32)
        y_tr = torch.tensor(y_np[tr_idx], dtype=torch.float32)
        X_val = torch.tensor(X_np[val_idx], dtype=torch.float32)

        train_ds = TensorDataset(X_tr, y_tr)
        train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

        model = FraudMLP(
            input_dim=input_dim,
            hidden_sizes=hidden_sizes,
            dropout=dropout,
        ).to(device)

        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

        best_loss = float("inf")
        best_state = None
        patience_counter = 0

        for epoch in range(max_epochs):
            train_loss = train_one_epoch(model, train_dl, optimizer, loss_fn)

            if train_loss < best_loss - 1e-4:
                best_loss = train_loss
                best_state = model.state_dict()
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    break

        if best_state is not None:
            model.load_state_dict(best_state)

        model.eval()
        with torch.no_grad():
            logits_val = model(X_val.to(device))
            proba_val = torch.sigmoid(logits_val).cpu().numpy().ravel()

        preds[val_idx] = proba_val

    return preds


In [None]:
# Validation predictions on train for each base model
best_lgb.set_params(verbose=-1) # Silence LightGBM warnings
cv_pred_lgb = get_cv_pred_probas_sklearn(best_lgb, X_train_final, y_train, n_splits=5)
cv_pred_rf  = get_cv_pred_probas_sklearn(best_rf,  X_train_final, y_train, n_splits=5)

cv_pred_mlp = get_cv_pred_probas_mlp(
    X_train_final,
    y_train,
    n_splits=5,
    hidden_sizes=best_mlp_params["hidden_sizes"],
    dropout=best_mlp_params["dropout"],
    lr=best_mlp_params["lr"],
)

# Stack them as features for the meta-model
Z_train = np.column_stack([cv_pred_lgb, cv_pred_rf, cv_pred_mlp])
print("Z_train shape:", Z_train.shape)


In [None]:
# Train the meta-model (Logistic Regression) on stacked train features
meta_lr = LogisticRegression(
    penalty="l2",
    solver="lbfgs",
    max_iter=5000,
    class_weight={0: 1.0, 1: scale_pos},  # reuse the imbalance ratio
    random_state=42,
)

meta_lr.fit(Z_train, y_train)

# Retrain base models on FULL train set with best hyperparameters
# LightGBM
lgb_full = clone(best_lgb).fit(X_train_final, y_train)

# Random Forest
rf_full = clone(best_rf).fit(X_train_final, y_train)

# MLP
X_train_t = torch.tensor(X_train_final, dtype=torch.float32).to(device)
y_train_t = torch.tensor(y_train, dtype=torch.float32).to(device)

train_ds_full = TensorDataset(X_train_t, y_train_t)
train_dl_full = DataLoader(train_ds_full, batch_size=128, shuffle=True)

mlp_full = FraudMLP(
    input_dim=X_train_final.shape[1],
    hidden_sizes=best_mlp_params["hidden_sizes"],
    dropout=best_mlp_params["dropout"],
).to(device)

optimizer = torch.optim.Adam(mlp_full.parameters(), lr=best_mlp_params["lr"])
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

best_loss = float("inf")
best_state = None
patience = 5
patience_counter = 0

for epoch in range(30):
    train_loss = train_one_epoch(mlp_full, train_dl_full, optimizer, loss_fn)
    print(f"[MLP full] Epoch {epoch+1}/30 — loss = {train_loss:.4f}")

    if train_loss < best_loss - 1e-4:
        best_loss = train_loss
        best_state = mlp_full.state_dict()
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping on full train.")
            break

if best_state is not None:
    mlp_full.load_state_dict(best_state)

# Get base-model probabilities on test set and stack into Z_test
proba_test_lgb = lgb_full.predict_proba(X_test_final)[:, 1]
proba_test_rf  = rf_full.predict_proba(X_test_final)[:, 1]

X_test_t = torch.tensor(X_test_final, dtype=torch.float32).to(device)
mlp_full.eval()
with torch.no_grad():
    logits_test_mlp = mlp_full(X_test_t)
    proba_test_mlp  = torch.sigmoid(logits_test_mlp).cpu().numpy().ravel()

Z_test = np.column_stack([proba_test_lgb, proba_test_rf, proba_test_mlp])
print("Z_test shape:", Z_test.shape)

# Meta-model final prediction + metrics
meta_proba = meta_lr.predict_proba(Z_test)[:, 1]
meta_pred  = (meta_proba >= 0.5).astype(int)

print("\n Stacked Ensemble (LGB + RF + MLP) — Test Metrics")
print(f"ROC-AUC : {roc_auc_score(y_test, meta_proba):.4f}")
print(f"AUC-PR  : {average_precision_score(y_test, meta_proba):.4f}")
print(f"Accuracy: {accuracy_score(y_test, meta_pred):.4f}")
print(f"Precision: {precision_score(y_test, meta_pred):.4f}")
print(f"Recall  : {recall_score(y_test, meta_pred):.4f}")
print(f"F1      : {f1_score(y_test, meta_pred):.4f}")


# 8. AUC Curves

In [None]:
# Gather all model probabilities in a dict
proba_lr = best_lr.predict_proba(X_test_final)[:, 1]
proba_svm = best_svm.predict_proba(X_test_final)[:, 1]
proba_rf = best_rf.predict_proba(X_test_final)[:, 1]
proba_lgb = best_lgb.predict_proba(X_test_final)[:, 1]
proba_mlp = proba_test
proba_stack = meta_proba

model_probas = {
    "Logistic Regression": proba_lr,
    "SVM":                 proba_svm,
    "Random Forest":       proba_rf,
    "LightGBM":            proba_lgb,
    "MLP":                 proba_mlp,
    "Stacked Ensemble":    proba_stack,
}

# Plot the ROC curves for all models in one figure
plt.figure(figsize=(8, 6))

for name, proba in model_probas.items():
    fpr, tpr, _ = roc_curve(y_test, proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.3f})")

# Chance line
plt.plot([0, 1], [0, 1], linestyle="--", linewidth=1)

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves for All Models")
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()

# Plot the Precision–Recall curves for all models in one figure
plt.figure(figsize=(8, 6))

for name, proba in model_probas.items():
    precision, recall, _ = precision_recall_curve(y_test, proba)
    ap = average_precision_score(y_test, proba)
    plt.plot(recall, precision, label=f"{name} (AP = {ap:.3f})")

# Baseline: positive rate
pos_rate = (y_test == 1).mean()
plt.hlines(
    pos_rate,
    0, 1,
    linestyles="--",
    linewidth=1,
    label=f"Baseline (pos rate = {pos_rate:.3f})"
)

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision–Recall Curves for All Models")
plt.legend(loc="lower left", fontsize=8)

plt.grid(True)
plt.tight_layout()
plt.show()
