In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# CIC2018 Dataset


In [9]:
!pip install catboost xgboost lightgbm imbalanced-learn

import pandas as pd
import os
import joblib
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC

# Install necessary packages (run only once per session)

import kagglehub

# Download latest version
path = kagglehub.dataset_download("dhoogla/csecicids2018")

print("Path to dataset files:", path)

# Read the Parquet file
file_path = '/kaggle/input/csecicids2018/DoS1-Thursday-15-02-2018_TrafficForML_CICFlowMeter.parquet'
cic = pd.read_parquet(path + "/DoS1-Thursday-15-02-2018_TrafficForML_CICFlowMeter.parquet")

print(cic['Label'].value_counts())

# Define which labels are considered DoS attacks
dos_labels = ['DoS attacks-GoldenEye', 'DoS attacks-Slowloris']

# Replace them with a unified label "DoS"
cic['Label'] = cic['Label'].replace(dos_labels, 'DoS')
print(cic['Label'].value_counts())

benign_df = cic[cic['Label'] == 'Benign'].sample(n=50000, random_state=42)
dos_df = cic[cic['Label'] == 'DoS'].sample(n=50000, random_state=42)

cic = pd.concat([benign_df, dos_df], ignore_index=True)


feature_mapping = {
    'Total Fwd Packet': 'Total Fwd Packets',
    'Total Bwd packets': 'Total Backward Packets',
    'Total Length of Fwd Packet': 'Fwd Packets Length Total',
    'CWR Flag Count': 'CWE Flag Count',
    'Fwd Packet/Bulk Avg': 'Fwd Avg Packets/Bulk',  # Note: Duplicate key issue below
    'Fwd Bytes/Bulk Avg': 'Fwd Avg Bytes/Bulk',
    'Fwd Bulk Rate Avg': 'Fwd Avg Bulk Rate',
    'Bwd Bytes/Bulk Avg': 'Bwd Avg Bytes/Bulk',
    'Bwd Bulk Rate Avg': 'Bwd Avg Bulk Rate',
    'Bwd Init Win Bytes': 'Init Bwd Win Bytes',
    'Bwd Packet/Bulk Avg':'Bwd Avg Packets/Bulk',
    'FWD Init Win Bytes': 'Init Fwd Win Bytes',
    'Fwd Act Data Pkts': 'Fwd Act Data Packets'
}
reversed_feature_mapping = {v: k for k, v in feature_mapping.items()}

cic = cic.rename(columns=reversed_feature_mapping)
cic['Label'].value_counts()

labels = cic['Label']
numeric_df = cic.select_dtypes(include=[np.number])

# Encode labels to binary
le = LabelEncoder()
y = le.fit_transform(labels)

# Copy numeric features
X = numeric_df.copy()

# remove highly correlated features
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop_corr = [col for col in upper.columns if any(upper[col] > 0.95)]
X.drop(columns=to_drop_corr, inplace=True)

X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.dropna(inplace=True)
y = y[X.index]  # align y

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

fast_svm = CalibratedClassifierCV(
    estimator=LinearSVC(C=0.1, max_iter=10000, random_state=RANDOM_STATE),
    method='sigmoid',
    cv=5
)

models = {
    "Random Forest":RandomForestClassifier(
        n_estimators=500, max_depth=None, min_samples_split=5,
        min_samples_leaf=2, max_features='sqrt', n_jobs=-1,
        random_state=RANDOM_STATE
    ),

    "Gradient Boosting":GradientBoostingClassifier(
        n_estimators=100, learning_rate=0.05, max_depth=3,
        subsample=0.8, random_state=RANDOM_STATE
    ),

    "XGBoost":XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss', random_state=RANDOM_STATE,
        n_estimators=500, max_depth=6, learning_rate=0.03,
        subsample=0.85, colsample_bytree=0.85,
        reg_alpha=0.5, reg_lambda=2.0, n_jobs=-1,
        tree_method='gpu_hist' if USE_GPU else 'hist',
        verbosity=0
    ),

    "LightGBM":LGBMClassifier(
        objective='binary',
        random_state=RANDOM_STATE, n_estimators=500, learning_rate=0.03,
        num_leaves=64, max_depth=-1, min_child_samples=20,
        subsample=0.9, colsample_bytree=0.9,
        reg_alpha=0.5, reg_lambda=1.0, boosting_type='gbdt',
        n_jobs=-1, verbose=-1, device='gpu' if USE_GPU else 'cpu'
    ),

    "CatBoost":CatBoostClassifier(
        iterations=700, depth=8, learning_rate=0.03,
        task_type='GPU' if USE_GPU else 'CPU',
        thread_count=-1, random_seed=RANDOM_STATE,
        l2_leaf_reg=3.0, verbose=100,
        od_type='Iter', od_wait=50,
        loss_function='Logloss'
    ),

    "Logistic Regression":LogisticRegression(
        C=0.01, penalty='l2', random_state=RANDOM_STATE,
        max_iter=1000, n_jobs=-1
    ),

    "KNN":KNeighborsClassifier(
        n_neighbors=10, n_jobs=-1
    ),

    "Naïve Bayes":GaussianNB(var_smoothing=1e-9),

    "SVM":fast_svm,
}

for name, model in models.items():
    # Fit on train set
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    acc = accuracy_score(y_test, y_pred)

    print(f"🎯 {name} Test Accuracy: {acc:.4f}")
    print(f"\n🔍 Classification Report for {name}:")
    print(classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1']))


! rm -rf ./saved_models/
# Save models and preprocessing objects
save_dir = "saved_models"
os.makedirs(save_dir, exist_ok=True)
feature_columns = X.columns.tolist()  # features after dropping correlated and unimportant

for name, model in models.items():
    model_path = os.path.join(save_dir, f"{name.replace(' ', '_').lower()}.joblib")
    joblib.dump(model, model_path)

joblib.dump(le, os.path.join(save_dir, "label_encoder.joblib"))
joblib.dump(scaler, os.path.join(save_dir, "scaler.joblib"))
joblib.dump(feature_columns, os.path.join(save_dir, "feature_columns.joblib"))

Path to dataset files: /kaggle/input/csecicids2018
Label
Benign                   743498
DoS attacks-GoldenEye     41406
DoS attacks-Slowloris      9908
Name: count, dtype: int64
Label
Benign    743498
DoS        51314
Name: count, dtype: int64


  cic['Label'] = cic['Label'].replace(dos_labels, 'DoS')


🎯 Random Forest Test Accuracy: 0.9998

🔍 Classification Report for Random Forest:
              precision    recall  f1-score   support

     Class 0       1.00      1.00      1.00     10000
     Class 1       1.00      1.00      1.00     10000

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

🎯 Gradient Boosting Test Accuracy: 0.9990

🔍 Classification Report for Gradient Boosting:
              precision    recall  f1-score   support

     Class 0       1.00      1.00      1.00     10000
     Class 1       1.00      1.00      1.00     10000

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

🎯 XGBoost Test Accuracy: 0.9996

🔍 Classification Report for XGBoost:
              precision    recall  f1-score   support

     Class 0       1.00      1.00      1.00     1



🎯 LightGBM Test Accuracy: 0.9999

🔍 Classification Report for LightGBM:
              precision    recall  f1-score   support

     Class 0       1.00      1.00      1.00     10000
     Class 1       1.00      1.00      1.00     10000

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

0:	learn: 0.5895582	total: 13.9ms	remaining: 9.69s
100:	learn: 0.0009199	total: 3.13s	remaining: 18.6s
200:	learn: 0.0005905	total: 4.62s	remaining: 11.5s
300:	learn: 0.0004748	total: 7.87s	remaining: 10.4s
400:	learn: 0.0004356	total: 9.1s	remaining: 6.79s
500:	learn: 0.0004039	total: 9.98s	remaining: 3.97s
600:	learn: 0.0003745	total: 10.9s	remaining: 1.79s
699:	learn: 0.0003629	total: 11.7s	remaining: 0us
🎯 CatBoost Test Accuracy: 0.9999

🔍 Classification Report for CatBoost:
              precision    recall  f1-score   support

     Class 0       1.00      1.00      1.00     10000
     Cla

['saved_models/feature_columns.joblib']

## Evaluate CICIDS2018 Trained Models On CIC-UNSWB-NB15 Dataset


In [10]:
import pandas as pd

unsw = pd.read_csv("/content/drive/MyDrive/Datasets/CICFlowMeter_out.csv")


print(unsw['Label'].value_counts())

# Keep only rows where the 'Label' is 'BENIGN' or 'DoS'
filtered_df = unsw[unsw['Label'].isin(['Benign', 'DoS'])]

# Undersample BENIGN to 2000 samples
benign_df = filtered_df[filtered_df['Label'] == 'Benign'].sample(n=5000, random_state=42)
dos_df = filtered_df[filtered_df['Label'] == 'DoS']  # keep all DoS for now

# Combine and prepare for SMOTE
df_bal = pd.concat([benign_df, dos_df], ignore_index=True)

# Separate features and labels
X_c = df_bal.drop(columns=['Label'])
X_c = df_bal.select_dtypes(include=[np.number])
y_c = df_bal['Label']

y_encoded = le.fit_transform(y_c)  # BENIGN = 0, DoS = 1

# Apply SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_c, y_encoded)

# Convert back to DataFrame
X_res_df = pd.DataFrame(X_res, columns=X.columns)
y_res_df = pd.DataFrame(le.inverse_transform(y_res), columns=['Label'])

# Combine features and label
final_df = pd.concat([X_res_df, y_res_df], axis=1)

# 1. Separate features and labels
final_df_X = final_df.drop(columns=['Label'])
final_df_Y = final_df['Label']

final_df_X = final_df_X[X.columns]

cic2019_y_encoded = le.transform(final_df_Y)
final_df_X_scaled = scaler.fit_transform(final_df_X)

for name, model in models.items():
    print(f"\n🔹 Evaluating on CIC UNSW-NB15 Augmented Dataset {name}")

    try:
        y_pred = model.predict(final_df_X_scaled)
        acc = accuracy_score(cic2019_y_encoded, y_pred)

        print(f"✅ {name} Accuracy: {acc:.4f}")
        print(f"\n🔍 {name} Classification Report:")
        print(classification_report(cic2019_y_encoded, y_pred, target_names=le.classes_))
        print("-" * 50)

    except Exception as e:
        print(f"❌ Error evaluating {name}: {e}")


Label
Benign            3450658
Exploits            30951
Fuzzers             29613
Reconnaissance      16735
Generic              4632
DoS                  4467
Shellcode            2102
Backdoor              452
Analysis              385
Worms                 246
Name: count, dtype: int64

🔹 Evaluating on CIC UNSW-NB15 Augmented Dataset Random Forest
✅ Random Forest Accuracy: 0.5000

🔍 Random Forest Classification Report:
              precision    recall  f1-score   support

      Benign       0.50      1.00      0.67      5000
         DoS       0.00      0.00      0.00      5000

    accuracy                           0.50     10000
   macro avg       0.25      0.50      0.33     10000
weighted avg       0.25      0.50      0.33     10000

--------------------------------------------------

🔹 Evaluating on CIC UNSW-NB15 Augmented Dataset Gradient Boosting


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


✅ Gradient Boosting Accuracy: 0.3665

🔍 Gradient Boosting Classification Report:
              precision    recall  f1-score   support

      Benign       0.42      0.73      0.54      5000
         DoS       0.00      0.00      0.00      5000

    accuracy                           0.37     10000
   macro avg       0.21      0.37      0.27     10000
weighted avg       0.21      0.37      0.27     10000

--------------------------------------------------

🔹 Evaluating on CIC UNSW-NB15 Augmented Dataset XGBoost
✅ XGBoost Accuracy: 0.3499

🔍 XGBoost Classification Report:
              precision    recall  f1-score   support

      Benign       0.41      0.70      0.52      5000
         DoS       0.00      0.00      0.00      5000

    accuracy                           0.35     10000
   macro avg       0.21      0.35      0.26     10000
weighted avg       0.21      0.35      0.26     10000

--------------------------------------------------

🔹 Evaluating on CIC UNSW-NB15 Augmented Data

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Practical Test Of CICIDS2018 Dataset On Recorded CSVs


In [11]:
saved_models_dir = "saved_models"
vuln1_csv = "/content/drive/MyDrive/Datasets/goldeneye_ISCX.csv"
vuln2_csv = "/content/drive/MyDrive/Datasets/slowloris_ISCX.csv"
normal_csv = "/content/drive/MyDrive/Datasets/benign_ISCX.csv"

le = joblib.load(os.path.join(saved_models_dir, "label_encoder.joblib"))
scaler = joblib.load(os.path.join(saved_models_dir, "scaler.joblib"))
feature_columns = joblib.load(os.path.join(saved_models_dir, "feature_columns.joblib"))

def testDataset(input_csv):
  df = pd.read_csv(input_csv)

  df = df[feature_columns]

  df = df.apply(pd.to_numeric, errors='coerce')
  df = df.dropna()

  X_new = df.select_dtypes(include=[np.number])

  X_scaled = scaler.transform(X_new)

  for file in os.listdir(saved_models_dir):
      if file.endswith(".joblib") and not file.startswith(("label_encoder", "scaler", "feature_columns")):
          model_path = os.path.join(saved_models_dir, file)
          model_name = file.replace(".joblib", "").replace("_", " ").title()

          try:
              model = joblib.load(model_path)
              y_pred = model.predict(X_scaled)
              decoded = le.inverse_transform(y_pred)

              print(f"🔹 {model_name} Predictions:")
              for i, pred in enumerate(decoded):
                  print(f"  Sample {i+1}: {pred}")
                  if i == 5 :
                    break
              print("-" * 40)

          except Exception as e:
              print(f"❌ Failed to predict with {model_name}: {e}")

print("vuln dataset 1: ")
testDataset(vuln1_csv)
print("vuln dataset 2: ")
testDataset(vuln2_csv)
print("normal dataset: ")
testDataset(normal_csv)

vuln dataset 1: 
🔹 Random Forest Predictions:
  Sample 1: Benign
  Sample 2: Benign
  Sample 3: Benign
  Sample 4: Benign
  Sample 5: Benign
  Sample 6: Benign
----------------------------------------
🔹 Knn Predictions:
  Sample 1: DoS
  Sample 2: DoS
  Sample 3: DoS
  Sample 4: DoS
  Sample 5: DoS
  Sample 6: DoS
----------------------------------------
🔹 Xgboost Predictions:
  Sample 1: DoS
  Sample 2: DoS
  Sample 3: Benign
  Sample 4: DoS
  Sample 5: DoS
  Sample 6: Benign
----------------------------------------
🔹 Catboost Predictions:
  Sample 1: Benign
  Sample 2: Benign
  Sample 3: Benign
  Sample 4: Benign
  Sample 5: Benign
  Sample 6: Benign
----------------------------------------
🔹 Svm Predictions:
  Sample 1: Benign
  Sample 2: Benign
  Sample 3: Benign
  Sample 4: Benign
  Sample 5: Benign
  Sample 6: Benign
----------------------------------------
🔹 Logistic Regression Predictions:
  Sample 1: DoS
  Sample 2: DoS
  Sample 3: DoS
  Sample 4: DoS
  Sample 5: DoS
  Sample 



🔹 Random Forest Predictions:
  Sample 1: Benign
  Sample 2: Benign
  Sample 3: Benign
  Sample 4: Benign
  Sample 5: Benign
  Sample 6: Benign
----------------------------------------
🔹 Knn Predictions:
  Sample 1: Benign
  Sample 2: Benign
  Sample 3: Benign
  Sample 4: Benign
  Sample 5: Benign
  Sample 6: Benign
----------------------------------------
🔹 Xgboost Predictions:
  Sample 1: Benign
  Sample 2: Benign
  Sample 3: Benign
  Sample 4: Benign
  Sample 5: Benign
  Sample 6: Benign
----------------------------------------
🔹 Catboost Predictions:
  Sample 1: Benign
  Sample 2: Benign
  Sample 3: Benign
  Sample 4: Benign
  Sample 5: Benign
  Sample 6: Benign
----------------------------------------
🔹 Svm Predictions:
  Sample 1: DoS
  Sample 2: DoS
  Sample 3: DoS
  Sample 4: DoS
  Sample 5: DoS
  Sample 6: DoS
----------------------------------------
🔹 Logistic Regression Predictions:
  Sample 1: DoS
  Sample 2: DoS
  Sample 3: DoS
  Sample 4: DoS
  Sample 5: DoS
  Sample 6: Do



🔹 Random Forest Predictions:
  Sample 1: Benign
  Sample 2: Benign
  Sample 3: Benign
  Sample 4: Benign
  Sample 5: Benign
  Sample 6: Benign
----------------------------------------
🔹 Knn Predictions:
  Sample 1: DoS
  Sample 2: DoS
  Sample 3: DoS
  Sample 4: DoS
  Sample 5: DoS
  Sample 6: DoS
----------------------------------------
🔹 Xgboost Predictions:
  Sample 1: Benign
  Sample 2: Benign
  Sample 3: Benign
  Sample 4: Benign
  Sample 5: Benign
  Sample 6: Benign
----------------------------------------
🔹 Catboost Predictions:
  Sample 1: Benign
  Sample 2: Benign
  Sample 3: Benign
  Sample 4: Benign
  Sample 5: Benign
  Sample 6: Benign
----------------------------------------
🔹 Svm Predictions:
  Sample 1: Benign
  Sample 2: Benign
  Sample 3: Benign
  Sample 4: Benign
  Sample 5: Benign
  Sample 6: Benign
----------------------------------------
🔹 Logistic Regression Predictions:
  Sample 1: DoS
  Sample 2: DoS
  Sample 3: DoS
  Sample 4: DoS
  Sample 5: DoS
  Sample 6: Do



In [None]:
! pip install catboost
import os
import warnings
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    confusion_matrix, roc_curve, roc_auc_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from scipy.special import expit, softmax

# -------------------- Configuration --------------------
DATA_PATH = "/content/drive/MyDrive/Datasets/final.csv"
MODEL_DIR = "./models"
os.makedirs(MODEL_DIR, exist_ok=True)

RANDOM_STATE = 42
TEST_SIZE = 0.3
USE_GPU = True

# -------------------- Utility Functions --------------------

def scale_pipeline(model):
    numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), numeric_features)
    ], remainder='passthrough')
    return make_pipeline(preprocessor, model)

def feature_enhancement(X, enhance_level=0.01):
    X_noisy = X.copy()
    for col in X_noisy.select_dtypes(include=[np.number]).columns:
        noise = np.random.normal(0, enhance_level, size=X_noisy[col].shape)
        X_noisy[col] += noise
    return X_noisy

def corrupt_labels(y, corruption_rate=0.03):
    y_corrupt = y.copy()
    num_to_corrupt = int(len(y) * corruption_rate)
    indices = np.random.choice(y.index, size=num_to_corrupt, replace=False)
    unique_classes = y.unique()
    for idx in indices:
        current_label = y[idx]
        new_label = np.random.choice([c for c in unique_classes if c != current_label])
        y_corrupt.at[idx] = new_label
    return y_corrupt

def dropout_features(X, dropout_rate=0.05):
    X_dropout = X.copy()
    mask = np.random.rand(*X_dropout.shape) < dropout_rate
    X_dropout[mask] = 0
    return X_dropout

def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    print(f"\n---- {name} ----")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred, average='weighted'),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted')
    }

    if metrics['Accuracy'] > 0.995:
        warnings.warn("⚠️ Suspiciously high accuracy. Check for data leakage or overly simple task.")

    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

    classes = np.unique(y_test)
    cm = confusion_matrix(y_test, y_pred, labels=classes)
    print("\nConfusion Matrix:")
    print(pd.DataFrame(cm, index=classes, columns=classes))

    try:
        y_score = model.predict_proba(X_test)
    except AttributeError:
        try:
            scores = model.decision_function(X_test)
            y_score = (
                np.vstack([1 - expit(scores), expit(scores)]).T
                if scores.ndim == 1 else softmax(scores)
            )
        except AttributeError:
            print("⚠️ Skipping ROC metrics: model has no probability or decision function.")
            y_score = None

    if y_score is not None and len(classes) == 2:
        positive_class = sorted(classes)[-1]
        pos_idx = list(classes).index(positive_class)
        fpr, tpr, thresholds = roc_curve(y_test, y_score[:, pos_idx], pos_label=positive_class)
        idx = np.abs(thresholds - 0.5).argmin()
        print(f"At threshold ~0.5: FPR={fpr[idx]:.4f}, TPR={tpr[idx]:.4f}")

    joblib.dump(model, os.path.join(MODEL_DIR, f"{name.replace(' ', '_').lower()}.joblib"))
    print(f"✅ Model saved to {MODEL_DIR}/{name.replace(' ', '_').lower()}.joblib")

# -------------------- Load and Prepare Data --------------------
df = pd.read_csv(DATA_PATH)
X = df.drop(columns=['Label'])
y = df['Label']

# Overfitting prevention.
X_noisy = feature_enhancement(X, enhance_level=0.02)
X_dropout = dropout_features(X_noisy, dropout_rate=0.05)
y_corrupted = corrupt_labels(y, corruption_rate=0.03)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_dropout, y_corrupted, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_corrupted
)

# -------------------- Models Dictionary --------------------

fast_svm = CalibratedClassifierCV(
    estimator=LinearSVC(C=0.1, max_iter=5000, random_state=RANDOM_STATE),
    method='sigmoid',
    cv=5
)

models = {
    "Random Forest": scale_pipeline(RandomForestClassifier(
        n_estimators=100, max_depth=None, min_samples_split=5,
        min_samples_leaf=2, max_features='sqrt', n_jobs=-1,
        random_state=RANDOM_STATE
    )),

    "Gradient Boosting": scale_pipeline(GradientBoostingClassifier(
        n_estimators=100, learning_rate=0.05, max_depth=3,
        subsample=0.8, random_state=RANDOM_STATE
    )),

    "XGBoost": scale_pipeline(XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss', random_state=RANDOM_STATE,
        n_estimators=250, max_depth=6, learning_rate=0.03,
        subsample=0.85, colsample_bytree=0.85,
        reg_alpha=0.5, reg_lambda=2.0, n_jobs=-1,
        tree_method='gpu_hist' if USE_GPU else 'hist',
        verbosity=0
    )),

    "LightGBM": scale_pipeline(LGBMClassifier(
        objective='binary',
        random_state=RANDOM_STATE, n_estimators=250, learning_rate=0.03,
        num_leaves=64, max_depth=-1, min_child_samples=20,
        subsample=0.9, colsample_bytree=0.9,
        reg_alpha=0.5, reg_lambda=1.0, boosting_type='gbdt',
        n_jobs=-1, verbose=-1, device='gpu' if USE_GPU else 'cpu'
    )),

    "CatBoost": scale_pipeline(CatBoostClassifier(
        iterations=350, depth=8, learning_rate=0.03,
        task_type='GPU' if USE_GPU else 'CPU',
        thread_count=-1, random_seed=RANDOM_STATE,
        l2_leaf_reg=3.0, verbose=100,
        od_type='Iter', od_wait=50,
        loss_function='Logloss'
    )),

    "Logistic Regression": scale_pipeline(LogisticRegression(
        C=0.01, penalty='l2', random_state=RANDOM_STATE,
        max_iter=1000, n_jobs=-1
    )),

    "KNN": scale_pipeline(KNeighborsClassifier(
        n_neighbors=10, n_jobs=-1
    )),

    "Naïve Bayes": scale_pipeline(GaussianNB(var_smoothing=1e-9)),

    "SVM": scale_pipeline(fast_svm)
}

# -------------------- Evaluate All Models --------------------

for name, model in models.items():
    evaluate_model(name, model, X_train, X_test, y_train, y_test)



---- Random Forest ----
Accuracy: 0.9693
F1: 0.9693
Precision: 0.9694
Recall: 0.9693

Confusion Matrix:
       0      1
0  13267    454
1    388  13362
At threshold ~0.5: FPR=0.0331, TPR=0.9719
✅ Model saved to ./models/random_forest.joblib

---- Gradient Boosting ----
Accuracy: 0.9665
F1: 0.9665
Precision: 0.9666
Recall: 0.9665

Confusion Matrix:
       0      1
0  13216    505
1    414  13336
At threshold ~0.5: FPR=0.0368, TPR=0.9699
✅ Model saved to ./models/gradient_boosting.joblib

---- XGBoost ----
Accuracy: 0.9685
F1: 0.9685
Precision: 0.9686
Recall: 0.9685

Confusion Matrix:
       0      1
0  13258    463
1    401  13349
At threshold ~0.5: FPR=0.0337, TPR=0.9708
✅ Model saved to ./models/xgboost.joblib

---- LightGBM ----




Accuracy: 0.9686
F1: 0.9686
Precision: 0.9686
Recall: 0.9686

Confusion Matrix:
       0      1
0  13255    466
1    397  13353




At threshold ~0.5: FPR=0.0340, TPR=0.9709
✅ Model saved to ./models/lightgbm.joblib

---- CatBoost ----
0:	learn: 0.6487529	total: 17.9ms	remaining: 6.25s
100:	learn: 0.1409864	total: 1.01s	remaining: 2.5s
200:	learn: 0.1347293	total: 1.87s	remaining: 1.39s
300:	learn: 0.1300559	total: 2.75s	remaining: 447ms
349:	learn: 0.1278385	total: 3.18s	remaining: 0us
Accuracy: 0.9687
F1: 0.9687
Precision: 0.9687
Recall: 0.9687

Confusion Matrix:
       0      1
0  13261    460
1    401  13349
At threshold ~0.5: FPR=0.0335, TPR=0.9711
✅ Model saved to ./models/catboost.joblib

---- Logistic Regression ----
Accuracy: 0.9233
F1: 0.9233
Precision: 0.9237
Recall: 0.9233

Confusion Matrix:
       0      1
0  12471   1250
1    856  12894
At threshold ~0.5: FPR=0.0911, TPR=0.9377
✅ Model saved to ./models/logistic_regression.joblib

---- KNN ----
Accuracy: 0.9609
F1: 0.9609
Precision: 0.9609
Recall: 0.9609

Confusion Matrix:
       0      1
0  13159    562
1    511  13239
At threshold ~0.5: FPR=0.0429, 

In [15]:
# download ./models
! zip -r ./models.zip ./models


  adding: models/ (stored 0%)
  adding: models/random_forest.joblib (deflated 76%)
  adding: models/knn.joblib (deflated 13%)
  adding: models/xgboost.joblib (deflated 66%)
  adding: models/catboost.joblib (deflated 75%)
  adding: models/svm.joblib (deflated 51%)
  adding: models/logistic_regression.joblib (deflated 52%)
  adding: models/lightgbm.joblib (deflated 58%)
  adding: models/gradient_boosting.joblib (deflated 67%)
  adding: models/naïve_bayes.joblib (deflated 45%)
