To run this notebook, which is divided into two main sections, mainly the Feature Engineering and the Model Training pipleline, it can be run just by clicking "Run All", since the feature engineering pipeline has an extremely high ram usage to test the code and avoid session restart we would recommend to run the following code sections: "Package Installation and Imports", "Utility Functions" and then skipping the "Feature Engineering" blocks and running rest of the code using the checkpoint section which downloads the preprocessed dataset which has gone through the feature engineering process already.

Link to trained models: https://drive.google.com/file/d/1MK8rpBfely9tTmZp7J-Nt19a9fw4zt6R/view?usp=sharing

Link to results files: https://docs.google.com/spreadsheets/d/1R4kK20zrJOvTbMsxL0o_qCtdeatWIQt4iu7pLta-9j4/edit?usp=sharing

## Package Installations and Imports

In [None]:
!pip install -q gdown

In [None]:
from google.colab import files
import gdown, zipfile, os, io, glob, re, shutil, csv, joblib

import numpy as np
import pandas as pd

from typing import List, Dict, Any, Iterable

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    matthews_corrcoef,
    classification_report,
    roc_auc_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

## Utility Functions

In [None]:
# Gets df from gdrive
def gdrive_snapshot(gdrive_url):
    path = "snap_load.zip"
    gdown.download(gdrive_url, path, quiet=True, fuzzy=True)
    extract_to = "/content/snap_load"
    os.makedirs(extract_to, exist_ok=True)
    with zipfile.ZipFile("/content/"+path , "r") as z:
        z.extractall(extract_to)
    print("Extracted to:", extract_to)

    # base_path = '/content/snap_load'
    # dir_name = os.path.basename(base_path)
    # csv_files = glob.glob(os.path.join(path, "*.csv"))

    csv_search_path = os.path.join(extract_to, "**", "*.csv")
    csv_files = glob.glob(csv_search_path, recursive=True)
    if '/content/snap_load/f7546561558c07c5_NFV3DATA-A11964_A11964/data/NetFlow_v3_Features.csv' in csv_files:
      csv_files.remove('/content/snap_load/f7546561558c07c5_NFV3DATA-A11964_A11964/data/NetFlow_v3_Features.csv')
    print(f"Found {len(csv_files)} CSV files in '{extract_to}'.")
    if not csv_files:
        print(f"No CSV files found in zip")
        return []
    # df = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)
    # print(f"Created DataFrame with {len(df)} rows.")
    return csv_files


# df_snap = gdrive_snapshot('https://drive.google.com/file/d/11gdDpNaUPrV3PKzpfLyvyH0P-pc2KeHX/view?usp=sharing')


In [None]:
def save_df_zip(df, zip_filename):
  chunk_size = 100000000      # can be reduced depending on ram limitations
  total_chunks = (len(df) // chunk_size) + 1
  zip_buffer = io.BytesIO()
  print(f"Processing DataFrame in {total_chunks} chunks and writing to zip archive...")
  with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as archive:
      # Loop through the DataFrame in chunks
      for i in range(0, len(df), chunk_size):
          df_chunk = df.iloc[i:i + chunk_size]
          part_num = (i // chunk_size) + 1
          chunk_filename = f'data_part_{part_num}.csv'
          csv_data = df_chunk.to_csv(index=False, header=(i==0)).encode('utf-8')
          archive.writestr(chunk_filename, csv_data)
          print(f"  - Added '{chunk_filename}' to the archive.")
  zip_data = zip_buffer.getvalue()
  zip_filename = f"{zip_filename}.zip"
  with open(zip_filename, 'wb') as f:
      f.write(zip_data)
  files.download(zip_filename)
  print(f"DataFrame zipped and ready for download as '{zip_filename}'")


#### SAMPLE SIZE ADJUSTMENT

In [None]:
def adjust_size(df, max_rows):
    df = df.dropna()

    return (
        df.groupby('Attack', group_keys=False)
          .apply(lambda g: g.sample(
              n=min(len(g), max_rows),
              random_state=42
          ))
          .reset_index(drop=True)
    )

##Feature Engineering



#### Downloading data to Gather the base dataframe

In [None]:
url_ = "https://drive.google.com/file/d/1Jc1jTq8_nWNAIlLxQpep6VCY3fpJEDa8/view?usp=sharing"
csv_files_sys = gdrive_snapshot(url_)
all_df = pd.concat([pd.read_csv(f) for f in csv_files_sys], ignore_index=True)
# all_df = df.copy()


In [None]:
all_df = all_df[~all_df['Attack'].isin(['Generic', 'Analysis'])]

In [None]:
all_df['Attack'] = all_df['Attack'].replace(
    ['ddos', 'DDOS_attack-HOIC', 'DDoS_attacks-LOIC-HTTP', 'DDOS_attack-LOIC-UDP'], 'DDoS'
).replace(
    ['dos', 'DoS_attacks-SlowHTTPTest', 'DoS_attacks-Hulk', 'DoS_attacks-GoldenEye', 'DoS_attacks-Slowloris'], 'DoS'
).replace(
    ['FTP-BruteForce', 'SSH-Bruteforce', 'Brute_Force_-Web', 'Brute_Force_-XSS', 'password'], 'Brute_Force'
).replace(
    ['injection'], 'SQL_Injection'
).replace(
    ['scanning'], 'Reconnaissance'
)

# Corrected filtering using isin() and negation
attacks_to_remove = ["mitm","ransomware","Shellcode","Theft","Worms"]
all_df = all_df[~all_df["Attack"].isin(attacks_to_remove)]


print("Values in 'Attack' column after renaming and duplicating overlapping attacks:")
print(all_df['Attack'].value_counts())

In [None]:
print("Values in 'Attack' column after renaming and duplicating overlapping attacks:")
print(all_df['Attack'].value_counts())

In [None]:
# Drop non-numeric columns before calculating correlation
numeric_df = all_df.drop(columns=['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'Attack'], errors='ignore')

#need to calculate X_cor here again as dropped columns from X
X_cor = numeric_df.corr().dropna(axis=1, how='all').dropna(axis=0, how='all')

# Set the size of the figure for better readability
plt.figure(figsize=(10, 8))

# Create the heatmap
sns.heatmap(X_cor, annot=False, cmap='coolwarm', linewidths=.5)

# Add title and show the plot
plt.title('Correlation Matrix Heatmap')
plt.show()

cor_eigenvalue, cor_eigenvector = np.linalg.eig(X_cor)
cor_total_variance = np.sum(cor_eigenvalue)
print(f"total variance = {cor_total_variance:.2f}")
cor_variance_ratios = cor_eigenvalue * 100 / cor_total_variance
cor_variance_ratios_cum = np.cumsum(cor_variance_ratios)
# for i in range(cor_variance_ratios.shape[0]):
#     print(f"{i+1}: {cor_variance_ratios[i]:.2f}% : Cum : {cor_variance_ratios_cum[i]:.2f}%")

fig = plt.figure(figsize=(8,5))
plt.plot(cor_variance_ratios_cum, linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Index')
plt.ylabel('Proportion of variance')
plt.show()
plt.close('all')

In [None]:
total_nan_count = all_df.isna().sum().sum()
print(f"\nTotal NaN count in the DataFrame: {total_nan_count}")

### Feature Engineering Pipeline

#### Removing Leaky Attributes

In [None]:
def remove_leaky_columns(df,col_list):
  return df.drop(columns = col_list, errors='ignore')

#### Dealing with missing and infinite values

In [None]:
def handle_values(df):
  df = df.replace([np.inf, -np.inf], np.nan) # Correctly assign the result back to df
  constant_columns = df.columns[df.nunique() == 1].tolist()
  if constant_columns:
      df = df.drop(columns=constant_columns, errors="ignore")
  return df

#### Dropping highly correlated columns

In [None]:
#percent from 0 to 1
def drop_similar_col(df, percent):
  corr = df.corr()
  upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
  to_drop = [column for column in upper.columns if any(upper[column] > percent)]
  df = df.drop(columns=to_drop, errors="ignore")
  return df


#### Simplified Function

In [None]:
def feature_engineering_with_labels(df, percent, leaky_cols, label_cols):
    # Separate labels (only those that actually exist in df)
    label_cols_present = [c for c in label_cols if c in df.columns]
    labels = df[label_cols_present].copy() if label_cols_present else pd.DataFrame(index=df.index)

    # Work only on features for the pipeline
    df = df.drop(columns=label_cols_present, errors="ignore")
    df = remove_leaky_columns(df, leaky_cols)
    df = handle_values(df)
    df = drop_similar_col(df, percent)

    # Reattach labels
    return adjust_size(pd.concat([df, labels], axis=1), 3000000)

In [None]:
label_cols = ['Label', 'Attack']
leaky_cols = ['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT', 'L4_DST_PORT']

processed_df = feature_engineering_with_labels(all_df, percent=0.95, leaky_cols=leaky_cols, label_cols=label_cols)

In [None]:
processed_df.to_csv('processed_dataset.csv', index=False)
print("DataFrame saved as 'processed_dataset.csv'")

In [None]:
files.download('processed_dataset.csv')

## MODEL TRAINING

#### Load Processed Dataset [CHECKPOINT]

In [None]:
#run from here, if you want to skip running the feature engineering pipeline
#following link has the dataset that has already gone through feature engineering
#otherwise this block can be skipped

drive_url = "https://drive.google.com/file/d/1Uswxrf1oSEpdsfspLeXXUFtuAhSyo_aw/view?usp=sharing"
output_path = "processed_dataset.csv"

gdown.download(drive_url, output_path, quiet=False, fuzzy=True)
print(f"Download complete: {output_path}")

processed_df = pd.read_csv(output_path)
print("DataFrame 'processed_df' loaded successfully.")
print(f"Shape of processed_df: {processed_df.shape}")
processed_df.head()

Downloading...
From (original): https://drive.google.com/uc?id=1Uswxrf1oSEpdsfspLeXXUFtuAhSyo_aw
From (redirected): https://drive.google.com/uc?id=1Uswxrf1oSEpdsfspLeXXUFtuAhSyo_aw&confirm=t&uuid=b3d701f2-e73d-48ec-afbc-c54b800140bd
To: /content/processed_dataset.csv
100%|██████████| 3.09G/3.09G [00:32<00:00, 95.2MB/s]


Download complete: processed_dataset.csv
DataFrame 'processed_df' loaded successfully.
Shape of processed_df: (17423002, 51)


Unnamed: 0,FLOW_START_MILLISECONDS,FLOW_END_MILLISECONDS,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,SRC_TO_DST_IAT_MIN,SRC_TO_DST_IAT_MAX,SRC_TO_DST_IAT_AVG,SRC_TO_DST_IAT_STDDEV,DST_TO_SRC_IAT_MIN,DST_TO_SRC_IAT_MAX,DST_TO_SRC_IAT_AVG,DST_TO_SRC_IAT_STDDEV,Label,Attack
0,1556436599094,1556436599119,17,9.0,76,1,76,1,0,0,...,0,0,0,0,0,0,0,0,1,Backdoor
1,1556436599094,1556436599119,17,9.0,76,1,76,1,0,0,...,0,0,0,0,0,0,0,0,1,Backdoor
2,1556436603451,1556436603509,6,0.0,208,2,200,1,24,24,...,58,58,58,0,0,0,0,0,1,Backdoor
3,1556436603451,1556436603509,6,0.0,208,2,200,1,24,24,...,58,58,58,0,0,0,0,0,1,Backdoor
4,1556436610261,1556436610826,6,0.0,104000,80,2144,26,24,24,...,0,124,7,23,0,135,17,33,1,Backdoor


#### Train Test Split Logic

In [None]:
def save_model(model, model_name, tag, base_path="content/models"):
    os.makedirs(base_path, exist_ok=True)
    file_path = os.path.join(base_path, f"{model_name}_{tag}.pkl")
    joblib.dump(model, file_path)
    print(f"[+] Saved {model_name} model for '{tag}' at: {file_path}")

In [None]:
def _aggregate_cv_metrics(fold_metrics: list[dict]) -> dict:
    keys_to_avg = ["accuracy", "precision", "recall", "f1_score", "mcc", "dr_macro", "far_macro"]

    agg = {}
    for k in keys_to_avg:
        agg[k] = float(np.mean([m[k] for m in fold_metrics]))

    # Sum confusion matrices
    agg["confusion_matrix"] = sum((m["confusion_matrix"] for m in fold_metrics))

    # Keep all per-fold reports for debugging/inspection
    agg["classification_report"] = "\n\n".join(
        [f"Fold {i+1}:\n{m['classification_report']}" for i, m in enumerate(fold_metrics)]
    )
    return agg


def kfold_evaluate_model(model_fn, tag, X, y, n_splits: int = 5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_metrics = []

    for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        metrics = model_fn(
            tag=f"{tag}_fold{fold_idx}",
            X_train=X_train,
            X_test=X_test,
            y_train=y_train,
            y_test=y_test,
            save=False,
        )
        fold_metrics.append(metrics)

    return _aggregate_cv_metrics(fold_metrics)


#### EVALUATION FUNCTION

In [None]:
def evaluate_model(X_test, y_test, y_pred):
  cm = confusion_matrix(y_test, y_pred)
  acc = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
  recall = recall_score(y_test, y_pred, average='weighted', zero_division=0) # Standard weighted recall
  f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
  mcc = matthews_corrcoef(y_test, y_pred) # Great for imbalanced datasets

  report = classification_report(y_test, y_pred, zero_division=0)

  # Detection Rate (Recall Macro)
  TP = np.diag(cm).astype(float)
  FN = cm.sum(axis=1) - TP
  with np.errstate(divide='ignore', invalid='ignore'):
      dr_per_class = np.where((TP + FN) > 0, TP / (TP + FN), 0.0)
  dr_macro = float(dr_per_class.mean())

  # False Alarm Rate (FPR Macro)
  FP = cm.sum(axis=0) - TP
  TN = cm.sum() - (TP + FP + FN)
  with np.errstate(divide='ignore', invalid='ignore'):
      far_per_class = np.where((FP + TN) > 0, FP / (FP + TN), 0.0)
  far_macro = float(far_per_class.mean())

  return {
      "confusion_matrix": cm,
      "accuracy": float(acc),
      "precision": float(precision),
      "recall": float(recall),
      "f1_score": float(f1),
      "mcc": float(mcc),
      "dr_macro": dr_macro,
      "far_macro": far_macro,
      "classification_report": report
  }

In [None]:
# ---- to reduce train time ----
N_SPLITS = 3
USE_LOGREG = True
USE_RF = True
USE_KNN = True
ROW_MAX = 300000


In [None]:
# downsample (just to run with smaller dataset to get faster results)
if ROW_MAX is not None:
    processed_df = adjust_size(processed_df, ROW_MAX)

#### Logistic Regresion

In [None]:
def logistic_regression(tag, X_train, X_test, y_train, y_test, save=True):
    print(f"[+] Training Logistic Regression for: {tag} ...")

    lr = LogisticRegression(
        penalty="l2",
        C=1.0,
        solver="lbfgs",
        max_iter=200,          # tuned for speed
        class_weight="balanced",
        n_jobs=-1,
        random_state=42
    )
    lr.fit(X_train, y_train)

    if save:
        save_model(lr, "logistic_regression", tag)

    print(f"[✓] Finished Logistic Regression for: {tag}")
    y_pred = lr.predict(X_test)
    return evaluate_model(X_test, y_test, y_pred)

#### Random Forest

In [None]:
def random_forest(tag, X_train, X_test, y_train, y_test, save=True):
    print(f"[+] Training Random Forest for: {tag} ...")

    rf = RandomForestClassifier(
        n_estimators=80,       # tuned for speed
        max_depth=12,
        min_samples_split=5,
        min_samples_leaf=2,
        bootstrap=True,
        class_weight="balanced_subsample",
        n_jobs=-1,
        random_state=42
    )
    rf.fit(X_train, y_train)

    if save:
        save_model(rf, "random_forest", tag)

    print(f"[✓] Finished Random Forest for: {tag}")
    y_pred = rf.predict(X_test)
    return evaluate_model(X_test, y_test, y_pred)

#### K-Nearest Neighbor

In [None]:
def knn_model(tag, X_train, X_test, y_train, y_test, n_neighbors=10, save=True):
    print(f"[+] Training KNN for: {tag} ...")

    knn = KNeighborsClassifier(
        n_neighbors=n_neighbors,
        weights="uniform",     # tuned for speed
        n_jobs=-1
    )
    knn.fit(X_train, y_train)

    if save:
        save_model(knn, "knn_model", tag)

    print(f"[✓] Finished KNN for: {tag}")
    y_pred = knn.predict(X_test)
    return evaluate_model(X_test, y_test, y_pred)

## MAGIC FUNCTION

In [None]:
def evaluate_models(df, combinations, n_splits: int = N_SPLITS):
    results = []
    for combo in combinations:
        attack_name = combo[1]

        df_sub = df[df["Attack"].isin(combo)]
        X = df_sub.drop(columns=["Label", "Attack"])
        y = df_sub["Label"]

        rec: Dict[str, Any] = {"combination": combo}

        if USE_RF:
            rec["random_forest"] = kfold_evaluate_model(
                random_forest, attack_name, X, y, n_splits
            )

        if USE_LOGREG:
            rec["logistic_regression"] = kfold_evaluate_model(
                logistic_regression, attack_name, X, y, n_splits
            )

        if USE_KNN:
            rec["knn_model"] = kfold_evaluate_model(
                knn_model, attack_name, X, y, n_splits
            )

        results.append(rec)

    return results

In [None]:
def evaluate_multiclass_models(
    df,
    attacks_needed=None,
    include_benign=True,
    tag: str = "multiclass",
    n_splits: int = N_SPLITS,
):
    if attacks_needed is None:
        attacks = df["Attack"].unique().tolist()
    else:
        attacks = list(attacks_needed)

    if include_benign and "Benign" not in attacks:
        attacks.append("Benign")

    df_sub = df[df["Attack"].isin(attacks)]
    X = df_sub.drop(columns=["Label", "Attack"])
    y = df_sub["Attack"]

    out: Dict[str, Any] = {
        "classes": sorted(y.unique().tolist())
    }

    if USE_RF:
        out["random_forest"] = kfold_evaluate_model(random_forest, tag, X, y, n_splits)

    if USE_LOGREG:
        out["logistic_regression"] = kfold_evaluate_model(
            logistic_regression, tag, X, y, n_splits
        )

    if USE_KNN:
        out["knn_model"] = kfold_evaluate_model(knn_model, tag, X, y, n_splits)

    return out

## Save Final Results

In [None]:
combinations = [
                 ["Benign" , "DDoS"],
                 ["Benign" , "DoS"],
                 ["Benign" , "xss"],
                 ["Benign" , "Reconnaissance"],
                 ["Benign" , "Brute_Force"],
                 ["Benign" , "SQL_Injection"],
                 ["Benign" , "Bot"],
                 ["Benign" , "Backdoor"],
                 ["Benign" , "Infilteration"],
                 ["Benign" , "Exploits"],
                 ["Benign" , "Fuzzers"],
]

ATTACKS_ORDER = [
    "Backdoor","Bot","Brute_Force","DDoS","DoS",
    "Exploits","Fuzzers","Infilteration","Reconnaissance",
    "SQL_Injection","xss"
]

TABLE_HEADERS = [
    "ATTACKS",  # for binary this is the specific attack; for multiclass row it is 'MULTICLASS'
    "Accuracy",
    "Precision",
    "Recall",
    "F1 Score",
    "matthews_corrcoef",
    "Detection Rate",
    "False Alarm Rate",
]

MODEL_KEYS = [
    ("random_forest", "Random Forest"),
    ("logistic_regression", "Logistic Regression"),
    ("knn_model", "K-Nearest Neighbor"),
]

def _safe_get(d: dict, k: str, default=None):
    return d.get(k, default) if isinstance(d, dict) else default

def _row_from_metrics(attack: str, metrics: dict) -> list:
    return [
        attack,
        _safe_get(metrics, "accuracy", ""),
        _safe_get(metrics, "precision", ""),
        _safe_get(metrics, "recall", ""),
        _safe_get(metrics, "f1_score", ""),
        _safe_get(metrics, "mcc", ""),
        _safe_get(metrics, "dr_macro", ""),
        _safe_get(metrics, "far_macro", ""),
    ]

def _dedupe_preserve_order(items: Iterable[str]) -> list:
    seen, out = set(), []
    for x in items:
        if x not in seen:
            out.append(x); seen.add(x)
    return out

def write_training_results_csv(
    evaluations: List[Dict[str, Any]],
    multiclass_results: Dict[str, Any] | None = None,
    out_path: str = "training_results.csv",
    attacks_order: list | None = None,
) -> str:

    # Determine attack list (for the binary models)
    if attacks_order is None:
        attacks = [
            rec["combination"][1]
            for rec in evaluations
            if isinstance(rec.get("combination"), (list, tuple))
            and len(rec["combination"]) >= 2
        ]
        attacks = _dedupe_preserve_order(attacks)
    else:
        attacks = list(attacks_order)

    # Lookup for binary (attack-specific) metrics
    lookup = {}
    for rec in evaluations:
        combo = rec.get("combination", ["", ""])
        attack = combo[1] if isinstance(combo, (list, tuple)) and len(combo) >= 2 else None
        if not attack:
            continue
        for model_key, _model_title in MODEL_KEYS:
            if isinstance(rec.get(model_key), dict):
                lookup[(attack, model_key)] = rec[model_key]

    # Lookup for multiclass metrics (one row per model)
    multiclass_lookup = {}
    if isinstance(multiclass_results, dict):
        for model_key, _model_title in MODEL_KEYS:
            metrics = multiclass_results.get(model_key)
            if isinstance(metrics, dict):
                multiclass_lookup[model_key] = metrics

    os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True)
    with open(out_path, "w", newline="") as f:
        w = csv.writer(f)

        for idx, (model_key, model_title) in enumerate(MODEL_KEYS):
            if idx > 0:
                w.writerow([])

            # Indicate these are K-fold results
            w.writerow([f"Model: {model_title} (K-Fold CV)"])
            w.writerow(TABLE_HEADERS)

            # Binary (attack-specific) rows
            for attack in attacks:
                metrics = lookup.get((attack, model_key), {})
                w.writerow(_row_from_metrics(attack, metrics))

            # Multi-class row
            if model_key in multiclass_lookup:
                mc_metrics = multiclass_lookup[model_key]
                w.writerow(_row_from_metrics("MULTICLASS", mc_metrics))

    print("Saved:", os.path.abspath(out_path))
    return os.path.abspath(out_path)


In [None]:
evaluations = evaluate_models(processed_df, combinations)

multiclass_results = evaluate_multiclass_models(
    processed_df,
    attacks_needed=ATTACKS_ORDER,
    include_benign=True,
    tag="all_attacks_multiclass",
    n_splits=5,
)


[+] Training Random Forest for: DDoS_fold1 ...
[✓] Finished Random Forest for: DDoS_fold1
[+] Training Random Forest for: DDoS_fold2 ...
[✓] Finished Random Forest for: DDoS_fold2
[+] Training Random Forest for: DDoS_fold3 ...
[✓] Finished Random Forest for: DDoS_fold3
[+] Training Logistic Regression for: DDoS_fold1 ...
[✓] Finished Logistic Regression for: DDoS_fold1
[+] Training Logistic Regression for: DDoS_fold2 ...
[✓] Finished Logistic Regression for: DDoS_fold2
[+] Training Logistic Regression for: DDoS_fold3 ...
[✓] Finished Logistic Regression for: DDoS_fold3
[+] Training KNN for: DDoS_fold1 ...
[✓] Finished KNN for: DDoS_fold1
[+] Training KNN for: DDoS_fold2 ...
[✓] Finished KNN for: DDoS_fold2
[+] Training KNN for: DDoS_fold3 ...
[✓] Finished KNN for: DDoS_fold3
[+] Training Random Forest for: DoS_fold1 ...
[✓] Finished Random Forest for: DoS_fold1
[+] Training Random Forest for: DoS_fold2 ...
[✓] Finished Random Forest for: DoS_fold2
[+] Training Random Forest for: DoS_fo

In [None]:
write_training_results_csv(
    evaluations,
    multiclass_results=multiclass_results,
    out_path="training_results.csv",
    attacks_order=ATTACKS_ORDER,
)

Saved: /content/training_results.csv


'/content/training_results.csv'

In [None]:
files.download("training_results.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def train_and_save_final_models(df, combinations, multiclass_results=None):
    saved_paths = []

    # ----- Save final binary models -----
    for combo in combinations:
        attack_name = combo[1]
        df_sub = df[df["Attack"].isin(combo)]

        X = df_sub.drop(columns=["Label", "Attack"])
        y = df_sub["Label"]

        # Train on full data (no CV)
        rf = RandomForestClassifier(
            n_estimators=200,
            max_depth=20,
            min_samples_split=5,
            min_samples_leaf=2,
            bootstrap=True,
            class_weight="balanced_subsample",
            n_jobs=-1,
            random_state=42
        ).fit(X, y)
        save_model(rf, "random_forest", attack_name)
        saved_paths.append(f"random_forest_{attack_name}.pkl")

        lr = LogisticRegression(
            penalty="l2",
            C=1.0,
            solver="lbfgs",
            max_iter=1000,
            class_weight="balanced",
            n_jobs=-1,
            random_state=42
        ).fit(X, y)
        save_model(lr, "logistic_regression", attack_name)
        saved_paths.append(f"logistic_regression_{attack_name}.pkl")

        knn = KNeighborsClassifier(
            n_neighbors=15,
            weights="distance",
            n_jobs=-1
        ).fit(X, y)
        save_model(knn, "knn_model", attack_name)
        saved_paths.append(f"knn_model_{attack_name}.pkl")

    # ----- Save final multiclass models -----
    # Multiclass uses Attack as label
    if multiclass_results is not None:
        attacks = multiclass_results["classes"]
        df_sub = df[df["Attack"].isin(attacks)]

        X = df_sub.drop(columns=["Label", "Attack"])
        y = df_sub["Attack"]

        tag = "all_attacks_multiclass"

        rf = RandomForestClassifier(
            n_estimators=200,
            max_depth=20,
            min_samples_split=5,
            min_samples_leaf=2,
            bootstrap=True,
            class_weight="balanced_subsample",
            n_jobs=-1,
            random_state=42
        ).fit(X, y)
        save_model(rf, "random_forest", tag)
        saved_paths.append(f"random_forest_{tag}.pkl")

        lr = LogisticRegression(
            penalty="l2",
            C=1.0,
            solver="lbfgs",
            max_iter=1000,
            class_weight="balanced",
            n_jobs=-1,
            random_state=42
        ).fit(X, y)
        save_model(lr, "logistic_regression", tag)
        saved_paths.append(f"logistic_regression_{tag}.pkl")

        knn = KNeighborsClassifier(
            n_neighbors=15,
            weights="distance",
            n_jobs=-1
        ).fit(X, y)
        save_model(knn, "knn_model", tag)
        saved_paths.append(f"knn_model_{tag}.pkl")

    return saved_paths


In [None]:
saved_paths = train_and_save_final_models(
    processed_df,
    combinations=combinations,
    multiclass_results=multiclass_results
)

print("Saved model files:")
for p in saved_paths:
    print(" -", p)

[+] Saved random_forest model for 'DDoS' at: content/models/random_forest_DDoS.pkl
[+] Saved logistic_regression model for 'DDoS' at: content/models/logistic_regression_DDoS.pkl
[+] Saved knn_model model for 'DDoS' at: content/models/knn_model_DDoS.pkl
[+] Saved random_forest model for 'DoS' at: content/models/random_forest_DoS.pkl
[+] Saved logistic_regression model for 'DoS' at: content/models/logistic_regression_DoS.pkl
[+] Saved knn_model model for 'DoS' at: content/models/knn_model_DoS.pkl
[+] Saved random_forest model for 'xss' at: content/models/random_forest_xss.pkl
[+] Saved logistic_regression model for 'xss' at: content/models/logistic_regression_xss.pkl
[+] Saved knn_model model for 'xss' at: content/models/knn_model_xss.pkl
[+] Saved random_forest model for 'Reconnaissance' at: content/models/random_forest_Reconnaissance.pkl
[+] Saved logistic_regression model for 'Reconnaissance' at: content/models/logistic_regression_Reconnaissance.pkl
[+] Saved knn_model model for 'Reco

In [None]:
%cd content/
!ls
!zip -r models.zip models

/content/content
models
  adding: models/ (stored 0%)
  adding: models/random_forest_Fuzzers.pkl (deflated 68%)
  adding: models/logistic_regression_xss.pkl (deflated 39%)
  adding: models/knn_model_all_attacks_multiclass.pkl (deflated 89%)
  adding: models/random_forest_Exploits.pkl (deflated 67%)
  adding: models/knn_model_Backdoor.pkl (deflated 91%)
  adding: models/logistic_regression_Backdoor.pkl (deflated 40%)
  adding: models/knn_model_Brute_Force.pkl (deflated 90%)
  adding: models/logistic_regression_Exploits.pkl (deflated 40%)
  adding: models/knn_model_DDoS.pkl (deflated 89%)
  adding: models/random_forest_DoS.pkl (deflated 66%)
  adding: models/logistic_regression_Fuzzers.pkl (deflated 40%)
  adding: models/random_forest_Bot.pkl (deflated 67%)
  adding: models/random_forest_SQL_Injection.pkl (deflated 66%)
  adding: models/knn_model_DoS.pkl (deflated 88%)
  adding: models/knn_model_SQL_Injection.pkl (deflated 87%)
  adding: models/logistic_regression_Brute_Force.pkl (deflat

In [None]:
files.download('models.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>