# Modeling Notebook

## Library Imports

In [11]:
## Import necessary libraries here
import os
import random
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from pathlib import Path

In [12]:
# set the random seed for reproducibility
RANDOM_SEED = 2025

# Establish output directories for images and tables developed
IMAGE_DIR = Path("../Images/")
TABLE_DIR = Path("../Tables/")

In [13]:
# create helper functions to save images and tables to respective directories
def save_figure(fig, filename: str, dpi: int = 300) -> None:
    """
    Save a matplotlib/seaborn figure to the Images directory.
    """
    filepath = IMAGE_DIR / filename
    fig.savefig(filepath, dpi=dpi, bbox_inches="tight")
    print(f"Saved figure to {filepath}")

def save_table(df, filename: str, index: bool = False) -> None:
    """
    Save a pandas or polars DataFrame to the Tables directory as CSV.
    """
    filepath = TABLE_DIR / filename

    # Handle polars vs pandas
    if isinstance(df, pl.DataFrame):
        df.write_csv(str(filepath))
    else:
        # assume pandas-like
        df.to_csv(str(filepath), index=index)

    print(f"Saved table to {filepath}")

## 1. Data Access and Structure

### 1.1 Loading Cleaned Dataset

In [14]:
# set the filepath to the parquet cleaned dataset
PATH = "../Data/Cleaned/Cleaned_Dataset.parquet"

# scan the parquet file with polars
scan = pl.scan_parquet(PATH)

# get the dataset schema
schema = scan.collect_schema()

In [15]:
# set feature and target variables
TARGET_LABEL = 'Label'
TARGET_FAMILY = 'Label_Family'
TARGET_BINARY = 'Label_Binary'

# categorical columns
CATEGORICAL_COLUMNS = ["Protocol_Type"]
# numerical columns
NUMERICAL_COLUMNS = [col for col, dtype in schema.items() if dtype in [pl.Int32, pl.Float32]]

all_columns = schema.names()
target_columns = [TARGET_LABEL, TARGET_FAMILY, TARGET_BINARY]
feature_columns = [col for col in all_columns if col not in target_columns]

## 2. Linear Model Assumptions & Preprocessing Design

### 2.1 Multicollinearity and Redundancy

### 2.2 Scaling Strategy for Numeric Features

### 2.3 Skewness and Transformations

### 2.4 Class Balance Profiling on Target Granularities

## 3. Stratified Equal-Sized Subsampling Strategy

### 3.1 Global Subsampling Parameters

In [16]:
def get_class_counts(lazy_scan: pl.LazyFrame, target_col: str) -> pl.DataFrame:
    """
    Return a Polars DataFrame with class counts for the given target column.
    Uses lazy scan -> group_by -> aggregate -> collect.
    """
    counts = (
        lazy_scan
        .group_by(target_col)
        .agg(pl.len().alias("count"))
        .collect()
        .sort("count")
    )
    return counts

# Get class counts for each target granularity
label_counts = get_class_counts(scan, TARGET_LABEL)
label_family_counts = get_class_counts(scan, TARGET_FAMILY)
label_binary_counts = get_class_counts(scan, TARGET_BINARY)

print("Label (34-class) counts (sorted by smallest class):")
print(label_counts)

print("\nLabel_Family (7/8-class) counts:")
print(label_family_counts)

print("\nLabel_Binary (2-class) counts:")
print(label_binary_counts)

Label (34-class) counts (sorted by smallest class):
shape: (34, 2)
┌───────────────────┬─────────┐
│ Label             ┆ count   │
│ ---               ┆ ---     │
│ str               ┆ u32     │
╞═══════════════════╪═════════╡
│ UPLOADING_ATTACK  ┆ 1196    │
│ RECON-PINGSWEEP   ┆ 2161    │
│ BACKDOOR_MALWARE  ┆ 3075    │
│ XSS               ┆ 3705    │
│ SQLINJECTION      ┆ 5021    │
│ …                 ┆ …       │
│ DDOS-PSHACK_FLOOD ┆ 1641898 │
│ DDOS-SYN_FLOOD    ┆ 1764599 │
│ DOS-UDP_FLOOD     ┆ 1851682 │
│ DDOS-ICMP_FLOOD   ┆ 1907561 │
│ DDOS-UDP_FLOOD    ┆ 1964164 │
└───────────────────┴─────────┘

Label_Family (7/8-class) counts:
shape: (8, 2)
┌──────────────┬──────────┐
│ Label_Family ┆ count    │
│ ---          ┆ ---      │
│ str          ┆ u32      │
╞══════════════╪══════════╡
│ BRUTE_FORCE  ┆ 12520    │
│ WEB          ┆ 23707    │
│ SPOOFING     ┆ 436061   │
│ RECON        ┆ 655464   │
│ BENIGN       ┆ 1047308  │
│ MIRAI        ┆ 2359183  │
│ DOS          ┆ 4178919  │
│ DDO

### 3.2 Subsampling for Label (34 Classes)

### 3.3 Subsampling for Label_Family (8 Classes)

### 3.4 Subsampling for Label_binary (2 Classes)

## 4. Train/Test Split & Preprocessing Pipelines

### 4.1 Common Train/Test Split Configuration

### 4.2 Shared Preprocessing Pipeline Definition

### 4.3 Train/Test Split & Pipeline for 'Label' (34 Classes)

### 4.4 Train/Test Split & Pipeline for 'Label_Family' (8 Classes)

### 4.5 Train/Test Split & Pipeline for 'Label_binary' (2 Classes)

## 5. Multiclass Logistic Regression Models

### 5.1 Model configuration and Hyperparameters

### 5.2 Logistic Regression for 'Label' (34 Classes)

#### 5.2.1 Model Training

#### 5.2.2 Model Evaluation

### 5.3 Logistic Regression for 'Label_Family' (8 Classes)

#### 5.3.1 Model Training

#### 5.3.2 Model Evaluation

### 5.4 Logistic Regression for 'Label_binary' (2 Classes)

#### 5.4.1 Model Training

#### 5.4.2 Model Evaluation

## 6. Linear Support Vector Machine Models

### 6.1 Model Choice and Configuration

### 6.2 Linear SVM for 'Label' (34 Classes)

#### 6.2.1 Model Training

#### 6.2.2 Model Evaluation

### 6.3 Linear SVM for 'Label_Family' (8 Classes)

#### 6.3.1 Model Training

#### 6.3.2 Model Evaluation

### 6.4 Linear SVM for 'Label_binary' (2 Classes)

#### 6.4.1 Model Training

#### 6.4.2 Model Evaluation

## 7. Cross Model and Cross Granularity Comparisons

### 7.1 Summary Table of Metrics

### 7.2 Discussion of Results