In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from src.paths import ProjectPaths
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# CONFIGURATION

In [None]:
LOGGING_ENABLED = True


def log(message):
    if LOGGING_ENABLED:
        print(f"[LOG] {message}")

# STEP 1: Load and Prepare Raw Data

In [None]:
df_path = ProjectPaths.DATA_ML_CSV_FOLDER / "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
df_title = df_path.stem.split(".pcap")[0].replace("-", " ").strip()

df = pd.read_csv(df_path)
df.columns = df.columns.str.strip()
df['Label'] = df['Label'].str.strip()
df = df[df['Label'].isin(['BENIGN', 'DDoS'])]

log(f"Loaded dataset with {len(df)} entries.")
log(f"Label distribution:\n{df['Label'].value_counts()}")

# STEP 2: Visualize Label Distribution

In [None]:
label_counts = df['Label'].value_counts()
palette = sns.color_palette("Set2", n_colors=len(label_counts))
color_map = {label: palette[i] for i, label in enumerate(label_counts.index)}
colors = [color_map[label] for label in label_counts.index]

plt.figure(figsize=(6, 6))
plt.pie(
    label_counts.values.astype(int),
    labels=label_counts.index,
    colors=colors,
    autopct=lambda pct: f'{int(round(pct))}%',
    startangle=140
)
plt.title(f'{df_title}\nTraffic Label Distribution\n({len(df)} entries)')
plt.axis('equal')
plt.tight_layout()
plt.show()

# STEP 3: Clean Numeric Features

In [None]:
numeric_cols = df.select_dtypes(include='number').columns
df_numeric = df[numeric_cols].copy()
df_non_numeric = df.drop(columns=numeric_cols)

inf_mask = df_numeric.isin([np.inf, -np.inf])
if inf_mask.any().any():
    inf_cols = list(df_numeric.columns[inf_mask.any()])
    log(f"Replacing infinities in columns: {inf_cols}")
    df_numeric = df_numeric.replace([np.inf, -np.inf], np.nan)

nan_cols = df_numeric.columns[df_numeric.isna().any()]
if len(nan_cols) > 0:
    log(f"Imputing NaNs with column means in: {list(nan_cols)}")
    df_numeric[nan_cols] = df_numeric[nan_cols].fillna(df_numeric[nan_cols].mean())

if 'Flow Bytes/s' in df_numeric.columns:
    clip_threshold = df_numeric['Flow Bytes/s'].quantile(0.999)
    df_numeric['Flow Bytes/s'] = df_numeric['Flow Bytes/s'].clip(upper=clip_threshold)
    log(f"Clipped 'Flow Bytes/s' at 99.9th percentile: {clip_threshold:.2f}")

df_cleaned = pd.concat([df_numeric, df_non_numeric], axis=1)
log("Completed simplified numeric cleaning.")

# STEP 4: Sample Training and Testing Sets

In [None]:
train_size = 10_000
test_size = 20_000

train_benign_ratio = 0.8
train_ddos_ratio = 1 - train_benign_ratio

test_benign_ratio = 0.57
test_ddos_ratio = 1 - test_benign_ratio

train_benign_size = round(train_size * train_benign_ratio)
train_ddos_size = round(train_size * train_ddos_ratio)

test_benign_size = round(test_size * test_benign_ratio)
test_ddos_size = round(test_size * test_ddos_ratio)

df_benign = df_cleaned[df_cleaned['Label'] == 'BENIGN']
df_ddos = df_cleaned[df_cleaned['Label'] == 'DDoS']

train_benign = df_benign.sample(n=train_benign_size)
train_ddos = df_ddos.sample(n=train_ddos_size)
df_train = pd.concat([train_benign, train_ddos])

df_remaining = df_cleaned.drop(df_train.index)

test_benign = df_remaining[df_remaining['Label'] == 'BENIGN'].sample(n=test_benign_size)
test_ddos = df_remaining[df_remaining['Label'] == 'DDoS'].sample(n=test_ddos_size)
df_test = pd.concat([test_benign, test_ddos])

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

log(f"Training set: {df_train.shape}, Testing set: {df_test.shape}")

In [None]:
df_train

In [None]:
df_test

# STEP 5: Apply SVD to Training Set

In [None]:
X_train = df_train.select_dtypes(include='number')
U, S, VT = np.linalg.svd(X_train, full_matrices=False)

svd_train = U[:, :2] * S[:2]
components = VT[:2, :]

components_df = pd.DataFrame(components, columns=X_train.columns, index=['SVD1', 'SVD2'])

train_svd_df = pd.DataFrame(svd_train, columns=['SVD1', 'SVD2'])
train_svd_df['Label'] = df_train['Label']

# STEP 6: Project Test Set onto Superfeatures

In [None]:
X_test = df_test.select_dtypes(include='number')
svd_test = X_test @ components.T

svd_test.columns = ['SVD1', 'SVD2']
svd_test['Label'] = df_test['Label']

test_svd_df = svd_test

# STEP 7: Visualize SVD Projection

In [None]:
ddos_mean_y = train_svd_df[train_svd_df['Label'] == 'DDoS']['SVD2'].mean()
global_mean_x = train_svd_df['SVD1'].mean()
global_mean_y = train_svd_df['SVD2'].mean()

flip_y = ddos_mean_y > global_mean_y  # flip if DDoS is above average
flip_x = global_mean_x < 0  # flip if overall mean is negative

# Create a visualization-specific copy
train_svd_df_plot = train_svd_df.copy()
if flip_y:
    train_svd_df_plot['SVD2'] *= -1
if flip_x:
    train_svd_df_plot['SVD1'] *= -1

# Plot using the flipped version
plt.figure(figsize=(10, 6))
for label in train_svd_df_plot['Label'].unique():
    subset = train_svd_df_plot[train_svd_df_plot['Label'] == label]
    plt.scatter(subset['SVD1'], subset['SVD2'], label=label, alpha=0.25, s=8)

plt.title('SVD Projection: Behavioral Clustering')
plt.xticks([])
plt.yticks([])
plt.xlabel('')
plt.ylabel('')
plt.legend(frameon=False)
plt.box(False)
plt.grid(False)
plt.tight_layout()
plt.show()

# STEP 8: Train and Evaluate Classifier

In [None]:
X_train_svd = train_svd_df[['SVD1', 'SVD2']]
y_train_svd = train_svd_df['Label']

X_test_svd = test_svd_df[['SVD1', 'SVD2']]
y_test_svd = test_svd_df['Label']

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_svd, y_train_svd)
y_pred = clf.predict(X_test_svd)

results_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],
    'Value': [
        accuracy_score(y_test_svd, y_pred),
        precision_score(y_test_svd, y_pred, pos_label='DDoS'),
        recall_score(y_test_svd, y_pred, pos_label='DDoS'),
        f1_score(y_test_svd, y_pred, pos_label='DDoS')
    ]
})

log("Model evaluation complete.")
log(f"\n{results_df.round(4)}")