In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
data_path = Path(r"C:\Users\conne\Documents\Csci-Capstone\Data\CIC-IDS2017")
csv_files = list(data_path.glob("*.csv"))

df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)
print(f"Loaded {len(csv_files)} files with shape {df.shape}")


df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].str.strip()

df.replace([np.inf, -np.inf], np.nan, inplace=True)

df = df.loc[:, df.isna().mean() < 0.3]

numeric_cols = df.select_dtypes(include="number").columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

df.drop_duplicates(inplace=True)

drop_cols = [
    "fwd_header_length.1",
    "fwd_avg_bytes/bulk","fwd_avg_packets/bulk","fwd_avg_bulk_rate",
    "bwd_avg_bytes/bulk","bwd_avg_packets/bulk","bwd_avg_bulk_rate"
]
df = df.drop(columns=drop_cols, errors="ignore")

Loaded 8 files with shape (2830743, 79)


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  for col in df.select_dtypes(include="object").columns:


In [3]:
# -------------------------------
# Layered IDS: K-Means (Behavioral Deviation Scoring)
# -------------------------------

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import joblib

# -------------------------------
# 1Ô∏è‚É£ Extract numeric features
# -------------------------------
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

if 'label' in numeric_features:
    numeric_features.remove('label')

X_unsup = df[numeric_features]

# -------------------------------
# 2Ô∏è‚É£ Remove highly correlated features
# (MUST match Isolation Forest)
# -------------------------------
corr_matrix = X_unsup.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

to_drop = [col for col in upper.columns if any(upper[col] > 0.9)]
X_unsup_clean = X_unsup.drop(columns=to_drop)

# -------------------------------
# 3Ô∏è‚É£ Scale features
# -------------------------------
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X_unsup_clean)

# -------------------------------
# 4Ô∏è‚É£ Train / validation split
# (labels ONLY for evaluation)
# -------------------------------
y_eval = df['label'].apply(lambda x: 0 if x.lower() == 'benign' else 1)

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled,
    y_eval,
    test_size=0.2,
    stratify=y_eval,
    random_state=42
)

# -------------------------------
# 5Ô∏è‚É£ Train K-Means
# -------------------------------
# k chosen to represent common traffic behaviors
kmeans = KMeans(
    n_clusters=8,
    init="k-means++",
    n_init=20,
    random_state=42
)

kmeans.fit(X_train)

# -------------------------------
# 6Ô∏è‚É£ Distance-based anomaly scoring
# -------------------------------
# Distance to closest centroid = behavioral deviation
train_distances = np.min(
    kmeans.transform(X_train), axis=1
)

val_distances = np.min(
    kmeans.transform(X_val), axis=1
)

# -------------------------------
# 7Ô∏è‚É£ Normalize scores for ensemble fusion
# -------------------------------
score_scaler = MinMaxScaler()

train_scores_norm = score_scaler.fit_transform(
    train_distances.reshape(-1, 1)
).ravel()

val_scores_norm = score_scaler.transform(
    val_distances.reshape(-1, 1)
).ravel()

# -------------------------------
# 8Ô∏è‚É£ Define risk tiers (NO hard classification)
# -------------------------------
LOW_RISK = np.percentile(train_scores_norm, 75)
MEDIUM_RISK = np.percentile(train_scores_norm, 90)
HIGH_RISK = np.percentile(train_scores_norm, 97)

def kmeans_risk_label(score):
    if score >= HIGH_RISK:
        return "high"
    elif score >= MEDIUM_RISK:
        return "medium"
    elif score >= LOW_RISK:
        return "low"
    else:
        return "normal"

risk_labels_val = [kmeans_risk_label(s) for s in val_scores_norm]

# -------------------------------
# 9Ô∏è‚É£ Offline evaluation only
# -------------------------------
print("K-Means ROC-AUC:",
      roc_auc_score(y_val, val_scores_norm))

# -------------------------------
# üîü Save artifacts for downstream layers
# -------------------------------
joblib.dump(kmeans, "kmeans_model.pkl")
joblib.dump(feature_scaler, "kmeans_feature_scaler.pkl")
joblib.dump(score_scaler, "kmeans_score_scaler.pkl")
joblib.dump(to_drop, "kmeans_dropped_features.pkl")



K-Means ROC-AUC: 0.7155701784030799


['kmeans_dropped_features.pkl']