In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
data_path = Path(r"C:\Users\conne\Documents\Csci-Capstone\Data\CIC-IDS2017")
csv_files = list(data_path.glob("*.csv"))

df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)
print(f"Loaded {len(csv_files)} files with shape {df.shape}")


df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].str.strip()

df.replace([np.inf, -np.inf], np.nan, inplace=True)

df = df.loc[:, df.isna().mean() < 0.3]

numeric_cols = df.select_dtypes(include="number").columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

df.drop_duplicates(inplace=True)

drop_cols = [
    "fwd_header_length.1",
    "fwd_avg_bytes/bulk","fwd_avg_packets/bulk","fwd_avg_bulk_rate",
    "bwd_avg_bytes/bulk","bwd_avg_packets/bulk","bwd_avg_bulk_rate"
]
df = df.drop(columns=drop_cols, errors="ignore")

Loaded 8 files with shape (2830743, 79)


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  for col in df.select_dtypes(include="object").columns:


In [3]:
df['label'].unique()

<StringArray>
[                    'BENIGN',                       'DDoS',
                   'PortScan',                        'Bot',
               'Infiltration',   'Web Attack � Brute Force',
           'Web Attack � XSS', 'Web Attack � Sql Injection',
                'FTP-Patator',                'SSH-Patator',
              'DoS slowloris',           'DoS Slowhttptest',
                   'DoS Hulk',              'DoS GoldenEye',
                 'Heartbleed']
Length: 15, dtype: str

In [4]:
# -------------------------------
# Supervised Layer Training: Multi-Class Attack Classification
# -------------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import joblib

# -------------------------------
# 0️⃣ Clean labels
# -------------------------------
df['label'] = df['label'].str.strip()  # remove whitespace
df['label'] = df['label'].str.replace('�', '', regex=False)  # remove weird chars

# -------------------------------
# 1️⃣ Prepare features and labels
# -------------------------------
y = df['label']  # keep exact label names

# Encode labels to integers
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # e.g., BENIGN=0, DDoS=1, PortScan=2, ...

# Numeric features only
X = df.select_dtypes(include=['int64', 'float64'])

# Drop highly correlated features
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.9)]
X = X.drop(columns=to_drop)

# -------------------------------
# 2️⃣ Scale features
# -------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -------------------------------
# 3️⃣ Train / validation split
# -------------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# -------------------------------
# 4️⃣ Train Random Forest
# -------------------------------
rf_model = RandomForestClassifier(
    n_estimators=50,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

# -------------------------------
# # 5️⃣ Train SVM
# # -------------------------------
# svm_model = SVC(
#     kernel='rbf',
#     probability=True,
#     random_state=42
# )
# svm_model.fit(X_train, y_train)

# -------------------------------
# 6️⃣ Evaluate models
# -------------------------------
rf_pred = rf_model.predict(X_val)
# svm_pred = svm_model.predict(X_val)

print("Random Forest Classification Report:")
print(classification_report(y_val, rf_pred, target_names=le.classes_))

# print("\nSVM Classification Report:")
# print(classification_report(y_val, svm_pred, target_names=le.classes_))

# -------------------------------
# 7️⃣ Save models and artifacts
# -------------------------------
joblib.dump(rf_model, "RandomForestDumps/rf_model_multiclass.pkl")
# joblib.dump(svm_model, "svm_model_multiclass.pkl")
joblib.dump(scaler, "RandomForestDumps/supervised_scaler_multiclass.pkl")
joblib.dump(to_drop, "RandomForestDumps/supervised_dropped_features_multiclass.pkl")
joblib.dump(le, "RandomForestDumps/label_encoder.pkl")  # Needed to decode predictions later

print("\n✅ Multi-class supervised models saved for inference!")


Random Forest Classification Report:
                           precision    recall  f1-score   support

                   BENIGN       1.00      1.00      1.00    419297
                      Bot       0.91      0.64      0.75       391
                     DDoS       1.00      1.00      1.00     25603
            DoS GoldenEye       1.00      0.99      0.99      2057
                 DoS Hulk       1.00      1.00      1.00     34570
         DoS Slowhttptest       0.99      0.99      0.99      1046
            DoS slowloris       1.00      0.99      1.00      1077
              FTP-Patator       1.00      1.00      1.00      1187
               Heartbleed       1.00      1.00      1.00         2
             Infiltration       1.00      0.57      0.73         7
                 PortScan       0.99      1.00      0.99     18164
              SSH-Patator       1.00      1.00      1.00       644
  Web Attack  Brute Force       0.71      0.97      0.82       294
Web Attack  Sql Injectio