In [1]:
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
import pickle as pkl

In [None]:
def dataset_file_to_df(file):
    read_file = f"datasets/Other/{file}"

    df_read = pd.read_csv(read_file)
    df_read.replace([np.inf, -np.inf, np.nan], 0, inplace=True)

    return df_read

df = dataset_file_to_df("CIC-ToN-IoT.csv")


In [None]:
def normalize(df):
    numeric_columns = df.select_dtypes(include=['number']).columns.tolist()
    df_norm = df.copy()
    for col in numeric_columns:
        if df[col].skew() > 0:
            df_norm[col] = np.log1p(df[col].clip(lower=-0.99))  # Ensures log1p always gets valid input
        elif df[col].skew() < 0:
            df_norm[col] = df[col] ** 2  # Can cause overflow
    return df_norm

df_norm = normalize(df)


In [None]:
X = df_norm.drop(["Flow ID", "Src IP", "Timestamp", "Dst IP", "Label", "Attack"], axis=1)
y = df_norm["Attack"]
y = y.rename(columns={"Attack": "Label"})

In [None]:
def load_pkl(file):
    with open(file, 'rb') as f:
        return pkl.load(f)

### SVM

In [12]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV


svc = SVC(C=10, gamma=0.1, kernel='poly')

svc.fit(X_train_pca, y_train)

y_pred = svc.predict(X_test_pca)
print(classification_report(y_test, y_pred, target_names=np.unique(y_test)))


              precision    recall  f1-score   support

      BENIGN       1.00      0.99      1.00    158976
         Bot       0.77      0.99      0.87       590
        DDoS       1.00      1.00      1.00     38408
         DoS       0.99      1.00      1.00     75799
  FTPPatator       0.99      1.00      1.00      2381
    PortScan       1.00      1.00      1.00     47679
  SSHPatator       0.95      1.00      0.97      1769
   WebAttack       0.82      0.99      0.90       654

    accuracy                           1.00    326256
   macro avg       0.94      1.00      0.97    326256
weighted avg       1.00      1.00      1.00    326256



### DecisionTreeClassifier

In [13]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint

decision_tree = DecisionTreeClassifier(criterion='log_loss', max_depth=14, max_features=None, min_samples_leaf=8, min_samples_split=16, splitter='best')

decision_tree.fit(X_train_pca, y_train)

y_pred = decision_tree.predict(X_test_pca)
print(classification_report(y_test, y_pred, target_names=np.unique(y_test)))


              precision    recall  f1-score   support

      BENIGN       1.00      0.99      0.99    158976
         Bot       0.77      0.99      0.86       590
        DDoS       0.99      1.00      1.00     38408
         DoS       0.98      0.99      0.99     75799
  FTPPatator       0.97      1.00      0.98      2381
    PortScan       1.00      1.00      1.00     47679
  SSHPatator       0.92      0.99      0.95      1769
   WebAttack       0.78      0.97      0.86       654

    accuracy                           0.99    326256
   macro avg       0.93      0.99      0.95    326256
weighted avg       0.99      0.99      0.99    326256



### KNN

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import randint
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)

knn = KNeighborsClassifier(metric='euclidean', n_neighbors=3, p=4, weights='distance')

knn.fit(X_train_pca, y_train_encoded)

y_pred = knn.predict(X_test_pca)
y_pred_labels = label_encoder.inverse_transform(y_pred)
print(classification_report(y_test, y_pred_labels, target_names=np.unique(y_test)))



              precision    recall  f1-score   support

      BENIGN       1.00      0.99      1.00    158976
         Bot       0.61      0.99      0.75       590
        DDoS       1.00      1.00      1.00     38408
         DoS       0.99      1.00      1.00     75799
  FTPPatator       0.99      1.00      1.00      2381
    PortScan       1.00      1.00      1.00     47679
  SSHPatator       0.96      1.00      0.98      1769
   WebAttack       0.88      0.99      0.93       654

    accuracy                           1.00    326256
   macro avg       0.93      1.00      0.96    326256
weighted avg       1.00      1.00      1.00    326256



### SDGClassifier

In [15]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

sgd = SGDClassifier(random_state=42)

sgd.fit(X_train_pca, y_train)

y_pred = sgd.predict(X_test_pca)
print(classification_report(y_test, y_pred, target_names=np.unique(y_test)))


              precision    recall  f1-score   support

      BENIGN       0.99      0.92      0.95    158976
         Bot       0.09      0.85      0.16       590
        DDoS       1.00      1.00      1.00     38408
         DoS       0.92      0.98      0.95     75799
  FTPPatator       0.94      0.99      0.96      2381
    PortScan       1.00      0.99      1.00     47679
  SSHPatator       0.88      0.98      0.93      1769
   WebAttack       0.36      0.85      0.51       654

    accuracy                           0.95    326256
   macro avg       0.77      0.95      0.81    326256
weighted avg       0.97      0.95      0.96    326256



In [16]:
import pickle as pkl
with open("../app/ml_model/multi_linear_svc.pkl", 'wb') as f:
    pkl.dump(linear_svc, f)

with open("../app/ml_model/multi_svc.pkl", 'wb') as f:
    pkl.dump(svc, f)

with open("../app/ml_model/multi_decision_tree.pkl", 'wb') as f:
    pkl.dump(decision_tree, f)

with open("../app/ml_model/multi_knn.pkl", 'wb') as f:
    pkl.dump(knn, f)

with open("../app/ml_model/multi_sgd.pkl", 'wb') as f:
    pkl.dump(sgd, f)