In [1]:
# Importing necessary modules
import pandas as pd
import numpy as np
import os
import pickle

# Importing Visualization Package
import matplotlib.pyplot as plt
import seaborn as sns

# Importing Scikit learn package
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import (
    f1_score,
    classification_report,
    confusion_matrix,
    accuracy_score
)
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import NearMiss
from datetime import datetime


In [2]:
# Load the combined dataset from notebook s
df = pd.read_csv('rawdata/combined_iot_data.csv')

print(f"Dataset loaded: {df.shape}")
print(f"\nClass distribution:")
print(df['label'].value_counts())
print(f"\nFirst rows:")
print(df.head())


Dataset loaded: (701648, 117)

Class distribution:
label
Mirai     652100
Benign     49548
Name: count, dtype: int64

First rows:
   MI_dir_L5_weight  MI_dir_L5_mean  MI_dir_L5_variance  MI_dir_L3_weight  \
0          1.000000       60.000000            0.000000          1.000000   
1          1.000000      354.000000            0.000000          1.000000   
2          1.857879      360.458980           35.789338          1.912127   
3          1.000000      337.000000            0.000000          1.000000   
4          1.680223      172.140917        18487.448750          1.793580   

   MI_dir_L3_mean  MI_dir_L3_variance  MI_dir_L1_weight  MI_dir_L1_mean  \
0       60.000000            0.000000          1.000000       60.000000   
1      354.000000            0.000000          1.000000      354.000000   
2      360.275733           35.923972          1.969807      360.091968   
3      337.000000            0.000000          1.000000      337.000000   
4      182.560279        18928.1

In [3]:
def knn_classifier_unbalanced(data, scaling=False):
    """
    KNN Classifier without balance
    """
    X = data.drop(['label', 'device'], axis=1).values
    y = data['label'].values
    
    print(f"Original Shape: {X.shape}, {y.shape}")
    
    # Split data
    if scaling == False:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=47)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=47)
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    
    # Train KNN
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)
    
    # Predictions
    y_pred = knn.predict(X_test)
    
    # Metrics
    print(f"\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print(f"\nAccuracy on Training set: {knn.score(X_train, y_train)}")
    print(f"Accuracy on Test set: {knn.score(X_test, y_test)}")
    print("---------------------- Done ---------------------")


In [4]:
print("=" * 60)
print("KNN - WITHOUT BALANCING - WITHOUT SCALING")
print("=" * 60)
knn_classifier_unbalanced(df, scaling=False)


KNN - WITHOUT BALANCING - WITHOUT SCALING
Original Shape: (701648, 115), (701648,)

Confusion Matrix:
[[ 14790     77]
 [   140 195488]]

Classification Report:
              precision    recall  f1-score   support

      Benign       0.99      0.99      0.99     14867
       Mirai       1.00      1.00      1.00    195628

    accuracy                           1.00    210495
   macro avg       1.00      1.00      1.00    210495
weighted avg       1.00      1.00      1.00    210495


Accuracy on Training set: 0.9993382917339403
Accuracy on Test set: 0.9989690966531272
---------------------- Done ---------------------


In [5]:
print("\n" + "=" * 60)
print("KNN - WITHOUT BALANCING - WITH SCALING")
print("=" * 60)
knn_classifier_unbalanced(df, scaling=True)



KNN - WITHOUT BALANCING - WITH SCALING
Original Shape: (701648, 115), (701648,)

Confusion Matrix:
[[ 14866      1]
 [     3 195625]]

Classification Report:
              precision    recall  f1-score   support

      Benign       1.00      1.00      1.00     14867
       Mirai       1.00      1.00      1.00    195628

    accuracy                           1.00    210495
   macro avg       1.00      1.00      1.00    210495
weighted avg       1.00      1.00      1.00    210495


Accuracy on Training set: 0.9999796397456597
Accuracy on Test set: 0.9999809971733296
---------------------- Done ---------------------


In [6]:
def knn_classifier_balanced(data, scaling=False):
    """
    KNN Classifier with balance using NearMiss
    """
    X = data.drop(['label', 'device'], axis=1).values
    y = data['label'].values
    
    print(f"Original Shape: {X.shape}, {y.shape}")
    
    # Balance data with NearMiss
    nm = NearMiss()
    X_res, y_res = nm.fit_resample(X, y)
    print(f"Balanced Shape: {X_res.shape}, {y_res.shape}")
    
    # Split data
    if scaling == False:
        X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.30, random_state=47)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.30, random_state=47)
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    
    # Train KNN
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)
    
    # Predictions
    y_pred = knn.predict(X_test)
    
    # Metrics
    print(f"\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print(f"\nAccuracy on Training set: {knn.score(X_train, y_train)}")
    print(f"Accuracy on Test set: {knn.score(X_test, y_test)}")
    print("---------------------- Done ---------------------")


In [7]:
print("\n" + "=" * 60)
print("KNN - WITH BALANCING (NearMiss) - WITHOUT SCALING")
print("=" * 60)
knn_classifier_balanced(df, scaling=False)



KNN - WITH BALANCING (NearMiss) - WITHOUT SCALING
Original Shape: (701648, 115), (701648,)
Balanced Shape: (99096, 115), (99096,)

Confusion Matrix:
[[14843     0]
 [    0 14886]]

Classification Report:
              precision    recall  f1-score   support

      Benign       1.00      1.00      1.00     14843
       Mirai       1.00      1.00      1.00     14886

    accuracy                           1.00     29729
   macro avg       1.00      1.00      1.00     29729
weighted avg       1.00      1.00      1.00     29729


Accuracy on Training set: 0.9999423356927646
Accuracy on Test set: 1.0
---------------------- Done ---------------------


In [8]:
print("\n" + "=" * 60)
print("KNN - WITH BALANCING (NearMiss) - WITH SCALING")
print("=" * 60)
knn_classifier_balanced(df, scaling=True)



KNN - WITH BALANCING (NearMiss) - WITH SCALING
Original Shape: (701648, 115), (701648,)
Balanced Shape: (99096, 115), (99096,)

Confusion Matrix:
[[14843     0]
 [    0 14886]]

Classification Report:
              precision    recall  f1-score   support

      Benign       1.00      1.00      1.00     14843
       Mirai       1.00      1.00      1.00     14886

    accuracy                           1.00     29729
   macro avg       1.00      1.00      1.00     29729
weighted avg       1.00      1.00      1.00     29729


Accuracy on Training set: 0.9999855839231911
Accuracy on Test set: 1.0
---------------------- Done ---------------------


In [9]:
# Train the best model (e.g., with scaling and balancing)
X = df.drop(['label', 'device'], axis=1).values
y = df['label'].values

# Balance
nm = NearMiss()
X_res, y_res = nm.fit_resample(X, y)

# Split and scale
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.30, random_state=47)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Train
best_model = KNeighborsClassifier(n_neighbors=3)
best_model.fit(X_train, y_train)

# Save model
model_path = 'models'
if not os.path.exists(model_path):
    os.makedirs(model_path)

model_name = 'DanminiDoorbell_KNN_balanced_scaled.pkl'
model_full_path = os.path.join(model_path, model_name)

with open(model_full_path, 'wb') as f:
    pickle.dump(best_model, f)

print(f"Model saved at: {model_full_path}")
print(f"Accuracy in test: {best_model.score(X_test, y_test):.4f}")


Model saved at: models\DanminiDoorbell_KNN_balanced_scaled.pkl
Accuracy in test: 1.0000
