<a href="https://colab.research.google.com/github/durrapearl/mqtt/blob/main/LR_KNN_DT_RF_NB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from imblearn.over_sampling import ADASYN
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
# Mount Google Drive (if needed)
from google.colab import drive
drive.mount('/content/drive')

# Extract zipfile
import zipfile

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Define the path to the ZIP file and the extraction directory
zip_file_path = '/content/drive/MyDrive/uniflow_features.zip'
extraction_path = '/content/uniflow_features/'

# Create the extraction directory if it doesn't exist
import os
os.makedirs(extraction_path, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)

# List the extracted files
extracted_files = os.listdir(extraction_path)
print(f'Extracted files: {extracted_files}')

# Import the os module if not already imported
import os

# Define file paths for the CSV files
normal_csv_path = os.path.join(extraction_path, 'uniflow_normal.csv')
scan_su_csv_path = os.path.join(extraction_path, 'uniflow_scan_sU.csv')
sparta_csv_path = os.path.join(extraction_path, 'uniflow_sparta.csv')
mqtt_bruteforce_csv_path = os.path.join(extraction_path, 'uniflow_mqtt_bruteforce.csv')

# Load CSV files into DataFrames
normal_df = pd.read_csv(normal_csv_path)
scan_su_df = pd.read_csv(scan_su_csv_path)
sparta_df = pd.read_csv(sparta_csv_path)
mqtt_bruteforce_df = pd.read_csv(mqtt_bruteforce_csv_path)

# Add a 'class' column to each DataFrame
normal_df['class'] = '0'
scan_su_df['class'] = '1'
sparta_df['class'] = '2'
mqtt_bruteforce_df['class'] = '3'
# Concatenate the DataFrames into one combined_df
df = pd.concat([normal_df, scan_su_df, sparta_df, mqtt_bruteforce_df], ignore_index=True)

# Check the distribution of labels
label_counts = df['class'].value_counts()
print(label_counts)

# Define your feature matrix X and target variable y
y = df['class']  # 'class' column is your label
X = df.drop(columns=['class'])  # Exclude the 'class' column from features

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

Extracted files: ['uniflow_normal.csv', 'uniflow_mqtt_bruteforce.csv', 'uniflow_scan_sU.csv', 'uniflow_sparta.csv', 'uniflow_scan_A.csv']
2    182407
0    171836
1     56845
3     33079
Name: class, dtype: int64


In [4]:
# Assuming 'ip_src' and 'ip_dst' are categorical columns in your original DataFrame
categorical_cols = ['ip_src', 'ip_dst']  # Add other categorical columns if needed
encoder = OneHotEncoder(sparse=False, drop='first')

# Apply one-hot encoding to categorical columns
X_encoded = encoder.fit_transform(X[categorical_cols])

# Normalize the numerical columns
numeric_cols = X.columns.difference(categorical_cols)
scaler = StandardScaler()
X_scaled_numeric = scaler.fit_transform(X[numeric_cols])

# Combine the one-hot encoded and scaled numeric features
X_final = np.hstack((X_encoded, X_scaled_numeric))

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y_encoded, random_state=42, stratify=y_encoded)

In [5]:
from collections import Counter
class_distribution = Counter(y_train)
print(class_distribution)

Counter({2: 136805, 0: 128877, 1: 42634, 3: 24809})


In [6]:
# Apply ADASYN for oversampling
adasyn = ADASYN(n_jobs=-1, sampling_strategy={2: 136805, 0: 128877, 1: 42634, 3: 24809})
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

In [7]:
# Initialize and train the Gaussian Naive Bayes classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train_resampled, y_train_resampled)
# Predict on the test set
y_pred = nb_classifier.predict(X_test)

# Calculate accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.58


In [8]:
# Initialize and train the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
dt_classifier.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred = dt_classifier.predict(X_test)

# Calculate accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.91


In [13]:
# Initialize classifiers
logistic_classifier = LogisticRegression(n_jobs=-1, random_state=0)
knn_classifier = KNeighborsClassifier(n_jobs=-1)
decision_tree_classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
random_forest_classifier = RandomForestClassifier(criterion='entropy', n_jobs=-1, random_state=0)
naive_bayes_classifier = GaussianNB()

In [14]:
# Train each classifier individually
logistic_classifier.fit(X_train_resampled, y_train_resampled)
knn_classifier.fit(X_train_resampled, y_train_resampled)
decision_tree_classifier.fit(X_train_resampled, y_train_resampled)
random_forest_classifier.fit(X_train_resampled, y_train_resampled)
naive_bayes_classifier.fit(X_train_resampled, y_train_resampled)


In [15]:
# Predict on the test set for each classifier
y_pred_logistic = logistic_classifier.predict(X_test)
y_pred_knn = knn_classifier.predict(X_test)
y_pred_decision_tree = decision_tree_classifier.predict(X_test)
y_pred_random_forest = random_forest_classifier.predict(X_test)
y_pred_naive_bayes = naive_bayes_classifier.predict(X_test)

In [16]:

# Calculate accuracy for each classifier
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
accuracy_decision_tree = accuracy_score(y_test, y_pred_decision_tree)
accuracy_random_forest = accuracy_score(y_test, y_pred_random_forest)
accuracy_naive_bayes = accuracy_score(y_test, y_pred_naive_bayes)

print("Logistic Regression Accuracy:", accuracy_logistic)
print("K-Nearest Neighbors Accuracy:", accuracy_knn)
print("Decision Tree Accuracy:", accuracy_decision_tree)
print("Random Forest Accuracy:", accuracy_random_forest)
print("Naive Bayes Accuracy:", accuracy_naive_bayes)

Logistic Regression Accuracy: 0.5874714072152879
K-Nearest Neighbors Accuracy: 0.8642855856342645
Decision Tree Accuracy: 0.9136542929702275
Random Forest Accuracy: 0.915158228418076
Naive Bayes Accuracy: 0.5753408620161741


In [18]:
from sklearn.metrics import confusion_matrix

# Print classification report and confusion matrix for each classifier
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logistic))
print("Logistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logistic))

print("K-Nearest Neighbors Classification Report:")
print(classification_report(y_test, y_pred_knn))
print("K-Nearest Neighbors Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))

print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_decision_tree))
print("Decision Tree Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_decision_tree))

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_random_forest))
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_random_forest))

print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_naive_bayes))
print("Naive Bayes Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_naive_bayes))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.64      0.56     42959
           1       1.00      0.40      0.57     14211
           2       0.57      0.52      0.55     45602
           3       1.00      1.00      1.00      8270

    accuracy                           0.59    111042
   macro avg       0.77      0.64      0.67    111042
weighted avg       0.63      0.59      0.59    111042

Logistic Regression Confusion Matrix:
[[27429     0 15530     0]
 [ 6408  5657  2141     5]
 [21722     0 23879     1]
 [    0     0     1  8269]]
K-Nearest Neighbors Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.84      0.83     42959
           1       0.77      0.74      0.76     14211
           2       0.91      0.90      0.91     45602
           3       1.00      1.00      1.00      8270

    accuracy                           0.86    111042
   mac

In [21]:
# Calculate and print Accuracy
accuracy_nn = accuracy_score(y_test, y_pred_logistic)
print("Accuracy of IDS:", accuracy_nn)

# Calculate and print Precision
precision_nn = precision_score(y_test, y_pred_logistic, average='weighted')
print("Precision of IDS:", precision_nn)

# Calculate and print Recall
recall_nn = recall_score(y_test, y_pred_logistic, average='weighted')
print("Recall of IDS:", recall_nn)

# Calculate and print Average F1-Score
f1_weighted_nn = f1_score(y_test, y_pred_logistic, average='weighted')
print("Average F1 of IDS:", f1_weighted_nn)

# Calculate and print F1-Score for each type of attack
f1_per_class_nn = f1_score(y_test,y_pred_logistic, average=None)
print("F1 based IDS for each type of attack:", f1_per_class_nn)


Accuracy of IDS: 0.5874714072152879
Precision of IDS: 0.6294065914543014
Recall of IDS: 0.5874714072152879
Average F1 of IDS: 0.5877862672493627
F1 based IDS for each type of attack: [0.55683225 0.56945843 0.54797884 0.99957691]


In [23]:
# Calculate and print Accuracy
accuracy_nn = accuracy_score(y_test, y_pred_knn)
print("Accuracy of IDS:", accuracy_nn)

# Calculate and print Precision
precision_nn = precision_score(y_test, y_pred_knn, average='weighted')
print("Precision of IDS:", precision_nn)

# Calculate and print Recall
recall_nn = recall_score(y_test, y_pred_knn, average='weighted')
print("Recall of IDS:", recall_nn)

# Calculate and print Average F1-Score
f1_weighted_nn = f1_score(y_test, y_pred_knn, average='weighted')
print("Average F1 of IDS:", f1_weighted_nn)

# Calculate and print F1-Score for each type of attack
f1_per_class_nn = f1_score(y_test,y_pred_knn, average=None)
print("F1 based IDS for each type of attack:", f1_per_class_nn)

Accuracy of IDS: 0.8642855856342645
Precision of IDS: 0.864223619774099
Recall of IDS: 0.8642855856342645
Average F1 of IDS: 0.8641918801071837
F1 based IDS for each type of attack: [0.8290049  0.75788115 0.90597182 0.99927414]


In [24]:
# Calculate and print Accuracy
accuracy_nn = accuracy_score(y_test, y_pred_decision_tree)
print("Accuracy of IDS:", accuracy_nn)

# Calculate and print Precision
precision_nn = precision_score(y_test, y_pred_decision_tree, average='weighted')
print("Precision of IDS:", precision_nn)

# Calculate and print Recall
recall_nn = recall_score(y_test, y_pred_decision_tree, average='weighted')
print("Recall of IDS:", recall_nn)

# Calculate and print Average F1-Score
f1_weighted_nn = f1_score(y_test, y_pred_decision_tree, average='weighted')
print("Average F1 of IDS:", f1_weighted_nn)

# Calculate and print F1-Score for each type of attack
f1_per_class_nn = f1_score(y_test,y_pred_decision_tree, average=None)
print("F1 based IDS for each type of attack:", f1_per_class_nn)

Accuracy of IDS: 0.9136542929702275
Precision of IDS: 0.9136470347697362
Recall of IDS: 0.9136542929702275
Average F1 of IDS: 0.9136490879219853
F1 based IDS for each type of attack: [0.89286797 0.84685887 0.9383798  1.        ]


In [25]:
# Calculate and print Accuracy
accuracy_nn = accuracy_score(y_test, y_pred_random_forest)
print("Accuracy of IDS:", accuracy_nn)

# Calculate and print Precision
precision_nn = precision_score(y_test, y_pred_random_forest, average='weighted')
print("Precision of IDS:", precision_nn)

# Calculate and print Recall
recall_nn = recall_score(y_test, y_pred_random_forest, average='weighted')
print("Recall of IDS:", recall_nn)

# Calculate and print Average F1-Score
f1_weighted_nn = f1_score(y_test, y_pred_random_forest, average='weighted')
print("Average F1 of IDS:", f1_weighted_nn)

# Calculate and print F1-Score for each type of attack
f1_per_class_nn = f1_score(y_test,y_pred_random_forest, average=None)
print("F1 based IDS for each type of attack:", f1_per_class_nn)

Accuracy of IDS: 0.915158228418076
Precision of IDS: 0.916422850090562
Recall of IDS: 0.915158228418076
Average F1 of IDS: 0.9137912395621053
F1 based IDS for each type of attack: [0.89673516 0.82446809 0.94206055 1.        ]


In [26]:
# Calculate and print Accuracy
accuracy_nn = accuracy_score(y_test, y_pred_naive_bayes)
print("Accuracy of IDS:", accuracy_nn)

# Calculate and print Precision
precision_nn = precision_score(y_test, y_pred_naive_bayes, average='weighted')
print("Precision of IDS:", precision_nn)

# Calculate and print Recall
recall_nn = recall_score(y_test, y_pred_naive_bayes, average='weighted')
print("Recall of IDS:", recall_nn)

# Calculate and print Average F1-Score
f1_weighted_nn = f1_score(y_test, y_pred_naive_bayes, average='weighted')
print("Average F1 of IDS:", f1_weighted_nn)

# Calculate and print F1-Score for each type of attack
f1_per_class_nn = f1_score(y_test,y_pred_naive_bayes, average=None)
print("F1 based IDS for each type of attack:", f1_per_class_nn)

Accuracy of IDS: 0.5753408620161741
Precision of IDS: 0.7909281137240582
Recall of IDS: 0.5753408620161741
Average F1 of IDS: 0.5074467117059621
F1 based IDS for each type of attack: [0.64533454 0.56376038 0.27067612 1.        ]
