<a href="https://colab.research.google.com/github/durrapearl/mqtt/blob/main/LR_KNN_DT_RF_NB%20biflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from imblearn.over_sampling import ADASYN
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
# Mount Google Drive (if needed)
from google.colab import drive
drive.mount('/content/drive')

# Extract zipfile
import zipfile

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
# Define the path to the ZIP file and the extraction directory
zip_file_path = '/content/drive/MyDrive/biflow_features.zip'
extraction_path = '/content/biflow_features/'

# Create the extraction directory if it doesn't exist
import os
os.makedirs(extraction_path, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)

# List the extracted files
extracted_files = os.listdir(extraction_path)
print(f'Extracted files: {extracted_files}')

# Import the os module if not already imported
import os

# Define file paths for the CSV files
normal_csv_path = os.path.join(extraction_path, 'biflow_normal.csv')
scan_su_csv_path = os.path.join(extraction_path, 'biflow_scan_sU.csv')
sparta_csv_path = os.path.join(extraction_path, 'biflow_sparta.csv')
mqtt_bruteforce_csv_path = os.path.join(extraction_path, 'biflow_mqtt_bruteforce.csv')

# Load CSV files into DataFrames
normal_df = pd.read_csv(normal_csv_path)
scan_su_df = pd.read_csv(scan_su_csv_path)
sparta_df = pd.read_csv(sparta_csv_path)
mqtt_bruteforce_df = pd.read_csv(mqtt_bruteforce_csv_path)

# Add a 'class' column to each DataFrame
normal_df['class'] = '0'
scan_su_df['class'] = '1'
sparta_df['class'] = '2'
mqtt_bruteforce_df['class'] = '3'
# Concatenate the DataFrames into one combined_df
df = pd.concat([normal_df, scan_su_df, sparta_df, mqtt_bruteforce_df], ignore_index=True)

# Check the distribution of labels
label_counts = df['class'].value_counts()
print(label_counts)

# Define your feature matrix X and target variable y
y = df['class']  # 'class' column is your label
X = df.drop(columns=['class'])  # Exclude the 'class' column from features

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

Extracted files: ['biflow_normal.csv', 'biflow_scan_A.csv', 'biflow_scan_sU.csv', 'biflow_sparta.csv', 'biflow_mqtt_bruteforce.csv']
2    91318
0    86008
1    39664
3    16696
Name: class, dtype: int64


In [29]:
# Assuming 'ip_src' and 'ip_dst' are categorical columns in your original DataFrame
categorical_cols = ['ip_src', 'ip_dst']  # Add other categorical columns if needed
encoder = OneHotEncoder(sparse=False, drop='first')

# Apply one-hot encoding to categorical columns
X_encoded = encoder.fit_transform(X[categorical_cols])

# Normalize the numerical columns
numeric_cols = X.columns.difference(categorical_cols)
scaler = StandardScaler()
X_scaled_numeric = scaler.fit_transform(X[numeric_cols])

# Combine the one-hot encoded and scaled numeric features
X_final = np.hstack((X_encoded, X_scaled_numeric))

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y_encoded, random_state=42, stratify=y_encoded)

In [30]:
from collections import Counter
class_distribution = Counter(y_train)
print(class_distribution)

Counter({2: 68488, 0: 64506, 1: 29748, 3: 12522})


In [31]:
# Apply ADASYN for oversampling
adasyn = ADASYN(n_jobs=-1, sampling_strategy={2: 68488, 0: 64506, 1: 29748, 3: 12522})
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

In [32]:
# Initialize and train the Gaussian Naive Bayes classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train_resampled, y_train_resampled)
# Predict on the test set
y_pred = nb_classifier.predict(X_test)

# Calculate accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.57


In [33]:
# Initialize and train the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
dt_classifier.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred = dt_classifier.predict(X_test)

# Calculate accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.96


In [34]:
# Initialize classifiers
logistic_classifier = LogisticRegression(n_jobs=-1, random_state=0)
knn_classifier = KNeighborsClassifier(n_jobs=-1)
decision_tree_classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
random_forest_classifier = RandomForestClassifier(criterion='entropy', n_jobs=-1, random_state=0)
naive_bayes_classifier = GaussianNB()

In [35]:
# Train each classifier individually
logistic_classifier.fit(X_train_resampled, y_train_resampled)
knn_classifier.fit(X_train_resampled, y_train_resampled)
decision_tree_classifier.fit(X_train_resampled, y_train_resampled)
random_forest_classifier.fit(X_train_resampled, y_train_resampled)
naive_bayes_classifier.fit(X_train_resampled, y_train_resampled)


In [36]:
# Predict on the test set for each classifier
y_pred_logistic = logistic_classifier.predict(X_test)
y_pred_knn = knn_classifier.predict(X_test)
y_pred_decision_tree = decision_tree_classifier.predict(X_test)
y_pred_random_forest = random_forest_classifier.predict(X_test)
y_pred_naive_bayes = naive_bayes_classifier.predict(X_test)

In [37]:

# Calculate accuracy for each classifier
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
accuracy_decision_tree = accuracy_score(y_test, y_pred_decision_tree)
accuracy_random_forest = accuracy_score(y_test, y_pred_random_forest)
accuracy_naive_bayes = accuracy_score(y_test, y_pred_naive_bayes)

print("Logistic Regression Accuracy:", accuracy_logistic)
print("K-Nearest Neighbors Accuracy:", accuracy_knn)
print("Decision Tree Accuracy:", accuracy_decision_tree)
print("Random Forest Accuracy:", accuracy_random_forest)
print("Naive Bayes Accuracy:", accuracy_naive_bayes)

Logistic Regression Accuracy: 0.6059532367943583
K-Nearest Neighbors Accuracy: 0.9452089966108658
Decision Tree Accuracy: 0.9571394337749478
Random Forest Accuracy: 0.9565745780699052
Naive Bayes Accuracy: 0.5669097257882305


In [38]:
from sklearn.metrics import confusion_matrix

# Print classification report and confusion matrix for each classifier
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logistic))
print("Logistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logistic))

print("K-Nearest Neighbors Classification Report:")
print(classification_report(y_test, y_pred_knn))
print("K-Nearest Neighbors Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))

print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_decision_tree))
print("Decision Tree Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_decision_tree))

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_random_forest))
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_random_forest))

print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_naive_bayes))
print("Naive Bayes Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_naive_bayes))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.65      0.56     21502
           1       1.00      0.57      0.72      9916
           2       0.57      0.51      0.54     22830
           3       1.00      1.00      1.00      4174

    accuracy                           0.61     58422
   macro avg       0.77      0.68      0.71     58422
weighted avg       0.65      0.61      0.61     58422

Logistic Regression Confusion Matrix:
[[14038     0  7464     0]
 [ 3181  5604  1130     1]
 [11243     0 11587     0]
 [    1     0     1  4172]]
K-Nearest Neighbors Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.94      0.93     21502
           1       0.95      0.93      0.94      9916
           2       0.95      0.95      0.95     22830
           3       1.00      1.00      1.00      4174

    accuracy                           0.95     58422
   mac

In [39]:
# Calculate and print Accuracy
accuracy_nn = accuracy_score(y_test, y_pred_logistic)
print("Accuracy of IDS:", accuracy_nn)

# Calculate and print Precision
precision_nn = precision_score(y_test, y_pred_logistic, average='weighted')
print("Precision of IDS:", precision_nn)

# Calculate and print Recall
recall_nn = recall_score(y_test, y_pred_logistic, average='weighted')
print("Recall of IDS:", recall_nn)

# Calculate and print Average F1-Score
f1_weighted_nn = f1_score(y_test, y_pred_logistic, average='weighted')
print("Average F1 of IDS:", f1_weighted_nn)

# Calculate and print F1-Score for each type of attack
f1_per_class_nn = f1_score(y_test,y_pred_logistic, average=None)
print("F1 based IDS for each type of attack:", f1_per_class_nn)


Accuracy of IDS: 0.6059532367943583
Precision of IDS: 0.6470354821328458
Recall of IDS: 0.6059532367943583
Average F1 of IDS: 0.6113466267263069
F1 based IDS for each type of attack: [0.56191334 0.72216495 0.53877988 0.99964059]


In [40]:
# Calculate and print Accuracy
accuracy_nn = accuracy_score(y_test, y_pred_knn)
print("Accuracy of IDS:", accuracy_nn)

# Calculate and print Precision
precision_nn = precision_score(y_test, y_pred_knn, average='weighted')
print("Precision of IDS:", precision_nn)

# Calculate and print Recall
recall_nn = recall_score(y_test, y_pred_knn, average='weighted')
print("Recall of IDS:", recall_nn)

# Calculate and print Average F1-Score
f1_weighted_nn = f1_score(y_test, y_pred_knn, average='weighted')
print("Average F1 of IDS:", f1_weighted_nn)

# Calculate and print F1-Score for each type of attack
f1_per_class_nn = f1_score(y_test,y_pred_knn, average=None)
print("F1 based IDS for each type of attack:", f1_per_class_nn)

Accuracy of IDS: 0.9452089966108658
Precision of IDS: 0.9454120114734772
Recall of IDS: 0.9452089966108658
Average F1 of IDS: 0.945264295633423
F1 based IDS for each type of attack: [0.93017037 0.94348269 0.95040014 0.99916097]


In [41]:
# Calculate and print Accuracy
accuracy_nn = accuracy_score(y_test, y_pred_decision_tree)
print("Accuracy of IDS:", accuracy_nn)

# Calculate and print Precision
precision_nn = precision_score(y_test, y_pred_decision_tree, average='weighted')
print("Precision of IDS:", precision_nn)

# Calculate and print Recall
recall_nn = recall_score(y_test, y_pred_decision_tree, average='weighted')
print("Recall of IDS:", recall_nn)

# Calculate and print Average F1-Score
f1_weighted_nn = f1_score(y_test, y_pred_decision_tree, average='weighted')
print("Average F1 of IDS:", f1_weighted_nn)

# Calculate and print F1-Score for each type of attack
f1_per_class_nn = f1_score(y_test,y_pred_decision_tree, average=None)
print("F1 based IDS for each type of attack:", f1_per_class_nn)

Accuracy of IDS: 0.9571394337749478
Precision of IDS: 0.9571366784621756
Recall of IDS: 0.9571394337749478
Average F1 of IDS: 0.9571358048462923
F1 based IDS for each type of attack: [0.94744444 0.94823683 0.96233559 0.99976036]


In [42]:
# Calculate and print Accuracy
accuracy_nn = accuracy_score(y_test, y_pred_random_forest)
print("Accuracy of IDS:", accuracy_nn)

# Calculate and print Precision
precision_nn = precision_score(y_test, y_pred_random_forest, average='weighted')
print("Precision of IDS:", precision_nn)

# Calculate and print Recall
recall_nn = recall_score(y_test, y_pred_random_forest, average='weighted')
print("Recall of IDS:", recall_nn)

# Calculate and print Average F1-Score
f1_weighted_nn = f1_score(y_test, y_pred_random_forest, average='weighted')
print("Average F1 of IDS:", f1_weighted_nn)

# Calculate and print F1-Score for each type of attack
f1_per_class_nn = f1_score(y_test,y_pred_random_forest, average=None)
print("F1 based IDS for each type of attack:", f1_per_class_nn)

Accuracy of IDS: 0.9565745780699052
Precision of IDS: 0.9567085798449381
Recall of IDS: 0.9565745780699052
Average F1 of IDS: 0.9565550897757712
F1 based IDS for each type of attack: [0.94702523 0.94723295 0.96165848 0.9998802 ]


In [43]:
# Calculate and print Accuracy
accuracy_nn = accuracy_score(y_test, y_pred_naive_bayes)
print("Accuracy of IDS:", accuracy_nn)

# Calculate and print Precision
precision_nn = precision_score(y_test, y_pred_naive_bayes, average='weighted')
print("Precision of IDS:", precision_nn)

# Calculate and print Recall
recall_nn = recall_score(y_test, y_pred_naive_bayes, average='weighted')
print("Recall of IDS:", recall_nn)

# Calculate and print Average F1-Score
f1_weighted_nn = f1_score(y_test, y_pred_naive_bayes, average='weighted')
print("Average F1 of IDS:", f1_weighted_nn)

# Calculate and print F1-Score for each type of attack
f1_per_class_nn = f1_score(y_test,y_pred_naive_bayes, average=None)
print("F1 based IDS for each type of attack:", f1_per_class_nn)

Accuracy of IDS: 0.5669097257882305
Precision of IDS: 0.7654726180992943
Recall of IDS: 0.5669097257882305
Average F1 of IDS: 0.5089293369873986
F1 based IDS for each type of attack: [0.62947285 0.71496145 0.27004793 0.7050773 ]
