In [27]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import time
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.base import BaseEstimator, ClassifierMixin
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.decomposition import PCA
from scipy.sparse import csr_matrix

In [39]:
import os
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline


In [40]:
data = pd.read_csv('data/modified_utc_dataset.csv')

# Initial data exploration
print(data.head())

   Duration Protocol Direction State  Source_Type_of_Service  \
0  1.026539      tcp        ->  S_RA                     0.0   
1  1.009595      tcp        ->  S_RA                     0.0   
2  3.056586      tcp        ->  SR_A                     0.0   
3  3.111769      tcp        ->  SR_A                     0.0   
4  3.083411      tcp        ->  SR_A                     0.0   

   Destination_Type_of_Service  Total_Packets  Total_Bytes  Source_Bytes  \
0                          0.0              4          276           156   
1                          0.0              4          276           156   
2                          0.0              3          182           122   
3                          0.0              3          182           122   
4                          0.0              3          182           122   

                                    Label  
0  flow=Background-Established-cmpgw-CVUT  
1  flow=Background-Established-cmpgw-CVUT  
2             flow=Backgro

In [41]:
data = data.replace([np.inf, -np.inf], np.nan)
data = data.dropna()

**Feature selection**
- we have seen the correlation between 'Total_Packets' and 'Total_Bytes' is approximately one.
- The 'label's purpose is to help us identify whether the observation is from a botnet or not.

In [42]:
columns_to_drop = ['Label', 'Total_Bytes']
columns_to_drop = [col for col in columns_to_drop if col in data.columns]

data['botnet'] = data['Label'].apply(lambda x: 1 if 'flow=From-Botnet' in x else 0)

data = data.drop(columns_to_drop, axis =1)

In [43]:
# Encoding categorical features into numerical
data['Protocol'] = data['Protocol'].astype('category').cat.codes
data['Protocol'] = data['Protocol'].astype(np.int32)
data['Direction'] = data['Direction'].astype('category').cat.codes
data['Direction'] = data['Direction'].astype(np.int32)
data['State'] = data['State'].astype('category').cat.codes
data['State'] = data['State'].astype(np.int32)

In [44]:
X = data.drop('botnet', axis =1)
y = data['botnet']

## Feature importance 
**Some features doesn't even have a single observation representing the other class**

In [45]:
# Check for zero variance features and remove them
variance = X.var()
zero_variance_features = variance[variance == 0].index
if len(zero_variance_features) > 0:
    X.drop(columns=zero_variance_features, inplace=True)


In [None]:
## Class Imbalance
The dataset exhibits a significant class imbalance between two classes: 'not-botnet' (0) and 'botnet' (1). Here is the distribution:

'not-botnet' (0): 1,587,187 instances
'botnet' (1): 33,986 instances
This class distribution indicates a severe imbalance, where the majority class ('not-botnet') heavily outweighs the minority class ('botnet').

In [46]:
# Apply chi-squared test and create a dictionary that holds the p-value and the features.
chi_scores, p_values = chi2(X, y)
chi2_results = {feature: p_value for feature, p_value in zip(X.columns, p_values)}

print(chi2_results)

{'Duration': 0.0, 'Protocol': 5.052265651175802e-59, 'Direction': 0.0, 'State': 0.0, 'Source_Type_of_Service': 7.057826559746228e-25, 'Destination_Type_of_Service': 0.00019255073119190503, 'Total_Packets': 0.0, 'Source_Bytes': 0.0}


In [47]:
y = to_categorical(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
def create_model():
    model = Sequential()
    model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(y_train.shape[1], activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Custom KerasClassifier wrapper
class KerasClassifierWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, build_fn=None, epochs=1, batch_size=32, verbose=0, **kwargs):
        self.build_fn = build_fn
        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        self.kwargs = kwargs
        self.model_ = None

    def fit(self, X, y, **kwargs):
        # Ensure y is correctly shaped
        if len(y.shape) == 3:
            y = np.squeeze(y)
        self.model_ = self.build_fn(**self.kwargs)
        self.history_ = self.model_.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=self.verbose)
        return self

    def predict(self, X, **kwargs):
        return np.argmax(self.model_.predict(X), axis=1)
# Wrap the model using the custom wrapper
model = KerasClassifierWrapper(build_fn=create_model, epochs=50, batch_size=32, verbose=1)

# Define hyperparameters for GridSearchCV
params = {
    'epochs': [50, 100],
    'batch_size': [32, 64]
}

# Hyperparameter tuning and model evaluation
grid = GridSearchCV(model, param_grid=params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

# Plot training history
history = grid.best_estimator_.model_.history.history


Epoch 1/50
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 1ms/step - accuracy: 0.9788 - loss: 0.0959
Epoch 2/50
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 1ms/step - accuracy: 0.9819 - loss: 0.0811
Epoch 3/50
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1ms/step - accuracy: 0.9817 - loss: 0.0828
Epoch 4/50
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1ms/step - accuracy: 0.9820 - loss: 0.0815
Epoch 5/50
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1ms/step - accuracy: 0.9819 - loss: 0.0855
Epoch 6/50
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 1ms/step - accuracy: 0.9820 - loss: 0.0814
Epoch 7/50
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 1ms/step - accuracy: 0.9820 - loss: 0.0821
Epoch 8/50
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1ms/step - accuracy: 0.9821 - loss: 0.0821


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 1ms/step - accuracy: 0.9824 - loss: 0.0808
Epoch 37/50
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 1ms/step - accuracy: 0.9820 - loss: 0.0865
Epoch 40/50
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 1ms/step - accuracy: 0.9820 - loss: 0.1148
Epoch 41/50
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 1ms/step - accuracy: 0.9821 - loss: 0.0833
Epoch 42/50
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 1ms/step - accuracy: 0.9820 - loss: 0.3118
Epoch 43/50
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 1ms/step - accuracy: 0.9823 - loss: 0.1060
Epoch 44/50
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1ms/step - accuracy: 0.9823 - loss: 0.1094
Epoch 45/50
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 1ms/step - accuracy: 0.9824 - loss: 0.0891
Epoc

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 1ms/step - accuracy: 0.9819 - loss: 0.0881
Epoch 16/100
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 2ms/step - accuracy: 0.9819 - loss: 0.0864
Epoch 17/100
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 2ms/step - accuracy: 0.9823 - loss: 0.0965
Epoch 18/100
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 1ms/step - accuracy: 0.9821 - loss: 0.1058
Epoch 19/100
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 1ms/step - accuracy: 0.9818 - loss: 0.0904
Epoch 21/100
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 1ms/step - accuracy: 0.9821 - loss: 0.0893
Epoch 22/100
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 1ms/step - accuracy: 0.9819 - loss: 0.1139
Epoch 23/100
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1ms/step - accuracy: 0.9819 - loss: 0.15

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1ms/step - accuracy: 0.9822 - loss: 0.0972
Epoch 95/100
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 1ms/step - accuracy: 0.9821 - loss: 0.1042
Epoch 98/100
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 2ms/step - accuracy: 0.9820 - loss: 0.1772
Epoch 99/100
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 2ms/step - accuracy: 0.9820 - loss: 0.0947
Epoch 100/100
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 1ms/step - accuracy: 0.9819 - loss: 0.0882
Epoch 2/100
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 2ms/step - accuracy: 0.9824 - loss: 0.0804
Epoch 3/100
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 2ms/step - accuracy: 0.9821 - loss: 0.0802
Epoch 4/100
[1m30505/30505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 2ms/step - accuracy: 0.9822 - loss: 0.0847

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot accuracy
ax1.plot(history['accuracy'], label='Train Accuracy')
ax1.plot(history['val_accuracy'], label='Validation Accuracy')
ax1.set_title('Model Accuracy')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Accuracy')
ax1.legend()

# Plot loss
ax2.plot(history['loss'], label='Train Loss')
ax2.plot(history['val_loss'], label='Validation Loss')
ax2.set_title('Model Loss')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()

plt.show()

# Evaluate on test set
y_pred = grid.best_estimator_.predict(X_test)

# Convert predictions and true labels from categorical to binary
y_pred_binary = np.argmax(to_categorical(y_pred), axis=1)
y_test_binary = np.argmax(y_test, axis=1)

# Evaluate the model
accuracy = accuracy_score(y_test_binary, y_pred_binary)
precision = precision_score(y_test_binary, y_pred_binary, average='weighted')
recall = recall_score(y_test_binary, y_pred_binary, average='weighted')
f1 = f1_score(y_test_binary, y_pred_binary, average='weighted')
roc_auc = roc_auc_score(y_test_binary, to_categorical(y_pred_binary), multi_class='ovr')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"ROC-AUC: {roc_auc}")
