## Imports

In [1]:
# Import necessary libraries for data manipulation and analysis
import pandas as pd  # For data handling
import numpy as np  # For numerical operations
import seaborn as sns
import matplotlib.pyplot as plt
import os
import requests

# Import libraries for machine learning and deep learning
from sklearn.model_selection import train_test_split, GridSearchCV  # For splitting data into train and test sets
from sklearn.ensemble import RandomForestClassifier  # For Random Forest model
import tensorflow as tf  # For building deep learning models
from tensorflow.keras.models import Sequential  # For building sequential models
from tensorflow.keras.layers import (
    LSTM, Dense, Dropout, Bidirectional, BatchNormalization
)  # For defining layers in neural networks
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint  # Callbacks for training
from tensorflow.keras.optimizers import Adam  # Optimizer for neural networks

# Import libraries for evaluation metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)  # For model evaluation

# Import custom utility functions from a script
from scripts.utils import create_sequences, preprocess, perform_anova

## Datasets:

Botnet Class Labels
- **clear**
- **neris**
- **rbot**
- **fast flux**
- **donbot**
- **qvod**

ToN IoT Class Labels
- **clear**
- **backdoor**
- **dos**
- **injection**
- **mitm**
- **password**
- **ransomware**
- **scanning**
- **xss**

### Download Datasets


In [2]:
urls = {
    'botnet_multiclass': 'https://zenodo.org/records/8035724/files/botnet_multiclass.csv?download=1',
    'ton_iot_multiclass': 'https://zenodo.org/records/8035724/files/ton_iot_multiclass.csv?download=1',
}

save_dir = 'datasets/'

if not os.path.exists(save_dir):
    for filename, url in urls.items():
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open(os.path.join(save_dir, filename + '.csv'), 'wb') as file:
                for chunk in response.iter_content(chunk_size=128):
                    file.write(chunk)
        else:
            print(f"Failed to download {filename}")

df_primary = pd.read_csv('datasets/botnet_multiclass.csv')
# df_secondary = pd.read_csv('datasets/ton_iot_multiclass.csv')

### ANOVA
- Low P Value: statistical significance
- High V Value: seperation from group mean

In [3]:
anova_df = perform_anova(df = df_primary, target_column = 'LABEL')
anova_df.T

Unnamed: 0,Unnamed: 0.1,PEARSON_SK1_SKEWNESS,PEARSON_SK2_SKEWNESS,FISHER_MI_3_SKEWNESS,ENTROPY,SCALED_ENTROPY,HURST_EXPONENT,P_BENFORD,TIME_DISTRIBUTION,AREA_VALUES_DISTRIBUTION,...,SPECTRAL_ENERGY,POWER_MEAN,SPECTRAL_FLUX,POWER_STD,MAX_POWER,CNT_ZEROS,SPECTRAL_CREST,SPECTRAL_ENTROPY,SPECTRAL_CENTROID,GALTON_SKEWNESS
F-Value,1502.211269,8743.866331,2608.909682,2664.922306,4036.56304,12086.719545,6088.641145,4249.979261,2151.918931,9860.27935,...,1.644773,1.644709,1.339223,1.226269,1.207071,1.04967,0.797632,0.777401,0.194402,0.175223
P-Value,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.144348,0.144364,0.244262,0.293668,0.302814,0.386333,0.551129,0.565806,0.96479,0.971926


### Label Values for Datasets

In [4]:
primary_first = df_primary.copy()
primary_first['TIME_FIRST'] = pd.to_datetime(df_primary['TIME_FIRST'], unit='s')

primary_second = primary_first.copy()

date_one = pd.to_datetime('2011-08-16').date()
date_two = pd.to_datetime('2011-08-10').date()

primary_first = primary_first[primary_first['TIME_FIRST'].dt.date == date_one]
primary_second = primary_second[primary_second['TIME_FIRST'].dt.date == date_two]

primary_first['LABEL'].value_counts(), primary_second['LABEL'].value_counts()

(clear        37983
 qvod           277
 fast_flux      166
 donbot          27
 Name: LABEL, dtype: int64,
 clear    15354
 neris     6176
 Name: LABEL, dtype: int64)

In [5]:
columns_to_keep = [
    'PACKETS',
    'PACKETS_REV',
    'BYTES',
    'BYTES_REV',
    'DURATION',
    'BURSTINESS',
    'ENTROPY',
    'SCALED_ENTROPY',
    'HURST_EXPONENT',
    'CNT_ZEROS',
    'CNT_NZ_DISTRIBUTION',
    'TIME_DISTRIBUTION',
    'PEARSON_SK1_SKEWNESS',
    'LABEL'
]

df_primary_processed = preprocess(df = df_primary, columns_to_keep = columns_to_keep, IP = '147.32.84.165', days = ['2011-08-16'])
df_secondary_processed = preprocess(df = df_primary, columns_to_keep = columns_to_keep, IP = '147.32.84.165', days = ['2011-08-10'])
# df_secondary_processed = preprocess(df = df_secondary, columns_to_keep = columns_to_keep, IP = '192.168.1.195', days = ['2019-04-03', '2019-04-26'], sample_size = df_primary_processed.shape[0])

## Data Vizualization
- **Boxen**
- **KDE**
- **Histogram**
- **Pairplot**

In [6]:
df_features = df_primary_processed.drop('LABEL', axis=1)

# Iterating through each column to create Boxen, KDE, and Histogram plots and save them
for col in df_features.columns:
    plt.figure(figsize=(20, 5))

    # Boxen plot
    plt.subplot(1, 3, 1)
    sns.boxenplot(data=df_features, x=col)
    plt.title(f'Boxenplot of {col}')

    # KDE plot
    plt.subplot(1, 3, 2)
    sns.kdeplot(data=df_features, x=col, fill=True)
    plt.title(f'KDE Plot of {col}')

    # Histogram of 'LABEL'
    plt.subplot(1, 3, 3)
    sns.histplot(data=df_primary_processed, x='LABEL', bins=10, kde=False)
    plt.title('Histogram of LABEL')

    # Save the figure
    plt.tight_layout()
    plt.savefig(f'viz/{col}.png')
    plt.close()

plt.figure()
sns.pairplot(df_features)
plt.tight_layout()
plt.savefig('viz/pairplot.png')
plt.close()

<Figure size 432x288 with 0 Axes>

## Random Forest Model

### Data Preparation
- Data is prepared for the Random Forest model by separating the target variable 'LABEL' from the features.
- The dataset is split into training and testing sets using the `train_test_split` function.

In [7]:
# Preparing data for Random Forest
X_rf = df_primary_processed.drop('LABEL', axis=1)
y_rf = df_primary_processed['LABEL']

# Splitting the dataset into training and testing sets for Random Forest
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_rf, y_rf, test_size=0.2, random_state=42)

### Model Creation
- A Random Forest classifier with 100 decision trees is created using the `RandomForestClassifier` from scikit-learn.
- The classifier is trained on the training data using the `fit` method.

In [8]:

# Parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 150, 200],  # Different number of trees in the forest
    'max_depth': [5, 10, None],       # Maximum number of levels in each decision tree
    'min_samples_split': [2, 4, 6],   # Minimum number of data points placed in a node before the node is split
    'min_samples_leaf': [1, 2, 4]     # Minimum number of data points allowed in a leaf node
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_rf, y_train_rf)

# Best parameters found by GridSearchCV
best_params = grid_search.best_params_
print("Best parameters found by GridSearchCV:", best_params)


Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters found by GridSearchCV: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 100}


In [9]:
# Random Forest Model
rf_classifier = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    random_state=42  
)

rf_classifier.fit(X_train_rf, y_train_rf)

RandomForestClassifier(max_depth=10, min_samples_split=6, random_state=42)

### Predictions
- Predictions are made on the test data using the trained Random Forest model. These predictions are stored in the variable `y_pred_rf`.

In [10]:
# Predictions with Random Forest
y_pred_rf = rf_classifier.predict(X_test_rf)

### Evaluation Metrics
- To assess the model's performance, several evaluation metrics are calculated:
  - **Accuracy**: Measures the proportion of correctly classified instances.
  - **Precision**: Measures the ability to correctly identify positive cases.
  - **Recall**: Measures the ability to find all positive cases.
  - **F1 Score**: Combines precision and recall into a single metric.
  - **ROC-AUC**: Measures the area under the Receiver Operating Characteristic curve, indicating the model's ability to distinguish between classes.

In [11]:
# Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_test_rf, y_pred_rf)
precision_rf = precision_score(y_test_rf, y_pred_rf)
recall_rf = recall_score(y_test_rf, y_pred_rf)
f1_rf = f1_score(y_test_rf, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test_rf, y_pred_rf)

print("Random Forest Model:")
print(f"Accuracy: {accuracy_rf}")
print(f"Precision: {precision_rf}")
print(f"Recall: {recall_rf}")
print(f"F1 Score: {f1_rf}")
print(f"ROC-AUC: {roc_auc_rf}")

Random Forest Model:
Accuracy: 0.9995787700084247
Precision: 0.9896907216494846
Recall: 0.9896907216494846
F1 Score: 0.9896907216494846
ROC-AUC: 0.9947378570621106


## Basic LSTM Model

### Data Preparation
- Sequences are created from the primary dataset with a time step of 1.
- The dataset is then split into training and testing sets using the `train_test_split` function.

In [12]:
# Creating sequences for Basic LSTM
n_steps_basic = 1
X_lstm_basic, y_lstm_basic = create_sequences(df_primary_processed, n_steps_basic)

# Splitting the dataset into training and testing sets for Basic LSTM
X_train_basic, X_test_basic, y_train_basic, y_test_basic = train_test_split(X_lstm_basic, y_lstm_basic, test_size=0.2, random_state=42)

### Model Architecture
- A Basic LSTM model is constructed using Keras with the following architecture:
  - A single LSTM layer with 10 units and a linear activation function.
  - A Dense layer with a sigmoid activation function for binary classification.
- The Adam optimizer with a learning rate of 0.1 is used.

### Model Training
- The Basic LSTM model is trained on the training data using the `fit` method with specified epochs and batch size.

In [13]:
# Building the Basic LSTM Model
model_basic = Sequential()
model_basic.add(LSTM(10, activation='linear', input_shape=(n_steps_basic, X_train_basic.shape[2])))
model_basic.add(Dense(1, activation='sigmoid'))

# Large learning rate for Basic LSTM
adam_basic = Adam(learning_rate=0.1)

model_basic.compile(optimizer=adam_basic, loss='mse', metrics=['accuracy'])

# Fit Basic LSTM model
model_basic.fit(X_train_basic, y_train_basic, epochs=10, batch_size=32, validation_data=(X_test_basic, y_test_basic))

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1eb025b0f40>

### Predictions
- After training, predictions are made on the test data using the trained Basic LSTM model. Predictions are rounded to obtain binary classification results, which are stored in `y_pred_basic`.


In [14]:
# Predictions with Basic LSTM
y_pred_basic = np.round(model_basic.predict(X_test_basic))



### Evaluation Metrics
- To assess the performance of the model, several evaluation metrics are calculated:
  - **Accuracy**: Measures the proportion of correctly classified instances.
  - **Precision**: Measures the ability to correctly identify positive cases.
  - **Recall**: Measures the ability to find all positive cases.
  - **F1 Score**: Combines precision and recall into a single metric.
  - **Mean Squared Error (MSE)**: Measures the average squared difference between predicted and actual values.

In [15]:
# Evaluate the Basic LSTM model
accuracy_basic = accuracy_score(y_test_basic, y_pred_basic)
precision_basic = precision_score(y_test_basic, y_pred_basic,  zero_division=0)
recall_basic = recall_score(y_test_basic, y_pred_basic,  zero_division=0)
f1_basic = f1_score(y_test_basic, y_pred_basic)
roc_auc_basic = roc_auc_score(y_test_basic, y_pred_basic)

print("Basic LSTM Model:")
print(f"Accuracy: {accuracy_basic}")
print(f"Precision: {precision_basic}")
print(f"Recall: {recall_basic}")
print(f"F1 Score: {f1_basic}")
print(f"ROC-AUC: {roc_auc_basic}")

Basic LSTM Model:
Accuracy: 0.9799915754001685
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
ROC-AUC: 0.49978517722878624


## Differences Between Basic and Enhanced LSTM Models

- **Time Steps**:
  - In the basic model, the time step was set to 1.
  - In the enhanced model, the time step was increased to 5 for more complex sequence learning.

- **LSTM Units**: 
  - Basic model uses only 10 units with linear activation.
  - Enhanced model uses 100 units in a Bidirectional LSTM with `tanh` activation, followed by another LSTM layer with 50 units and `relu` activation.

- **Learning Rate**: 
  - Basic model uses a high learning rate (0.1).
  - Enhanced model uses the default learning rate of the Adam optimizer.

- **Loss Function**: 
  - Basic model uses mean squared error (MSE), not typical for binary classification.
  - Enhanced model uses binary crossentropy.

- **Regularization and Normalization**: 
  - Basic model lacks additional layers for regularization.
  - Enhanced model includes Dropout and Batch Normalization layers.

- **Model Architecture**: 
  - Basic model is simpler with fewer layers.
  - Enhanced model is more complex with Bidirectional and stacked LSTM layers.

- **Callbacks**: 
  - Basic model does not use any callbacks.
  - Enhanced model employs Early Stopping and Model Checkpointing.

In [16]:
# Creating sequences for Enhanced LSTM
n_steps_enhanced = 5
X_lstm_enhanced, y_lstm_enhanced = create_sequences(df_primary_processed, n_steps_enhanced)

# Splitting the dataset into training and testing sets for Enhanced LSTM
X_train_enhanced, X_test_enhanced, y_train_enhanced, y_test_enhanced = train_test_split(X_lstm_enhanced, y_lstm_enhanced, test_size=0.2, random_state=42)

### Model Creation

In [17]:
# Building the Enhanced LSTM Model
model_enhanced = Sequential()
model_enhanced.add(Bidirectional(LSTM(100, return_sequences=True, activation='tanh'), input_shape=(n_steps_enhanced, X_train_enhanced.shape[2])))
model_enhanced.add(Dropout(0.2))
model_enhanced.add(BatchNormalization())
model_enhanced.add(LSTM(50, activation='relu'))
model_enhanced.add(Dense(1, activation='sigmoid'))
model_enhanced.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Callbacks for Enhanced LSTM
early_stopping = EarlyStopping(monitor='val_loss', patience=10)
model_checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True)

# Fit Enhanced LSTM model
model_enhanced.fit(X_train_enhanced, y_train_enhanced, epochs=10, batch_size=32, validation_data=(X_test_enhanced, y_test_enhanced))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1eb064ec2e0>

### Predictions

In [18]:
# Predictions with Enhanced LSTM
y_pred_enhanced = np.round(model_enhanced.predict(X_test_enhanced))



### Evaluation Metrics

In [19]:
# Evaluate the Enhanced LSTM model
accuracy_enhanced = accuracy_score(y_test_enhanced, y_pred_enhanced)
precision_enhanced = precision_score(y_test_enhanced, y_pred_enhanced)
recall_enhanced = recall_score(y_test_enhanced, y_pred_enhanced)
f1_enhanced = f1_score(y_test_enhanced, y_pred_enhanced)
roc_auc_enhanced = roc_auc_score(y_test_enhanced, y_pred_enhanced)

print("Enhanced LSTM Model:")
print(f"Accuracy: {accuracy_enhanced}")
print(f"Precision: {precision_enhanced}")
print(f"Recall: {recall_enhanced}")
print(f"F1 Score: {f1_enhanced}")
print(f"ROC-AUC: {roc_auc_enhanced}")

Enhanced LSTM Model:
Accuracy: 0.9898883505371814
Precision: 0.7901234567901234
Recall: 0.6736842105263158
F1 Score: 0.7272727272727273
ROC-AUC: 0.8350149341539574


### Secondary Dataset Enhanced Model

In [20]:
# Creating sequences for Enhanced Secondary LSTM
n_steps_enhanced_secondary = 5
X_lstm_enhanced_secondary, y_lstm_enhanced_secondary = create_sequences(df_secondary_processed, n_steps_enhanced_secondary)

# Splitting the dataset into training and testing sets for Enhanced Secondary LSTM
X_train_enhanced_secondary, X_test_enhanced_secondary, y_train_enhanced_secondary, y_test_enhanced_secondary = train_test_split(X_lstm_enhanced_secondary, y_lstm_enhanced_secondary, test_size=0.2, random_state=42)

# Predictions using the Enhanced Secondary LSTM model
y_pred_enhanced_secondary = np.round(model_enhanced.predict(X_test_enhanced_secondary))

# Evaluate the Enhanced Secondary LSTM model
accuracy_enhanced_secondary = accuracy_score(y_test_enhanced_secondary, y_pred_enhanced_secondary)
precision_enhanced_secondary = precision_score(y_test_enhanced_secondary, y_pred_enhanced_secondary, zero_division=0)
recall_enhanced_secondary = recall_score(y_test_enhanced_secondary, y_pred_enhanced_secondary, zero_division=0)
f1_enhanced_secondary = f1_score(y_test_enhanced_secondary, y_pred_enhanced_secondary)
roc_auc_enhanced_secondary = roc_auc_score(y_test_enhanced_secondary, y_pred_enhanced_secondary)

print("Enhanced Secondary LSTM Model Evaluation:")
print(f"Accuracy: {accuracy_enhanced_secondary}")
print(f"Precision: {precision_enhanced_secondary}")
print(f"Recall: {recall_enhanced_secondary}")
print(f"F1 Score: {f1_enhanced_secondary}")
print(f"ROC-AUC: {roc_auc_enhanced_secondary}")

Enhanced Secondary LSTM Model Evaluation:
Accuracy: 0.7310946589106293
Precision: 0.8865546218487395
Recall: 0.17568692756036636
F1 Score: 0.29325920778318276
ROC-AUC: 0.5826129329781684


### Secondary Dataset Random Forest Model

In [21]:
# Preparing data for Random Forest
X_rf_secondary = df_secondary_processed.drop('LABEL', axis=1)
y_rf_secondary = df_secondary_processed['LABEL']

# Splitting the dataset into training and testing sets for Random Forest
X_train_rf_secondary , X_test_rf_secondary , y_train_rf_secondary , y_test_rf_secondary = train_test_split(X_rf_secondary, y_rf_secondary, test_size=0.2, random_state=42)

# Predictions using the Random Forest model for the secondary dataset
y_pred_rf_secondary = rf_classifier.predict(X_test_rf_secondary)

# Evaluate the Random Forest model for the secondary dataset
accuracy_rf_secondary = accuracy_score(y_test_rf_secondary, y_pred_rf_secondary)
precision_rf_secondary = precision_score(y_test_rf_secondary, y_pred_rf_secondary, zero_division=0)
recall_rf_secondary = recall_score(y_test_rf_secondary, y_pred_rf_secondary, zero_division=0)
f1_rf_secondary = f1_score(y_test_rf_secondary, y_pred_rf_secondary)
roc_auc_rf_secondary = roc_auc_score(y_test_rf_secondary, y_pred_rf_secondary)

print("Random Forest Model Evaluation for the Secondary Dataset:")
print(f"Accuracy: {accuracy_rf_secondary}")
print(f"Precision: {precision_rf_secondary}")
print(f"Recall: {recall_rf_secondary}")
print(f"F1 Score: {f1_rf_secondary}")
print(f"ROC-AUC: {roc_auc_rf_secondary}")

Random Forest Model Evaluation for the Secondary Dataset:
Accuracy: 0.8567274649748876
Precision: 1.0
Recall: 0.5564648117839607
F1 Score: 0.7150368033648791
ROC-AUC: 0.7782324058919803
