<a href="https://colab.research.google.com/github/chibuezedev/ddos-detector/blob/main/ddos_detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install dask[dataframe]
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import lightgbm as lgb
import xgboost as xgb
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import json
import os
import joblib
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt

Collecting dask-expr<1.2,>=1.1 (from dask[dataframe])
  Downloading dask_expr-1.1.21-py3-none-any.whl.metadata (2.6 kB)
INFO: pip is looking at multiple versions of dask-expr to determine which version is compatible with other requirements. This could take a while.
  Downloading dask_expr-1.1.20-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.19-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.18-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.16-py3-none-any.whl.metadata (2.5 kB)
Downloading dask_expr-1.1.16-py3-none-any.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.2/243.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dask-expr
Successfully installed dask-expr-1.1.16


In [2]:

class NetworkDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.FloatTensor(features)
        self.labels = torch.LongTensor(labels)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [3]:
class CNN1D(nn.Module):
    def __init__(self, input_size, num_classes):
        super(CNN1D, self).__init__()
        self.conv1 = nn.Conv1d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(2)

        # correct input size for the first fully connected layer
        self.flatten_size = 64 * ((input_size // 4))

        self.fc1 = nn.Linear(self.flatten_size, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        x = self.pool(x)

        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [4]:
def preprocess_data(data_df):
    essential_features = [
        'Packets', 'Bytes', 'Tx Packets', 'Tx Bytes',
        'Rx Packets', 'Rx Bytes', 'tcp.srcport', 'tcp.dstport',
        'ip.proto', 'frame.len'
    ]

    features = data_df[essential_features].values
    labels = pd.Categorical(data_df['Label']).codes

    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    return features_scaled, labels, scaler

In [5]:
def train_random_forest(X_train, X_test, y_train, y_test):
    print("\nTraining Random Forest...")

    # Hyperparameter grid
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    rf = RandomForestClassifier(random_state=42)
    rf_random = RandomizedSearchCV(rf, param_grid, n_iter=10, cv=3, random_state=42)
    rf_random.fit(X_train, y_train)

    best_rf = rf_random.best_estimator_
    y_pred = best_rf.predict(X_test)

    print("\nRandom Forest Results:")
    print(classification_report(y_test, y_pred))

    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': [f'Feature_{i}' for i in range(X_train.shape[1])],
        'importance': best_rf.feature_importances_
    }).sort_values('importance', ascending=False)

    return {
        'model': best_rf,
        'accuracy': accuracy_score(y_test, y_pred),
        'feature_importance': feature_importance,
        'predictions': y_pred,
        'best_params': rf_random.best_params_
    }

In [6]:
def train_lightgbm(X_train, X_test, y_train, y_test):
    print("\nTraining LightGBM...")

    # dataset for LightGBM
    train_data = lgb.Dataset(X_train, label=y_train)

    # Parameters
    params = {
        'objective': 'multiclass',
        'num_class': len(np.unique(y_train)),
        'metric': 'multi_logloss',
        'learning_rate': 0.1,
        'num_leaves': 31,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5
    }

    # Train model
    model = lgb.train(params, train_data, num_boost_round=100)

    # Make predictions
    y_pred = model.predict(X_test)
    y_pred = np.argmax(y_pred, axis=1)

    print("\nLightGBM Results:")
    print(classification_report(y_test, y_pred))

    return {
        'model': model,
        'accuracy': accuracy_score(y_test, y_pred),
        'predictions': y_pred
    }

In [7]:
def train_cnn(X_train, X_test, y_train, y_test, num_classes):
    print("\nTraining CNN...")

    train_dataset = NetworkDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    input_size = X_train.shape[1]
    model = CNN1D(input_size, num_classes)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    num_epochs = 10

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        for batch_features, batch_labels in train_loader:
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

            optimizer.zero_grad()
            outputs = model(batch_features)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()

        if (epoch + 1) % 2 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


    # Evaluation
    model.eval()
    with torch.no_grad():
        X_test_tensor = torch.FloatTensor(X_test).to(device)
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs.data, 1)
        y_pred = predicted.cpu().numpy()

    print("\nCNN Results:")
    print(classification_report(y_test, y_pred))

    return {
        'model': model,
        'accuracy': accuracy_score(y_test, y_pred),
        'predictions': y_pred
    }

In [8]:
def compare_models(results):
    print("\nModel Comparison:")
    print("=" * 50)

    accuracies = {model: results[model]['accuracy'] for model in results}
    for model, acc in accuracies.items():
        print(f"{model} Accuracy: {acc:.4f}")

    best_model = max(accuracies.items(), key=lambda x: x[1])[0]
    print(f"\nBest performing model: {best_model} with accuracy: {accuracies[best_model]:.4f}")

    return best_model

In [9]:
def save_best_model(results, best_model_name):
    output_dir = './best_model'
    os.makedirs(output_dir, exist_ok=True)

    model = results[best_model_name]['model']

    if best_model_name == 'Random Forest':
        joblib.dump(model, f'{output_dir}/random_forest.joblib')
    elif best_model_name == 'LightGBM':
        model.save_model(f'{output_dir}/lightgbm_model.txt')
    elif best_model_name == 'CNN':
        torch.save(model.state_dict(), f'{output_dir}/cnn_model.pt')

    # Save model info
    model_info = {
        'best_model': best_model_name,
        'accuracy': results[best_model_name]['accuracy'],
        'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    }

    with open(f'{output_dir}/model_info.json', 'w') as f:
        json.dump(model_info, f, indent=4)

In [10]:
def main():
    print("Loading data...")
    data = pd.read_csv('./sample_data/train.csv')

    # Preprocess data
    features, labels, scaler = preprocess_data(data)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        features, labels, test_size=0.2, random_state=42, stratify=labels
    )

    # Train all models
    results = {
        'Random Forest': train_random_forest(X_train, X_test, y_train, y_test),
        'LightGBM': train_lightgbm(X_train, X_test, y_train, y_test),
        'CNN': train_cnn(X_train, X_test, y_train, y_test, len(np.unique(labels)))
    }

    best_model = compare_models(results)

    save_best_model(results, best_model)

    print(f"\nBest model saved in './best_model' directory")

if __name__ == "__main__":
    main()

Loading data...

Training Random Forest...

Random Forest Results:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13608
           1       0.61      0.55      0.58      6804
           2       0.59      0.65      0.62      6804

    accuracy                           0.80     27216
   macro avg       0.74      0.73      0.73     27216
weighted avg       0.80      0.80      0.80     27216


Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003791 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 365
[LightGBM] [Info] Number of data points in the train set: 108864, number of used features: 8
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294

LightGBM Result