<a href="https://colab.research.google.com/github/chibuezedev/ddos-detector/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# !pip install onnx onnxruntime

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
import onnx
import torch.onnx
import json
import os
import shutil
from tqdm import tqdm
import gc

In [10]:
class NetworkTrafficDataset(Dataset):
    def __init__(self, features, labels, tokenizer, max_length=32):
        self.features = features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature_str = " ".join(f"{x:.2f}" if isinstance(x, float) else str(x)
                             for x in self.features[idx])

        encoding = self.tokenizer(
            feature_str,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [11]:
def preprocess_data(data_df, max_samples=5000): # increase to 100k
    if len(data_df) > max_samples:
        data_df = data_df.sample(n=max_samples, random_state=42)

    # add all features
    essential_features = [
        'Packets', 'Bytes', 'Tx Packets', 'Tx Bytes',
        'Rx Packets', 'Rx Bytes', 'tcp.srcport', 'tcp.dstport',
        'ip.proto', 'frame.len'
    ]

    features = data_df[essential_features].values

    scaler = StandardScaler()
    features = scaler.fit_transform(features)

    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(data_df['Label'])
    label_mapping = {i: label for i, label in enumerate(label_encoder.classes_)}

    return features, labels, label_mapping

In [12]:
def train_model(train_loader, model, device, num_epochs=2, patience=2):
    optimizer = AdamW(model.parameters(), lr=1e-4)
    best_loss = float('inf')
    patience_counter = 0

    # gradient accumulation
    accumulation_steps = 4
    optimizer.zero_grad()

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        batch_count = 0

        for batch_idx, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss / accumulation_steps
            loss.backward()

            if (batch_idx + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            total_loss += loss.item() * accumulation_steps
            batch_count += 1

            # memory mgt
            del outputs, loss, input_ids, attention_mask, labels
            torch.cuda.empty_cache() if torch.cuda.is_available() else gc.collect()

            if batch_count % 10 == 0:
                print(f"Epoch {epoch+1}, Batch {batch_count}, Loss: {total_loss/batch_count:.4f}")

        avg_loss = total_loss / batch_count

        if avg_loss < best_loss:
            best_loss = avg_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered after epoch {epoch+1}")
                break

In [13]:
def export_model(model, tokenizer, label_mapping, output_path):
    """
    Export the model and all necessary files for Node.js server consumption
    """
    os.makedirs(output_path, exist_ok=True)

    # model configuration
    config = {
        'max_length': 32,
        'num_labels': len(label_mapping),
        'model_type': 'bert',
        'vocab_size': tokenizer.vocab_size,
        'pad_token_id': tokenizer.pad_token_id,
        'version': '1.0',
        'label_mapping': label_mapping
    }

    with open(os.path.join(output_path, 'config.json'), 'w') as f:
        json.dump(config, f, indent=2)

    # tokenizer files
    tokenizer_path = os.path.join(output_path, 'tokenizer')
    os.makedirs(tokenizer_path, exist_ok=True)
    tokenizer.save_pretrained(tokenizer_path)

    try:
        # PyTorch model
        torch_path = os.path.join(output_path, 'model.pt')
        torch.save(model.state_dict(), torch_path)

        dummy_input = tokenizer(
            "dummy input",
            add_special_tokens=True,
            max_length=32,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        gc.collect()
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

        model.eval()
        with torch.no_grad():
            onnx_path = os.path.join(output_path, 'model.onnx')
            torch.onnx.export(
                model,
                (dummy_input['input_ids'], dummy_input['attention_mask']),
                onnx_path,
                input_names=['input_ids', 'attention_mask'],
                output_names=['logits'],
                dynamic_axes={
                    'input_ids': {0: 'batch_size'},
                    'attention_mask': {0: 'batch_size'},
                    'logits': {0: 'batch_size'}
                },
                opset_version=14
            )

            # Verify ONNX model for nodejs server
            onnx_model = onnx.load(onnx_path)
            onnx.checker.check_model(onnx_model)

    except Exception as e:
        print(f"ONNX export failed: {str(e)}")
        print("Falling back to PyTorch model export only")
        onnx_path = None

    shutil.make_archive(output_path, 'zip', output_path)

    return {
        'model_dir': output_path,
        'zip_path': f"{output_path}.zip",
        'onnx_path': onnx_path,
        'torch_path': torch_path,
        'config_path': os.path.join(output_path, 'config.json'),
        'tokenizer_path': tokenizer_path
    }

In [6]:

def main():
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

    print("Loading data...")
    data = pd.read_csv('./sample_data/train.csv', nrows=5000) #increase to 10k

    features, labels, label_mapping = preprocess_data(data)

    X_train, X_test, y_train, y_test = train_test_split(
        features, labels, test_size=0.1, random_state=42, stratify=labels
    )

    # Initialize BERT model
    print("Initializing BERT model...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=len(label_mapping),
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1
    )

    train_dataset = NetworkTrafficDataset(X_train, y_train, tokenizer)
    train_loader = DataLoader(
        train_dataset,
        batch_size=16,
        shuffle=True,
        num_workers=0
    )

    device = torch.device('cpu')
    model.to(device)

    print("Starting training...")
    train_model(train_loader, model, device)

    print("Exporting model...")
    try:
        output_path = os.path.join('./ddos_model', 'output_model')
        export_paths = export_model(model, tokenizer, label_mapping, output_path)
        print("\nModel export successful! Files location:")
        print(f"1. Model directory: {export_paths['model_dir']}")
        print(f"2. ZIP archive: {export_paths['zip_path']}")
    except Exception as e:
        print(f"Error during model export: {str(e)}")

    print("Training and export completed!")

if __name__ == "__main__":
    main()

Loading data...
Initializing BERT model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...
Epoch 1, Batch 10, Loss: 1.0702
Epoch 1, Batch 20, Loss: 0.9106
Epoch 1, Batch 30, Loss: 0.7821
Epoch 1, Batch 40, Loss: 0.7013
Epoch 1, Batch 50, Loss: 0.6416
Epoch 1, Batch 60, Loss: 0.5946
Epoch 1, Batch 70, Loss: 0.5627
Epoch 1, Batch 80, Loss: 0.5365
Epoch 1, Batch 90, Loss: 0.5241
Epoch 1, Batch 100, Loss: 0.5158
Epoch 1, Batch 110, Loss: 0.5011
Epoch 1, Batch 120, Loss: 0.4912
Epoch 1, Batch 130, Loss: 0.4830
Epoch 1, Batch 140, Loss: 0.4767
Epoch 1, Batch 150, Loss: 0.4681
Epoch 1, Batch 160, Loss: 0.4603
Epoch 1, Batch 170, Loss: 0.4532
Epoch 1, Batch 180, Loss: 0.4485
Epoch 1, Batch 190, Loss: 0.4458
Epoch 1, Batch 200, Loss: 0.4779
Epoch 1, Batch 210, Loss: 0.4747
Epoch 1, Batch 220, Loss: 0.4724
Epoch 1, Batch 230, Loss: 0.4653
Epoch 1, Batch 240, Loss: 0.4606
Epoch 1, Batch 250, Loss: 0.4585
Epoch 1, Batch 260, Loss: 0.4549
Epoch 1, Batch 270, Loss: 0.4501
Epoch 1, Batch 280, Loss: 0.4461
Epoch 2, Batch 10, Loss: 0.3605
Epoch 2, Batch 20, Loss: 0.3632
