In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nf-ton-iotv1/NetFlow_v1_Features.csv
/kaggle/input/nf-ton-iotv1/NF-ToN-IoT.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from lightgbm import LGBMClassifier

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv('/kaggle/input/nf-ton-iotv1/NF-ToN-IoT.csv')

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Drop unnamed index columns if present
df.drop(columns=[col for col in df.columns if 'unnamed' in col.lower()], inplace=True)

# Drop completely null columns
df.dropna(axis=1, how='all', inplace=True)

# Drop rows with missing values (can replace with imputation if needed)
df.dropna(inplace=True)

# Reset index after dropping rows
df.reset_index(drop=True, inplace=True)

In [5]:
# Encode attack types (multiclass) from 'attack' column
le = LabelEncoder()
df['encoded_attack'] = le.fit_transform(df['Attack'])
attack_class_names = le.classes_
print("Attack classes:", attack_class_names)

# Drop non-feature columns 
non_features = ['Label', 'Flow_ID', 'Source_IP', 'Destination_IP', 'Timestamp', 'attack', 'encoded_attack']
features = df.drop(columns=non_features, errors='ignore')

# Select numeric features only
features = features.select_dtypes(include=[np.number])

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(features)

# Multiclass target for attack classification
y_multiclass = df['encoded_attack'].values

# Deduplicate dataset based on features and target
X_unique, unique_indices = np.unique(X, axis=0, return_index=True)
y_unique = y_multiclass[unique_indices]

print(f"Removed {len(X) - len(X_unique)} duplicate samples from original data")

# Stratified train-test split on deduplicated data with multiclass labels
X_train, X_test, y_train, y_test = train_test_split(
    X_unique, y_unique,
    test_size=0.2,
    random_state=42,
    stratify=y_unique
)

# Verify class distributions after split
print("\nAfter deduplication and stratified split:")
print("Train class distribution:", np.bincount(y_train))
print("Test class distribution:", np.bincount(y_test))

# Final leakage check: ensure no overlapping samples between train and test
train_tuples = {tuple(row) for row in X_train}
test_tuples = {tuple(row) for row in X_test}
common_samples = train_tuples.intersection(test_tuples)
print(f"\nNumber of overlapping samples after cleaning: {len(common_samples)}")
assert len(common_samples) == 0, "Data leakage detected!"

Attack classes: ['Benign' 'backdoor' 'ddos' 'dos' 'injection' 'mitm' 'password'
 'ransomware' 'scanning' 'xss']
Removed 576759 duplicate samples from original data

After deduplication and stratified split:
Train class distribution: [158759  13775  84916  12016 356509    635      0    114  15288]
Test class distribution: [39690  3444 21229  3004 89127   159     0    28  3822]

Number of overlapping samples after cleaning: 0


In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Dataset with Gaussian noise as light augmentation
class TabularDataset(Dataset):
    def __init__(self, X):
        self.X = X.astype(np.float32)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        x = self.X[idx]
        # Light augmentation: small Gaussian noise
        noise_std = 0.01
        x1 = x + noise_std * np.random.randn(*x.shape).astype(np.float32)
        x2 = x + noise_std * np.random.randn(*x.shape).astype(np.float32)
        return x1, x2

# Encoder with normalization and dropout to prevent feature domination
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),  # Prevent overfitting + domination
            nn.Linear(128, embedding_dim),
            nn.LayerNorm(embedding_dim)  # Normalize embeddings
        )
    
    def forward(self, x):
        return F.normalize(self.net(x), dim=1)  # L2 normalize embeddings

# NT-Xent loss
def nt_xent_loss(z1, z2, temperature=0.5):
    batch_size = z1.size(0)
    z = torch.cat([z1, z2], dim=0)
    sim = torch.matmul(z, z.T) / temperature
    sim_max, _ = sim.max(dim=1, keepdim=True)
    sim = sim - sim_max.detach()  # stability

    mask = torch.eye(2 * batch_size, dtype=torch.bool).to(z.device)
    labels = torch.cat([torch.arange(batch_size) for _ in range(2)], dim=0).to(z.device)
    labels = (labels.unsqueeze(0) == labels.unsqueeze(1)).float()
    
    sim = sim[~mask].view(2 * batch_size, -1)
    labels = labels[~mask].view(2 * batch_size, -1)
    
    positives = sim[labels.bool()].view(2 * batch_size, -1)
    negatives = sim[~labels.bool()].view(2 * batch_size, -1)
    
    logits = torch.cat([positives, negatives], dim=1)
    targets = torch.zeros(logits.shape[0], dtype=torch.long).to(z.device)
    
    return F.cross_entropy(logits, targets)

# Training loop
def train_contrastive(model, dataloader, optimizer, epochs=10, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model.to(device)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for x1, x2 in dataloader:
            x1, x2 = x1.to(device), x2.to(device)
            z1 = model(x1)
            z2 = model(x2)
            loss = nt_xent_loss(z1, z2)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} - Loss: {total_loss/len(dataloader):.4f}")

In [7]:
import torch.optim as optim
from torch.utils.data import DataLoader

# Use cleaned and deduplicated feature dimensions
input_dim = X_train.shape[1]
embedding_dim = 64  # You can experiment with 32, 128 etc.

# Create contrastive training dataset with light augmentations
train_dataset = TabularDataset(X_train)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, drop_last=True)

# Initialize encoder model and optimizer
model = Encoder(input_dim=input_dim, embedding_dim=embedding_dim)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Train contrastive encoder
device = 'cuda' if torch.cuda.is_available() else 'cpu'
train_contrastive(model, train_loader, optimizer, epochs=20, device=device)


Epoch 1 - Loss: 4.5593
Epoch 2 - Loss: 4.4825
Epoch 3 - Loss: 4.4661
Epoch 4 - Loss: 4.4542
Epoch 5 - Loss: 4.4495
Epoch 6 - Loss: 4.4451
Epoch 7 - Loss: 4.4434
Epoch 8 - Loss: 4.4425
Epoch 9 - Loss: 4.4411
Epoch 10 - Loss: 4.4396
Epoch 11 - Loss: 4.4390
Epoch 12 - Loss: 4.4387
Epoch 13 - Loss: 4.4383
Epoch 14 - Loss: 4.4375
Epoch 15 - Loss: 4.4371
Epoch 16 - Loss: 4.4366
Epoch 17 - Loss: 4.4362
Epoch 18 - Loss: 4.4357
Epoch 19 - Loss: 4.4358
Epoch 20 - Loss: 4.4354


In [8]:
from torch.utils.data import DataLoader, TensorDataset

def get_embeddings(model, X, device='cpu', batch_size=1024):
    model.eval()
    embeddings = []

    dataset = TensorDataset(torch.tensor(X.astype(np.float32)))
    loader = DataLoader(dataset, batch_size=batch_size)

    with torch.no_grad():
        for (batch,) in loader:
            batch = batch.to(device)
            emb = model(batch).cpu().numpy()
            embeddings.append(emb)

    return np.vstack(embeddings)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Generate embeddings
train_embeddings = get_embeddings(model, X_train, device)
test_embeddings = get_embeddings(model, X_test, device)


In [9]:
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# class_names from your label encoder (10 classes originally)
class_names = le.classes_

# Get unique classes actually present in y_train/y_test (after deduplication)
unique_classes = np.unique(np.concatenate([y_train, y_test]))
print(f"Classes present after deduplication: {unique_classes}")
print(f"Corresponding class names: {class_names[unique_classes]}")

# Initialize LightGBM for multiclass classification
lgb_clf = LGBMClassifier(
    objective='multiclass',
    num_class=len(unique_classes),
    n_estimators=1000,
    boosting_type='gbdt',
    random_state=42,
    verbosity=-1
)

# Train with early stopping
lgb_clf.fit(
    train_embeddings, y_train,
    eval_set=[(test_embeddings, y_test)],
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=100)
    ]
)

# Predict on test set
y_pred = lgb_clf.predict(test_embeddings)

# Evaluation - specify labels and target_names matching classes present
print("Classification Report:\n",
      classification_report(y_test, y_pred,
                            labels=unique_classes,
                            target_names=class_names[unique_classes]))
print("Accuracy:", accuracy_score(y_test, y_pred))


Classes present after deduplication: [0 1 2 3 4 5 7 8]
Corresponding class names: ['Benign' 'backdoor' 'ddos' 'dos' 'injection' 'mitm' 'ransomware'
 'scanning']
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[22]	valid_0's multi_logloss: 0.324292
Classification Report:
               precision    recall  f1-score   support

      Benign       0.94      0.96      0.95     39690
    backdoor       0.98      0.99      0.98      3444
        ddos       0.88      0.79      0.83     21229
         dos       0.41      0.68      0.51      3004
   injection       0.93      0.96      0.95     89127
        mitm       0.17      0.17      0.17       159
  ransomware       0.00      0.00      0.00        28
    scanning       0.63      0.14      0.23      3822

    accuracy                           0.91    160503
   macro avg       0.62      0.59      0.58    160503
weighted avg       0.91      0.91      0.91    160503

Accuracy: 0.911727506650966
