In [67]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pandas.plotting import scatter_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim

# Data

In [68]:
# data_path = "/kaggle/input/traffic/data.csv"
data_path = "data.csv"

df = pd.read_csv(data_path)


In [69]:
df = df[df['Dst Port'] != 'Dst Port']

df['Dst Port'] = df['Dst Port'].astype(int)
df['Protocol'] = df['Protocol'].astype(int)
df['Flow Duration'] = df['Flow Duration'].astype(int)
df['Tot Fwd Pkts'] = df['Tot Fwd Pkts'].astype(int)
df['Tot Bwd Pkts'] = df['Tot Bwd Pkts'].astype(int)
df['TotLen Fwd Pkts'] = df['TotLen Fwd Pkts'].astype(int)
df['TotLen Bwd Pkts'] = df['TotLen Bwd Pkts'].astype(int)
df['Fwd Pkt Len Max'] = df['Fwd Pkt Len Max'].astype(int)
df['Fwd Pkt Len Min'] = df['Fwd Pkt Len Min'].astype(int)
df['Fwd Pkt Len Mean'] = df['Fwd Pkt Len Mean'].astype(float)
df['Fwd Pkt Len Std'] = df['Fwd Pkt Len Std'].astype(float)
df['Bwd Pkt Len Max'] = df['Bwd Pkt Len Max'].astype(int)
df['Bwd Pkt Len Min'] = df['Bwd Pkt Len Min'].astype(int)
df['Bwd Pkt Len Mean'] = df['Bwd Pkt Len Mean'].astype(float)
df['Bwd Pkt Len Std'] = df['Bwd Pkt Len Std'].astype(float)
df['Flow Byts/s'] = df['Flow Byts/s'].astype(float)
df['Flow Pkts/s'] = df['Flow Pkts/s'].astype(float)
df['Flow IAT Mean'] = df['Flow IAT Mean'].astype(float)
df['Flow IAT Std'] = df['Flow IAT Std'].astype(float)
df['Flow IAT Max'] = df['Flow IAT Max'].astype(int)
df['Flow IAT Min'] = df['Flow IAT Min'].astype(int)
df['Fwd IAT Tot'] = df['Fwd IAT Tot'].astype(int)
df['Fwd IAT Mean'] = df['Fwd IAT Mean'].astype(float)
df['Fwd IAT Std'] = df['Fwd IAT Std'].astype(float)
df['Fwd IAT Max'] = df['Fwd IAT Max'].astype(int)
df['Fwd IAT Min'] = df['Fwd IAT Min'].astype(int)
df['Bwd IAT Tot'] = df['Bwd IAT Tot'].astype(int)
df['Bwd IAT Mean'] = df['Bwd IAT Mean'].astype(float)
df['Bwd IAT Std'] = df['Bwd IAT Std'].astype(float)
df['Bwd IAT Max'] = df['Bwd IAT Max'].astype(int)
df['Bwd IAT Min'] = df['Bwd IAT Min'].astype(int)
df['Fwd PSH Flags'] = df['Fwd PSH Flags'].astype(int)
df['Bwd PSH Flags'] = df['Bwd PSH Flags'].astype(int)
df['Fwd URG Flags'] = df['Fwd URG Flags'].astype(int)
df['Bwd URG Flags'] = df['Bwd URG Flags'].astype(int)
df['Fwd Header Len'] = df['Fwd Header Len'].astype(int)
df['Bwd Header Len'] = df['Bwd Header Len'].astype(int)
df['Fwd Pkts/s'] = df['Fwd Pkts/s'].astype(float)
df['Bwd Pkts/s'] = df['Bwd Pkts/s'].astype(float)
df['Pkt Len Min'] = df['Pkt Len Min'].astype(int)
df['Pkt Len Max'] = df['Pkt Len Max'].astype(int)
df['Pkt Len Mean'] = df['Pkt Len Mean'].astype(float)
df['Pkt Len Std'] = df['Pkt Len Std'].astype(float)
df['Pkt Len Var'] = df['Pkt Len Var'].astype(float)
df['FIN Flag Cnt'] = df['FIN Flag Cnt'].astype(int)
df['SYN Flag Cnt'] = df['SYN Flag Cnt'].astype(int)
df['RST Flag Cnt'] = df['RST Flag Cnt'].astype(int)
df['PSH Flag Cnt'] = df['PSH Flag Cnt'].astype(int)
df['ACK Flag Cnt'] = df['ACK Flag Cnt'].astype(int)
df['URG Flag Cnt'] = df['URG Flag Cnt'].astype(int)
df['CWE Flag Count'] = df['CWE Flag Count'].astype(int)
df['ECE Flag Cnt'] = df['ECE Flag Cnt'].astype(int)
df['Down/Up Ratio'] = df['Down/Up Ratio'].astype(int)
df['Pkt Size Avg'] = df['Pkt Size Avg'].astype(float)
df['Fwd Seg Size Avg'] = df['Fwd Seg Size Avg'].astype(float)
df['Bwd Seg Size Avg'] = df['Bwd Seg Size Avg'].astype(float)
df['Fwd Byts/b Avg'] = df['Fwd Byts/b Avg'].astype(int)
df['Fwd Pkts/b Avg'] = df['Fwd Pkts/b Avg'].astype(int)
df['Fwd Blk Rate Avg'] = df['Fwd Blk Rate Avg'].astype(int)
df['Bwd Byts/b Avg'] = df['Bwd Byts/b Avg'].astype(int)
df['Bwd Pkts/b Avg'] = df['Bwd Pkts/b Avg'].astype(int)
df['Bwd Blk Rate Avg'] = df['Bwd Blk Rate Avg'].astype(int)
df['Subflow Fwd Pkts'] = df['Subflow Fwd Pkts'].astype(int)
df['Subflow Fwd Byts'] = df['Subflow Fwd Byts'].astype(int)
df['Subflow Bwd Pkts'] = df['Subflow Bwd Pkts'].astype(int)
df['Subflow Bwd Byts'] = df['Subflow Bwd Byts'].astype(int)
df['Init Fwd Win Byts'] = df['Init Fwd Win Byts'].astype(int)
df['Init Bwd Win Byts'] = df['Init Bwd Win Byts'].astype(int)
df['Fwd Act Data Pkts'] = df['Fwd Act Data Pkts'].astype(int)
df['Fwd Seg Size Min'] = df['Fwd Seg Size Min'].astype(int)
df['Active Mean'] = df['Active Mean'].astype(float)
df['Active Std'] = df['Active Std'].astype(float)
df['Active Max'] = df['Active Max'].astype(int)
df['Active Min'] = df['Active Min'].astype(int)
df['Idle Mean'] = df['Idle Mean'].astype(float)
df['Idle Std'] = df['Idle Std'].astype(float)
df['Idle Max'] = df['Idle Max'].astype(int)
df['Idle Min'] = df['Idle Min'].astype(int)

In [70]:
df.drop(["Timestamp"], axis=1, inplace=True)
df.drop(["Dst Port"], axis=1, inplace=True)

df["Flow Byts/s"] = df["Flow Byts/s"].replace(np.inf, np.nan)
df["Flow Pkts/s"] = df["Flow Pkts/s"].replace(np.inf, np.nan)

df["Flow Pkts/s"] = df["Flow Pkts/s"].replace(np.nan, df["Flow Pkts/s"].max())

In [71]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)

df.dropna(inplace=True)


## Label encode

In [72]:
df['Label'].values

array(['Benign', 'Benign', 'Benign', ..., 'SSH-Bruteforce',
       'SSH-Bruteforce', 'SSH-Bruteforce'], dtype=object)

In [73]:
vec_len = len(df['Label'].unique())

In [74]:
ohe = OneHotEncoder(sparse_output=False)
labels_onehot = ohe.fit_transform(df['Label'].values.reshape(-1, 1))

onehot_df = pd.DataFrame(labels_onehot, columns=ohe.categories_[0], index=df.index)

df = pd.concat([df.drop(columns=['Label']), onehot_df], axis=1)

In [75]:
df.head()

Unnamed: 0,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,...,DDOS attack-HOIC,DDOS attack-LOIC-UDP,DoS attacks-GoldenEye,DoS attacks-Hulk,DoS attacks-SlowHTTPTest,DoS attacks-Slowloris,FTP-BruteForce,Infilteration,SQL Injection,SSH-Bruteforce
0,6,165,2,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6,5727100,4,4,97,231,97,0,24.25,48.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,4053796,14,8,1456,1731,741,0,104.0,195.013609,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,833502,7,5,364,582,103,0,52.0,49.217206,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,17,601,1,1,53,85,53,53,53.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
# print(labels_onehot)

## Drop low corr with Label

In [77]:
corr_matrix = df.corr(numeric_only=True)
corr_pairs = corr_matrix.unstack()
threshold = 0.9
strong_corr = corr_pairs[
    (abs(corr_pairs) > threshold)
    & (corr_pairs != 1)
    & (corr_pairs.index.get_level_values(0) != corr_pairs.index.get_level_values(1))
]
strong_corr = strong_corr.sort_values(ascending=False)
pd.set_option("display.max_rows", None)

In [78]:
cols_to_drop = set()

for col1, col2 in strong_corr.index:
    if col1 not in cols_to_drop:
        cols_to_drop.add(col2)

# remove 'DDOS attack-LOIC-UDP' from the list of columns to drop

In [79]:
df_final = df.drop(columns=cols_to_drop, axis=1)

In [80]:
df_final.to_csv("model/train_data.csv", index=False)
df_final.shape

(9986, 65)

## Train test split

In [81]:
X = df_final.iloc[:, :-1]
y = df_final.iloc[:, -1:]

print(X.columns)
print(y.columns)
print(X.shape)
print(y.shape)

Index(['Protocol', 'TotLen Fwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Mean',
       'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s',
       'Flow IAT Std', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Bwd IAT Tot', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
       'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
       'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Mean',
       'Pkt Len Std', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt',
       'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count',
       'ECE Flag Cnt', 'Down/Up Ratio', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg',
       'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg',
       'Bwd Byts/b Avg', 'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg',
       'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Init Fwd Win Byts',
       'Init Bwd Win Byts', 'Fwd Seg Size Min', 'Active Std', 'Active Max',
       'Active Min', 'Idle Std', 'Idle Max', 'Benign', 'Bot',
       'Bru

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = y_train.to_numpy()  # Convert to numpy
y_test = y_test.to_numpy()    # Convert to numpy

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

print(X_train_tensor.shape)
print(y_train_tensor.shape)

torch.Size([7988, 64])
torch.Size([7988, 1])


# Model

In [83]:
class DNN(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(DNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.output = nn.Linear(16, num_classes)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.softmax(self.output(x), dim=1)
        return x

In [84]:
input_dim = X.shape[1]
output_dim = y.shape[1]
model = DNN(input_dim, output_dim)
print(model)

DNN(
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=16, bias=True)
  (output): Linear(in_features=16, out_features=1, bias=True)
)


In [85]:
loss_fn = nn.BCEWithLogitsLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [86]:
epochs = 50
batch_size = 32

train_data = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)



for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    
    for batch_X, batch_y in train_loader:
        # print("batch_y", batch_y)
        optimizer.zero_grad()
        predictions = model(batch_X)
        print("predictions", predictions.shape)
        loss = loss_fn(predictions, batch_y)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
        # print(predictions)
        # print(batch_y)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")


predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predictions torch.Size([32, 1])
predicti

In [87]:
model.eval()
with torch.no_grad():
    test_predictions = model(X_test_tensor)
    print(test_predictions)
    test_predictions = (test_predictions >= 0.5).float()

    correct_per_sample = test_predictions.eq(y_test_tensor).sum(dim=1) 
    total_labels_per_sample = y_test_tensor.size(1)
    accuracy_per_sample = correct_per_sample / total_labels_per_sample
    accuracy = accuracy_per_sample.mean().item()
    
    # print(f"Test Accuracy: {accuracy:.4f}")
    
# print(probabilities)
class_idx = torch.argmax(test_predictions, dim=1)
print(class_idx)

unique_classes, counts = torch.unique(class_idx, return_counts=True)
value_counts = {int(cls): int(count) for cls, count in zip(unique_classes, counts)}
print(value_counts)



tensor([[1.],
        [1.],
        [1.],
        ...,
        [1.],
        [1.],
        [1.]])
tensor([0, 0, 0,  ..., 0, 0, 0])
{0: 1998}


In [88]:
class_labels = [
    'Benign', 'Bot', 'Brute Force -Web', 'Brute Force -XSS',
    'DDOS attack-HOIC', 'DoS attacks-GoldenEye', 'DoS attacks-Hulk',
    'DoS attacks-SlowHTTPTest', 'DoS attacks-Slowloris', 'FTP-BruteForce',
    'Infilteration', 'SQL Injection', 'SSH-Bruteforce'
]

# Create mapper
mapper = {index + 1: label.lower() for index, label in enumerate(class_labels)}
print(mapper)


{1: 'benign', 2: 'bot', 3: 'brute force -web', 4: 'brute force -xss', 5: 'ddos attack-hoic', 6: 'dos attacks-goldeneye', 7: 'dos attacks-hulk', 8: 'dos attacks-slowhttptest', 9: 'dos attacks-slowloris', 10: 'ftp-bruteforce', 11: 'infilteration', 12: 'sql injection', 13: 'ssh-bruteforce'}


In [89]:
class_name = [mapper[index.item()+1] for index in class_idx]
print(class_name)

['benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign', 'benign',