In [66]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
from torch import optim
import numpy as np
import pandas as pd
import os
import io
import re
import sys
import time
%matplotlib inline

In [67]:
IDS_df = pd.read_csv("data/ids_small.csv")

# To display the top 5 rows
# IDS_df.head(5)

In [68]:
# print shape before dropping NaN rows
print(IDS_df.shape)

(18000, 80)


In [69]:
# Finding the null values.
print(IDS_df.isin([np.nan, np.inf, -np.inf]).sum().sum())

118


In [70]:
#  first replace infs to NaN:
IDS_df = IDS_df.replace([np.inf, -np.inf], np.nan)

In [71]:
# print shape after dropping NaN rows
IDS_df = IDS_df.dropna()
print(IDS_df.shape)
IDS_df = IDS_df.reset_index(drop=True)

(17941, 80)


In [72]:
# Finding the null values.
print(IDS_df.isin([np.nan, np.inf, -np.inf]).sum().sum())

0


Examine the proportion of types of traffic:

In [73]:
IDS_df['Label'].value_counts()

Benign                      7963
Infilteration               1978
DoS attacks-Hulk            1000
Bot                         1000
DoS attacks-Slowloris       1000
SSH-Bruteforce              1000
FTP-BruteForce              1000
DoS attacks-GoldenEye       1000
DDOS attack-HOIC            1000
DoS attacks-SlowHTTPTest    1000
Name: Label, dtype: int64

Convert all non-normal observations into a single class:

In [74]:
def get_label(text):
    """Binarize target labels into normal or anomalous."""
    if text == "Benign":
        return 0
    elif text == 'Infilteration':
        return 1
    elif text == 'DoS attacks-Slowloris':
        return 2
    elif text == 'SSH-Bruteforce':
        return 3
    elif text == 'DDOS attack-HOIC':
        return 4
    elif text == 'FTP-BruteForce':
        return 5
    elif text == 'DoS attacks-SlowHTTPTest':
        return 6
    elif text == 'Bot':
        return 7
    elif text == 'DoS attacks-Hulk':
        return 8
    elif text == 'DoS attacks-GoldenEye':
        return 9

IDS_df["Label"] = IDS_df["Label"].apply(get_label)

In [75]:
y = IDS_df["Label"].values
print(IDS_df["Label"].value_counts())

0    7963
1    1978
9    1000
8    1000
7    1000
6    1000
5    1000
4    1000
3    1000
2    1000
Name: Label, dtype: int64


Convert all categorical features into numerical form:

In [76]:
from sklearn.preprocessing import LabelEncoder

encodings_dictionary = dict()
for c in IDS_df.columns:
    if IDS_df[c].dtype == "object":
        encodings_dictionary[c] = LabelEncoder()
        IDS_df[c] = encodings_dictionary[c].fit_transform(IDS_df[c])

Split the dataset into normal and abnormal observations:

In [77]:
IDS_df_normal = IDS_df[IDS_df["Label"] == 0].sample(1000)  # Taking only 1000 samples to balance. Change this as required.
IDS_df_abnormal = IDS_df[IDS_df["Label"] != 0]
y_normal = IDS_df_normal.pop("Label").values
X_normal = IDS_df_normal.values
y_anomaly = IDS_df_abnormal.pop("Label").values
X_anomaly = IDS_df_abnormal.values

Train-test split the dataset:

In [78]:
from sklearn.model_selection import train_test_split

X_normal_train, X_normal_test, y_normal_train, y_normal_test = train_test_split(
    X_normal, y_normal, test_size=0.2, random_state=11
)

X_anomaly_train, X_anomaly_test, y_anomaly_train, y_anomaly_test = train_test_split(
    X_anomaly, y_anomaly, test_size=0.2, random_state=11
)

import numpy as np

X_train = np.concatenate((X_normal_train, X_anomaly_train))
y_train = np.concatenate((y_normal_train, y_anomaly_train))
X_test = np.concatenate((X_normal_test, X_anomaly_test))
y_test = np.concatenate((y_normal_test, y_anomaly_test))

# X_train = np.concatenate((X_normal_train[:10000], X_anomaly_train[:10000]))
# y_train = np.concatenate((y_normal_train[:10000], y_anomaly_train[:10000]))
# X_test = np.concatenate((X_normal_test[:1000], X_anomaly_test[:1000]))
# y_test = np.concatenate((y_normal_test[:1000], y_anomaly_test[:1000]))

In [79]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8782, 79)
(2196, 79)
(8782,)
(2196,)


In [80]:
batch_size = 100

# Pytorch
X_train  = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train)

X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test)

# Pytorch train and test sets
train = torch.utils.data.TensorDataset(X_train, y_train)
valid = torch.utils.data.TensorDataset(X_test, y_test)

# data loader
train_loader = torch.utils.data.DataLoader(train, batch_size = batch_size, shuffle = False)
valid_loader = torch.utils.data.DataLoader(valid, batch_size = batch_size, shuffle = False)

print('Completed loading data')

Completed loading data


In [81]:
train_loader.dataset.tensors[0].shape

torch.Size([8782, 79])

In [86]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
        
# Defining the DNN model
input_size = train_loader.dataset.tensors[0].shape[1]
hidden_layers = [512, 512, 512]
output_size = 10

# model definition
class MLP(nn.Module):
    # define model elements
    def __init__(self, n_inputs):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(n_inputs, hidden_layers[0])
        self.activ1 = nn.ReLU()
        
        self.layer2 = nn.Linear(hidden_layers[0], hidden_layers[1])
        self.activ2 = nn.ReLU()
        
        self.layer3 = nn.Linear(hidden_layers[1], hidden_layers[2])
        self.activ3 = nn.ReLU()
        
        self.layer4 = nn.Linear(hidden_layers[2], output_size)
        
    # forward propagate input
    def forward(self, x):
        x = self.layer1(x)
        x = self.activ1(x)
        
        x = self.layer2(x)
        x = self.activ2(x)
        
        x = self.layer3(x)
        x = self.activ3(x)
        
        x = self.layer4(x)
        return x

model = MLP(input_size)
print(model)
model.to(device)

 # Cross Entropy Loss 
error = nn.CrossEntropyLoss().to(device)
# SGD Optimizer
learning_rate = 0.001
# TODO: Try SGD
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

MLP(
  (layer1): Linear(in_features=79, out_features=512, bias=True)
  (activ1): ReLU()
  (layer2): Linear(in_features=512, out_features=512, bias=True)
  (activ2): ReLU()
  (layer3): Linear(in_features=512, out_features=512, bias=True)
  (activ3): ReLU()
  (layer4): Linear(in_features=512, out_features=10, bias=True)
)


In [87]:
print('Start training...')
start_time = time.time()
    
epochs = 500
for e in range(epochs):
    count = 0
    loss_list = []
    iteration_list = []
    accuracy_list = []
   
    for i, (data, labels) in enumerate(train_loader):
        train = data.to(device)
        #print(labels)
        labels = labels.to(device)

        # Clear gradients
        optimizer.zero_grad()
        # Forward propagation
        outputs = model(train)
        
        # Calculate softmax and cross entropy loss
        loss = error(outputs, labels)
        # Calculating gradients
        loss.backward()
        # Update parameters
        optimizer.step()

        if count % 100 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for data, labels in valid_loader:
                valid = data.to(device)
                #print('Lables:', labels)
                
                labels = labels.to(device)

                # Forward propagation
                outputs = model(valid)
                # Get predictions from the maximum value
                predicted = torch.max(outputs.data, 1)[1]

                #print('Predicted: ', predicted)
                
                # Total number of labels
                total += len(labels)
                correct += (predicted == labels).sum()

            accuracy = 100 * correct / float(total)

            # store loss and iteration
            loss_list.append(loss.data)
            iteration_list.append(count)
            accuracy_list.append(accuracy)
        if count % 100 == 0:
            # Print Loss
            print('Epoch: {} Iteration: {}  Loss: {}  Accuracy: {} %'.format(e + 1, count, loss.data, accuracy))

        count += 1

end_time = time.time()
print('Epochs completed. Time taken (seconds): ', str(end_time - start_time))
    

Start training...
Epoch: 1 Iteration: 0  Loss: 314954.5  Accuracy: 9 %
Epoch: 2 Iteration: 0  Loss: 1269992.625  Accuracy: 46 %
Epoch: 3 Iteration: 0  Loss: 196460.984375  Accuracy: 41 %
Epoch: 4 Iteration: 0  Loss: 2604.7255859375  Accuracy: 33 %
Epoch: 5 Iteration: 0  Loss: 2.8290276527404785  Accuracy: 28 %
Epoch: 6 Iteration: 0  Loss: 2.5379722118377686  Accuracy: 37 %
Epoch: 7 Iteration: 0  Loss: 2.3835365772247314  Accuracy: 37 %
Epoch: 8 Iteration: 0  Loss: 2.3223280906677246  Accuracy: 38 %
Epoch: 9 Iteration: 0  Loss: 2.270143508911133  Accuracy: 37 %
Epoch: 10 Iteration: 0  Loss: 2.2479453086853027  Accuracy: 37 %
Epoch: 11 Iteration: 0  Loss: 2.2372217178344727  Accuracy: 37 %
Epoch: 12 Iteration: 0  Loss: 2.224053382873535  Accuracy: 38 %
Epoch: 13 Iteration: 0  Loss: 2.2303388118743896  Accuracy: 38 %
Epoch: 14 Iteration: 0  Loss: 2.2177274227142334  Accuracy: 38 %
Epoch: 15 Iteration: 0  Loss: 2.2331197261810303  Accuracy: 38 %
Epoch: 16 Iteration: 0  Loss: 2.205936670303

KeyboardInterrupt: 