In [66]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
from torch import optim
import numpy as np
import pandas as pd
import os
import io
import re
import sys
import time
%matplotlib inline

In [16]:
IDS_df = pd.read_csv("./Data/CSE-CIC-IDS2018/03-02-2018.csv")

# To display the top 5 rows
IDS_df.head(5)

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,443,6,02/03/2018 08:47:38,141385,9,7,553,3773.0,202,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,49684,6,02/03/2018 08:47:38,281,2,1,38,0.0,38,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,443,6,02/03/2018 08:47:40,279824,11,15,1086,10527.0,385,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,443,6,02/03/2018 08:47:40,132,2,0,0,0.0,0,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,443,6,02/03/2018 08:47:41,274016,9,13,1285,6141.0,517,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


In [17]:
# print shape before dropping NaN rows
print(IDS_df.shape)

(1048575, 80)


In [18]:
# Finding the null values.
print(IDS_df.isin([np.nan, np.inf, -np.inf]).sum().sum())

8100


In [19]:
#  first replace infs to NaN:
IDS_df = IDS_df.replace([np.inf, -np.inf], np.nan)

In [20]:
# print shape after dropping NaN rows
IDS_df = IDS_df.dropna()
print(IDS_df.shape)
IDS_df = IDS_df.reset_index(drop=True)

(1044525, 80)


In [21]:
# Finding the null values.
print(IDS_df.isin([np.nan, np.inf, -np.inf]).sum().sum())

0


Examine the proportion of types of traffic:

In [22]:
y = IDS_df["Label"].values
from collections import Counter

Counter(y).most_common()

[('Benign', 758334), ('Bot', 286191)]

Convert all non-normal observations into a single class:

In [23]:
def label_anomalous(text):
    """Binarize target labels into normal or anomalous."""
    if text == "Benign":
        return 0
    else:
        return 1

IDS_df["Label"] = IDS_df["Label"].apply(label_anomalous)

In [24]:
y = IDS_df["Label"].values
Counter(y).most_common()

[(0, 758334), (1, 286191)]

Convert all categorical features into numerical form:

In [25]:
from sklearn.preprocessing import LabelEncoder

encodings_dictionary = dict()
for c in IDS_df.columns:
    if IDS_df[c].dtype == "object":
        encodings_dictionary[c] = LabelEncoder()
        IDS_df[c] = encodings_dictionary[c].fit_transform(IDS_df[c])

Split the dataset into normal and abnormal observations:

In [26]:
IDS_df_normal = IDS_df[IDS_df["Label"] == 0]
IDS_df_abnormal = IDS_df[IDS_df["Label"] == 1]
y_normal = IDS_df_normal.pop("Label").values
X_normal = IDS_df_normal.values
y_anomaly = IDS_df_abnormal.pop("Label").values
X_anomaly = IDS_df_abnormal.values

Train-test split the dataset:

In [57]:
from sklearn.model_selection import train_test_split

X_normal_train, X_normal_test, y_normal_train, y_normal_test = train_test_split(
    X_normal, y_normal, test_size=0.3, random_state=11
)
X_anomaly_train, X_anomaly_test, y_anomaly_train, y_anomaly_test = train_test_split(
    X_anomaly, y_anomaly, test_size=0.3, random_state=11
)

import numpy as np

X_train = np.concatenate((X_normal_train, X_anomaly_train))
y_train = np.concatenate((y_normal_train, y_anomaly_train))
X_test = np.concatenate((X_normal_test, X_anomaly_test))
y_test = np.concatenate((y_normal_test, y_anomaly_test))

In [58]:
batch_size = 256

# Pytorch
X_train  = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train)

X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test)

# Pytorch train and test sets
train = torch.utils.data.TensorDataset(X_train, y_train)
valid = torch.utils.data.TensorDataset(X_test, y_test)

# data loader
train_loader = torch.utils.data.DataLoader(train, batch_size = batch_size, shuffle = False)
valid_loader = torch.utils.data.DataLoader(valid, batch_size = batch_size, shuffle = False)

print('Completed loading data')

Completed loading data


In [59]:
train_loader.dataset.tensors[0].shape[1]

79

In [70]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
        
# Defining the DNN model
input_size = train_loader.dataset.tensors[0].shape[1]
hidden_layers = [128,64]
output_size = 2

model = nn.Sequential(
    nn.Linear(input_size, hidden_layers[0]),
    nn.ReLU(),
    nn.Linear(hidden_layers[0], hidden_layers[1]),
    nn.ReLU(),
    nn.Linear(hidden_layers[1], output_size),
    nn.Softmax(dim=1)
)
print(model)
model.to(device)

 # Cross Entropy Loss 
error = nn.CrossEntropyLoss().to(device)
# SGD Optimizer
learning_rate = 0.001
# TODO: Try SGD
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

Sequential(
  (0): Linear(in_features=79, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=2, bias=True)
  (5): Softmax(dim=1)
)


In [None]:
print('Start training...')
start_time = time.time()
    
epochs = 5
for e in range(epochs):
    count = 0
    loss_list = []
    iteration_list = []
    accuracy_list = []
   
    for i, (data, labels) in enumerate(train_loader):
        train = data.to(device)
        labels = labels.to(device)

        # Clear gradients
        optimizer.zero_grad()
        # Forward propagation
        outputs = model(train)
        # Calculate softmax and cross entropy loss
        loss = error(outputs, labels)
        # Calculating gradients
        loss.backward()
        # Update parameters
        optimizer.step()

        if count % 100 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for data, labels in valid_loader:
                valid = data.to(device)
                labels = labels.to(device)

                # Forward propagation
                outputs = model(valid)
                # Get predictions from the maximum value
                predicted = torch.max(outputs.data, 1)[1]

                # Total number of labels
                total += len(labels)
                correct += (predicted == labels).sum()

            accuracy = 100 * correct / float(total)

            # store loss and iteration
            loss_list.append(loss.data)
            iteration_list.append(count)
            accuracy_list.append(accuracy)
        if count % 100 == 0:
            # Print Loss
            print('Iteration: {}  Loss: {}  Accuracy: {} %'.format(count, loss.data, accuracy))

        count += 1

end_time = time.time()
print('Epochs completed. Time taken (seconds): ', str(end_time - start_time))
    

Start training...
Iteration: 0  Loss: 0.316585510969162  Accuracy: 72.60075378417969 %
Iteration: 100  Loss: 0.31326231360435486  Accuracy: 72.60075378417969 %
Iteration: 200  Loss: 0.31326231360435486  Accuracy: 72.60075378417969 %
Iteration: 300  Loss: 0.31326231360435486  Accuracy: 72.60075378417969 %
Iteration: 400  Loss: 0.31326231360435486  Accuracy: 72.60075378417969 %
Iteration: 500  Loss: 0.31326231360435486  Accuracy: 72.60075378417969 %
Iteration: 600  Loss: 0.31326231360435486  Accuracy: 72.60075378417969 %
Iteration: 700  Loss: 0.31326231360435486  Accuracy: 72.60075378417969 %
Iteration: 800  Loss: 0.31326231360435486  Accuracy: 72.60075378417969 %
Iteration: 900  Loss: 0.31326231360435486  Accuracy: 72.60075378417969 %
Iteration: 1000  Loss: 0.31326231360435486  Accuracy: 72.60075378417969 %
Iteration: 1100  Loss: 0.31326231360435486  Accuracy: 72.60075378417969 %
Iteration: 1200  Loss: 0.31326231360435486  Accuracy: 72.60075378417969 %
Iteration: 1300  Loss: 0.313262313