# Packages

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split

import requests


# Data

## import

Download and unzip the data

In [None]:
dataset_url = "http://205.174.165.80/CICDataset/NSL-KDD/Dataset/NSL-KDD.zip"
r = requests.get(dataset_url, allow_redirects=True)
open('data.zip', 'wb').write(r.content)

!unzip data.zip -d data

NameError: ignored

Read the data

In [None]:
train_data = pd.read_csv("data/KDDTrain+.txt")
test_data = pd.read_csv("data/KDDTest+.txt")
train_data.shape, test_data.shape

((125972, 43), (22543, 43))

In [None]:
# from google.colab import drive
# drive.mount("/content/drive")

In [None]:

# Dataset = pd.read_csv("/content/drive/MyDrive/data/NSL_train.csv")

Separate the data from labels

In [None]:
X_train = train_data.iloc[:, :-2]
y_train = train_data.iloc[:, -2]

X_test = test_data.iloc[:, :-2]
y_test = test_data.iloc[:, -2]


X_train.shape, y_train.shape, X_test.shape, y_test.shape

((125972, 41), (125972,), (22543, 41), (22543,))

Explore the data

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125972 entries, 0 to 125971
Data columns (total 41 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   0         125972 non-null  int64  
 1   tcp       125972 non-null  object 
 2   ftp_data  125972 non-null  object 
 3   SF        125972 non-null  object 
 4   491       125972 non-null  int64  
 5   0.1       125972 non-null  int64  
 6   0.2       125972 non-null  int64  
 7   0.3       125972 non-null  int64  
 8   0.4       125972 non-null  int64  
 9   0.5       125972 non-null  int64  
 10  0.6       125972 non-null  int64  
 11  0.7       125972 non-null  int64  
 12  0.8       125972 non-null  int64  
 13  0.9       125972 non-null  int64  
 14  0.10      125972 non-null  int64  
 15  0.11      125972 non-null  int64  
 16  0.12      125972 non-null  int64  
 17  0.13      125972 non-null  int64  
 18  0.14      125972 non-null  int64  
 19  0.15      125972 non-null  int64  
 20  0.16

In [None]:
v = X_train.iloc[:,-1].unique()
v.sort()
v

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
       0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
       0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
       0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
       0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99, 1.  ])

In [None]:
X_train.describe()

Unnamed: 0,0,491,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,150,25,0.17.1,0.03,0.17.2,0.00.6,0.00.7,0.00.8,0.05,0.00.9
count,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,...,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0
mean,287.146929,45567.1,19779.27,0.000198,0.022688,0.000111,0.204411,0.001222,0.395739,0.279253,...,182.1492,115.653725,0.521244,0.082952,0.148379,0.032543,0.284455,0.278487,0.118832,0.120241
std,2604.525522,5870354.0,4021285.0,0.014086,0.253531,0.014366,2.149977,0.045239,0.489011,23.942137,...,99.206565,110.702886,0.44895,0.188922,0.308998,0.112564,0.444785,0.44567,0.306559,0.31946
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,82.0,10.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,63.0,0.51,0.02,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,276.0,516.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,255.0,1.0,0.07,0.06,0.02,1.0,1.0,0.0,0.0
max,42908.0,1379964000.0,1309937000.0,1.0,3.0,3.0,77.0,5.0,1.0,7479.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Convert the problem from multi-class classification to binary classification
* We assume that all the other anomaly categories are under the category `abnormal`
* In result, we have only two categories.
  * `normal`
  * `abnormal`

In [None]:
mask = (y_train != "normal")
y_train.loc[mask] = "abnromal"
y_train

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


0           normal
1         abnromal
2           normal
3           normal
4         abnromal
            ...   
125967    abnromal
125968      normal
125969      normal
125970    abnromal
125971      normal
Name: normal, Length: 125972, dtype: object

In [None]:
assert y_train.unique().size==2, "y should have only 2 categories"
y_train.unique()

array(['normal', 'abnromal'], dtype=object)

## preprocessing





The whole pipeline

In [None]:
def fit_transformers(X_train = X_train, y_train = y_train):
  ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), slice(1,4))],remainder= 'passthrough')
  ct.fit(X_train)

  X_encoded = ct.transform(X_train)

  le = LabelEncoder()
  le.fit(y_train)

  sc = RobustScaler()
  sc.fit(X_encoded)

  return ct, le, sc

ct, le, sc = fit_transformers()

def preprocess(x, y, ct = ct, le = le, sc = sc):

  # It is better to pass the transformers and all other preprocessing methods


  # ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), slice(1,4))],remainder= 'passthrough')
  x_encoded = ct.fit_transform(x)

  print("input values of categorical features")
  print([f for f in ct.named_transformers_['encoder'].categories_])

  # print(np.sum([len(f) for f in ct.named_transformers_['encoder1'].categories_]))


  # le = LabelEncoder()
  y_encoded = le.fit_transform(y)

  # sc = RobustScaler()
  x_scaled = sc.fit_transform(x_encoded)

  # We do not need scaling for labels' column
  y_scaled = y_encoded

  return x_encoded, x_scaled, y_scaled


Encoding X_train categroical features

In [None]:
X_train_encoded, X_train_scaled, y_train_encoded = preprocess(X_train, y_train)

input values of categorical features
[array(['icmp', 'tcp', 'udp'], dtype=object), array(['IRC', 'X11', 'Z39_50', 'aol', 'auth', 'bgp', 'courier',
       'csnet_ns', 'ctf', 'daytime', 'discard', 'domain', 'domain_u',
       'echo', 'eco_i', 'ecr_i', 'efs', 'exec', 'finger', 'ftp',
       'ftp_data', 'gopher', 'harvest', 'hostnames', 'http', 'http_2784',
       'http_443', 'http_8001', 'imap4', 'iso_tsap', 'klogin', 'kshell',
       'ldap', 'link', 'login', 'mtp', 'name', 'netbios_dgm',
       'netbios_ns', 'netbios_ssn', 'netstat', 'nnsp', 'nntp', 'ntp_u',
       'other', 'pm_dump', 'pop_2', 'pop_3', 'printer', 'private',
       'red_i', 'remote_job', 'rje', 'shell', 'smtp', 'sql_net', 'ssh',
       'sunrpc', 'supdup', 'systat', 'telnet', 'tftp_u', 'tim_i', 'time',
       'urh_i', 'urp_i', 'uucp', 'uucp_path', 'vmnet', 'whois'],
      dtype=object), array(['OTH', 'REJ', 'RSTO', 'RSTOS0', 'RSTR', 'S0', 'S1', 'S2', 'S3',
       'SF', 'SH'], dtype=object)]


In [None]:
X_train.shape, X_train_encoded.shape

((125972, 41), (125972, 122))

Encoding the labels 

In [None]:
y_train_encoded, np.array(y_train)

(array([1, 0, 1, ..., 1, 0, 1]),
 array(['normal', 'abnromal', 'normal', ..., 'normal', 'abnromal',
        'normal'], dtype=object))

Scaling the data

In [None]:
X_train_scaled

array([[ 0.  , -1.  ,  1.  , ...,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , ...,  1.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , ...,  0.01,  0.  ,  0.01],
       ...,
       [ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.01,  0.  ],
       [ 0.  ,  0.  ,  0.  , ...,  1.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.  ,  0.  ]])

# Model

## Settings

## Logistic Regression

## Decision Trees

# Prediction

In [None]:
X_test_encoded, X_test_scaled, y_test_encoded = preprocess(X_test, y_test)

input values of categorical features
[array(['icmp', 'tcp', 'udp'], dtype=object), array(['IRC', 'X11', 'Z39_50', 'auth', 'bgp', 'courier', 'csnet_ns',
       'ctf', 'daytime', 'discard', 'domain', 'domain_u', 'echo', 'eco_i',
       'ecr_i', 'efs', 'exec', 'finger', 'ftp', 'ftp_data', 'gopher',
       'hostnames', 'http', 'http_443', 'imap4', 'iso_tsap', 'klogin',
       'kshell', 'ldap', 'link', 'login', 'mtp', 'name', 'netbios_dgm',
       'netbios_ns', 'netbios_ssn', 'netstat', 'nnsp', 'nntp', 'ntp_u',
       'other', 'pm_dump', 'pop_2', 'pop_3', 'printer', 'private',
       'remote_job', 'rje', 'shell', 'smtp', 'sql_net', 'ssh', 'sunrpc',
       'supdup', 'systat', 'telnet', 'tftp_u', 'tim_i', 'time', 'urp_i',
       'uucp', 'uucp_path', 'vmnet', 'whois'], dtype=object), array(['OTH', 'REJ', 'RSTO', 'RSTOS0', 'RSTR', 'S0', 'S1', 'S2', 'S3',
       'SF', 'SH'], dtype=object)]


In [None]:
X_test_scaled

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         2.77777778,  5.88235294],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.        , -1.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.19444444,  0.41176471],
       [ 0.        , -1.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         1.22222222,  5.88235294]])