In [1]:
# libraries import
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# dataset path
DATASET_PATH = "https://github.com/divyanshu1610/network-intrusion-detection-project/blob/main/kddcup.data_10_percent.gz?raw=true"
COL_NAMES_PATH = "https://raw.githubusercontent.com/divyanshu1610/network-intrusion-detection-project/main/kddcup.names"
ATTACK_TYPE_NAME_PATH = "https://raw.githubusercontent.com/divyanshu1610/network-intrusion-detection-project/main/training_attack_types"

In [3]:
column_names_df = pd.read_csv(COL_NAMES_PATH, header=None, sep=':',skiprows=[0])[0]
column_names_df = column_names_df.append(pd.Series(['attack_or_normal']), ignore_index=True)
column_names_df

0                        duration
1                   protocol_type
2                         service
3                            flag
4                       src_bytes
5                       dst_bytes
6                            land
7                  wrong_fragment
8                          urgent
9                             hot
10              num_failed_logins
11                      logged_in
12                num_compromised
13                     root_shell
14                   su_attempted
15                       num_root
16             num_file_creations
17                     num_shells
18               num_access_files
19              num_outbound_cmds
20                  is_host_login
21                 is_guest_login
22                          count
23                      srv_count
24                    serror_rate
25                srv_serror_rate
26                    rerror_rate
27                srv_rerror_rate
28                  same_srv_rate
29            

In [4]:
# read dataset
columns = column_names_df.values
dataset = pd.read_csv(DATASET_PATH, compression='gzip', header=None, quotechar='"', error_bad_lines=False, names=columns)

dataset.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_or_normal
0,0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


TRANSFORMATIONS

In [5]:
t_map = {
    'tcp': 3,
    'udp': 7,
    'icmp':9,
    'OTH':1,
    'REJ':2,
    'RSTO':3,
    'RSTOS0':4,
    'RSTR':5,
    'S0':6,
    'S1':7,
    'S2':8,
    'S3':9,
    'SF':10,
    'SH':11,
    'normal':0,
    'attack':1
}

In [6]:
# attack type tranform [ attack or normal ]
dataset['attack_or_normal'] = dataset.attack_or_normal.apply(lambda r:0 if (r[:-1] == 'normal') else 1)

# protocol_type transform 
dataset['protocol_type'] = dataset.protocol_type.apply(lambda r: t_map[r])

# flag transform
dataset['flag'] = dataset.flag.apply(lambda r: t_map[r])

dataset.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_or_normal
0,0,3,http,10,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,0
1,0,3,http,10,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0
2,0,3,http,10,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0
3,0,3,http,10,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0
4,0,3,http,10,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0


In [7]:
dataset.drop('service', axis=1, inplace=True)

dataset.shape

(494021, 41)

In [8]:
dataset = dataset.dropna('columns') # drop columns with NaN

In [18]:
# features after rst selection
column_names_df
features_col = [0, 1, 3, 4, 6, 9, 10, 11, 12, 14, 15, 16, 17, 21, 23, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]

In [19]:
r_dataset = dataset.iloc[:, features_col]
r_dataset.head()

Unnamed: 0,duration,protocol_type,src_bytes,dst_bytes,wrong_fragment,num_failed_logins,logged_in,num_compromised,root_shell,num_root,num_file_creations,num_shells,num_access_files,count,serror_rate,rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_or_normal
0,0,3,181,5450,0,0,1,0,0,0,0,0,0,8,0.0,0.0,1.0,0.0,0.0,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,0
1,0,3,239,486,0,0,1,0,0,0,0,0,0,8,0.0,0.0,1.0,0.0,0.0,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0
2,0,3,235,1337,0,0,1,0,0,0,0,0,0,8,0.0,0.0,1.0,0.0,0.0,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0
3,0,3,219,1337,0,0,1,0,0,0,0,0,0,6,0.0,0.0,1.0,0.0,0.0,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0
4,0,3,217,2032,0,0,1,0,0,0,0,0,0,6,0.0,0.0,1.0,0.0,0.0,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0


In [20]:
r_dataset.shape

(494021, 30)

In [21]:
r_dataset = r_dataset.sample(n=50000)
r_dataset

Unnamed: 0,duration,protocol_type,src_bytes,dst_bytes,wrong_fragment,num_failed_logins,logged_in,num_compromised,root_shell,num_root,num_file_creations,num_shells,num_access_files,count,serror_rate,rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_or_normal
373509,0,3,0,0,0,0,0,0,0,0,0,0,0,253,1.0,0.0,0.08,0.05,0.0,255,21,0.08,0.05,0.00,0.0,1.0,1.0,0.0,0.0,1
357860,0,3,0,0,0,0,0,0,0,0,0,0,0,117,1.0,0.0,0.17,0.05,0.0,255,18,0.07,0.07,0.00,0.0,1.0,1.0,0.0,0.0,1
303034,0,9,1032,0,0,0,0,0,0,0,0,0,0,511,0.0,0.0,1.00,0.00,0.0,255,255,1.00,0.00,1.00,0.0,0.0,0.0,0.0,0.0,1
454036,2404,7,146,105,0,0,0,0,0,0,0,0,0,1,0.0,0.0,1.00,0.00,0.0,255,1,0.00,0.66,0.98,0.0,0.0,0.0,0.0,0.0,0
433034,0,9,520,0,0,0,0,0,0,0,0,0,0,511,0.0,0.0,1.00,0.00,0.0,255,255,1.00,0.00,1.00,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330729,0,9,1032,0,0,0,0,0,0,0,0,0,0,511,0.0,0.0,1.00,0.00,0.0,255,255,1.00,0.00,1.00,0.0,0.0,0.0,0.0,0.0,1
360572,0,3,0,0,0,0,0,0,0,0,0,0,0,121,1.0,0.0,0.13,0.07,0.0,255,17,0.07,0.07,0.00,0.0,1.0,1.0,0.0,0.0,1
266439,0,9,1032,0,0,0,0,0,0,0,0,0,0,511,0.0,0.0,1.00,0.00,0.0,255,255,1.00,0.00,1.00,0.0,0.0,0.0,0.0,0.0,1
375131,0,3,0,0,0,0,0,0,0,0,0,0,0,238,1.0,0.0,0.03,0.05,0.0,255,6,0.02,0.06,0.00,0.0,1.0,1.0,0.0,0.0,1


In [27]:
X = r_dataset.iloc[:, :-1]
X
Y = r_dataset[['attack_or_normal']]
Y

Unnamed: 0,attack_or_normal
373509,1
357860,1
303034,1
454036,0
433034,1
...,...
330729,1
360572,1
266439,1
375131,1


SCALING

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

In [28]:

sc = MinMaxScaler()
X = sc.fit_transform(X)

In [29]:
# Split test and train data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

(33500, 29) (16500, 29)
(33500, 1) (16500, 1)


SUPPORT VECTOR MACHINE

In [30]:
from sklearn.svm import SVC
import time

In [31]:
model = SVC(gamma = 'scale')

In [32]:
start_time = time.time()
model.fit(X_train, Y_train.values.ravel())
end_time = time.time()

In [33]:
print("Training time: ",end_time-start_time)

Training time:  2.420142889022827


In [34]:
print("Train score is:", model.score(X_train, Y_train))

Train score is: 0.9936716417910447


In [35]:
start_time = time.time()
Y_test_pred = model.predict(X_test)
end_time = time.time()

In [36]:
print("Testing time: ",end_time-start_time)

Testing time:  0.6576790809631348


In [37]:
print("Test score is:", model.score(X_test,Y_test))

Test score is: 0.9933333333333333
