In [27]:
# libraries import
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [28]:
# dataset path
DATASET_PATH = "https://github.com/divyanshu1610/network-intrusion-detection-project/blob/main/kddcup.data_10_percent.gz?raw=true"
COL_NAMES_PATH = "https://raw.githubusercontent.com/divyanshu1610/network-intrusion-detection-project/main/kddcup.names"
ATTACK_TYPE_NAME_PATH = "https://raw.githubusercontent.com/divyanshu1610/network-intrusion-detection-project/main/training_attack_types"

In [35]:
column_names_df = pd.read_csv(COL_NAMES_PATH, header=None, sep=':',skiprows=[0])[0]
column_names_df = column_names_df.append(pd.Series(['attack_or_normal']), ignore_index=True)
column_names_df

0                        duration
1                   protocol_type
2                         service
3                            flag
4                       src_bytes
5                       dst_bytes
6                            land
7                  wrong_fragment
8                          urgent
9                             hot
10              num_failed_logins
11                      logged_in
12                num_compromised
13                     root_shell
14                   su_attempted
15                       num_root
16             num_file_creations
17                     num_shells
18               num_access_files
19              num_outbound_cmds
20                  is_host_login
21                 is_guest_login
22                          count
23                      srv_count
24                    serror_rate
25                srv_serror_rate
26                    rerror_rate
27                srv_rerror_rate
28                  same_srv_rate
29            

In [44]:
# read dataset
columns = column_names_df.values
dataset = pd.read_csv(DATASET_PATH, compression='gzip', header=None, quotechar='"', error_bad_lines=False, names=columns)

dataset.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_or_normal
0,0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


TRANSFORMATIONS

In [37]:
t_map = {
    'tcp': 3,
    'udp': 7,
    'icmp':9,
    'OTH':1,
    'REJ':2,
    'RSTO':3,
    'RSTOS0':4,
    'RSTR':5,
    'S0':6,
    'S1':7,
    'S2':8,
    'S3':9,
    'SF':10,
    'SH':11,
    'normal':0,
    'attack':1
}

In [45]:
# attack type tranform [ attack or normal ]
dataset['attack_or_normal'] = dataset.attack_or_normal.apply(lambda r:0 if (r[:-1] == 'normal') else 1)

# protocol_type transform 
dataset['protocol_type'] = dataset.protocol_type.apply(lambda r: t_map[r])

# flag transform
dataset['flag'] = dataset.flag.apply(lambda r: t_map[r])

dataset.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_or_normal
0,0,3,http,10,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,0
1,0,3,http,10,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0
2,0,3,http,10,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0
3,0,3,http,10,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0
4,0,3,http,10,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0


In [46]:
dataset.shape

(494021, 42)

In [57]:
dataset.drop('service', axis=1, inplace=True)

dataset.shape

(494021, 41)