In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale 
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.decomposition import PCA



kdd_names_read = pd.read_csv('datasets/feature_names.csv', header=None)
kdd_names = kdd_names_read.iloc[:,0].values
kdd = pd.read_csv('datasets/kddcup.data_10_percent_corrected', names=kdd_names)
kdd.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [5]:
for i in range(len(kdd.columns)):
    if kdd.dtypes[i] == object:
        print(kdd.columns[i],end =": ")
        print(kdd[kdd.columns[i]].unique())

protocol_type: ['tcp' 'udp' 'icmp']
service: ['http' 'smtp' 'finger' 'domain_u' 'auth' 'telnet' 'ftp' 'eco_i' 'ntp_u'
 'ecr_i' 'other' 'private' 'pop_3' 'ftp_data' 'rje' 'time' 'mtp' 'link'
 'remote_job' 'gopher' 'ssh' 'name' 'whois' 'domain' 'login' 'imap4'
 'daytime' 'ctf' 'nntp' 'shell' 'IRC' 'nnsp' 'http_443' 'exec' 'printer'
 'efs' 'courier' 'uucp' 'klogin' 'kshell' 'echo' 'discard' 'systat'
 'supdup' 'iso_tsap' 'hostnames' 'csnet_ns' 'pop_2' 'sunrpc' 'uucp_path'
 'netbios_ns' 'netbios_ssn' 'netbios_dgm' 'sql_net' 'vmnet' 'bgp' 'Z39_50'
 'ldap' 'netstat' 'urh_i' 'X11' 'urp_i' 'pm_dump' 'tftp_u' 'tim_i' 'red_i']
flag: ['SF' 'S1' 'REJ' 'S2' 'S0' 'S3' 'RSTO' 'RSTR' 'RSTOS0' 'OTH' 'SH']
attack_type: ['normal.' 'buffer_overflow.' 'loadmodule.' 'perl.' 'neptune.' 'smurf.'
 'guess_passwd.' 'pod.' 'teardrop.' 'portsweep.' 'ipsweep.' 'land.'
 'ftp_write.' 'back.' 'imap.' 'satan.' 'phf.' 'nmap.' 'multihop.'
 'warezmaster.' 'warezclient.' 'spy.' 'rootkit.']


In [6]:
kdd_multiclass = kdd.copy()
dos = ['neptune.', 'land.', 'pod.', 'smurf.', 'teardrop.', 'back.', 'worm.', 'udpstorm.', 'processtable.', 'apache2.']
probe = ['ipsweep.','satan.','nmap.','portsweep.','mscan.','saint.']
R2L = ['ftp_write.','guess_passwd.', 'imap.','multihop.','phf.','spy.','warezclient.','warezmaster.','snmpguess.','named.','xlock.','snmpgetattack.','httptunnel.','sendmail.']
U2R = ['buffer_overflow.','loadmodule.','perl.','rootkit.','ps.','xterm.','sqlattack.']

kdd_multiclass['attack_type'].values[kdd_multiclass['attack_type'].isin(dos)] = 'dos'
kdd_multiclass['attack_type'].values[kdd_multiclass['attack_type'].isin(probe)] = 'probe'
kdd_multiclass['attack_type'].values[kdd_multiclass['attack_type'].isin(R2L)] = 'R2L'
kdd_multiclass['attack_type'].values[kdd_multiclass['attack_type'].isin(U2R)] = 'U2R'

In [8]:
for i in range(len(kdd_multiclass.columns)):
    if kdd_multiclass.dtypes[i] == object:
        print(kdd_multiclass.columns[i],end =": ")
        print(kdd_multiclass[kdd_multiclass.columns[i]].unique())

protocol_type: ['tcp' 'udp' 'icmp']
service: ['http' 'smtp' 'finger' 'domain_u' 'auth' 'telnet' 'ftp' 'eco_i' 'ntp_u'
 'ecr_i' 'other' 'private' 'pop_3' 'ftp_data' 'rje' 'time' 'mtp' 'link'
 'remote_job' 'gopher' 'ssh' 'name' 'whois' 'domain' 'login' 'imap4'
 'daytime' 'ctf' 'nntp' 'shell' 'IRC' 'nnsp' 'http_443' 'exec' 'printer'
 'efs' 'courier' 'uucp' 'klogin' 'kshell' 'echo' 'discard' 'systat'
 'supdup' 'iso_tsap' 'hostnames' 'csnet_ns' 'pop_2' 'sunrpc' 'uucp_path'
 'netbios_ns' 'netbios_ssn' 'netbios_dgm' 'sql_net' 'vmnet' 'bgp' 'Z39_50'
 'ldap' 'netstat' 'urh_i' 'X11' 'urp_i' 'pm_dump' 'tftp_u' 'tim_i' 'red_i']
flag: ['SF' 'S1' 'REJ' 'S2' 'S0' 'S3' 'RSTO' 'RSTR' 'RSTOS0' 'OTH' 'SH']
attack_type: ['normal.' 'U2R' 'dos' 'R2L' 'probe']


In [35]:
kdd_multi_encoded = pd.get_dummies(kdd_multiclass, columns=['protocol_type','service','flag','attack_type'])
kdd_multi_encoded.columns

kdd_normal = kdd_multi_encoded.pop('attack_type_normal.')
kdd_dos = kdd_multi_encoded.pop('attack_type_dos')
kdd_probe = kdd_multi_encoded.pop('attack_type_probe')
kdd_u2r = kdd_multi_encoded.pop('attack_type_U2R')
kdd_r2l = kdd_multi_encoded.pop('attack_type_R2L')

kdd_multi_encoded['normal'] = kdd_normal
kdd_multi_encoded['dos'] = kdd_dos
kdd_multi_encoded['probe'] = kdd_probe
kdd_multi_encoded['u2r'] = kdd_u2r
kdd_multi_encoded['r2l'] = kdd_r2l


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,normal,dos,probe,u2r,r2l
duration,1.000000,0.004258,0.005440,-0.000452,-0.003235,0.003786,0.013213,0.005239,-0.017265,0.058095,...,-0.000728,-0.000456,0.002990,0.021548,-0.000998,0.118014,-0.132439,0.056540,0.000478,0.034561
src_bytes,0.004258,1.000000,-0.000002,-0.000020,-0.000139,-0.000005,0.004483,-0.000027,0.001701,0.000119,...,0.000116,0.000064,0.002325,-0.001725,-0.000045,-0.000936,-0.004050,0.015362,-0.000022,0.013008
dst_bytes,0.005440,-0.000002,1.000000,-0.000175,-0.001254,0.016288,0.004365,0.049330,0.047814,0.023298,...,0.008182,0.006520,0.000474,0.014036,-0.000387,0.037709,-0.048617,-0.002406,0.001327,0.103341
land,-0.000452,-0.000020,-0.000175,1.000000,-0.000318,-0.000017,-0.000295,-0.000065,-0.002784,-0.000038,...,-0.000072,-0.000047,-0.000030,-0.012075,-0.000098,-0.002542,0.002668,-0.000611,-0.000068,-0.000319
wrong_fragment,-0.003235,-0.000139,-0.001254,-0.000318,1.000000,-0.000123,-0.002106,-0.000467,-0.019908,-0.000271,...,-0.000513,-0.000333,-0.000215,0.026372,-0.000702,-0.023630,0.024426,-0.004369,-0.000490,-0.002281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
normal,0.118014,-0.000936,0.037709,-0.002542,-0.023630,0.001498,0.006327,0.001060,0.795282,0.005046,...,0.020272,0.008964,0.005692,0.206691,-0.007288,1.000000,-0.967386,-0.045337,-0.005080,-0.023667
dos,-0.132439,-0.004050,-0.048617,0.002668,0.024426,-0.005024,-0.058628,-0.019111,-0.784107,-0.005186,...,-0.020057,-0.010037,-0.008790,-0.185490,-0.028755,-0.967386,1.000000,-0.178875,-0.020045,-0.093377
probe,0.056540,0.015362,-0.002406,-0.000611,-0.004369,-0.000235,-0.003927,-0.000896,-0.037570,-0.000520,...,-0.000984,0.002560,-0.000412,-0.087896,0.154695,-0.045337,-0.178875,1.000000,-0.000939,-0.004376
u2r,0.000478,-0.000022,0.001327,-0.000068,-0.000490,0.035781,0.017964,0.012613,0.021262,0.006854,...,-0.000110,-0.000072,-0.000046,0.005204,-0.000151,-0.005080,-0.020045,-0.000939,1.000000,-0.000490


In [38]:
kdd_corr = kdd_multi_encoded.corr()


In [47]:
kdd_corr_select = kdd_corr.drop( kdd_corr[ (abs(kdd_corr['normal']) < 0.1) 
                                         & (abs(kdd_corr['probe']) < 0.1)
                                         & (abs(kdd_corr['u2r']) < 0.1)
                                         & (abs(kdd_corr['r2l']) < 0.1)
                                         & (abs(kdd_corr['dos']) < 0.1)].index)

kdd_corr_select.drop(index=['num_outbound_cmds','is_host_login'], inplace=True)
kdd_corr_select


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,normal,dos,probe,u2r,r2l
duration,1.0,0.004258,0.00544,-0.000452,-0.003235,0.003786,0.013213,0.005239,-0.017265,0.058095,...,-0.000728,-0.000456,0.00299,0.021548,-0.000998,0.118014,-0.132439,0.05654,0.000478,0.034561
dst_bytes,0.00544,-2e-06,1.0,-0.000175,-0.001254,0.016288,0.004365,0.04933,0.047814,0.023298,...,0.008182,0.00652,0.000474,0.014036,-0.000387,0.037709,-0.048617,-0.002406,0.001327,0.103341
hot,0.013213,0.004483,0.004365,-0.000295,-0.002106,0.000356,1.0,0.00874,0.105305,0.007348,...,0.00049,0.006376,0.003828,0.023071,-0.00065,0.006327,-0.058628,-0.003927,0.017964,0.449503
num_failed_logins,0.005239,-2.7e-05,0.04933,-6.5e-05,-0.000467,0.141996,0.00874,1.0,-0.001145,0.006907,...,-0.000105,-6.8e-05,0.057936,-0.010307,-0.000144,0.00106,-0.019111,-0.000896,0.012613,0.152698
logged_in,-0.017265,0.001701,0.047814,-0.002784,-0.019908,0.006164,0.105305,-0.001145,1.0,0.013612,...,0.023097,0.01589,0.006986,0.227275,-0.00614,0.795282,-0.784107,-0.03757,0.021262,0.104174
root_shell,0.02134,-2.2e-05,0.03168,-7e-05,-0.000504,0.03479,0.024065,0.036983,0.025293,0.255557,...,-0.000113,-7.4e-05,-4.7e-05,0.005831,-0.000155,0.005871,-0.020615,-0.000966,0.486117,0.023635
num_shells,-0.000169,5e-06,0.000144,-6.6e-05,-0.000473,-2.6e-05,0.006373,-9.7e-05,0.023776,0.009341,...,-0.000107,-6.9e-05,-4.5e-05,0.005482,-0.000146,0.014951,-0.019378,-0.000908,0.125231,0.014933
is_guest_login,0.023424,-8.2e-05,0.001289,-0.000249,-0.001778,-9.6e-05,0.843572,-0.000365,0.089318,-0.000212,...,0.004664,-0.00026,-0.000168,0.020336,-0.000548,0.032299,-0.072798,-0.003412,-0.000382,0.356408
count,-0.105153,-0.003098,-0.040373,-0.01026,-0.061934,-0.003997,-0.068451,-0.015184,-0.634643,-0.008792,...,-0.016511,-0.010696,-0.006955,0.392237,-0.022875,-0.752978,0.762798,-0.06888,-0.015715,-0.074217
srv_count,-0.08025,-0.002501,-0.030544,-0.007886,-0.047789,-0.003047,-0.052164,-0.011578,-0.478122,-0.006704,...,-0.012459,-0.008101,-0.005277,0.634065,-0.017129,-0.566829,0.586434,-0.106098,-0.012148,-0.056381


In [None]:
corr_normal = kdd_corr_select.sort_values(by='normal')
corr_probe = kdd_corr_select.sort_values(by='probe')
corr_u2r = kdd_corr_select.sort_values(by='u2r')
corr_r2l = kdd_corr_select.sort_values(by='r2l')
corr_dos = kdd_corr_select.sort_values(by='dos')
