<h1>XGBoost</h1>

In [1]:
import imblearn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold, SelectPercentile, f_classif, RFE, RFECV
from sklearn.metrics import auc, accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score, roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler  

import pickle
from joblib import dump, load


%matplotlib inline
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 500)

  from numpy.core.umath_tests import inner1d


<h2> Understand Data </h2>

In [2]:
#features used to differentiate normal connections from attacks
field_names = pd.read_csv("./Data/Field Names.csv", names=["header", "type"])

added_field_names = pd.DataFrame({"header":["attack", "cnt_correct"],"type":["Nominal", "Count"]})
field_names = field_names.append(added_field_names,sort=False).reset_index().drop("index", axis=1) 

#print data header horizontally for easier viewing
field_names1 = field_names.transpose()
field_names1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42
header,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,cnt_correct
type,Numeric,Nominal,Nominal,Nominal,Numeric,Numeric,Binary,Numeric,Numeric,Numeric,Numeric,Binary,Numeric,Binary,Binary,Numeric,Numeric,Numeric,Numeric,Numeric,Binary,Binary,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Numeric,Nominal,Count


In [3]:
#Combine data file "KDDTrain+.txt" and  header file "Field Names.csv"
data = pd.read_csv("./Data/KDDTrain+.txt", names=field_names["header"])
data.head(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,cnt_correct
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0.0,0.0,1.0,0.0,0.0,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,30,32,0.0,0.0,0.0,0.0,1.0,0.0,0.09,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


<h2> Prepare Training and Test Data </h2>

In [4]:
#Training Data#
X_train_raw = data.drop(['attack', 'cnt_correct'], axis=1)
Y_train_raw = data[['attack']]

In [5]:
#X_train_raw.head()
print("Train Set shape", X_train_raw.shape)

Train Set shape (125973, 41)


In [6]:
#X_train_raw.head()
print("Train Set shape", Y_train_raw.shape)

Train Set shape (125973, 1)


In [7]:
# Process Training set and test set, converting normal connection to 0, attack to 1. 
for i, row in Y_train_raw.iterrows():
    #print(row['category'])
    if row['attack'] == 'normal':
        row['attack'] = 0
    else:
        row['attack'] = 1

Y_train_raw.head()

Unnamed: 0,attack
0,0
1,0
2,1
3,0
4,0


In [8]:
#Process Test Data 
data_test = pd.read_csv("./Data/KDDTest+.txt", names=field_names["header"])

X_test_raw = data_test.drop(['attack', 'cnt_correct'], axis=1)
Y_test_raw = data_test[['attack']]

for i, row in Y_test_raw.iterrows():
    #print(row['category'])
    if row['attack'] == 'normal':
        row['attack'] = 0
    else:
        row['attack'] = 1
        
Y_test_raw.head()

Unnamed: 0,attack
0,1
1,1
2,0
3,1
4,1


In [9]:
#print the shape of the datasets
print("XTrain Set shape", X_train_raw.shape)
print("YTrain Set shape", Y_train_raw.shape)
print("XTest Set shape", X_test_raw.shape)
print("YTest Set shape", Y_test_raw.shape)

XTrain Set shape (125973, 41)
YTrain Set shape (125973, 1)
XTest Set shape (22544, 41)
YTest Set shape (22544, 1)


<h2> Apply One Hot Encoding </h2>

In [10]:
#print columns with categorical data 
field_names.loc[field_names['type'] =='Nominal'] ##only 3 symbolic protocol type, service, flag

Unnamed: 0,header,type
1,protocol_type,Nominal
2,service,Nominal
3,flag,Nominal
41,attack,Nominal


In [11]:
cat_columns = ["protocol_type", "service","flag"]

X_train = pd.get_dummies(X_train_raw, prefix_sep="__",
                              columns=cat_columns)

X_test = pd.get_dummies(X_test_raw, prefix_sep="__",
                              columns=cat_columns)

In [12]:
print("XTrain Set shape:", X_train.shape)
print("XTest Set shape:", X_test.shape)

XTrain Set shape: (125973, 122)
XTest Set shape: (22544, 116)


In [13]:
# Get missing columns in the training test
missing_cols = set(X_train.columns ) - set(X_test.columns )

# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0
    
# Ensure the order of column in the test set is in the same order than in train set
X_test = X_test[X_train.columns]

In [14]:
print("XTrain Set shape:", X_train.shape)
print("XTest Set shape:", X_test.shape)

XTrain Set shape: (125973, 122)
XTest Set shape: (22544, 122)


In [15]:
X_train.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_type__icmp,protocol_type__tcp,protocol_type__udp,service__IRC,service__X11,service__Z39_50,service__aol,service__auth,service__bgp,service__courier,service__csnet_ns,service__ctf,service__daytime,service__discard,service__domain,service__domain_u,service__echo,service__eco_i,service__ecr_i,service__efs,service__exec,service__finger,service__ftp,service__ftp_data,service__gopher,service__harvest,service__hostnames,service__http,service__http_2784,service__http_443,service__http_8001,service__imap4,service__iso_tsap,service__klogin,service__kshell,service__ldap,service__link,service__login,service__mtp,service__name,service__netbios_dgm,service__netbios_ns,service__netbios_ssn,service__netstat,service__nnsp,service__nntp,service__ntp_u,service__other,service__pm_dump,service__pop_2,service__pop_3,service__printer,service__private,service__red_i,service__remote_job,service__rje,service__shell,service__smtp,service__sql_net,service__ssh,service__sunrpc,service__supdup,service__systat,service__telnet,service__tftp_u,service__tim_i,service__time,service__urh_i,service__urp_i,service__uucp,service__uucp_path,service__vmnet,service__whois,flag__OTH,flag__REJ,flag__RSTO,flag__RSTOS0,flag__RSTR,flag__S0,flag__S1,flag__S2,flag__S3,flag__SF,flag__SH
0,0,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0.0,0.0,1.0,0.0,0.0,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0,199,420,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,30,32,0.0,0.0,0.0,0.0,1.0,0.0,0.09,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [16]:
X_test.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_type__icmp,protocol_type__tcp,protocol_type__udp,service__IRC,service__X11,service__Z39_50,service__aol,service__auth,service__bgp,service__courier,service__csnet_ns,service__ctf,service__daytime,service__discard,service__domain,service__domain_u,service__echo,service__eco_i,service__ecr_i,service__efs,service__exec,service__finger,service__ftp,service__ftp_data,service__gopher,service__harvest,service__hostnames,service__http,service__http_2784,service__http_443,service__http_8001,service__imap4,service__iso_tsap,service__klogin,service__kshell,service__ldap,service__link,service__login,service__mtp,service__name,service__netbios_dgm,service__netbios_ns,service__netbios_ssn,service__netstat,service__nnsp,service__nntp,service__ntp_u,service__other,service__pm_dump,service__pop_2,service__pop_3,service__printer,service__private,service__red_i,service__remote_job,service__rje,service__shell,service__smtp,service__sql_net,service__ssh,service__sunrpc,service__supdup,service__systat,service__telnet,service__tftp_u,service__tim_i,service__time,service__urh_i,service__urp_i,service__uucp,service__uucp_path,service__vmnet,service__whois,flag__OTH,flag__REJ,flag__RSTO,flag__RSTOS0,flag__RSTR,flag__S0,flag__S1,flag__S2,flag__S3,flag__SF,flag__SH
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,229,10,0.0,0.0,1.0,1.0,0.04,0.06,0.0,255,10,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,136,1,0.0,0.0,1.0,1.0,0.01,0.06,0.0,255,1,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,2,12983,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,134,86,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,0,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,65,0.0,0.0,0.0,0.0,1.0,0.0,1.0,3,57,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,1,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,0.0,0.12,1.0,0.5,1.0,0.0,0.75,29,86,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [17]:
y_train = Y_train_raw.astype('int')
y_test = Y_test_raw.astype('int')

<h2> Normalization </h2>

In [18]:
X_train.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_type__icmp,protocol_type__tcp,protocol_type__udp,service__IRC,service__X11,service__Z39_50,service__aol,service__auth,service__bgp,service__courier,service__csnet_ns,service__ctf,service__daytime,service__discard,service__domain,service__domain_u,service__echo,service__eco_i,service__ecr_i,service__efs,service__exec,service__finger,service__ftp,service__ftp_data,service__gopher,service__harvest,service__hostnames,service__http,service__http_2784,service__http_443,service__http_8001,service__imap4,service__iso_tsap,service__klogin,service__kshell,service__ldap,service__link,service__login,service__mtp,service__name,service__netbios_dgm,service__netbios_ns,service__netbios_ssn,service__netstat,service__nnsp,service__nntp,service__ntp_u,service__other,service__pm_dump,service__pop_2,service__pop_3,service__printer,service__private,service__red_i,service__remote_job,service__rje,service__shell,service__smtp,service__sql_net,service__ssh,service__sunrpc,service__supdup,service__systat,service__telnet,service__tftp_u,service__tim_i,service__time,service__urh_i,service__urp_i,service__uucp,service__uucp_path,service__vmnet,service__whois,flag__OTH,flag__REJ,flag__RSTO,flag__RSTOS0,flag__RSTR,flag__S0,flag__S1,flag__S2,flag__S3,flag__SF,flag__SH
0,0,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0.0,0.0,1.0,0.0,0.0,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0,199,420,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,30,32,0.0,0.0,0.0,0.0,1.0,0.0,0.09,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [19]:
# Replace the following columns with the log of themselves 
'''If you have variables that always get positive numbers, such as lenght, weight, etc., and that showes 
much more variation with higher values (heteroscedasticity), a log-normal distribution (i.e., normal after 
log-transformation) might be a clearly better description of the data than a normal distribution.
Log-transforming that kind of variables makes the distributions more normally distributed, stabilizes the variances, 
but also makes your model multiplative on the raw scale instead of additive. '''

log_cols = ["duration", "src_bytes", "dst_bytes", "hot", "num_failed_logins", "num_compromised", "num_root", "num_file_creations", 
            "num_shells", "num_access_files"]

dataframe = X_train

for log_col in log_cols:
    if log_col in dataframe.columns:
        print("Processing column: %s" %(log_col))
        dataframe[log_col] = dataframe[log_col].map(lambda x: max(np.log(x), 0))
    else:
        pass

Processing column: duration


  app.launch_new_instance()


Processing column: src_bytes
Processing column: dst_bytes
Processing column: hot
Processing column: num_failed_logins
Processing column: num_compromised
Processing column: num_root
Processing column: num_file_creations
Processing column: num_shells
Processing column: num_access_files


In [20]:
#Normalize columns with numeric data (binary data is not normalized)
numeric_cols = list(field_names["header"][field_names["type"] == "Numeric"])
numeric_cols.remove("num_outbound_cmds")
print(numeric_cols)

['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'num_compromised', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']


In [21]:
#normalize these columns
#https://stackoverflow.com/questions/26414913/normalize-columns-of-pandas-data-frame
X_train_norm = dataframe

norm_type = "mean-std"

for numeric_col in numeric_cols:
    print("Processing column: %s" %(numeric_col))
    tr_values = X_train_norm[numeric_col]
    
    if norm_type == "min-max":
        X_train_norm[numeric_col] = (tr_values - np.min(tr_values))/(np.max(tr_values) - np.min(tr_values))
        X_train_norm[numeric_col] = X_train_norm[numeric_col].astype("float64")
    elif norm_type == "mean-std":
        X_train_norm[numeric_col] = (tr_values - np.min(tr_values))/np.std(tr_values)
        X_train_norm[numeric_col] = X_train_norm[numeric_col].astype("float64")
    else:
        pass

Processing column: duration
Processing column: src_bytes
Processing column: dst_bytes
Processing column: wrong_fragment
Processing column: urgent
Processing column: hot
Processing column: num_failed_logins
Processing column: num_compromised
Processing column: num_root
Processing column: num_file_creations
Processing column: num_shells
Processing column: num_access_files
Processing column: count
Processing column: srv_count
Processing column: serror_rate
Processing column: srv_serror_rate
Processing column: rerror_rate
Processing column: srv_rerror_rate
Processing column: same_srv_rate
Processing column: diff_srv_rate
Processing column: srv_diff_host_rate
Processing column: dst_host_count
Processing column: dst_host_srv_count
Processing column: dst_host_same_srv_rate
Processing column: dst_host_diff_srv_rate
Processing column: dst_host_same_src_port_rate
Processing column: dst_host_srv_diff_host_rate
Processing column: dst_host_serror_rate
Processing column: dst_host_srv_serror_rate
Pro

In [22]:
X_train_norm.head(5)

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_type__icmp,protocol_type__tcp,protocol_type__udp,service__IRC,service__X11,service__Z39_50,service__aol,service__auth,service__bgp,service__courier,service__csnet_ns,service__ctf,service__daytime,service__discard,service__domain,service__domain_u,service__echo,service__eco_i,service__ecr_i,service__efs,service__exec,service__finger,service__ftp,service__ftp_data,service__gopher,service__harvest,service__hostnames,service__http,service__http_2784,service__http_443,service__http_8001,service__imap4,service__iso_tsap,service__klogin,service__kshell,service__ldap,service__link,service__login,service__mtp,service__name,service__netbios_dgm,service__netbios_ns,service__netbios_ssn,service__netstat,service__nnsp,service__nntp,service__ntp_u,service__other,service__pm_dump,service__pop_2,service__pop_3,service__printer,service__private,service__red_i,service__remote_job,service__rje,service__shell,service__smtp,service__sql_net,service__ssh,service__sunrpc,service__supdup,service__systat,service__telnet,service__tftp_u,service__tim_i,service__time,service__urh_i,service__urp_i,service__uucp,service__uucp_path,service__vmnet,service__whois,flag__OTH,flag__REJ,flag__RSTO,flag__RSTOS0,flag__RSTR,flag__S0,flag__S1,flag__S2,flag__S3,flag__SF,flag__SH
0,0.0,2.068858,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0.017466,0.027535,0.0,0.0,0.0,0.0,2.274686,0.0,0.0,1.512008,0.225831,0.378663,0.158796,0.550169,0.0,0.0,0.0,0.163102,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0.0,1.663918,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0.113529,0.013767,0.0,0.0,0.0,0.0,0.181975,0.831884,0.0,2.570414,0.009033,0.0,3.17593,2.847934,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,1.074159,0.082604,2.239873,2.237033,0.0,0.0,0.113734,0.388212,0.0,2.570414,0.234864,0.222743,0.264661,0.0,0.0,2.248291,2.243826,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0.0,1.818547,2.537965,0,0.0,0.0,0.0,0.0,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0.043665,0.068837,0.447975,0.447407,0.0,0.0,2.274686,0.0,0.0,0.302402,2.303475,2.227432,0.0,0.097089,0.355355,0.067449,0.022438,0.0,0.031303,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0.0,1.767319,1.702167,0,0.0,0.0,0.0,0.0,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0.26199,0.440556,0.0,0.0,0.0,0.0,2.274686,0.0,0.346381,2.570414,2.303475,2.227432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [23]:
# Replace the following columns with the log of themselves because they have very big range
dataframe_test = X_test

for log_col in log_cols:
    if log_col in dataframe_test.columns:
        print("Processing column: %s" %(log_col))
        dataframe_test[log_col] = dataframe_test[log_col].map(lambda x: max(np.log(x), 0))
    else:
        pass

Processing column: duration
Processing column: src_bytes
Processing column: dst_bytes


  import sys


Processing column: hot
Processing column: num_failed_logins
Processing column: num_compromised
Processing column: num_root
Processing column: num_file_creations
Processing column: num_shells
Processing column: num_access_files


In [24]:
#normalize these columns
#https://stackoverflow.com/questions/26414913/normalize-columns-of-pandas-data-frame
X_test_norm = dataframe_test

#normalisation
for numeric_col in numeric_cols:
    print("Processing column: %s" %(numeric_col))
    values_test = X_test_norm[numeric_col]
    
    if norm_type == "min-max":
        X_test_norm[numeric_col] = (values_test - np.min(values_test))/(np.max(values_test) - np.min(values_test))
        X_test_norm[numeric_col] = X_test_norm[numeric_col].astype("float64")
    elif norm_type == "mean-std":
        X_test_norm[numeric_col] = (values_test - np.min(values_test))/np.std(values_test)
        X_test_norm[numeric_col] = X_test_norm[numeric_col].astype("float64")
    else:
        pass

Processing column: duration
Processing column: src_bytes
Processing column: dst_bytes
Processing column: wrong_fragment
Processing column: urgent
Processing column: hot
Processing column: num_failed_logins
Processing column: num_compromised
Processing column: num_root
Processing column: num_file_creations
Processing column: num_shells
Processing column: num_access_files
Processing column: count
Processing column: srv_count
Processing column: serror_rate
Processing column: srv_serror_rate
Processing column: rerror_rate
Processing column: srv_rerror_rate
Processing column: same_srv_rate
Processing column: diff_srv_rate
Processing column: srv_diff_host_rate
Processing column: dst_host_count
Processing column: dst_host_srv_count
Processing column: dst_host_same_srv_rate
Processing column: dst_host_diff_srv_rate
Processing column: dst_host_same_src_port_rate
Processing column: dst_host_srv_diff_host_rate
Processing column: dst_host_serror_rate
Processing column: dst_host_srv_serror_rate
Pro

In [25]:
X_test_norm.head(5)

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_type__icmp,protocol_type__tcp,protocol_type__udp,service__IRC,service__X11,service__Z39_50,service__aol,service__auth,service__bgp,service__courier,service__csnet_ns,service__ctf,service__daytime,service__discard,service__domain,service__domain_u,service__echo,service__eco_i,service__ecr_i,service__efs,service__exec,service__finger,service__ftp,service__ftp_data,service__gopher,service__harvest,service__hostnames,service__http,service__http_2784,service__http_443,service__http_8001,service__imap4,service__iso_tsap,service__klogin,service__kshell,service__ldap,service__link,service__login,service__mtp,service__name,service__netbios_dgm,service__netbios_ns,service__netbios_ssn,service__netstat,service__nnsp,service__nntp,service__ntp_u,service__other,service__pm_dump,service__pop_2,service__pop_3,service__printer,service__private,service__red_i,service__remote_job,service__rje,service__shell,service__smtp,service__sql_net,service__ssh,service__sunrpc,service__supdup,service__systat,service__telnet,service__tftp_u,service__tim_i,service__time,service__urh_i,service__urp_i,service__uucp,service__uucp_path,service__vmnet,service__whois,flag__OTH,flag__REJ,flag__RSTO,flag__RSTOS0,flag__RSTR,flag__S0,flag__S1,flag__S2,flag__S3,flag__SF,flag__SH
0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,1.781597,0.112283,0.0,0.0,2.40322,2.402658,0.096973,0.231542,0.0,2.711797,0.08946,0.091811,0.271848,0.0,0.0,0.0,0.0,2.58251,2.494599,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,1.058066,0.011228,0.0,0.0,2.40322,2.402658,0.024243,0.231542,0.0,2.711797,0.008946,0.0,0.271848,0.0,0.0,0.0,0.0,2.58251,2.494599,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0.358104,2.982939,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0.00778,0.011228,0.0,0.0,0.0,0.0,2.42432,0.0,0.0,1.425023,0.769358,1.400115,0.181232,1.991762,0.234214,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,0.0,0.943481,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0.00778,0.72984,0.0,0.0,0.0,0.0,2.42432,0.0,3.944156,0.031903,0.509923,2.295271,0.0,3.265184,3.278996,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0.0,0.0,0.788089,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0.00778,0.089827,0.0,0.402246,2.40322,1.201329,2.42432,0.0,2.958117,0.3084,0.769358,0.711534,0.770235,0.097956,0.234214,0.0,0.0,2.143483,1.771165,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [26]:
print("XTrain Set shape:",X_train_norm.shape)
print("XTest Set shape:", X_test_norm.shape)

XTrain Set shape: (125973, 122)
XTest Set shape: (22544, 122)


<h2> Train GradientBoostingClassifier</h2>

In [30]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, 
                                        max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_train_norm, y_train.values.ravel())

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train_norm, y_train)))
    print("Accuracy score (test): {0:.3f}".format(gb_clf.score(X_test_norm, y_test)))

Learning rate:  0.05
Accuracy score (training): 0.931
Accuracy score (test): 0.745
Learning rate:  0.075
Accuracy score (training): 0.917
Accuracy score (test): 0.754
Learning rate:  0.1
Accuracy score (training): 0.923
Accuracy score (test): 0.753
Learning rate:  0.25
Accuracy score (training): 0.928
Accuracy score (test): 0.762
Learning rate:  0.5
Accuracy score (training): 0.968
Accuracy score (test): 0.823
Learning rate:  0.75
Accuracy score (training): 0.971
Accuracy score (test): 0.821
Learning rate:  1
Accuracy score (training): 0.962
Accuracy score (test): 0.761


In [31]:
gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, 
                                     max_features=2, max_depth=2, random_state=0)\

gb_clf2.fit(X_train_norm, y_train.values.ravel())
predictions = gb_clf2.predict(X_test_norm)

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix:
[[9336  375]
 [3615 9218]]
Classification Report
             precision    recall  f1-score   support

          0       0.72      0.96      0.82      9711
          1       0.96      0.72      0.82     12833

avg / total       0.86      0.82      0.82     22544



In [None]:
parameters = {
    "loss":["deviance"],
    "learning_rate": [0.5],
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.8, 1.0],
    "n_estimators":[10,20]
    }

grid = GridSearchCV(GradientBoostingClassifier(), parameters, cv=5, n_jobs=-1)

# fit the grid with data
grid.fit(X_train_norm, y_train)

In [None]:
# examine the best model
# Single best score achieved across all params
print(grid.best_score_)

# Dictionary containing the parameters used to generate that score
print(grid.best_params_)

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(grid.best_estimator_)

In [None]:
gb_grid = DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=20,
                       max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=6,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

gb_grid.fit(X_train_norm[rfe_features], y_train)

print("Accuracy on training set: {:.3f}".format(gb_grid.score(X_train_norm, y_train)))
print("Accuracy on test set: {:.3f}".format(gb_grid.score(X_test_norm, y_test)))

In [None]:
#predict the data
y_pred_grid= gb_grid.predict(X_test_norm)

In [None]:
#print report
print(confusion_matrix(y_test, y_pred_grid))  
print(classification_report(y_test, y_pred_grid))

<h2> Train GradientBoostingClassifier with rfe </h2>

In [27]:
estimator=GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, 
                                     max_features=2, max_depth=2, random_state=0)

gb_rfe = RFE(estimator, n_features_to_select=15, step=1)
gb_rfe.fit(X_train_norm, y_train.values.ravel())

RFE(estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.5, loss='deviance', max_depth=2,
              max_features=2, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=20,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False),
  n_features_to_select=15, step=1, verbose=0)

In [28]:
print("Accuracy on normalized training set: {:.3f}".format(gb_rfe.score(X_train_norm, y_train)))
print("Accuracy on normalized test set: {:.3f}".format(gb_rfe.score(X_test_norm, y_test)))

Accuracy on normalized training set: 0.966
Accuracy on normalized test set: 0.726


In [29]:
#predict the data
y_pred_rfe = gb_rfe.predict(X_test_norm)

In [30]:
#print report
print(confusion_matrix(y_test, y_pred_rfe))  
print(classification_report(y_test, y_pred_rfe))

[[8964  747]
 [5428 7405]]
             precision    recall  f1-score   support

          0       0.62      0.92      0.74      9711
          1       0.91      0.58      0.71     12833

avg / total       0.79      0.73      0.72     22544

