## Import Libraries

In [5]:
# Import packages
import os 
import warnings
import gc

import pickle
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.express as px 
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, classification_report
import seaborn as sns

# Configuration
warnings.simplefilter('ignore')
pd.set_option('max_columns', 50)

## Donwloading the dataset from kaggle

In [1]:
!wget http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz

--2021-10-30 18:48:02--  http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz
Resolving kdd.ics.uci.edu (kdd.ics.uci.edu)... 128.195.1.86
Connecting to kdd.ics.uci.edu (kdd.ics.uci.edu)|128.195.1.86|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2144903 (2.0M) [application/x-gzip]
Saving to: ‘kddcup.data_10_percent.gz’


2021-10-30 18:48:04 (2.18 MB/s) - ‘kddcup.data_10_percent.gz’ saved [2144903/2144903]



In [2]:
!gzip -d /content/kddcup.data_10_percent.gz

In [3]:
features = ['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised','root_shell','su_attempted',
'num_root','num_file_creations','num_shells','num_access_files','num_outbound_cmds',
'is_host_login',
'is_guest_login',
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',
'intrusion_type']

In [6]:
data = pd.read_csv('/content/kddcup.data_10_percent', names=features, header=None)
data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,intrusion_type
0,0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


## Read the dataset

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
scaler = StandardScaler()

In [8]:
cols_names = data.select_dtypes(include=['float64', 'int64']).columns
scale_data = scaler.fit_transform(data.select_dtypes(include=['float64', 'int64']))
# scale_test = scaler.fit_transform(data.select_dtypes(include=['float64', 'int64']))

**creating training and testing dataframes with scaled values**

In [9]:
# turn the result back to a dataframe
sc_df = pd.DataFrame(scale_data, columns = cols_names)
# sc_testdf = pd.DataFrame(scale_test, columns = cols_names)

In [10]:
sc_df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,-0.067792,-0.002879,0.138664,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,-0.010552,-0.004676,-0.00564,-0.011232,-0.009919,-0.027632,0.0,0.0,-0.037263,-1.521417,-1.15664,-0.46409,-0.46352,-0.24796,-0.248631,0.536987,-0.255243,-0.203633,-3.451536,-1.694315,0.599396,-0.282867,-1.022077,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464
1,-0.067792,-0.00282,-0.011578,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,-0.010552,-0.004676,-0.00564,-0.011232,-0.009919,-0.027632,0.0,0.0,-0.037263,-1.521417,-1.15664,-0.46409,-0.46352,-0.24796,-0.248631,0.536987,-0.255243,-0.203633,-3.297085,-1.600011,0.599396,-0.282867,-1.146737,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464
2,-0.067792,-0.002824,0.014179,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,-0.010552,-0.004676,-0.00564,-0.011232,-0.009919,-0.027632,0.0,0.0,-0.037263,-1.521417,-1.15664,-0.46409,-0.46352,-0.24796,-0.248631,0.536987,-0.255243,-0.203633,-3.142633,-1.505707,0.599396,-0.282867,-1.188291,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464
3,-0.067792,-0.00284,0.014179,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,-0.010552,-0.004676,-0.00564,-0.011232,-0.009919,-0.027632,0.0,0.0,-0.037263,-1.5308,-1.16476,-0.46409,-0.46352,-0.24796,-0.248631,0.536987,-0.255243,-0.203633,-2.988182,-1.411403,0.599396,-0.282867,-1.188291,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464
4,-0.067792,-0.002842,0.035214,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,-0.010552,-0.004676,-0.00564,-0.011232,-0.009919,-0.027632,0.0,0.0,-0.037263,-1.5308,-1.16476,-0.46409,-0.46352,-0.24796,-0.248631,0.536987,-0.255243,-0.203633,-2.833731,-1.3171,0.599396,-0.282867,-1.209067,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464


In [64]:
pickle.dump(scaler, open('/content/theScaler.pkl','wb'))

## Encoding categorical attributes in the datasets

In [11]:
cat_data = data.select_dtypes(include=['object']).copy()

In [12]:
data_cat = cat_data.apply(encoder.fit_transform)
# test_cat = cat_test.apply(encoder.fit_transform)
encoder.classes_

array(['back.', 'buffer_overflow.', 'ftp_write.', 'guess_passwd.',
       'imap.', 'ipsweep.', 'land.', 'loadmodule.', 'multihop.',
       'neptune.', 'nmap.', 'normal.', 'perl.', 'phf.', 'pod.',
       'portsweep.', 'rootkit.', 'satan.', 'smurf.', 'spy.', 'teardrop.',
       'warezclient.', 'warezmaster.'], dtype=object)

In [13]:
len(encoder.classes_)

23

In [14]:
# separate target column from encoded data 
encdata = data_cat.drop(['intrusion_type'], axis=1)
cat_Y = data_cat[['intrusion_type']].copy()

In [17]:
final_df = pd.concat([sc_df,encdata],axis=1)
final_df.shape

(494021, 41)

In [18]:
final_df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_type,service,flag
0,-0.067792,-0.002879,0.138664,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,-0.010552,-0.004676,-0.00564,-0.011232,-0.009919,-0.027632,0.0,0.0,-0.037263,-1.521417,-1.15664,-0.46409,-0.46352,-0.24796,-0.248631,0.536987,-0.255243,-0.203633,-3.451536,-1.694315,0.599396,-0.282867,-1.022077,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,1,22,9
1,-0.067792,-0.00282,-0.011578,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,-0.010552,-0.004676,-0.00564,-0.011232,-0.009919,-0.027632,0.0,0.0,-0.037263,-1.521417,-1.15664,-0.46409,-0.46352,-0.24796,-0.248631,0.536987,-0.255243,-0.203633,-3.297085,-1.600011,0.599396,-0.282867,-1.146737,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,1,22,9
2,-0.067792,-0.002824,0.014179,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,-0.010552,-0.004676,-0.00564,-0.011232,-0.009919,-0.027632,0.0,0.0,-0.037263,-1.521417,-1.15664,-0.46409,-0.46352,-0.24796,-0.248631,0.536987,-0.255243,-0.203633,-3.142633,-1.505707,0.599396,-0.282867,-1.188291,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,1,22,9
3,-0.067792,-0.00284,0.014179,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,-0.010552,-0.004676,-0.00564,-0.011232,-0.009919,-0.027632,0.0,0.0,-0.037263,-1.5308,-1.16476,-0.46409,-0.46352,-0.24796,-0.248631,0.536987,-0.255243,-0.203633,-2.988182,-1.411403,0.599396,-0.282867,-1.188291,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,1,22,9
4,-0.067792,-0.002842,0.035214,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,-0.010552,-0.004676,-0.00564,-0.011232,-0.009919,-0.027632,0.0,0.0,-0.037263,-1.5308,-1.16476,-0.46409,-0.46352,-0.24796,-0.248631,0.536987,-0.255243,-0.203633,-2.833731,-1.3171,0.599396,-0.282867,-1.209067,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,1,22,9


## Splitting the Dataset

In [72]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(final_df, cat_Y,train_size=0.70, random_state=2)

In [73]:
X_train.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_type,service,flag
429638,-0.067792,-0.002535,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,-0.010552,-0.004676,-0.00564,-0.011232,-0.009919,-0.027632,0.0,0.0,-0.037263,0.838455,0.885398,-0.46409,-0.46352,-0.24796,-0.248631,0.536987,-0.255243,-0.203633,0.347967,0.625558,0.599396,-0.282867,0.827048,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,0,14,9
477539,-0.067792,-0.003062,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,-0.010552,-0.004676,-0.00564,-0.011232,-0.009919,-0.027632,0.0,0.0,-0.037263,-0.292219,-1.15258,-0.46409,-0.46352,4.069396,4.058989,-1.961795,0.474636,-0.203633,0.347967,-1.694315,-1.737618,0.357813,-1.250621,-0.158629,-0.464418,-0.463202,4.084676,4.095715,1,45,1
151900,-0.067792,-0.002017,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,-0.010552,-0.004676,-0.00564,-0.011232,-0.009919,-0.027632,0.0,0.0,-0.037263,0.838455,0.885398,-0.46409,-0.46352,-0.24796,-0.248631,0.536987,-0.255243,-0.203633,0.347967,0.625558,0.599396,-0.282867,0.827048,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,0,14,9
247082,-0.067792,-0.002017,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,-0.010552,-0.004676,-0.00564,-0.011232,-0.009919,-0.027632,0.0,0.0,-0.037263,0.838455,0.885398,-0.46409,-0.46352,-0.24796,-0.248631,0.536987,-0.255243,-0.203633,0.347967,0.625558,0.599396,-0.282867,0.827048,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,0,14,9
82340,-0.067792,-0.002773,0.013937,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,-0.010552,-0.004676,-0.00564,-0.011232,-0.009919,-0.027632,0.0,0.0,-0.037263,-1.404127,-1.055147,-0.46409,-0.46352,-0.24796,-0.248631,0.536987,-0.255243,-0.203633,0.317077,0.625558,0.599396,-0.282867,-1.250621,0.078715,-0.464418,-0.463202,-0.25204,-0.249464,1,22,9


In [74]:
Y_train.head(2)

Unnamed: 0,intrusion_type
429638,18
477539,9


## Defining the NN model using keras

In [75]:
# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

In [123]:
# Initialising the ANN
multi_classifier = Sequential()

# Adding the input layer and the first hidden layer
multi_classifier.add(Dense(units= 60, activation = 'relu'))

#Adding a second hidden layer
multi_classifier.add(Dense(units = 60, activation = 'relu'))

#Adding a third hidden layer
multi_classifier.add(Dense(units = 60, activation = 'relu'))

# Adding the output layer
multi_classifier.add(Dense(units = 23, activation = 'sigmoid'))

# Compiling the ANN
multi_classifier.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

## Start training 

In [124]:
multi_classifier.fit(X_train, Y_train, batch_size=10, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f420f0c3150>

In [125]:
pred = multi_classifier.predict(X_test)
pred_class = np.argmax(pred, axis=-1) 
prediction_ = encoder.inverse_transform(pred_class)

In [126]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(Y_test, pred_class)
print('Training accuracy is {}'.format(round(accuracy*100, 2)))

Training accuracy is 99.78


## Let's do a test here, shall we

In [127]:
def Pre_To_Predict(To_predict):
  scaler1 = pickle.load(open('/content/theScaler.pkl','rb'))
  single_col = To_predict.select_dtypes(include=['float64', 'int64']).columns
  To_predict1 = scaler1.transform(To_predict.select_dtypes(include=['float64', 'int64'])) 
  To_predict1 = pd.DataFrame(To_predict1, columns = single_col)
  To_predict2_col = To_predict.select_dtypes(include=['object']).columns

  encoder_service = LabelEncoder()
  encoder_service.fit(['IRC', 'X11', 'Z39_50', 'auth', 'bgp', 'courier', 'csnet_ns',
        'ctf', 'daytime', 'discard', 'domain', 'domain_u', 'echo', 'eco_i',
        'ecr_i', 'efs', 'exec', 'finger', 'ftp', 'ftp_data', 'gopher',
        'hostnames', 'http', 'http_443', 'http_8001', 'imap5', 'iso_tsap',
        'klogin', 'kshell', 'ldap', 'link', 'login', 'mtp', 'name',
        'netbios_dgm', 'netbios_ns', 'netbios_ssn', 'netstat', 'nnsp',
        'nntp', 'ntp_u', 'other', 'pm_dump', 'pop_2', 'pop_3', 'printer',
        'private', 'red_i', 'remote_job', 'rje', 'shell', 'smtp',
        'sql_net', 'ssh', 'sunrpc', 'supdup', 'systat', 'telnet', 'tim_i',
        'time', 'urh_i', 'urp_i', 'uucp', 'uucp_path', 'vmnet', 'whois'])
  encoder_protocol_type = LabelEncoder()
  encoder_protocol_type.fit(['icmp', 'tcp', 'udp'])
  encoder_flag = LabelEncoder()
  encoder_flag.fit(['OTH', 'REJ', 'RSTO', 'RSTOS0', 'RSTR', 'S0', 'S1', 'S2', 'S3',
       'SF', 'SH'])
  
  array_service = encoder_service.transform(To_predict['service'].values)
  array_protocol = encoder_protocol_type.transform(To_predict['protocol_type'].values)
  array_flag = encoder_flag.transform(To_predict['flag'].values)
  array_service, array_protocol, array_flag

  full_array = np.array([[array_protocol[0],array_service[0],array_flag[0]]])
  To_predict2 = pd.DataFrame(full_array, columns=To_predict2_col)
  Final_to_predict = pd.concat([To_predict1,To_predict2],axis=1)

  return Final_to_predict

In [128]:
small_test_df = data.sample(1)
print('And it\'s Itrusion Type is {}'.format(small_test_df['intrusion_type']))

And it's Itrusion Type is 188116    smurf.
Name: intrusion_type, dtype: object


In [129]:
small_test_df = small_test_df.drop('intrusion_type', axis=1)
small_test_df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
188116,0,icmp,ecr_i,SF,1032,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,511,511,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,255,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [130]:
train_d = Pre_To_Predict(To_predict=small_test_df)
OnePredict = classifier.predict(train_d)

In [131]:
OnePredict_ = np.argmax(OnePredict, axis=-1) 
OneAnswer = encoder.inverse_transform(OnePredict_)
print('Neural Network Predicted as {}'.format(OneAnswer[0]))

Neural Network Predicted as smurf.
