## Import Libraries

In [1]:
# Import packages
import os 
import warnings
import gc

import pickle
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.express as px 
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, classification_report
import seaborn as sns

# Configuration
warnings.simplefilter('ignore')
pd.set_option('max_columns', 50)

## Donwloading the dataset from kaggle

In [2]:
!chmod 600 /content/kaggle.json
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d sampadab17/network-intrusion-detection
!unzip /content/network-intrusion-detection.zip

Downloading network-intrusion-detection.zip to /content
  0% 0.00/818k [00:00<?, ?B/s]
100% 818k/818k [00:00<00:00, 13.1MB/s]
Archive:  /content/network-intrusion-detection.zip
  inflating: Test_data.csv           
  inflating: Train_data.csv          


## Read the dataset

In [55]:
train = pd.read_csv('/content/Train_data.csv')
test = pd.read_csv('/content/Test_data.csv')

In [56]:
train.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,anomaly
3,0,tcp,http,SF,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0.0,0.0,1.0,0.0,0.0,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,30,32,0.0,0.0,0.0,0.0,1.0,0.0,0.09,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [57]:
test.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,tcp,private,REJ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,229,10,0.0,0.0,1.0,1.0,0.04,0.06,0.0,255,10,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0
1,0,tcp,private,REJ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,136,1,0.0,0.0,1.0,1.0,0.01,0.06,0.0,255,1,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,134,86,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0
3,0,icmp,eco_i,SF,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,65,0.0,0.0,0.0,0.0,1.0,0.0,1.0,3,57,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,0.0,0.12,1.0,0.5,1.0,0.0,0.75,29,86,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71


In [58]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
scaler = StandardScaler()

In [59]:
cols_names = train.select_dtypes(include=['float64', 'int64']).columns
scale_train = scaler.fit_transform(train.select_dtypes(include=['float64', 'int64']))
scale_test = scaler.fit_transform(test.select_dtypes(include=['float64', 'int64']))

**creating training and testing dataframes with scaled values**

In [60]:
# turn the result back to a dataframe
sc_traindf = pd.DataFrame(scale_train, columns = cols_names)
sc_testdf = pd.DataFrame(scale_test, columns = cols_names)

## Encoding categorical attributes in the datasets

In [61]:
cat_train = train.select_dtypes(include=['object']).copy()
cat_test =  test.select_dtypes(include=['object']).copy()

In [62]:
train_cat = cat_train.apply(encoder.fit_transform)
test_cat = cat_test.apply(encoder.fit_transform)
encoder.classes_

array(['OTH', 'REJ', 'RSTO', 'RSTOS0', 'RSTR', 'S0', 'S1', 'S2', 'S3',
       'SF', 'SH'], dtype=object)

In [63]:
# separate target column from encoded data 
enctrain = train_cat.drop(['class'], axis=1)
cat_Ytrain = train_cat[['class']].copy()

In [64]:
train_x = pd.concat([sc_traindf,enctrain],axis=1)
train_y = train['class'].apply(lambda x: 1 if x=='anomaly' else 0)
train_x.shape

(25192, 41)

In [65]:
train_y.head()

0    0
1    0
2    1
3    0
4    0
Name: class, dtype: int64

In [67]:
train_x.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_type,service,flag
0,-0.113551,-0.009889,-0.03931,-0.00891,-0.091223,-0.006301,-0.091933,-0.02622,-0.807626,-0.021873,-0.039377,-0.027665,-0.021724,-0.027808,-0.018905,-0.043917,0.0,0.0,-0.09599,-0.720244,-0.354628,-0.640142,-0.633978,-0.372186,-0.373098,0.772109,-0.349282,-0.373886,-0.328634,-0.813985,-0.779157,-0.280673,0.07312,-0.287993,-0.641804,-0.627365,-0.221668,-0.374281,1,19,9
1,-0.113551,-0.010032,-0.03931,-0.00891,-0.091223,-0.006301,-0.091933,-0.02622,-0.807626,-0.021873,-0.039377,-0.027665,-0.021724,-0.027808,-0.018905,-0.043917,0.0,0.0,-0.09599,-0.624317,-0.368427,-0.640142,-0.633978,-0.372186,-0.373098,-1.320567,0.490836,-0.373886,0.732059,-1.030895,-1.157831,2.764403,2.37562,-0.287993,-0.641804,-0.627365,-0.38514,-0.374281,2,41,9
2,-0.113551,-0.010093,-0.03931,-0.00891,-0.091223,-0.006301,-0.091933,-0.02622,-0.807626,-0.021873,-0.039377,-0.027665,-0.021724,-0.027808,-0.018905,-0.043917,0.0,0.0,-0.09599,0.334947,-0.29943,1.595477,1.600209,-0.372186,-0.373098,-1.388806,0.042773,-0.373886,0.732059,-0.804947,-0.935081,-0.173828,-0.478183,-0.287993,1.603834,1.614454,-0.38514,-0.374281,1,46,5
3,-0.113551,-0.009996,0.052473,-0.00891,-0.091223,-0.006301,-0.091933,-0.02622,1.238197,-0.021873,-0.039377,-0.027665,-0.021724,-0.027808,-0.018905,-0.043917,0.0,0.0,-0.09599,-0.694082,-0.31323,-0.193018,-0.187141,-0.372186,-0.373098,0.772109,-0.349282,-0.373886,-1.540854,1.264742,1.069663,-0.44094,-0.380894,0.073759,-0.574435,-0.604947,-0.38514,-0.342768,1,22,9
4,-0.113551,-0.01001,-0.034582,-0.00891,-0.091223,-0.006301,-0.091933,-0.02622,1.238197,-0.021873,-0.039377,-0.027665,-0.021724,-0.027808,-0.018905,-0.043917,0.0,0.0,-0.09599,-0.476067,0.059355,-0.640142,-0.633978,-0.372186,-0.373098,0.772109,-0.349282,-0.023115,0.732059,1.264742,1.069663,-0.44094,-0.478183,-0.287993,-0.641804,-0.627365,-0.38514,-0.374281,1,22,9


## Splitting the Dataset

In [68]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(train_x,train_y,train_size=0.70, random_state=2)

In [69]:
X_train.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_type,service,flag
1312,-0.113551,-0.010093,-0.03931,-0.00891,-0.091223,-0.006301,-0.091933,-0.02622,-0.807626,-0.021873,-0.039377,-0.027665,-0.021724,-0.027808,-0.018905,-0.043917,0.0,0.0,-0.09599,1.70408,-0.133837,1.595477,1.600209,-0.372186,-0.373098,-1.36606,-0.125251,-0.373886,-0.429652,-0.87725,-0.868257,-0.066984,-0.445754,-0.287993,1.581378,1.614454,-0.38514,-0.374281,1,46,5
536,-0.113551,-0.010093,-0.03931,-0.00891,-0.091223,-0.006301,-0.091933,-0.02622,-0.807626,-0.021873,-0.039377,-0.027665,-0.021724,-0.027808,-0.018905,-0.043917,0.0,0.0,-0.09599,-0.092361,-0.106238,1.595477,1.600209,-0.372186,-0.373098,-0.888384,-0.069243,-0.373886,0.732059,-0.515732,-0.645507,-0.227251,-0.478183,-0.287993,1.603834,1.614454,-0.38514,-0.374281,1,17,5
12385,-0.113551,-0.010093,-0.03931,-0.00891,-0.091223,-0.006301,-0.091933,-0.02622,-0.807626,-0.021873,-0.039377,-0.027665,-0.021724,-0.027808,-0.018905,-0.043917,0.0,0.0,-0.09599,1.250609,-0.120038,-0.640142,-0.633978,2.765176,2.729322,-1.320567,-0.013235,-0.373886,0.732059,-0.868212,-1.001906,-0.066984,-0.478183,-0.287993,-0.641804,-0.627365,2.884296,2.777041,1,6,1
24516,-0.113551,-0.010092,-0.03931,-0.00891,-0.091223,-0.006301,-0.091933,-0.02622,-0.807626,-0.021873,-0.039377,-0.027665,-0.021724,-0.027808,-0.018905,-0.043917,0.0,0.0,-0.09599,-0.266773,-0.340829,-0.640142,-0.633978,-0.372186,-0.373098,-1.36606,0.266804,-0.373886,0.732059,-1.012819,-1.135556,0.039861,0.527134,-0.287993,-0.641804,-0.627365,1.609216,-0.374281,2,41,9
19896,-0.113551,-0.009973,-0.032623,-0.00891,-0.091223,-0.006301,-0.091933,-0.02622,1.238197,-0.021873,-0.039377,-0.027665,-0.021724,-0.027808,-0.018905,-0.043917,0.0,0.0,-0.09599,-0.720244,-0.31323,-0.640142,-0.633978,-0.372186,-0.373098,0.772109,-0.349282,1.185096,-1.803501,1.264742,1.069663,-0.44094,0.332556,0.164197,-0.641804,-0.627365,-0.38514,-0.374281,1,22,9


In [70]:
Y_train.head()

1312     1
536      1
12385    1
24516    1
19896    0
Name: class, dtype: int64

## Defining the NN model using keras

In [71]:
# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

In [80]:
# Initialising the ANN
classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(units= 60, activation = 'relu'))

#Adding a second hidden layer
classifier.add(Dense(units = 60, activation = 'relu'))

#Adding a third hidden layer
classifier.add(Dense(units = 60, activation = 'relu'))

# Adding the output layer
classifier.add(Dense(units = 1, activation = 'sigmoid'))

# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

## Start training 

In [81]:
classifier.fit(X_train, Y_train, batch_size=10, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fd5943216d0>

In [82]:
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

In [91]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(Y_test, y_pred)
print('Training accuracy is {}'.format(accuracy))

Training accuracy is 0.9923260121725325


## Making the Confusion Matrix

In [95]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, y_pred)

array([[4032,   28],
       [  30, 3468]])