In [None]:
'''
Documentation: DA4 - PART1
CSE4020-ML-LAB-L29+L30, 20BCE0417 - CHIRAG DILIP GOENKA

Topic: Multi-Layer Perceptron (MLP) with NSL-KDD dataset.
Dataset: https://www.unb.ca/cic/datasets/nsl.html (NSL KDD – Intrusion Detection Dataset)
         https://www.kaggle.com/hassan06/nslkdd
Reference: https://scikit-learn.org/stable/modules/neural_networks_supervised.html

METHODOLOGY
Part 1: 
Read and parse the initial dataset
Load it into our pandas dataframe
Display the dataset with feature information
Part 2: 
Split our dataset into its attributes and labels
Perform feature scaling
Create Multilayer perceptron (MLP) model
Part 3:
Training and Predictions
Train and evaluate a Multilayer perceptron (MLP) model
Print out the Confusion matrix, Precision, Recall and Accuracy
'''

In [1]:
#importing all necessary libraries
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
#adding the column names
col_names=['duration','protocol_type','service','flag','src_bytes','dst_bytes',
           'land','wrong_fragment','urgent','hot','num_failed_logins','logged_in',
           'num_compromised','root_shell','su_attempted','num_root','num_file_creations',
           'num_shells','num_access_files','num_outbound_cmds','is_host_login',
           'is_guest_login','count','srv_count','serror_rate','srv_serror_rate',
           'rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate',
           'srv_diff_host_rate','dst_host_count','dst_host_srv_count',
           'dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate',
           'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate',
           'dst_host_rerror_rate','dst_host_srv_rerror_rate','attack','level']

In [3]:
# Read training dataset to pandas dataframe
read_file = pd.read_csv('KDDTrain+.txt') #reading the txt file
read_file.to_csv('train.csv', index=None) #renaming it to train.csv
# Display the dataset with feature information
df=pd.read_csv('train.csv',names=col_names,low_memory=False)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 43 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     125973 non-null  int64  
 1   protocol_type                125973 non-null  object 
 2   service                      125973 non-null  object 
 3   flag                         125973 non-null  object 
 4   src_bytes                    125973 non-null  int64  
 5   dst_bytes                    125973 non-null  float64
 6   land                         125973 non-null  float64
 7   wrong_fragment               125973 non-null  float64
 8   urgent                       125973 non-null  float64
 9   hot                          125973 non-null  float64
 10  num_failed_logins            125973 non-null  float64
 11  logged_in                    125973 non-null  float64
 12  num_compromised              125973 non-null  float64
 13 

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,level
0,0,tcp,ftp_data,SF,491,0.1,0.2,0.3,0.4,0.5,...,0.17.1,0.03,0.17.2,0.00.6,0.00.7,0.00.8,0.05,0.00.9,normal,20
1,0,udp,other,SF,146,0.0,0.0,0.0,0.0,0.0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0.0,0.0,0.0,0.0,0.0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [4]:
# Assign data from 4-21 columns to X variable
X = df.iloc[:,4:22]
# Assign data from the second last column to y variable ('attack column')
y = df.iloc[:,-2:-1]
# unique values we have in y 
y.attack.unique()

array(['normal', 'neptune', 'warezclient', 'ipsweep', 'portsweep',
       'teardrop', 'nmap', 'satan', 'smurf', 'pod', 'back',
       'guess_passwd', 'ftp_write', 'multihop', 'rootkit',
       'buffer_overflow', 'imap', 'warezmaster', 'phf', 'land',
       'loadmodule', 'spy', 'perl'], dtype=object)

In [5]:
# Let's convert these categorical values to numerical values. To do so we will use Scikit-Learn's LabelEncoder class
le = preprocessing.LabelEncoder()
y = y.apply(le.fit_transform)

In [6]:
# create training and test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
# perform feature scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
# Training and Predictions
mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000) 
mlp.fit(X_train, y_train.values.ravel())
predictions = mlp.predict(X_test)

In [9]:
# Evaluating the Algorithm
predictions_train = mlp.predict(X_train)
print('Training predictions accuracy:',accuracy_score(predictions_train,y_train))
predictions_test = predictions
print('\nTesting predictions accuracy:',accuracy_score(predictions_test, y_test))
print('\nConfusion matrix:\n',confusion_matrix(y_test,predictions)) 
print('\nClassification report:\n',classification_report(y_test,predictions,zero_division=0))
print('\nAccuracy Score on Confusion Matrix:',accuracy_score(y_test,predictions)) 
print('Error Score on Confusion Matrix:',1-accuracy_score(y_test,predictions))

Training predictions accuracy: 0.7278374248347854

Testing predictions accuracy: 0.7247469736058741

Confusion matrix:
 [[ 159    0    0    0    0    0    0    0    0    0    0   20    0    0
     0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    0   10    0    0
     0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    0    3    0    0
     0    0    0    0    0    0    0]
 [   0    0    0   13    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    1    0    1    0    0
     0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0  702    0    4    0    0
     0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    4    0    0    0    0    0    0    0
     0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    0    1    0    0
     0    0    0    0    0    0    0]


In [10]:
# the above cell gives a red box due to division of 0 by 0. we have used zero_division=0 to remove the warning