In [3]:
import pandas as pd
import numpy as np
import sys
import sklearn
import io
import random
import warnings
# Suppress warnings
warnings.filterwarnings('ignore')

## Loading the Data 

In [13]:
# Loading the Data
train_df = pd.read_csv("NSL_train.csv")
test_df  = pd.read_csv("NSL_test.csv")

train_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


## Data Analysis 

In [14]:
len(train_df.columns), train_df.columns

(42,
 Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
        'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
        'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
        'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
        'num_access_files', 'num_outbound_cmds', 'is_host_login',
        'is_guest_login', 'count', 'srv_count', 'serror_rate',
        'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
        'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
        'dst_host_srv_count', 'dst_host_same_srv_rate',
        'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
        'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
        'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
        'dst_host_srv_rerror_rate', 'label'],
       dtype='object'))

In [15]:
print('Label distribution Training set:')
print(train_df['label'].nunique())
print(train_df['label'].value_counts())

Label distribution Training set:
23
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: label, dtype: int64


In [50]:
print('Label distribution Test set:')
print(test_df['label'].value_counts())

Label distribution Test set:
label
normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess           331
saint               319
mailbomb            293
snmpgetattack       178
portsweep           157
ipsweep             141
httptunnel          133
nmap                 73
pod                  41
buffer_overflow      20
multihop             18
named                17
ps                   15
sendmail             14
rootkit              13
xterm                13
teardrop             12
xlock                 9
land                  7
xsnoop                4
ftp_write             3
worm                  2
loadmodule            2
perl                  2
sqlattack             2
udpstorm              2
phf                   2
imap                  1
Name: count, dtype: int64


In [16]:

print('Training set:')
categorical_columns = train_df.select_dtypes(['object']).columns.tolist()
for col_name in categorical_columns:
    unique_cat = len(train_df[col_name].unique())
    print(f"Column '{col_name}' has {unique_cat} categories")

Training set:
Column 'protocol_type' has 3 categories
Column 'service' has 70 categories
Column 'flag' has 11 categories
Column 'label' has 23 categories


In [17]:
categorical_columns.remove('label')
categorical_columns

['protocol_type', 'service', 'flag']

In [18]:

from sklearn.preprocessing import LabelEncoder

combined_data = pd.concat([train_df, test_df], axis=0)
# List of categorical columns
print(categorical_columns) 


label_encoders = {} 

for column in categorical_columns:
    lb = LabelEncoder()
    lb.fit(combined_data[column])
    label_encoders[column] = lb
    combined_data[column] = lb.transform(combined_data[column])

train_data = combined_data[:len(train_df)]
test_data = combined_data[len(train_df):]

['protocol_type', 'service', 'flag']


#### Labels Processing : 


In [19]:
label_categories = { 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4}

In [20]:
label_classes = {"DoS" : [0,1], "Probe" : [0,2], "R2L" : [0,3], "U2R" : [0,4]}
train_data['label'] = train_data['label'].replace(label_categories)
test_data['label'] = test_data['label'].replace(label_categories)

In [21]:
categorical_dfs = {
    "DoS" : (train_data[train_data['label'].isin(label_classes['DoS'])], test_data[test_data['label'].isin(label_classes['DoS'])]),
    "Probe" : (train_data[train_data['label'].isin(label_classes['Probe'])], test_data[test_data['label'].isin(label_classes['Probe'])]),
    "R2L" : (train_data[train_data['label'].isin(label_classes['R2L'])], test_data[test_data['label'].isin(label_classes['R2L'])]),
    "U2R" : (train_data[train_data['label'].isin(label_classes['U2R'])], test_data[test_data['label'].isin(label_classes['U2R'])]),
}

### Generate Train, Test from Categorical labels  

In [22]:
def return_train_test_data(categorical_dfs, category : str) : 
    df = categorical_dfs[category][0]
    X_train = df.drop('label',axis=1)
    Y_train = df.label
    
    df_test = categorical_dfs[category][1]
    X_test = df_test.drop('label',axis=1)
    Y_test = df_test.label

    return X_train, Y_train, X_test, Y_test

### Transform Data using StandardScaler to fit the model 

In [23]:

from sklearn import preprocessing

def train_test_transformed(categorical_dfs, category):

    X_train, Y_train, X_test, Y_test = return_train_test_data(categorical_dfs, category)
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    scaler_test = preprocessing.StandardScaler().fit(X_test)
    X_test = scaler_test.transform(X_test)

    return X_train, Y_train.astype(int), X_test, Y_test.astype(int)

In [24]:

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# We are using three classifiers here and we will select the best features from the three classifiers.
classifiers = [SVC(kernel='linear', C=1.0, random_state=0), RandomForestClassifier(n_estimators=10,n_jobs=2), KNeighborsClassifier()]
rfe = RFE(estimator=classifiers[1], n_features_to_select=13, step=1)

In [27]:
X_train, Y_train, X_test, Y_test = train_test_transformed(categorical_dfs, "DoS")
#Fit the feature selector on the training data
rfe.fit(X_train, Y_train.astype(int))
# Transform the training data to get the selected features
X_rfe = rfe.transform(X_train)

#Get the list of columns from the categorical_dfs : 
column_Names =list(categorical_dfs["DoS"][0])

#Get the list of selected columns
column_indexes =[i for i, x in enumerate(rfe.support_) if x]
selected_column_names = [column_Names[i] for i in column_indexes]

In [28]:
X_train.shape , X_rfe.shape

((113270, 41), (113270, 13))

Fitting the data on feature selector reveals only 13 columns are used to make the prediction from the 41 present

In [35]:
selected_column_names

['protocol_type',
 'service',
 'flag',
 'src_bytes',
 'dst_bytes',
 'wrong_fragment',
 'num_compromised',
 'count',
 'srv_count',
 'same_srv_rate',
 'diff_srv_rate',
 'dst_host_same_srv_rate',
 'dst_host_srv_serror_rate']

In [34]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

def run_training(categorical_dfs, category : str , classifier : int ) : 
    
    X_train, Y_train, X_test, Y_test = train_test_transformed(categorical_dfs, category)

    rfe.fit(X_train, Y_train.astype(int))
    # Transform the training, test data to get the selected features
    X_train_rfe = rfe.transform(X_train)
    X_test_rfe = rfe.transform(X_test)

    model = classifiers[classifier]
    model.fit(X_train_rfe, Y_train)

    Y_pred = model.predict(X_test_rfe)
    
    print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))
    print("Precision:",metrics.precision_score(Y_test, Y_pred, average='weighted'))
    print("Recall:",metrics.recall_score(Y_test, Y_pred, average='weighted'))
    print("F1:",metrics.f1_score(Y_test, Y_pred, average='weighted'))

    return pd.crosstab(Y_test, Y_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])

In [33]:
confusion_matrix = run_training(categorical_dfs, "DoS", 0)
print(confusion_matrix)

Accuracy: 0.8716440510162483
Precision: 0.8758730627421681
Recall: 0.8716440510162483
F1: 0.8699758230838476
Predicted attacks     0     1
Actual attacks               
0                  9159   552
1                  1652  5808


Now that we have modularized the codes, we can run this experiment for across the list of models vs category of attacks 
Model Catalog : 

0 = SVC 

1 = RandomForestClassifier

2 = KNN

Attack Categories : 

[ "DoS" , "Probe" , "R2L" , "U2R" ] 

In [36]:
confusion_matrix = run_training(categorical_dfs, "R2L", 0)
print(confusion_matrix)

Accuracy: 0.7702445220704985
Precision: 0.6641921603505512
Recall: 0.7702445220704985
F1: 0.6719380564354002
Predicted attacks     0   3
Actual attacks             
0                  9695  16
3                  2878   7


In [37]:
confusion_matrix= run_training(categorical_dfs, "Probe", 0)
print(confusion_matrix)

Accuracy: 0.878503132212331
Precision: 0.8775712514915698
Recall: 0.878503132212331
F1: 0.8631641784393244
Predicted attacks     0     2
Actual attacks               
0                  9544   167
2                  1307  1114


In [38]:
confusion_matrix= run_training(categorical_dfs, "U2R", 0)
print(confusion_matrix)

Accuracy: 0.9941705870321129
Precision: 0.9932214782492162
Recall: 0.9941705870321129
F1: 0.9923872564552023
Predicted attacks     0   4
Actual attacks             
0                  9708   3
4                    54  13


In [39]:
confusion_matrix = run_training(categorical_dfs, "DoS", 1)
print(confusion_matrix)

Accuracy: 0.8634325315939666
Precision: 0.8869095419811458
Recall: 0.8634325315939666
F1: 0.8584623391126935
Predicted attacks     0     1
Actual attacks               
0                  9654    57
1                  2288  5172


In [40]:
confusion_matrix= run_training(categorical_dfs, "Probe", 1)
print(confusion_matrix)

Accuracy: 0.8778437190900099
Precision: 0.8717842472879904
Recall: 0.8778437190900099
F1: 0.8668931685374486
Predicted attacks     0     2
Actual attacks               
0                  9411   300
2                  1182  1239


In [41]:
confusion_matrix = run_training(categorical_dfs, "R2L", 1)
print(confusion_matrix)

Accuracy: 0.7709590346141633
Precision: 0.7089314900062045
Recall: 0.7709590346141633
F1: 0.6713993992034822
Predicted attacks     0  3
Actual attacks            
0                  9710  1
3                  2884  1


In [42]:
confusion_matrix= run_training(categorical_dfs, "U2R", 1)
print(confusion_matrix)

Accuracy: 0.9941705870321129
Precision: 0.9942046038768273
Recall: 0.9941705870321129
F1: 0.9920214762638171
Predicted attacks     0   4
Actual attacks             
0                  9711   0
4                    57  10


In [43]:
confusion_matrix = run_training(categorical_dfs, "DoS", 2)
print(confusion_matrix)

Accuracy: 0.9184089453147749
Precision: 0.9268460924267232
Recall: 0.9184089453147749
F1: 0.91709588218927
Predicted attacks     0     1
Actual attacks               
0                  9653    58
1                  1343  6117


In [44]:
confusion_matrix= run_training(categorical_dfs, "Probe", 2)
print(confusion_matrix)

Accuracy: 0.9135344543356413
Precision: 0.912193267053767
Recall: 0.9135344543356413
F1: 0.9079833689634926
Predicted attacks     0     2
Actual attacks               
0                  9524   187
2                   862  1559


In [45]:
confusion_matrix = run_training(categorical_dfs, "R2L", 2)
print(confusion_matrix)

Accuracy: 0.7705620832010162
Precision: 0.6568729894146405
Recall: 0.7705620832010162
F1: 0.6715021169635647
Predicted attacks     0  3
Actual attacks            
0                  9703  8
3                  2882  3


In [46]:
confusion_matrix= run_training(categorical_dfs, "U2R", 2)
print(confusion_matrix)

Accuracy: 0.9945796686438945
Precision: 0.993766025422206
Recall: 0.9945796686438945
F1: 0.9932156249855
Predicted attacks     0   4
Actual attacks             
0                  9707   4
4                    49  18
