In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score


# Reading Data

In [3]:
df = pd.read_csv('Dataset/NSL_new.csv')

In [4]:
X = df.drop(['label'], axis=1)
y = df['label']

In [5]:
scoring_metrics = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score, average="micro"),
           'recall' : make_scorer(recall_score, average="micro"), 
           'f1_score' : make_scorer(f1_score, average="micro")}

In [6]:
df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,label
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,normal
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,normal
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,neptune
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,normal
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,normal


In [7]:
X.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [8]:
y.head()

0     normal
1     normal
2    neptune
3     normal
4     normal
Name: label, dtype: object

In [9]:
y = y.replace( to_replace =  'normal' , value = 0 )

y = y.replace( to_replace =  'neptune' , value = 1 )
y = y.replace( to_replace =  'back' , value = 1 )
y = y.replace( to_replace =  'land' , value = 1 )
y = y.replace( to_replace =  'pod' , value = 1 )
y = y.replace( to_replace =  'smurf' , value = 1 )
y = y.replace( to_replace =  'teardrop' , value = 1 )
y = y.replace( to_replace =  'mailbomb' , value = 1 )
y = y.replace( to_replace =  'apache2' , value = 1 )
y = y.replace( to_replace =  'processtable' , value = 1 )
y = y.replace( to_replace =  'udpstorm' , value = 1 )
y = y.replace( to_replace =  'worm' , value = 1 )

y = y.replace( to_replace =  'ipsweep' , value = 2 )
y = y.replace( to_replace =  'nmap' , value = 2 )
y = y.replace( to_replace =  'portsweep' , value = 2 )
y = y.replace( to_replace =  'satan' , value = 2 )
y = y.replace( to_replace =  'mscan' , value = 2 )
y = y.replace( to_replace =  'saint' , value = 2 )

y = y.replace( to_replace =  'ftp_write' , value = 3 )
y = y.replace( to_replace =  'guess_passwd' , value = 3 )
y = y.replace( to_replace =  'imap' , value = 3)
y = y.replace( to_replace =  'multihop' , value = 3 )
y = y.replace( to_replace =  'phf' , value = 3 )
y = y.replace( to_replace =  'spy' , value = 3 )
y = y.replace( to_replace =  'warezclient' , value = 3 )
y = y.replace( to_replace =  'warezmaster' , value = 3 )
y = y.replace( to_replace =  'sendmail' , value = 3 )
y = y.replace( to_replace =  'snmpgetattack' , value = 3 )
y = y.replace( to_replace =  'snmpguess' , value = 3 )
y = y.replace( to_replace =  'xlock' , value = 3 )
y = y.replace( to_replace =  'xsnoop' , value = 3 )
y = y.replace( to_replace =  'httptunnel' , value = 3 )

y = y.replace( to_replace =  'buffer_overflow' , value = 4 )
y = y.replace( to_replace =  'loadmodule' , value = 4 )
y = y.replace( to_replace =  'perl' , value = 4 )
y = y.replace( to_replace =  'rootkit' , value = 4 )
y = y.replace( to_replace =  'ps' , value = 4 )
y = y.replace( to_replace =  'sqlattack' , value = 4  )
y = y.replace( to_replace =  'xterm' , value = 4 )

y = y.replace( to_replace =  'unknown' , value = 5 )

In [10]:
y.head()

0    0
1    0
2    1
3    0
4    0
Name: label, dtype: int64

# Feature Selection

In [11]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif

In [12]:
# Univariate feature selection with F-test for feature scoring
# We use the default selection function to select the four
# most significant features
selector = SelectKBest(f_score=chi2, k=15)
selector.fit(X, y)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
plt.bar(X_indices - .45, scores, width=.2,
        label=r'Univariate score ($-Log(p_{value})$)', color='darkorange',
        edgecolor='black')

  f = msb / msw
  


NameError: name 'X_indices' is not defined

In [13]:
y.head()

0    0
1    0
2    1
3    0
4    0
Name: label, dtype: int64

In [14]:
y.drop(y.columns[0], axis=1)

AttributeError: 'Series' object has no attribute 'columns'

In [15]:
# define feature selection
fs = SelectKBest(score_func=f_classif, k=15)

In [16]:
#Fit the function for ranking the features by score

fit = fs.fit(X, y)


  f = msb / msw


In [17]:
#Summarize scores numpy.set_printoptions(precision=3) print(fit.scores_)
#Apply the transformation on to dataset
fit = fs.fit(X, y)
features = fit.transform(X)

  f = msb / msw


In [18]:
print("before transform:",X)
selector=SelectKBest(score_func=f_classif,k=3)
selector.fit(X,y)
print("scores_:",selector.scores_)
print("pvalues_:",selector.pvalues_)
print("selected index:",selector.get_support(True))
print("after transform:",selector.transform(X)) 

before transform:         duration  src_bytes  dst_bytes  land  wrong_fragment  urgent  hot  \
0              0        491          0     0               0       0    0   
1              0        146          0     0               0       0    0   
2              0          0          0     0               0       0    0   
3              0        232       8153     0               0       0    0   
4              0        199        420     0               0       0    0   
...          ...        ...        ...   ...             ...     ...  ...   
125968         0          0          0     0               0       0    0   
125969         8        105        145     0               0       0    0   
125970         0       2231        384     0               0       0    0   
125971         0          0          0     0               0       0    0   
125972         0        151          0     0               0       0    0   

        num_failed_logins  logged_in  num_compromised  ..

  f = msb / msw


scores_: [1.63216657e+03 1.14527406e+01 5.26988211e+00 3.55455220e+00
 4.45748761e+02 3.42019637e+01 4.13719711e+03 3.84845525e+02
 3.19592485e+04 3.31110190e+00 2.64747538e+03 1.59991802e+01
 4.14961849e+00 5.00184623e+01 4.96449419e+02 4.54620524e+01
            nan 2.17648989e-01 2.88233829e+03 2.14284531e+04
 2.42274507e+02 5.14262570e+04 5.11639306e+04 4.48742744e+03
 4.54649054e+03 6.80193230e+04 4.56927417e+03 3.86846091e+03
 9.43292409e+03 3.46238796e+04 3.41042427e+04 1.31026399e+04
 1.37786510e+04 8.17589987e+03 5.21218091e+04 5.30773440e+04
 3.81965256e+03 4.58499256e+03 5.34120694e+03 3.07927309e+03
 1.93979328e+03 3.98910411e+01 1.18508349e+01 3.67700176e+02
 4.90443035e+00 1.45875439e+02 2.99333382e+02 3.14045579e+02
 2.24451089e+02 2.17230993e+02 2.06620544e+02 2.14119490e+02
 1.74185480e+02 2.26306980e+03 1.68413546e+02 1.27169511e+04
 1.10376662e+03 2.04156200e+02 1.96718139e+02 1.74116758e+02
 1.75690317e+03 2.05179985e+03 1.89469229e+02 4.90443035e+00
 1.85602776e+02

# SVM

In [19]:
from sklearn import svm

In [20]:
svmModel = svm.SVC(gamma='scale')

In [None]:
svmScore = cross_validate(svmModel, X, y, cv=4, scoring=scoring_metrics, verbose=15 ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
ss = pd.DataFrame(svmScore)

In [None]:
ss.head()

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
randomForestModel = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=100)

In [None]:
randomForestScore = cross_validate(randomForestModel, X, y, cv=4, scoring=scoring_metrics, verbose=3 ,n_jobs=-1)

In [None]:
randomForestScore = pd.DataFrame(randomForestScore)

In [None]:
randomForestScore.to_csv('NSL_RandomForest_scores.csv')

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [None]:
rf_parameters={'n_estimators' : range(80,200,20),'max_depth': range(1,20,3), 'min_samples_split': np.arange(0.1,1,0.1)}

In [None]:
rf2 = RandomForestClassifier()

In [None]:
dlGrid = GridSearchCV(rf2, rf_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=3)

In [None]:
dlGrid.fit(X, y)

In [None]:
dtGridScores = pd.DataFrame(dlGrid.cv_results_)
dtGridScores.head()

In [None]:
dt3=dlGrid.best_estimator_

In [None]:
dtFinalScore = cross_validate(dt3, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

In [None]:
pd.DataFrame(dtFinalScore)

In [None]:
dtFinalScore['test_accuracy'].mean()

# Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlpModel = MLPClassifier()

In [None]:
mlpScore=cross_validate(mlpModel, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

In [None]:
mlpModel

In [None]:
mlpScore = pd.DataFrame(mlpScore)

In [None]:
mlpScore.to_csv('NSL_MLP_scores.csv')

In [None]:
nLayers=[
    (4),
    (7),
    (10), #one layer of 10 nodes
    (13),
    (17),
    (20),
    (30),
    (50),
    (80),
    (100),
    (120),
    (140),
    (180),
    (220),
    (10, 10), #two layers, 10 nodes each
    (20, 20), #two layers, 20 nodes each
    (30, 30),
    (50, 50),
    (80, 80),
    (100, 100),
    (150, 150),
    (10, 10, 10), #three layers, 10 nodes each
    (20, 20, 20),
    (30, 30, 30),
    (50, 50, 50),
    (80, 80, 80),
]

In [None]:
mlp_parameters = {
    'hidden_layer_sizes': nLayers,
    'solver': ['sgd', 'adam'],
    'alpha': [0.001, 0.01, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [None]:
X_sample=X.sample(frac=0.2, random_state=1)
y_sample=y.sample(frac=0.2, random_state=1)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
from sklearn.neural_network import MLPClassifier
mlpModel2 = MLPClassifier()

In [None]:
mlp_grid = GridSearchCV(mlpModel2, mlp_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=3)


In [None]:
mlp_grid.fit(X_sample, y_sample)

In [None]:
mlpGridScores=pd.DataFrame(mlp_grid.cv_results_)
mlpGridScores.head()
mlpGridScores.to_csv('NSL_ANN_gridsearch.csv')

In [None]:
mlpFinalModel = mlp_grid.best_estimator_
mlp_grid.best_params_

In [None]:
mlpFinalScore=cross_validate(mlpFinalModel, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

In [None]:
mlpFinalScore = pd.DataFrame(mlpFinalScore)
mlpFinalScore.to_csv('NSL_MLP_Scores')

In [None]:
mlpFinalScore['test_accuracy'].mean()

# Decision Trees

In [None]:
from sklearn.tree import tree

In [None]:
dt=tree.DecisionTreeClassifier()

In [None]:
dtScore = cross_validate(dt, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

In [None]:
dtScore = pd.DataFrame(dtScore)


In [None]:
dtScore.to_csv('NSL_dtScore_scores.csv')

In [None]:
dt_parameters={'min_samples_split' : range(10,500,20),'max_depth': range(1,20,2)}

In [None]:
dt2 = DecisionTreeClassifier()

In [None]:
dlGrid = GridSearchCV(dt2, dt_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=4)

In [None]:
dlGrid.fit(X, y)

In [None]:
dtGridScores = pd.DataFrame(dlGrid.cv_results_)
dtGridScores.head()
dtGridScores.to_csv('NSL_DT_GridSearch')

In [None]:
dt3=dlGrid.best_estimator_
dlGrid.best_params_

In [None]:
dtFinalScore = cross_validate(dt3, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

In [None]:
dtFinalScore = pd.DataFrame(dtFinalScore)
dtFinalScore.to_csv('NSL_DT_Scores')
dtFinalScore['test_accuracy'].mean()

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()

In [None]:
k_range = list(range(1, 101, 5))

In [None]:
param_dict = dict(n_neighbors=k_range)

In [None]:
grid = GridSearchCV(knn, param_dict, cv=4, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1)

In [None]:
grid.fit(X, y)

In [None]:
knnScore = pd.DataFrame(grid.cv_results_)
knnScore.to_csv('NSL_KNN_GridSearch.csv')

In [None]:
knn2=grid.best_estimator_
grid.best_params_

In [None]:
knnFinalScore = cross_validate(knn2, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

In [None]:
pd.DataFrame(knnFinalScore)

In [None]:
knnFinalScore.to_csv('NSL_KNN_scores')

In [None]:
pd.DataFrame(linearScore).mean()

In [None]:
pd.DataFrame(mlpScore).mean()

In [None]:
knnFinalScore = pd.DataFrame(knnFinalScore)
knnFinalScore.to_csv('NSL_KNN_Scores')

# All Results

In [None]:
allResults=pd.concat([
    pd.DataFrame(mlpScore).mean(),
    pd.DataFrame(dtFinalScore).mean(),
    pd.DataFrame(knnFinalScore).mean(),
    pd.DataFrame(ss).mean()
], axis=1)
allResults

In [None]:
allResults.rename(columns={0:'Multi-level Perceptron', 1:'Decision Tree', 2:'KNN' , 3:'SVM' },  inplace=True)
allResults

# All Results