In [40]:
import pandas as pd
import numpy as np

In [41]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score


# Reading Data

In [42]:
df = pd.read_csv('Dataset/NSL_new.csv')

In [43]:
df = df.sample(n=4000)

In [44]:
X = df.drop(['label'], axis=1)
y = df['label']

In [45]:
scoring_metrics = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score, average="micro"),
           'recall' : make_scorer(recall_score, average="micro"), 
           'f1_score' : make_scorer(f1_score, average="micro")}

In [46]:
df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,label
89676,3,52,1463,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,normal
58360,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,neptune
104901,0,46,46,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,normal
35862,5044,5131424,0,0,0,0,3,0,1,0,...,0,0,0,0,0,0,0,1,0,warezclient
54316,0,54540,8314,0,0,0,2,0,1,1,...,0,0,0,0,0,0,0,1,0,back


In [47]:
X.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
89676,3,52,1463,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
58360,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
104901,0,46,46,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
35862,5044,5131424,0,0,0,0,3,0,1,0,...,0,0,0,0,0,0,0,0,1,0
54316,0,54540,8314,0,0,0,2,0,1,1,...,0,0,0,0,0,0,0,0,1,0


In [48]:
y.head()

89676          normal
58360         neptune
104901         normal
35862     warezclient
54316            back
Name: label, dtype: object

In [49]:
y = y.replace( to_replace =  'normal' , value = 0 )

y = y.replace( to_replace =  'neptune' , value = 1 )
y = y.replace( to_replace =  'back' , value = 1 )
y = y.replace( to_replace =  'land' , value = 1 )
y = y.replace( to_replace =  'pod' , value = 1 )
y = y.replace( to_replace =  'smurf' , value = 1 )
y = y.replace( to_replace =  'teardrop' , value = 1 )
y = y.replace( to_replace =  'mailbomb' , value = 1 )
y = y.replace( to_replace =  'apache2' , value = 1 )
y = y.replace( to_replace =  'processtable' , value = 1 )
y = y.replace( to_replace =  'udpstorm' , value = 1 )
y = y.replace( to_replace =  'worm' , value = 1 )

y = y.replace( to_replace =  'ipsweep' , value = 2 )
y = y.replace( to_replace =  'nmap' , value = 2 )
y = y.replace( to_replace =  'portsweep' , value = 2 )
y = y.replace( to_replace =  'satan' , value = 2 )
y = y.replace( to_replace =  'mscan' , value = 2 )
y = y.replace( to_replace =  'saint' , value = 2 )

y = y.replace( to_replace =  'ftp_write' , value = 3 )
y = y.replace( to_replace =  'guess_passwd' , value = 3 )
y = y.replace( to_replace =  'imap' , value = 3)
y = y.replace( to_replace =  'multihop' , value = 3 )
y = y.replace( to_replace =  'phf' , value = 3 )
y = y.replace( to_replace =  'spy' , value = 3 )
y = y.replace( to_replace =  'warezclient' , value = 3 )
y = y.replace( to_replace =  'warezmaster' , value = 3 )
y = y.replace( to_replace =  'sendmail' , value = 3 )
y = y.replace( to_replace =  'snmpgetattack' , value = 3 )
y = y.replace( to_replace =  'snmpguess' , value = 3 )
y = y.replace( to_replace =  'xlock' , value = 3 )
y = y.replace( to_replace =  'xsnoop' , value = 3 )
y = y.replace( to_replace =  'httptunnel' , value = 3 )

y = y.replace( to_replace =  'buffer_overflow' , value = 4 )
y = y.replace( to_replace =  'loadmodule' , value = 4 )
y = y.replace( to_replace =  'perl' , value = 4 )
y = y.replace( to_replace =  'rootkit' , value = 4 )
y = y.replace( to_replace =  'ps' , value = 4 )
y = y.replace( to_replace =  'sqlattack' , value = 4  )
y = y.replace( to_replace =  'xterm' , value = 4 )

y = y.replace( to_replace =  'unknown' , value = 5 )

In [50]:
y.head()

89676     0
58360     1
104901    0
35862     3
54316     1
Name: label, dtype: int64

# Feature Selection

In [51]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, chi2

In [52]:
# Univariate feature selection with F-test for feature scoring
# We use the default selection function to select the four
# most significant features
selector = SelectKBest(score_func=chi2, k=15)
selector.fit(X, y)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
plt.bar(X_indices - .45, scores, width=.2,
        label=r'Univariate score ($-Log(p_{value})$)', color='darkorange',
        edgecolor='black')

  


NameError: name 'X_indices' is not defined

In [53]:
y.head()

89676     0
58360     1
104901    0
35862     3
54316     1
Name: label, dtype: int64

In [54]:
y.drop(y.columns[0], axis=1)

AttributeError: 'Series' object has no attribute 'columns'

In [55]:
# define feature selection
fs = SelectKBest(score_func=chi2, k=15)

In [56]:
#Fit the function for ranking the features by score

fit = fs.fit(X, y)


In [57]:
#Summarize scores numpy.set_printoptions(precision=3) print(fit.scores_)
#Apply the transformation on to dataset
fit = fs.fit(X, y)
features = fit.transform(X)

In [58]:
print("before transform:",X)
selector=SelectKBest(score_func=chi2,k=3)
selector.fit(X,y)
print("scores_:",selector.scores_)
print("pvalues_:",selector.pvalues_)
print("selected index:",selector.get_support(True))
print("after transform:",selector.transform(X)) 

before transform:         duration  src_bytes  dst_bytes  land  wrong_fragment  urgent  hot  \
89676          3         52       1463     0               0       0    0   
58360          0          0          0     0               0       0    0   
104901         0         46         46     0               0       0    0   
35862       5044    5131424          0     0               0       0    3   
54316          0      54540       8314     0               0       0    2   
...          ...        ...        ...   ...             ...     ...  ...   
26734          0         18          0     0               0       0    0   
43381          0      54540       8314     0               0       0    2   
67184          0          1          0     0               0       0    0   
118519         0        218        264     0               0       0    0   
64050          0          0          0     0               0       0    0   

        num_failed_logins  logged_in  num_compromised  ..

# SVM

In [59]:
from sklearn import svm

In [60]:
svmModel = svm.SVC(gamma='scale')

In [61]:
svmScore = cross_validate(svmModel, X, y, cv=4, scoring=scoring_metrics, verbose=15 ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   52.6s
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:   53.1s remaining:   53.1s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   58.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   58.1s finished


In [62]:
ss = pd.DataFrame(svmScore)

In [63]:
ss.head()

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,12.466094,11.1709,0.534466,0.534466,0.534466,0.534466
1,14.595318,12.163416,0.536464,0.536464,0.536464,0.536464
2,12.632207,7.467297,0.535536,0.535536,0.535536,0.535536
3,13.331208,12.549727,0.535536,0.535536,0.535536,0.535536


# Random Forest

In [64]:
from sklearn.ensemble import RandomForestClassifier

In [65]:
randomForestModel = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=100)

In [66]:
randomForestScore = cross_validate(randomForestModel, X, y, cv=4, scoring=scoring_metrics, verbose=3 ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   26.8s finished


In [67]:
randomForestScore = pd.DataFrame(randomForestScore)

In [68]:
randomForestScore.to_csv('NSL_RandomForest_scores_ChiSquared.csv')

In [69]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [70]:
rf_parameters={'n_estimators' : range(80,200,20),'max_depth': range(1,20,3), 'min_samples_split': np.arange(0.1,1,0.1)}

In [71]:
rf2 = RandomForestClassifier()

In [72]:
dlGrid = GridSearchCV(rf2, rf_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=3)

In [73]:
dlGrid.fit(X, y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 378 candidates, totalling 1134 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1134 out of 1134 | elapsed:  5.0min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=Fa...
                         'min_samples_split': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                         'n_estimators': range(80, 200, 20

In [74]:
dtGridScores = pd.DataFrame(dlGrid.cv_results_)
dtGridScores.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_accuracy,split1_test_accuracy,...,split2_test_recall,mean_test_recall,std_test_recall,rank_test_recall,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score
0,0.847261,0.178225,0.491494,0.242463,1,0.1,80,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.854042,0.858965,...,0.854245,0.85575,0.002274,116,0.854042,0.858965,0.854245,0.85575,0.002274,116
1,1.176332,0.184932,0.356039,0.1667,1,0.1,100,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.853293,0.858965,...,0.853494,0.85525,0.002627,189,0.853293,0.858965,0.853494,0.85525,0.002627,189
2,0.970507,0.454119,0.605512,0.29235,1,0.1,120,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.853293,0.858215,...,0.854245,0.85525,0.002132,189,0.853293,0.858215,0.854245,0.85525,0.002132,189
3,1.249456,0.592748,0.87099,0.468795,1,0.1,140,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.854042,0.857464,...,0.854996,0.8555,0.001442,141,0.854042,0.857464,0.854996,0.8555,0.001442,141
4,1.348267,0.633221,0.817804,0.384856,1,0.1,160,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.854042,0.858215,...,0.854245,0.8555,0.001921,141,0.854042,0.858215,0.854245,0.8555,0.001921,141


In [75]:
dt3=dlGrid.best_estimator_

In [76]:
dtFinalScore = cross_validate(dt3, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    3.5s finished


In [77]:
pd.DataFrame(dtFinalScore)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,2.709305,0.7547,0.966034,0.966034,0.966034,0.966034
1,2.702218,0.696197,0.95005,0.95005,0.95005,0.95005
2,2.474291,0.667295,0.963964,0.963964,0.963964,0.963964
3,2.853664,0.65288,0.932933,0.932933,0.932933,0.932933


In [78]:
dtFinalScore['test_accuracy'].mean()

0.9532452032452032

# Neural Network

In [79]:
from sklearn.neural_network import MLPClassifier

In [80]:
mlpModel = MLPClassifier()

In [81]:
mlpScore=cross_validate(mlpModel, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   31.9s finished


In [82]:
mlpModel

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [83]:
mlpScore = pd.DataFrame(mlpScore)

In [84]:
mlpScore.to_csv('NSL_MLP_scores_ChiSquared.csv')

In [85]:
nLayers=[
    (4),
    (7),
    (10), #one layer of 10 nodes
    (13),
    (17),
    (20),
    (30),
    (50),
    (80),
    (100),
    (120),
    (140),
    (180),
    (220),
    (10, 10), #two layers, 10 nodes each
    (20, 20), #two layers, 20 nodes each
    (30, 30),
    (50, 50),
    (80, 80),
    (100, 100),
    (150, 150),
    (10, 10, 10), #three layers, 10 nodes each
    (20, 20, 20),
    (30, 30, 30),
    (50, 50, 50),
    (80, 80, 80),
]

In [86]:
mlp_parameters = {
    'hidden_layer_sizes': nLayers,
    'solver': ['sgd', 'adam'],
    'alpha': [0.001, 0.01, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [87]:
X_sample=X.sample(frac=0.2, random_state=1)
y_sample=y.sample(frac=0.2, random_state=1)

In [88]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [89]:
from sklearn.neural_network import MLPClassifier
mlpModel2 = MLPClassifier()

In [90]:
mlp_grid = GridSearchCV(mlpModel2, mlp_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=3)


In [91]:
mlp_grid.fit(X_sample, y_sample)

Fitting 3 folds for each of 312 candidates, totalling 936 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   29.7s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 936 out of 936 | elapsed:  5.0min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(100,),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_iter=200,
                                     momentum=0.9, n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_sta...
                                                (50, 50, 50), (80, 80, 80)],
                         'learning_rate': ['constant', 'adaptive'],
                         'solver': ['sgd', 'adam']},
             pre_dispatch='2*n_jobs', refit='accuracy',
             return_train_score=False,
             scoring={'accu

In [92]:
mlpGridScores=pd.DataFrame(mlp_grid.cv_results_)
mlpGridScores.head()
mlpGridScores.to_csv('NSL_ANN_GridSearch_ChiSquared.csv')

In [93]:
mlpFinalModel = mlp_grid.best_estimator_
mlp_grid.best_params_

{'alpha': 0.001,
 'hidden_layer_sizes': (150, 150),
 'learning_rate': 'constant',
 'solver': 'adam'}

In [94]:
mlpFinalScore=cross_validate(mlpFinalModel, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.4min finished


In [95]:
mlpFinalScore = pd.DataFrame(mlpFinalScore)
mlpFinalScore.to_csv('NSL_MLP_Scores_ChiSquared')

In [96]:
mlpFinalScore['test_accuracy'].mean()

0.9537337037337038

# Decision Trees

In [97]:
from sklearn.tree import tree

In [98]:
dt=tree.DecisionTreeClassifier()

In [99]:
dtScore = cross_validate(dt, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.2s finished


In [100]:
dtScore = pd.DataFrame(dtScore)


In [101]:
dtScore.to_csv('NSL_dtScore_scores_ChiSquared.csv')

In [102]:
dt_parameters={'min_samples_split' : range(10,500,20),'max_depth': range(1,20,2)}

In [103]:
dt2 = DecisionTreeClassifier()

NameError: name 'DecisionTreeClassifier' is not defined

In [None]:
dlGrid = GridSearchCV(dt2, dt_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=4)

In [None]:
dlGrid.fit(X, y)

In [None]:
dtGridScores = pd.DataFrame(dlGrid.cv_results_)
dtGridScores.head()
dtGridScores.to_csv('NSL_DT_GridSearch_ChiSquared')

In [None]:
dt3=dlGrid.best_estimator_
dlGrid.best_params_

In [None]:
dtFinalScore = cross_validate(dt3, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

In [None]:
dtFinalScore = pd.DataFrame(dtFinalScore)
dtFinalScore.to_csv('NSL_DT_Scores_ChiSquared')
dtFinalScore['test_accuracy'].mean()

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()

In [None]:
k_range = list(range(1, 101, 5))

In [None]:
param_dict = dict(n_neighbors=k_range)

In [None]:
grid = GridSearchCV(knn, param_dict, cv=4, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1)

In [None]:
grid.fit(X, y)

In [None]:
knnScore = pd.DataFrame(grid.cv_results_)
knnScore.to_csv('NSL_KNN_GridSearch_ChiSquared.csv')

In [None]:
knn2=grid.best_estimator_
grid.best_params_

In [None]:
knnFinalScore = cross_validate(knn2, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

In [None]:
pd.DataFrame(knnFinalScore)

In [None]:
knnFinalScore.to_csv('NSL_KNN_scores_ChiSquared')

In [None]:
pd.DataFrame(linearScore).mean()

In [None]:
pd.DataFrame(mlpScore).mean()

In [None]:
knnFinalScore = pd.DataFrame(knnFinalScore)
knnFinalScore.to_csv('NSL_KNN_Scores_ChiSquared')

# All Results

In [None]:
allResults=pd.concat([
    pd.DataFrame(mlpScore).mean(),
    pd.DataFrame(dtFinalScore).mean(),
    pd.DataFrame(knnFinalScore).mean(),
    pd.DataFrame(ss).mean()
], axis=1)
allResults

In [None]:
allResults.rename(columns={0:'Multi-level Perceptron', 1:'Decision Tree', 2:'KNN' , 3:'SVM' },  inplace=True)
allResults

# All Results