In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score


# Reading Data

In [3]:
df = pd.read_csv('../Dataset/NSL_new.csv')

In [4]:
df = df.sample(n=4000)

In [5]:
X = df.drop(['label'], axis=1)
y = df['label']

In [6]:
scoring_metrics = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score, average="micro"),
           'recall' : make_scorer(recall_score, average="micro"), 
           'f1_score' : make_scorer(f1_score, average="micro")}

In [7]:
df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,label
10046,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,neptune
37748,0,45,72,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,normal
29825,1,817,330,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,normal
77183,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,neptune
287,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,neptune


In [8]:
X.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
10046,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
37748,0,45,72,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
29825,1,817,330,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
77183,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
287,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [9]:
y.head()

10046    neptune
37748     normal
29825     normal
77183    neptune
287      neptune
Name: label, dtype: object

In [10]:
y = y.replace( to_replace =  'normal' , value = 0 )

y = y.replace( to_replace =  'neptune' , value = 1 )
y = y.replace( to_replace =  'back' , value = 1 )
y = y.replace( to_replace =  'land' , value = 1 )
y = y.replace( to_replace =  'pod' , value = 1 )
y = y.replace( to_replace =  'smurf' , value = 1 )
y = y.replace( to_replace =  'teardrop' , value = 1 )
y = y.replace( to_replace =  'mailbomb' , value = 1 )
y = y.replace( to_replace =  'apache2' , value = 1 )
y = y.replace( to_replace =  'processtable' , value = 1 )
y = y.replace( to_replace =  'udpstorm' , value = 1 )
y = y.replace( to_replace =  'worm' , value = 1 )

y = y.replace( to_replace =  'ipsweep' , value = 2 )
y = y.replace( to_replace =  'nmap' , value = 2 )
y = y.replace( to_replace =  'portsweep' , value = 2 )
y = y.replace( to_replace =  'satan' , value = 2 )
y = y.replace( to_replace =  'mscan' , value = 2 )
y = y.replace( to_replace =  'saint' , value = 2 )

y = y.replace( to_replace =  'ftp_write' , value = 3 )
y = y.replace( to_replace =  'guess_passwd' , value = 3 )
y = y.replace( to_replace =  'imap' , value = 3)
y = y.replace( to_replace =  'multihop' , value = 3 )
y = y.replace( to_replace =  'phf' , value = 3 )
y = y.replace( to_replace =  'spy' , value = 3 )
y = y.replace( to_replace =  'warezclient' , value = 3 )
y = y.replace( to_replace =  'warezmaster' , value = 3 )
y = y.replace( to_replace =  'sendmail' , value = 3 )
y = y.replace( to_replace =  'snmpgetattack' , value = 3 )
y = y.replace( to_replace =  'snmpguess' , value = 3 )
y = y.replace( to_replace =  'xlock' , value = 3 )
y = y.replace( to_replace =  'xsnoop' , value = 3 )
y = y.replace( to_replace =  'httptunnel' , value = 3 )

y = y.replace( to_replace =  'buffer_overflow' , value = 4 )
y = y.replace( to_replace =  'loadmodule' , value = 4 )
y = y.replace( to_replace =  'perl' , value = 4 )
y = y.replace( to_replace =  'rootkit' , value = 4 )
y = y.replace( to_replace =  'ps' , value = 4 )
y = y.replace( to_replace =  'sqlattack' , value = 4  )
y = y.replace( to_replace =  'xterm' , value = 4 )

y = y.replace( to_replace =  'unknown' , value = 5 )

In [11]:
y.head()

10046    1
37748    0
29825    0
77183    1
287      1
Name: label, dtype: int64

# ChiSquared

In [12]:
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, chi2

In [13]:
print("before transform:",X)
selector=SelectKBest(score_func=chi2,k=12)
fit = selector.fit(X,y)
features = fit.transform(X)
print("scores_:",fit.scores_)
print("pvalues_:",fit.pvalues_)
print("selected index:",fit.get_support(True))
print("after transform:",fit.transform(X)) 
X = fit.transform(X)

before transform:         duration  src_bytes  dst_bytes  land  wrong_fragment  urgent  hot  \
10046          0          0          0     0               0       0    0   
37748          0         45         72     0               0       0    0   
29825          1        817        330     0               0       0    0   
77183          0          0          0     0               0       0    0   
287            0          0          0     0               0       0    0   
...          ...        ...        ...   ...             ...     ...  ...   
101713         0        319       6902     0               0       0    0   
11423          0        245          0     0               0       0    0   
10315          0        236       2507     0               0       0    0   
16445          0          0          0     0               0       0    0   
108396         0      54540       7300     0               0       0    1   

        num_failed_logins  logged_in  num_compromised  ..

# SVM

In [14]:
from sklearn import svm

In [15]:
svmModel = svm.SVC(gamma='scale')

In [16]:
svmScore = cross_validate(svmModel, X, y, cv=10, scoring=scoring_metrics, verbose=15 ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done   2 out of  10 | elapsed:    5.7s remaining:   23.2s
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    5.7s remaining:   13.5s
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    5.8s remaining:    8.7s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    5.9s remaining:    5.9s
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    5.9s remaining:    3.9s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    5.9s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed:    5.9s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    7.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    7.1s finished


In [17]:
ss = pd.DataFrame(svmScore)

In [18]:
ss.head()

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,1.28085,0.392971,0.545906,0.545906,0.545906,0.545906
1,1.076914,0.423999,0.545906,0.545906,0.545906,0.545906
2,1.219366,0.402811,0.548628,0.548628,0.548628,0.548628
3,1.083895,0.432604,0.55,0.55,0.55,0.55
4,1.110726,0.431455,0.5525,0.5525,0.5525,0.5525


In [19]:
ss['test_accuracy'].mean()

0.5497584621093321

In [20]:
ss.to_csv('NSL_SVM_ChiSquared_bestEstimator.csv')

# Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
randomForestModel = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=100)

In [23]:
randomForestScore = cross_validate(randomForestModel, X, y, cv=4, scoring=scoring_metrics, verbose=3 ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.4s finished


In [24]:
randomForestScore = pd.DataFrame(randomForestScore)

In [25]:
randomForestScore.to_csv('NSL_RandomForest_ChiSquared_scores.csv')

In [26]:
randomForestScore

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,0.30766,0.13124,0.869261,0.869261,0.869261,0.869261
1,0.267118,0.119079,0.874126,0.874126,0.874126,0.874126
2,0.293518,0.141381,0.867868,0.867868,0.867868,0.867868
3,0.296502,0.13124,0.875752,0.875752,0.875752,0.875752


In [27]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [28]:
rf_parameters={'n_estimators' : range(80,200,20),'max_depth': range(1,20,3), 'min_samples_split': np.arange(0.1,1,0.1)}

In [29]:
rf2 = RandomForestClassifier()

In [30]:
dlGrid = GridSearchCV(rf2, rf_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=3)

In [31]:
dlGrid.fit(X, y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 378 candidates, totalling 1134 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   44.6s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1134 out of 1134 | elapsed:  1.7min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=Fa...
                         'min_samples_split': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                         'n_estimators': range(80, 200, 20

In [32]:
dtGridScores = pd.DataFrame(dlGrid.cv_results_)
dtGridScores.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_accuracy,split1_test_accuracy,...,split2_test_recall,mean_test_recall,std_test_recall,rank_test_recall,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score
0,0.205125,0.038861,0.166572,0.016919,1,0.1,80,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.86057,0.862069,...,0.867117,0.86325,0.0028,116,0.86057,0.862069,0.867117,0.86325,0.0028,116
1,0.239166,0.037012,0.183937,0.033075,1,0.1,100,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.858321,0.856822,...,0.866366,0.8605,0.00419,189,0.858321,0.856822,0.866366,0.8605,0.00419,189
2,0.344107,0.07256,0.244597,0.006393,1,0.1,120,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.858321,0.85907,...,0.864865,0.86075,0.002924,184,0.858321,0.85907,0.864865,0.86075,0.002924,184
3,0.485345,0.003734,0.280817,0.003505,1,0.1,140,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.861319,0.862819,...,0.864865,0.863,0.001453,123,0.861319,0.862819,0.864865,0.863,0.001453,123
4,0.551861,0.007098,0.311794,0.006121,1,0.1,160,"{'max_depth': 1, 'min_samples_split': 0.1, 'n_...",0.86057,0.862069,...,0.863363,0.862,0.001141,159,0.86057,0.862069,0.863363,0.862,0.001141,159


In [33]:
dt3=dlGrid.best_estimator_

In [34]:
dtFinalScore = cross_validate(dt3, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.7s finished


In [35]:
dtFinalScore = pd.DataFrame(dtFinalScore)

In [36]:
dtFinalScore['test_accuracy'].mean()

0.9450073595130175

In [37]:
randomForestScore.to_csv('NSL_RandomForest_ChiSquared_bestEstimator.csv')

# Neural Network

In [38]:
from sklearn.neural_network import MLPClassifier

In [39]:
mlpModel = MLPClassifier()

In [40]:
mlpScore=cross_validate(mlpModel, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    4.8s finished


In [41]:
mlpModel

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [42]:
mlpScore = pd.DataFrame(mlpScore)

In [43]:
mlpScore.to_csv('NSL_MLP_ChiSquared_scores.csv')

In [44]:
nLayers=[
    (4),
    (7),
    (10), #one layer of 10 nodes
    (13),
    (17),
    (20),
    (30),
    (50),
    (80),
    (100),
    (120),
    (140),
    (180),
    (220),
    (10, 10), #two layers, 10 nodes each
    (20, 20), #two layers, 20 nodes each
    (30, 30),
    (50, 50),
    (80, 80),
    (100, 100),
    (150, 150),
    (10, 10, 10), #three layers, 10 nodes each
    (20, 20, 20),
    (30, 30, 30),
    (50, 50, 50),
    (80, 80, 80),
]

In [45]:
mlp_parameters = {
    'hidden_layer_sizes': nLayers,
    'solver': ['sgd', 'adam'],
    'alpha': [0.001, 0.01, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [46]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [47]:
from sklearn.neural_network import MLPClassifier
mlpModel2 = MLPClassifier()

In [48]:
mlp_grid = GridSearchCV(mlpModel2, mlp_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=3)


In [49]:
mlp_grid.fit(X, y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 312 candidates, totalling 936 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   34.7s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 936 out of 936 | elapsed:  8.3min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(100,),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_iter=200,
                                     momentum=0.9, n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_sta...
                                                (50, 50, 50), (80, 80, 80)],
                         'learning_rate': ['constant', 'adaptive'],
                         'solver': ['sgd', 'adam']},
             pre_dispatch='2*n_jobs', refit='accuracy',
             return_train_score=False,
             scoring={'accu

In [50]:
mlpGridScores=pd.DataFrame(mlp_grid.cv_results_)
mlpGridScores.head()


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_hidden_layer_sizes,param_learning_rate,param_solver,params,split0_test_accuracy,...,split2_test_recall,mean_test_recall,std_test_recall,rank_test_recall,split0_test_f1_score,split1_test_f1_score,split2_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score
0,1.128676,0.714481,0.012407,0.000288,0.001,4,constant,sgd,"{'alpha': 0.001, 'hidden_layer_sizes': 4, 'lea...",0.858321,...,0.352102,0.6515,0.21667,237,0.858321,0.743628,0.352102,0.6515,0.21667,237
1,1.826513,0.50407,0.012265,0.00047,0.001,4,constant,adam,"{'alpha': 0.001, 'hidden_layer_sizes': 4, 'lea...",0.362069,...,0.774775,0.56175,0.168717,251,0.362069,0.548726,0.774775,0.56175,0.168717,251
2,2.233154,0.257163,0.010776,0.000729,0.001,4,adaptive,sgd,"{'alpha': 0.001, 'hidden_layer_sizes': 4, 'lea...",0.584708,...,0.56982,0.56775,0.014766,249,0.584708,0.548726,0.56982,0.56775,0.014766,249
3,2.332059,0.199981,0.015741,0.003668,0.001,4,adaptive,adam,"{'alpha': 0.001, 'hidden_layer_sizes': 4, 'lea...",0.706147,...,0.742492,0.66825,0.080622,230,0.706147,0.556222,0.742492,0.66825,0.080622,230
4,1.755259,0.359454,0.010974,0.001304,0.001,7,constant,sgd,"{'alpha': 0.001, 'hidden_layer_sizes': 7, 'lea...",0.856822,...,0.54955,0.7565,0.146247,218,0.856822,0.862819,0.54955,0.7565,0.146247,218


In [51]:
mlpFinalModel = mlp_grid.best_estimator_
mlp_grid.best_params_

{'alpha': 0.01,
 'hidden_layer_sizes': (150, 150),
 'learning_rate': 'adaptive',
 'solver': 'adam'}

In [52]:
mlpFinalScore=cross_validate(mlpFinalModel, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   18.6s finished


In [53]:
mlpFinalScore = pd.DataFrame(mlpFinalScore)
mlpFinalScore.to_csv('NSL_ANN_ChiSquared_bestEstimator.csv')

In [54]:
mlpFinalScore['test_accuracy'].mean()

0.9517316206772964

# Decision Trees

In [55]:
from sklearn.tree import tree

In [56]:
dt=tree.DecisionTreeClassifier()

In [57]:
dtScore = cross_validate(dt, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.0s finished


In [58]:
dtScore = pd.DataFrame(dtScore)
dtScore.head()

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1_score
0,0.017162,0.010268,0.981038,0.981038,0.981038,0.981038
1,0.017733,0.005053,0.987013,0.987013,0.987013,0.987013
2,0.011063,0.003046,0.97998,0.97998,0.97998,0.97998
3,0.013278,0.003995,0.980962,0.980962,0.980962,0.980962


In [59]:
dtScore.to_csv('NSL_dtScore_ChiSquared_scores.csv')

In [60]:
dt_parameters={'min_samples_split' : range(10,500,20),'max_depth': range(1,20,2)}

In [61]:
dt2 = tree.DecisionTreeClassifier()

In [62]:
dlGrid = GridSearchCV(dt2, dt_parameters, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1, cv=4)

In [63]:
dlGrid.fit(X, y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 4 folds for each of 250 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    4.4s finished


GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n...
             param_grid={'max_depth': range(1, 20, 2),
                         'min_samples_split': range(10, 500, 20)},
             pre_dispa

In [64]:
dtGridScores = pd.DataFrame(dlGrid.cv_results_)
dtGridScores.head()
dtGridScores.to_csv('NSL_DT_ChiSquared_GridSearch.csv')

In [65]:
dt3=dlGrid.best_estimator_
dlGrid.best_params_

{'max_depth': 11, 'min_samples_split': 10}

In [66]:
dtFinalScore = cross_validate(dt3, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.0s finished


In [67]:
dtFinalScore = pd.DataFrame(dtFinalScore)
dtFinalScore.to_csv('NSL_DT_ChiSquared_bestEstimator.csv')
dtFinalScore['test_precision'].mean()

0.9822501992530672

# KNN

In [68]:
from sklearn.neighbors import KNeighborsClassifier

In [69]:
knn = KNeighborsClassifier()

In [70]:
k_range = list(range(1, 101, 5))

In [71]:
param_dict = dict(n_neighbors=k_range)

In [72]:
grid = GridSearchCV(knn, param_dict, cv=4, scoring=scoring_metrics, refit='accuracy', verbose=3, n_jobs=-1)

In [73]:
grid.fit(X, y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 4 folds for each of 20 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    7.4s finished


GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'n_neighbors': [1, 6, 11, 16, 21, 26, 31, 36, 41, 46,
                                         51, 56, 61, 66, 71, 76, 81, 86, 91,
                                         96]},
             pre_dispatch='2*n_jobs', refit='accuracy',
             return_train_score=False,
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1_score': make_scorer(f1_score, average=micro),
                      'precision': make_scorer(precision_score, average=micro),
                      'recall': make_scorer(recall_sco

In [74]:
knnScore = pd.DataFrame(grid.cv_results_)
knnScore.to_csv('NSL_KNN_ChiSquared_GridSearch.csv')

In [75]:
knn2=grid.best_estimator_
grid.best_params_

{'n_neighbors': 1}

In [76]:
knnFinalScore = cross_validate(knn2, X, y, cv=4, scoring=scoring_metrics, verbose=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.2s finished


In [77]:
knnFinalScore = pd.DataFrame(knnFinalScore)

In [78]:
knnFinalScore.to_csv('NSL_KNN_PCA_bestEstimator.csv')

In [79]:
pd.DataFrame(mlpScore).mean()

fit_time          4.607745
score_time        0.016519
test_accuracy     0.949004
test_precision    0.949004
test_recall       0.949004
test_f1_score     0.949004
dtype: float64

# All Results

In [80]:
allResults=pd.concat([
    mlpScore.mean(),
    dtFinalScore.mean(),
    knnFinalScore.mean(),
    randomForestScore.mean()
], axis=1)
allResults

Unnamed: 0,0,1,2,3
fit_time,4.607745,0.021506,0.011485,0.2912
score_time,0.016519,0.009725,0.273168,0.130735
test_accuracy,0.949004,0.98225,0.960993,0.871752
test_precision,0.949004,0.98225,0.960993,0.871752
test_recall,0.949004,0.98225,0.960993,0.871752
test_f1_score,0.949004,0.98225,0.960993,0.871752


In [81]:
allResults.rename(columns={0:'Multi-level Perceptron', 1:'Decision Tree', 2:'KNN' , 3:'Random Forest' },  inplace=True)
allResults

Unnamed: 0,Multi-level Perceptron,Decision Tree,KNN,Random Forest
fit_time,4.607745,0.021506,0.011485,0.2912
score_time,0.016519,0.009725,0.273168,0.130735
test_accuracy,0.949004,0.98225,0.960993,0.871752
test_precision,0.949004,0.98225,0.960993,0.871752
test_recall,0.949004,0.98225,0.960993,0.871752
test_f1_score,0.949004,0.98225,0.960993,0.871752


# All Results

In [82]:
allResults.to_csv('NSL_ChiSquared_Final.csv')