In [28]:
import pandas as pd
import numpy as np

In [29]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score


# Reading Data

In [30]:
df = pd.read_csv('../../Dataset/NSL_new.csv')

In [31]:
df = df.drop( df[ (df.label != "normal") & (df.label != "neptune") & (df.label != "back") & (df.label != "land") & (df.label != "pod") & (df.label != "smurf") & (df.label != "teardrop") & (df.label != "teardrop") & (df.label != "mailbomb") & (df.label != "apache2") & (df.label != "processtable") & (df.label != "udpstorm") & (df.label != "worm")].index )
print(df.shape)

(113270, 123)


In [32]:
df = df.sample(n=20000)

In [33]:
df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,label
2419,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,neptune
71263,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,neptune
13863,0,751,325,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,normal
3737,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,neptune
4747,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,neptune


In [34]:
X = df.drop(['label'], axis=1)
y = df['label']

In [35]:
scoring_metrics = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score, average="micro"),
           'recall' : make_scorer(recall_score, average="micro"), 
           'f1_score' : make_scorer(f1_score, average="micro")}

In [36]:
y = y.replace( to_replace =  'normal' , value = 0 )

y = y.replace( to_replace =  'neptune' , value = 1 )
y = y.replace( to_replace =  'back' , value = 1 )
y = y.replace( to_replace =  'land' , value = 1 )
y = y.replace( to_replace =  'pod' , value = 1 )
y = y.replace( to_replace =  'smurf' , value = 1 )
y = y.replace( to_replace =  'teardrop' , value = 1 )
y = y.replace( to_replace =  'mailbomb' , value = 1 )
y = y.replace( to_replace =  'apache2' , value = 1 )
y = y.replace( to_replace =  'processtable' , value = 1 )
y = y.replace( to_replace =  'udpstorm' , value = 1 )
y = y.replace( to_replace =  'worm' , value = 1 )

In [37]:
y

2419     1
71263    1
13863    0
3737     1
4747     1
        ..
81512    1
59404    0
54256    1
45517    1
91832    0
Name: label, Length: 20000, dtype: int64

# First Level Stacking

# KNN, RandomForest, AdaBoost

In [38]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from vecstack import stacking

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [40]:
# Base Learners
models = [
    KNeighborsClassifier(n_neighbors=5, n_jobs=-1),    
    RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=90, max_depth=19, min_samples_split=0.1),    
    AdaBoostClassifier(random_state=0, learning_rate=0.1, n_estimators=1000)
]

In [41]:
S_train, S_test = stacking(models, X_train, y_train, X_test, regression=False, mode='oof_pred_bag', 
                           needs_proba=False, save_dir=None, metric=accuracy_score, n_folds=4, stratified=True, 
                           shuffle=True, random_state=0, verbose=2)

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [KNeighborsClassifier]
    fold  0:  [0.99575000]
    fold  1:  [0.99375000]
    fold  2:  [0.99300000]
    fold  3:  [0.99350000]
    ----
    MEAN:     [0.99400000] + [0.00104583]
    FULL:     [0.99400000]

model  1:     [RandomForestClassifier]
    fold  0:  [0.98900000]
    fold  1:  [0.98525000]
    fold  2:  [0.98325000]
    fold  3:  [0.97700000]
    ----
    MEAN:     [0.98362500] + [0.00434633]
    FULL:     [0.98362500]

model  2:     [AdaBoostClassifier]
    fold  0:  [0.99900000]
    fold  1:  [0.99875000]
    fold  2:  [0.99900000]
    fold  3:  [0.99925000]
    ----
    MEAN:     [0.99900000] + [0.00017678]
    FULL:     [0.99900000]



# Second Level Stacking

In [43]:
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

# Decision Tree - Meta Classifier

In [52]:
DT_Meta = DecisionTreeClassifier(max_depth=13, min_samples_split=10)

In [53]:
DT_Meta = DT_Meta.fit(S_train, y_train)
y_pred = DT_Meta.predict(S_test)

In [54]:
print('Final prediction score: [%.8f]' % accuracy_score(y_test, y_pred))

Final prediction score: [0.99900000]


# MLP Meta Classifier

In [55]:
MLP_Meta = MLPClassifier(solver='adam', alpha=0.001, hidden_layer_sizes=20)

In [56]:
MLP_Meta = MLP_Meta.fit(S_train, y_train)
y_pred = MLP_Meta.predict(S_test)

In [57]:
print('Final prediction score: [%.8f]' % accuracy_score(y_test, y_pred))

Final prediction score: [0.99900000]
