In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split # split  data into training and testing sets
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
import time
import pickle
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [9]:
root = "../../../../"

In [10]:
df = pd.read_csv(root + "datasets/binary/processed/CICDDoS_corr.csv", index_col=[0])

In [11]:
df[' Label'] = df[' Label'].apply(lambda x: 'ATTACK' if x != 'BENIGN' else 'BENIGN')

In [12]:
encoding = {
    "BENIGN": 0,
    "ATTACK" : 1    
}

In [13]:
df[' Label'] = df[' Label'].map(encoding)

In [14]:
df.head()

Unnamed: 0,Source Port,Destination Port,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Std,Bwd Packet Length Min,Bwd Packet Length Std,Flow Bytes/s,...,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,min_seg_size_forward,Active Std,Active Min,Idle Mean,Idle Std,Label
0,564.0,22216.0,0.0,2736.0,0.0,1368.0,0.0,0.0,0.0,2736000000.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1
1,1010.0,26305.0,0.0,2650.0,0.0,1325.0,0.0,0.0,0.0,2650000000.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1
2,608.0,52380.0,0.0,2944.0,0.0,1472.0,0.0,0.0,0.0,1472000000.0,...,0.0,0.0,1.0,1.0,14.0,0.0,0.0,0.0,0.0,1
3,529.0,27632.0,0.0,2944.0,0.0,1472.0,0.0,0.0,0.0,64000000.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1
4,564.0,33365.0,0.0,2944.0,0.0,1472.0,0.0,0.0,0.0,2944000000.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1


In [15]:
performance = []

for features in range(1,44):
    X = df.drop(columns=[' Label'])
    y = df[' Label']

    splits = 10
    fit_times = []
    predict_times = []
    test_sizes = []
    score_times = []
    test_accuracies = []
    test_precisions = []
    test_recalls = []
    test_f1_scores = []

    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)

    for train_index, test_index in skf.split(X, y):
        X_train,  X_test = X.iloc[train_index], X.iloc[test_index]
        y_train,  y_test = y.iloc[train_index], y.iloc[test_index]

        start = time.time()
        # Feature Selection
        selector = SelectKBest(f_classif, k=features).fit(X_train, y_train)
        X_train = selector.transform(X_train)
        X_test = selector.transform(X_test)
        # Training the model
        clf_xgb = xgb.XGBClassifier(seed=42)
        # clf_xgb.fit(X_train, 
        #             y_train,
        #             # verbose=True,
        #             ## the next three arguments set up early stopping.
        #             early_stopping_rounds=5,
        #             eval_metric='logloss',
        #             eval_set=[(X_test, y_test)])
        clf_xgb.fit(X_train, y_train)
        end = time.time()
        fit_times.append(end - start)

        start = time.time()
        y_pred = clf_xgb.predict(X_test)
        end = time.time()
        predict_times.append(end - start)

        test_sizes.append(len(y_pred))

        start = time.time()
        test_accuracies.append(accuracy_score(y_test, y_pred))
        test_precisions.append(precision_score(y_test, y_pred))
        test_recalls.append(recall_score(y_test, y_pred))
        test_f1_scores.append(f1_score(y_test, y_pred))
        end = time.time()
        score_times.append(end - start)

    fit_times = np.array(fit_times)
    predict_times = np.array(predict_times)
    test_sizes = np.array(test_sizes)
    test_accuracies = np.array(test_accuracies)
    test_precisions = np.array(test_precisions)
    test_recalls = np.array(test_recalls)
    test_f1_scores = np.array(test_f1_scores)
    score_times = np.array(score_times)

    pfm = pd.DataFrame([test_accuracies, test_precisions, test_recalls, test_f1_scores,
                        fit_times, predict_times, score_times, test_sizes])
    pfm = pfm.T
    pfm.columns = ["Accuracy", "Precision", "Recall", "F1_Score", 
                    "Fit_Time", "Predict_Time", "Score_Time", "Test_Size"]
    performance.append(pfm)

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [16]:
filename = root + "pickles/binary/pfm_anova.pkl"
outfile = open(filename, 'wb')
pickle.dump(performance, outfile)
outfile.close()

In [25]:
performance[42]

Unnamed: 0,Accuracy,Precision,Recall,F1_Score,Fit_Time,Predict_Time,Score_Time,Test_Size
0,0.996212,1.0,0.992424,0.996198,0.341787,0.001998,0.011992,528.0
1,0.998106,0.996226,1.0,0.99811,0.335792,0.002998,0.011992,528.0
2,0.998106,1.0,0.996212,0.998102,0.337791,0.002997,0.010993,528.0
3,1.0,1.0,1.0,1.0,0.342788,0.002996,0.011993,528.0
4,1.0,1.0,1.0,1.0,0.353781,0.002998,0.011993,528.0
5,1.0,1.0,1.0,1.0,0.337791,0.002998,0.010993,528.0
6,1.0,1.0,1.0,1.0,0.339789,0.002998,0.011993,528.0
7,1.0,1.0,1.0,1.0,0.340789,0.002999,0.011992,528.0
8,1.0,1.0,1.0,1.0,0.341787,0.002998,0.010993,528.0
9,0.998106,0.996226,1.0,0.99811,0.3248,0.002997,0.010993,528.0
