In [1]:
import pandas as pd
import glob
import numpy as np
from imblearn import under_sampling, over_sampling, combine
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
#conda install -c conda-forge xgboost

In [3]:
#reading data

path = '/Users/ahmetokanarik/Desktop/MScThesis/Dataset/UNSQ-NB15'
all_files = glob.glob(path + "/*.csv")
li = []

for filename in all_files:
    df = pd.read_csv(filename,header=None,sep=';',low_memory=False)
    li.append(df)


data = pd.concat(li, axis=0, ignore_index=True)
data.columns = ["srcip","sport","dstip","dsport","proto","state","dur","sbytes","dbytes","sttl","dttl","sloss","dloss","service","Sload","Dload","Spkts","Dpkts","swin","dwin","stcpb","dtcpb","smeansz","dmeansz","trans_depth","res_bdy_len","Sjit","Djit","Stime","Ltime","Sintpkt","Dintpkt","tcprtt","synack","ackdat","is_sm_ips_ports","ct_state_ttl","ct_flw_http_mthd","is_ftp_login","ct_ftp_cmd","ct_srv_src","ct_srv_dst","ct_dst_ltm","ct_src_ ltm","ct_src_dport_ltm","ct_dst_sport_ltm","ct_dst_src_ltm","attack_cat","Label"]


In [4]:
data.proto

0          tcp
1          tcp
2          tcp
3          udp
4          udp
          ... 
2540042    tcp
2540043    tcp
2540044    tcp
2540045    tcp
2540046    tcp
Name: proto, Length: 2540047, dtype: object

In [5]:
#one-hot encoding
dummies = pd.get_dummies(data[['service','proto','state']])
data.drop(['proto','service','state'],axis=1,inplace=True)
data = pd.concat([data, dummies], axis=1)
data = data[["dtcpb","stcpb","service_-","Dload","dmeansz","service_dns","smeansz","Sload","trans_depth","sttl",
            "service_ftp-data","ct_ftp_cmd","attack_cat"]]

data['attack_cat'] = data['attack_cat'].fillna('Normal')

In [6]:
replace_dict = {np.nan: 0, ' ': 0}
for cols in ['ct_ftp_cmd']:
    data[cols] = data[cols].replace(replace_dict)
    
replace_dict = {np.nan: 0, '0': 0}
for cols in ['ct_ftp_cmd']:
    data[cols] = data[cols].replace(replace_dict)
        

In [7]:
x = data.drop('attack_cat',axis=1).values.astype('float32')
y = data.attack_cat.values

In [8]:
print(sorted(Counter(y).items()))

[('Analysis', 2677), ('Backdoor', 2329), ('DoS', 16353), ('Exploits', 44525), ('Fuzzers', 24246), ('Generic', 215481), ('Normal', 2218764), ('Reconnaissance', 13987), ('Shellcode', 1511), ('Worms', 174)]


In [9]:
#Target Variable Label
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [10]:
print(sorted(Counter(y).items()))

[(0, 2677), (1, 2329), (2, 16353), (3, 44525), (4, 24246), (5, 215481), (6, 2218764), (7, 13987), (8, 1511), (9, 174)]


In [11]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
x = scaler.fit_transform(x)

In [12]:
x.shape

(2540047, 12)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = 0.3,random_state=40,stratify=y)


In [14]:
print(sorted(Counter(y_train).items()))

[(0, 1874), (1, 1630), (2, 11447), (3, 31167), (4, 16972), (5, 150837), (6, 1553134), (7, 9791), (8, 1058), (9, 122)]


In [15]:
print(sorted(Counter(y_test).items()))

[(0, 803), (1, 699), (2, 4906), (3, 13358), (4, 7274), (5, 64644), (6, 665630), (7, 4196), (8, 453), (9, 52)]


In [16]:
X_train.shape, X_test.shape

((1778032, 12), (762015, 12))

In [17]:
a = 177803 #177803 
smo = SMOTE(sampling_strategy={0:a,1:a,2:a,3:a,4:a,5:a,7:a,8:a,9:a},random_state=42) 
X_train, y_train = smo.fit_resample(X_train, y_train)   
print(sorted(Counter(y_train).items()))

[(0, 177803), (1, 177803), (2, 177803), (3, 177803), (4, 177803), (5, 177803), (6, 1553134), (7, 177803), (8, 177803), (9, 177803)]


In [18]:
from imblearn.under_sampling import NearMiss

# define the undersampling method
undersample = NearMiss(version=1, n_neighbors=3, sampling_strategy={6:a})
X_train, y_train = undersample.fit_resample(X_train, y_train)


In [19]:
print(sorted(Counter(y_train).items()))

[(0, 177803), (1, 177803), (2, 177803), (3, 177803), (4, 177803), (5, 177803), (6, 177803), (7, 177803), (8, 177803), (9, 177803)]


In [20]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test)

In [47]:
"""#hyperparameter tuning

import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope, as_apply

space={ 'learning_rate': hp.loguniform('learning_rate', np.log(0.0001), np.log(0.5)) - 0.0001,
        'n_estimators': scope.int(hp.quniform('n_estimators', 100, 6000, 200)),
        'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
      }

def hyperparameter_tuning(space):
    model=xgb.XGBRegressor(learning_rate = space['learning_rate'], n_estimators =space['n_estimators'], max_depth = int(space['max_depth']),
                         colsample_bytree=space['colsample_bytree'], min_child_weight=int(space['min_child_weight']))
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    model.fit(X_train, y_train,verbose=True)

    pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    #change the metric if you like
    return {'loss': -accuracy, 'status': STATUS_OK, 'model': model}

trials = Trials()

best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=1,
            trials=trials)

"""

SCORE:                                               
0.0009789833533460628                                  
100%|██████████| 1/1 [2:42:59<00:00, 9779.34s/trial, best loss: -0.0009789833533460628]


In [21]:
#new model with best parameters
import time

time_start = time.time()

params = {
    'n_estimators':5000,
    'subsample':0.9,
    'colsample_bytree':0.61, 
    'objective': 'multi:softmax',  # error evaluation for multiclass training
    'num_class': 10,
    'max_depth':36,
    'scale_pos_weight':1,
    'min_child_weight':4,
    'learning_rate':0.5,
    'seed':27
}
bst = xgb.train(params, dtrain)

time_end = time.time()
train_time = time_end - time_start
print("Train time:",train_time)



Parameters: { n_estimators, scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Train time: 205.27044105529785


In [22]:
import time
time_start = time.time()

pred = bst.predict(dtest)

time_end = time.time()
test_time = time_end - time_start
print("Test time:",test_time)

Test time: 0.8189940452575684


In [23]:
from sklearn import metrics
from sklearn.metrics import classification_report

target_names = ['Analysis','Backdoor','DoS','Exploits','Fuzzers','Generic','Normal','Reconnaissance','Shellcode','Worms']
acc = metrics.accuracy_score(y_test,pred) * 100
f1 = metrics.f1_score(y_test, pred,average='weighted')* 100
pre = metrics.precision_score(y_test, pred, labels=None, pos_label=1, average='weighted') * 100 #DR
recall = metrics.recall_score(y_test, pred, labels=None, pos_label=1, average='weighted', sample_weight=None) * 100

print(classification_report(y_test,pred,target_names=target_names))
print("acc:",acc)
print("pre:",pre)
print("DR=recall:",recall)
print("f1:",f1)


                precision    recall  f1-score   support

      Analysis       0.07      0.31      0.12       803
      Backdoor       0.08      0.26      0.12       699
           DoS       0.33      0.47      0.39      4906
      Exploits       0.75      0.54      0.63     13358
       Fuzzers       0.48      0.70      0.57      7274
       Generic       1.00      0.98      0.99     64644
        Normal       1.00      0.99      0.99    665630
Reconnaissance       0.88      0.77      0.82      4196
     Shellcode       0.25      0.53      0.34       453
         Worms       0.20      0.83      0.33        52

      accuracy                           0.97    762015
     macro avg       0.50      0.64      0.53    762015
  weighted avg       0.98      0.97      0.98    762015

acc: 97.3084519333609
pre: 98.16524921264127
DR=recall: 97.3084519333609
f1: 97.65862728457434


In [25]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, pred)

TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]

FAR = FP / (FP + TN)
print(FAR)

0.5121293800539084
