In [1]:
import pandas as pd
import glob
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, BatchNormalization, Dropout
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from imblearn import under_sampling, over_sampling, combine
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from collections import Counter

In [2]:
#reading data

path = '/Users/ahmetokanarik/Desktop/MScThesis/Dataset/UNSQ-NB15'
all_files = glob.glob(path + "/*.csv")
li = []

for filename in all_files:
    df = pd.read_csv(filename,header=None,sep=';',low_memory=False, nrows=78000)
    li.append(df)


data = pd.concat(li, axis=0, ignore_index=True)
data.columns = ["srcip","sport","dstip","dsport","proto","state","dur","sbytes","dbytes","sttl","dttl","sloss","dloss","service","Sload","Dload","Spkts","Dpkts","swin","dwin","stcpb","dtcpb","smeansz","dmeansz","trans_depth","res_bdy_len","Sjit","Djit","Stime","Ltime","Sintpkt","Dintpkt","tcprtt","synack","ackdat","is_sm_ips_ports","ct_state_ttl","ct_flw_http_mthd","is_ftp_login","ct_ftp_cmd","ct_srv_src","ct_srv_dst","ct_dst_ltm","ct_src_ ltm","ct_src_dport_ltm","ct_dst_sport_ltm","ct_dst_src_ltm","attack_cat","Label"]


In [3]:
#one-hot encoding
dummies = pd.get_dummies(data[['service','proto','state']])
data.drop(['proto','service','state'],axis=1,inplace=True)
data = pd.concat([data, dummies], axis=1)
data = data[["dtcpb","stcpb","service_-","Dload","dmeansz","service_dns","smeansz","Sload","trans_depth","sttl",
            "service_ftp-data","ct_ftp_cmd","attack_cat"]]

data['attack_cat'] = data['attack_cat'].fillna('Normal')

In [4]:
replace_dict = {np.nan: 0, ' ': 0}
for cols in ['ct_ftp_cmd']:
    data[cols] = data[cols].replace(replace_dict)
    
replace_dict = {np.nan: 0, '0': 0}
for cols in ['ct_ftp_cmd']:
    data[cols] = data[cols].replace(replace_dict)
        

In [5]:
x = data.drop('attack_cat',axis=1).values.astype('float32')
y = data.attack_cat.values

In [6]:
#Target Variable Label
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [7]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
x = scaler.fit_transform(x)

In [8]:
x.shape

(312000, 12)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = 0.3,random_state=40,stratify=y)


In [10]:
print(sorted(Counter(y_train).items()))

[(0, 190), (1, 225), (2, 1731), (3, 4595), (4, 2253), (5, 20062), (6, 187784), (7, 1392), (8, 150), (9, 18)]


In [11]:
a = 177803 #y-traine gÃ¶re belirlenmeli.177803
smo = SMOTE(sampling_strategy={0:a,1:a,2:a,3:a,4:a,5:a,7:a,8:a,9:a},random_state=42) 
X_train, y_train = smo.fit_resample(X_train, y_train)   
print(sorted(Counter(y_train).items()))

[(0, 177803), (1, 177803), (2, 177803), (3, 177803), (4, 177803), (5, 177803), (6, 187784), (7, 177803), (8, 177803), (9, 177803)]


In [12]:
from imblearn.under_sampling import NearMiss

# define the undersampling method
undersample = NearMiss(version=1, n_neighbors=3, sampling_strategy={6:a})
X_train, y_train = undersample.fit_resample(X_train, y_train)

In [13]:
print(sorted(Counter(y_train).items()))

[(0, 177803), (1, 177803), (2, 177803), (3, 177803), (4, 177803), (5, 177803), (6, 177803), (7, 177803), (8, 177803), (9, 177803)]


In [14]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

In [15]:
import time

time_start = time.time()

gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=0.005, max_depth=10, random_state=0)
gb_clf.fit(X_train, y_train)

time_end = time.time()
train_time = time_end - time_start
print("Train time:",train_time)

Train time: 4336.719126939774


In [16]:
import time

time_start = time.time()

pred = gb_clf.predict(X_test)

time_end = time.time()
test_time = time_end - time_start
print("test_time:",test_time)

test_time: 0.731665849685669


In [17]:
from sklearn import metrics
from sklearn.metrics import classification_report

target_names = ['Analysis','Backdoor','DoS','Exploits','Fuzzers','Generic','Normal','Reconnaissance','Shellcode','Worms']
acc = metrics.accuracy_score(y_test,pred) * 100
f1 = metrics.f1_score(y_test, pred,average='weighted')* 100
pre = metrics.precision_score(y_test, pred, labels=None, pos_label=1, average='weighted') * 100 #DR
recall = metrics.recall_score(y_test, pred, labels=None, pos_label=1, average='weighted', sample_weight=None) * 100

print(classification_report(y_test,pred,target_names=target_names))
print("acc:",acc)
print("pre:",pre)
print("DR=recall:",recall)
print("f1:",f1)

                precision    recall  f1-score   support

      Analysis       0.05      0.67      0.09        81
      Backdoor       0.04      0.11      0.06        97
           DoS       0.25      0.13      0.17       742
      Exploits       0.78      0.48      0.59      1969
       Fuzzers       0.41      0.73      0.52       966
       Generic       1.00      0.97      0.99      8598
        Normal       1.00      0.99      0.99     80479
Reconnaissance       0.83      0.81      0.82       597
     Shellcode       0.10      0.55      0.17        64
         Worms       0.00      0.00      0.00         7

      accuracy                           0.96     93600
     macro avg       0.44      0.54      0.44     93600
  weighted avg       0.98      0.96      0.97     93600

acc: 96.2638888888889
pre: 97.9154946475697
DR=recall: 96.2638888888889
f1: 96.91050270018397


In [22]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, pred)

TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]

FAR = FP / (FP + TN)
print(FAR)

0.5925925925925926


In [18]:
"""#hyperparameter tuning
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope, as_apply

space={ 'max_depth': scope.int(hp.uniform('max_depth', 1, 11)),
        'learning_rate': hp.loguniform('learning_rate', np.log(0.0001), np.log(0.5)) - 0.0001,
        'subsample':hp.uniform('subsample',0.5,1),
        'ccp_alpha' : hp.loguniform('ccp_alpha', np.log(0.0001), np.log(1)) - 0.0001,
        'n_estimators': scope.int(hp.quniform('n_estimators', 100, 6000, 200)),
      }

# Classifier:
def hyperparameter_tuning(space):
    model = GradientBoostingClassifier(
                              max_depth = space['max_depth'],
                              learning_rate=space['learning_rate'],
                              n_estimators =space['n_estimators'],
                              ccp_alpha = space['ccp_alpha'],
                              subsample=space['subsample'],
                              verbose=2
                              )
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    model.fit(X_train, y_train)

    pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    #change the metric if you like
    return {'loss': -accuracy, 'status': STATUS_OK, 'model': model}

trials = Trials()
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=1,
            trials=trials)

print(best)"""

'#hyperparameter tuning\nimport hyperopt\nfrom hyperopt import fmin, tpe, hp, STATUS_OK, Trials\nfrom hyperopt.pyll import scope, as_apply\n\nspace={ \'max_depth\': scope.int(hp.uniform(\'max_depth\', 1, 11)),\n        \'learning_rate\': hp.loguniform(\'learning_rate\', np.log(0.0001), np.log(0.5)) - 0.0001,\n        \'subsample\':hp.uniform(\'subsample\',0.5,1),\n        \'ccp_alpha\' : hp.loguniform(\'ccp_alpha\', np.log(0.0001), np.log(1)) - 0.0001,\n        \'n_estimators\': scope.int(hp.quniform(\'n_estimators\', 100, 6000, 200)),\n      }\n\n# Classifier:\ndef hyperparameter_tuning(space):\n    model = GradientBoostingClassifier(\n                              max_depth = space[\'max_depth\'],\n                              learning_rate=space[\'learning_rate\'],\n                              n_estimators =space[\'n_estimators\'],\n                              ccp_alpha = space[\'ccp_alpha\'],\n                              subsample=space[\'subsample\'],\n                     

In [19]:
#NEWMODELBESTPARAMETERS.

In [20]:
gb_clf = GradientBoostingClassifier(n_estimators=1600, learning_rate=0.005, max_depth=3.34, subsample=0.93,
                                   ccp_alpha=0.0003)

In [21]:
import time

time_start = time.time()

gb_clf.fit(X_train, y_train)

time_end = time.time()
train_time = time_end - time_start
print("Train time:",train_time)

KeyboardInterrupt: 

In [None]:
import time

time_start = time.time()

pred = gb_clf.predict(X_test)

time_end = time.time()
test_time = time_end - time_start
print("test_time:",test_time)

In [None]:
from sklearn import metrics
from sklearn.metrics import classification_report

target_names = ['Analysis','Backdoor','DoS','Exploits','Fuzzers','Generic','Normal','Reconnaissance','Shellcode','Worms']
acc = metrics.accuracy_score(y_test,pred) * 100
f1 = metrics.f1_score(y_test, pred,average='weighted')* 100
pre = metrics.precision_score(y_test, pred, labels=None, pos_label=1, average='weighted') * 100 #DR
recall = metrics.recall_score(y_test, pred, labels=None, pos_label=1, average='weighted', sample_weight=None) * 100

print(classification_report(y_test,pred,target_names=target_names))
print("acc:",acc)
print("pre:",pre)
print("DR=recall:",recall)
print("f1:",f1)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, pred)

TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]

FAR = FP / (FP + TN)
print(FAR)