In [1]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
# The path to the dataset files

train20_nsl_kdd_dataset_path = "NSL-KDD/KDDTrain+_20Percent.txt"
train_nsl_kdd_dataset_path = "NSL-KDD/KDDTrain+.txt"
test_nsl_kdd_dataset_path = "NSL-KDD/KDDTest+.txt"

In [14]:
# pre-defined features and attack categories from KDD

col_names = np.array(["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","labels", "difficulty"])

col_names_true = np.array(["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate"])

attack_dict = {
    'normal': 'normal',
    
    'back': 'DoS',
    'land': 'DoS',
    'neptune': 'DoS',
    'pod': 'DoS',
    'smurf': 'DoS',
    'teardrop': 'DoS',
    'mailbomb': 'DoS',
    'apache2': 'DoS',
    'processtable': 'DoS',
    'udpstorm': 'DoS',
    
    'ipsweep': 'Probe',
    'nmap': 'Probe',
    'portsweep': 'Probe',
    'satan': 'Probe',
    'mscan': 'Probe',
    'saint': 'Probe',

    'ftp_write': 'R2L',
    'guess_passwd': 'R2L',
    'imap': 'R2L',
    'multihop': 'R2L',
    'phf': 'R2L',
    'spy': 'R2L',
    'warezclient': 'R2L',
    'warezmaster': 'R2L',
    'sendmail': 'R2L',
    'named': 'R2L',
    'snmpgetattack': 'R2L',
    'snmpguess': 'R2L',
    'xlock': 'R2L',
    'xsnoop': 'R2L',
    'worm': 'R2L',
    
    'buffer_overflow': 'U2R',
    'loadmodule': 'U2R',
    'perl': 'U2R',
    'rootkit': 'U2R',
    'httptunnel': 'U2R',
    'ps': 'U2R',    
    'sqlattack': 'U2R',
    'xterm': 'U2R'
}

drop_cols = list(col_names[9:22])

In [4]:
# preprocessing functions

categorical_cols = ['protocol_type', 'service', 'flag']
features_to_normalize = ["duration","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate"]

def preprocess_train(train_df):
    ret_df = train_df.copy()
    
    # Categorize
    for x in categorical_cols:
        ret_df[x] = pd.Categorical(ret_df[x])
        df_dummies = pd.get_dummies(ret_df[x], prefix = x)
        ret_df = pd.concat([ret_df, df_dummies], axis=1)
    ret_df = ret_df.drop(columns = categorical_cols)
    
    # Normalize non-categorical columns
    for x in features_to_normalize:
        if ret_df[x].max() > 0:
            ret_df[x] = ret_df[x] / ret_df[x].max()
    
    return ret_df


def preprocess_test(train_df, test_df):
    ret_df = pd.concat([train_df, test_df])
    
    # Categorize
    for x in categorical_cols:
        ret_df[x] = pd.Categorical(ret_df[x])
        df_dummies = pd.get_dummies(ret_df[x], prefix = x)
        ret_df = pd.concat([ret_df, df_dummies], axis=1)
    ret_df = ret_df.drop(columns = categorical_cols)
    
    ret_df = ret_df[len(train_df):]
    
    # Normalize non-categorical columns
    for x in features_to_normalize:
        if ret_df[x].max() > 0:
            ret_df[x] = ret_df[x] / ret_df[x].max()
    
    return ret_df

def get_xy(data_pcd):
    X = data_pcd.drop(columns = ['labels', 'difficulty'])
    y = data_pcd["labels"].copy()
    for i in range(len(y)):
        if y[i] == "normal":
            y[i] = 0
        else:
            y[i] = 1
    y = y.astype('int')  
    
    return X, y

def get_xy_dos(data_pcd):
    X = data_pcd.drop(columns = ['labels', 'difficulty'])
    y = data_pcd["labels"].copy()
    for i in range(len(y)):
        if attack_dict[y[i]] == "DoS":
            y[i] = 1
        else:
            y[i] = 0
    y = y.astype('int')  
    
    return X, y


In [5]:
# loading and preprocessing the training set and the test set

train_full = pd.read_csv(train_nsl_kdd_dataset_path, header=None)
train_full.columns = col_names
train_full = train_full.drop(columns=drop_cols)

train_full_pcd = preprocess_train(train_full)
train_X, train_y = get_xy(train_full_pcd)
_, train_y_dos = get_xy_dos(train_full_pcd)

test_full = pd.read_csv(test_nsl_kdd_dataset_path, header=None)
test_full.columns = col_names
test_full = test_full.drop(columns=drop_cols)

test_full_pcd = preprocess_test(train_full, test_full) # Use this one for full one-hot encoding
test_X, test_y = get_xy(test_full_pcd)
_, test_y_dos = get_xy_dos(test_full_pcd)

train_X2 = train_X.drop(['count', 'srv_count'], 1)
test_X2 = test_X.drop(['count', 'srv_count'], 1)

In [None]:
col_names_true = np.array(["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate"])

true_dos_path = "dos-1.txt"

true_full = pd.read_csv(true_dos_path, header=None)
true_full.columns = col_names_true
true_full['labels'] = 'pod'
true_full['difficulty'] = 1

true_full_pcd = preprocess_test(train_full, true_full) # Use this one for full one-hot encoding
true_X, true_y = get_xy(true_full_pcd)
_, true_y_dos = get_xy_ddos(true_full_pcd)

true_X2 = true_X.drop(['count', 'srv_count'], 1)

In [26]:
clf = LogisticRegression(max_iter = 1000, random_state=0, solver='lbfgs')
print(clf.fit(train_X, train_y))
print("Train acc:", clf.score(train_X, train_y))
print("Test acc:", clf.score(test_X, test_y))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)
Train acc: 0.9720495661768792
Test acc: 0.7663236337828248


In [28]:
rfc = RandomForestClassifier()
print(rfc.fit(train_X, train_y))
print("Train acc:", rfc.score(train_X, train_y))
print("Test acc:", rfc.score(test_X, test_y))



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Train acc: 0.9997221626856548
Test acc: 0.7670777146912704


In [29]:
abc = AdaBoostClassifier()
print(abc.fit(train_X, train_y))
print("Train acc:", abc.score(train_X, train_y))
print("Test acc:", abc.score(test_X, test_y))

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
Train acc: 0.9817897485969216
Test acc: 0.7720014194464159


In [30]:
gbc = GradientBoostingClassifier()
print(gbc.fit(train_X, train_y))
print("Train acc:", gbc.score(train_X, train_y))
print("Test acc:", gbc.score(test_X, test_y))

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
Train acc: 0.9931810784850722
Test acc: 0.778832505322924


In [31]:
mlp = MLPClassifier()
print(mlp.fit(train_X, train_y))
print("Train acc:", mlp.score(train_X, train_y))
print("Test acc:", mlp.score(test_X, test_y))

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
Train acc: 0.9959038841656546
Test acc: 0.805624556422995


In [46]:
# and we run each trained model against our captured attack data from the SDN

for i in range(1,8):
    true_dos_path = "data/dos-" + str(i) + ".txt"
    print(true_dos_path, "True Acc")

    true_full = pd.read_csv(true_dos_path, header=None)
    true_full.columns = col_names_true
    true_full['labels'] = 'pod'
    true_full['difficulty'] = 1
    true_full.head()
    true_full_pcd = preprocess_test(train_full, true_full) # Use this one for full one-hot encoding
    true_X, true_y = get_xy(true_full_pcd)
    _, true_y_dos = get_xy_dos(true_full_pcd)


    print("Logistic Regression", clf.score(true_X, true_y))
    print("Random Forest", rfc.score(true_X, true_y))
    print("Adaptive Boosting", abc.score(true_X, true_y))
    print("Gradient Boosting", gbc.score(true_X, true_y))
    print("MLP", mlp.score(true_X, true_y))
    print()

data/dos-1.txt True Acc
Logistic Regression 0.18541809351846086
Random Forest 0.0
Adaptive Boosting 0.6513293070169976
Gradient Boosting 0.9999688686881265
MLP 3.1131311873482345e-05

data/dos-2.txt True Acc
Logistic Regression 0.2012372531849383
Random Forest 1.412389480523149e-05
Adaptive Boosting 0.9812717154882631
Gradient Boosting 1.0
MLP 1.412389480523149e-05

data/dos-3.txt True Acc
Logistic Regression 0.166725272517205
Random Forest 5.023358617571708e-05
Adaptive Boosting 0.9784497915306174
Gradient Boosting 1.0
MLP 2.511679308785854e-05

data/dos-4.txt True Acc
Logistic Regression 0.03539948149893943
Random Forest 9.427292010370022e-05
Adaptive Boosting 0.9875088380862598
Gradient Boosting 1.0
MLP 4.713646005185011e-05

data/dos-5.txt True Acc
Logistic Regression 0.1369303860995249
Random Forest 0.0
Adaptive Boosting 0.6907635001003725
Gradient Boosting 0.9996877300202975
MLP 0.0781790199183637

data/dos-6.txt True Acc
Logistic Regression 0.03026207103702035
Random Forest 0.00

In [25]:
clf2 = LogisticRegression(max_iter = 1000, random_state=0, solver='lbfgs')
print(clf2.fit(train_X, train_y_dos))
print("Train acc:", clf2.score(train_X, train_y_dos))
print("Test acc:", clf2.score(test_X, test_y_dos))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)
Train acc: 0.9881958832448223
Test acc: 0.9233942512420156


In [32]:
rfc2 = RandomForestClassifier()
print(rfc2.fit(train_X, train_y_dos))
print("Train acc:", rfc2.score(train_X, train_y_dos))
print("Test acc:", rfc2.score(test_X, test_y_dos))



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Train acc: 0.9999523707461122
Test acc: 0.8952271114265437


In [33]:
abc2 = AdaBoostClassifier()
print(abc2.fit(train_X, train_y_dos))
print("Train acc:", abc2.score(train_X, train_y_dos))
print("Test acc:", abc2.score(test_X, test_y_dos))

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
Train acc: 0.9991188588030768
Test acc: 0.9068044712562101


In [34]:
gbc2 = GradientBoostingClassifier()
print(gbc2.fit(train_X, train_y_dos))
print("Train acc:", gbc2.score(train_X, train_y_dos))
print("Test acc:", gbc2.score(test_X, test_y_dos))

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
Train acc: 0.9997618537305613
Test acc: 0.935193399574166


In [35]:
mlp2 = MLPClassifier()
print(mlp2.fit(train_X, train_y_dos))
print("Train acc:", mlp2.score(train_X, train_y_dos))
print("Test acc:", mlp2.score(test_X, test_y_dos))

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
Train acc: 0.9985552459654052
Test acc: 0.9238378282469837


In [45]:
# and we run each trained model against our captured attack data from the SDN

for i in range(1,8):
    true_dos_path = "data/dos-" + str(i) + ".txt"
    print(true_dos_path, "True Acc")

    true_full = pd.read_csv(true_dos_path, header=None)
    true_full.columns = col_names_true
    true_full['labels'] = 'pod'
    true_full['difficulty'] = 1
    true_full.head()
    true_full_pcd = preprocess_test(train_full, true_full) # Use this one for full one-hot encoding
    true_X, true_y = get_xy(true_full_pcd)
    _, true_y_dos = get_xy_dos(true_full_pcd)

    print("Logistic Regression", clf2.score(true_X, true_y_dos))
    print("Random Forest", rfc2.score(true_X, true_y_dos))
    print("Adaptive Boosting", abc2.score(true_X, true_y_dos))
    print("Gradient Boosting", gbc2.score(true_X, true_y_dos))
    print("MLP", mlp2.score(true_X, true_y_dos))
    print()

data/dos-1.txt True Acc
Logistic Regression 0.9997820808168856
Random Forest 0.0
Adaptive Boosting 0.0
Gradient Boosting 0.999937737376253
MLP 0.49698026274827223

data/dos-2.txt True Acc
Logistic Regression 0.8562187508827435
Random Forest 0.6987938193836333
Adaptive Boosting 0.0
Gradient Boosting 0.9999858761051947
MLP 0.0006496991610406486

data/dos-3.txt True Acc
Logistic Regression 0.8349826694127693
Random Forest 0.7206510272768373
Adaptive Boosting 0.0
Gradient Boosting 1.0
MLP 0.001105138895865776

data/dos-4.txt True Acc
Logistic Regression 0.9460287532406316
Random Forest 0.8551967947207165
Adaptive Boosting 0.0
Gradient Boosting 0.9999528635399482
MLP 0.00023568230025925054

data/dos-5.txt True Acc
Logistic Regression 0.8933151919345125
Random Forest 4.460999710035019e-05
Adaptive Boosting 0.0
Gradient Boosting 0.9999776950014498
MLP 0.5114759217540651

data/dos-6.txt True Acc
Logistic Regression 0.08778253360366449
Random Forest 0.06257665640409502
Adaptive Boosting 0.0
Gra

In [37]:
# we save the models into files for later use
# where we can simply do
# loaded_model = joblib.load(filename)

from sklearn.externals import joblib
models_path = "./models/"

_ = joblib.dump(clf, models_path + 'lg_att.pkl', compress=9)
_ = joblib.dump(rfc, models_path + 'rf_att.pkl', compress=9)
_ = joblib.dump(abc, models_path + 'abc_att.pkl', compress=9)
_ = joblib.dump(gbc, models_path + 'gbc_att.pkl', compress=9)
_ = joblib.dump(mlp, models_path + 'mlp_att.pkl', compress=9)

_ = joblib.dump(clf2, models_path + 'lr_dos.pkl', compress=9)
_ = joblib.dump(rfc2, models_path + 'rf_dos.pkl', compress=9)
_ = joblib.dump(abc2, models_path + 'abc_dos.pkl', compress=9)
_ = joblib.dump(gbc2, models_path + 'gbc_dos.pkl', compress=9)
_ = joblib.dump(mlp2, models_path + 'mlp_dos.pkl', compress=9)