In [1]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
# The path to the dataset files

train20_nsl_kdd_dataset_path = "NSL-KDD/KDDTrain+_20Percent.txt"
train_nsl_kdd_dataset_path = "NSL-KDD/KDDTrain+.txt"
test_nsl_kdd_dataset_path = "NSL-KDD/KDDTest+.txt"

In [3]:
# pre-defined features and attack categories from KDD

col_names = np.array(["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","labels", "difficulty"])

col_names_true = np.array(["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate"])

attack_dict = {
    'normal': 'normal',
    
    'back': 'DoS',
    'land': 'DoS',
    'neptune': 'DoS',
    'pod': 'DoS',
    'smurf': 'DoS',
    'teardrop': 'DoS',
    'mailbomb': 'DoS',
    'apache2': 'DoS',
    'processtable': 'DoS',
    'udpstorm': 'DoS',
    
    'ipsweep': 'Probe',
    'nmap': 'Probe',
    'portsweep': 'Probe',
    'satan': 'Probe',
    'mscan': 'Probe',
    'saint': 'Probe',

    'ftp_write': 'R2L',
    'guess_passwd': 'R2L',
    'imap': 'R2L',
    'multihop': 'R2L',
    'phf': 'R2L',
    'spy': 'R2L',
    'warezclient': 'R2L',
    'warezmaster': 'R2L',
    'sendmail': 'R2L',
    'named': 'R2L',
    'snmpgetattack': 'R2L',
    'snmpguess': 'R2L',
    'xlock': 'R2L',
    'xsnoop': 'R2L',
    'worm': 'R2L',
    
    'buffer_overflow': 'U2R',
    'loadmodule': 'U2R',
    'perl': 'U2R',
    'rootkit': 'U2R',
    'httptunnel': 'U2R',
    'ps': 'U2R',    
    'sqlattack': 'U2R',
    'xterm': 'U2R'
}

drop_cols = list(col_names[9:22])

In [4]:
# preprocessing functions

categorical_cols = ['protocol_type', 'service', 'flag']
features_to_normalize = ["duration","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate"]

def preprocess_train(train_df):
    ret_df = train_df.copy()
    
    # Categorize
    for x in categorical_cols:
        ret_df[x] = pd.Categorical(ret_df[x])
        df_dummies = pd.get_dummies(ret_df[x], prefix = x)
        ret_df = pd.concat([ret_df, df_dummies], axis=1)
    ret_df = ret_df.drop(columns = categorical_cols)
    
    # Normalize non-categorical columns
    for x in features_to_normalize:
        if ret_df[x].max() > 0:
            ret_df[x] = ret_df[x] / ret_df[x].max()
    
    return ret_df


def preprocess_test(train_df, test_df):
    ret_df = pd.concat([train_df, test_df])
    
    # Categorize
    for x in categorical_cols:
        ret_df[x] = pd.Categorical(ret_df[x])
        df_dummies = pd.get_dummies(ret_df[x], prefix = x)
        ret_df = pd.concat([ret_df, df_dummies], axis=1)
    ret_df = ret_df.drop(columns = categorical_cols)
    
    ret_df = ret_df[len(train_df):]
    
    # Normalize non-categorical columns
    for x in features_to_normalize:
        if ret_df[x].max() > 0:
            ret_df[x] = ret_df[x] / ret_df[x].max()
    
    return ret_df

def get_xy(data_pcd):
    X = data_pcd.drop(columns = ['labels', 'difficulty'])
    y = data_pcd["labels"].copy()
    for i in range(len(y)):
        if y[i] == "normal":
            y[i] = 0
        else:
            y[i] = 1
    y = y.astype('int')  
    
    return X, y

def get_xy_dos(data_pcd):
    X = data_pcd.drop(columns = ['labels', 'difficulty'])
    y = data_pcd["labels"].copy()
    for i in range(len(y)):
        if attack_dict[y[i]] == "DoS":
            y[i] = 1
        else:
            y[i] = 0
    y = y.astype('int')  
    
    return X, y

# loading and preprocessing the training set and the test set

train_full = pd.read_csv(train_nsl_kdd_dataset_path, header=None)
train_full.columns = col_names
train_full = train_full.drop(columns=drop_cols)

train_full_pcd = preprocess_train(train_full)
train_X, train_y = get_xy(train_full_pcd)
_, train_y_dos = get_xy_dos(train_full_pcd)

test_full = pd.read_csv(test_nsl_kdd_dataset_path, header=None)
test_full.columns = col_names
test_full = test_full.drop(columns=drop_cols)

test_full_pcd = preprocess_test(train_full, test_full) # Use this one for full one-hot encoding
test_X, test_y = get_xy(test_full_pcd)
_, test_y_dos = get_xy_dos(test_full_pcd)

In [5]:
# Train a GradBoost model using attack-or-not labels

gbc = GradientBoostingClassifier()
print(gbc.fit(train_X, train_y))
print("Train acc:", gbc.score(train_X, train_y))
print("Test acc:", gbc.score(test_X, test_y))

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
Train acc: 0.9931810784850722
Test acc: 0.77861071682044


In [6]:
# Now we verify the model's accuracy by feeding in live-captured data, which are preprocessed in the same way as the training/test sets.

for i in range(1,8):
    true_dos_path = "data/dos-" + str(i) + ".txt"
    print(true_dos_path, "True Acc")

    true_full = pd.read_csv(true_dos_path, header=None)
    true_full.columns = col_names_true
    true_full['labels'] = 'pod'
    true_full['difficulty'] = 1
    true_full_pcd = preprocess_test(train_full, true_full) # Use this one for full one-hot encoding
    true_X, true_y = get_xy(true_full_pcd)
    _, true_y_dos = get_xy_dos(true_full_pcd)

    print("Gradient Boosting", gbc.score(true_X, true_y))
    print()

data/dos-1.txt True Acc
Gradient Boosting 0.9999688686881265

data/dos-2.txt True Acc
Gradient Boosting 1.0

data/dos-3.txt True Acc
Gradient Boosting 1.0

data/dos-4.txt True Acc
Gradient Boosting 1.0

data/dos-5.txt True Acc
Gradient Boosting 0.9996877300202975

data/dos-6.txt True Acc
Gradient Boosting 1.0

data/dos-7.txt True Acc
Gradient Boosting 0.9999822964982473



In [None]:
# Train a GradBoost model using DoS-or-not labels

gbc2 = GradientBoostingClassifier()
print(gbc2.fit(train_X, train_y_dos))
print("Train acc:", gbc2.score(train_X, train_y_dos))
print("Test acc:", gbc2.score(test_X, test_y_dos))

In [None]:
for i in range(1,8):
    true_dos_path = "data/dos-" + str(i) + ".txt"
    print(true_dos_path, "True Accuracy")

    true_full = pd.read_csv(true_dos_path, header=None)
    true_full.columns = col_names_true
    true_full['labels'] = 'pod'
    true_full['difficulty'] = 1
    true_full_pcd = preprocess_test(train_full, true_full) # Use this one for full one-hot encoding
    true_X, true_y = get_xy(true_full_pcd)
    _, true_y_dos = get_xy_dos(true_full_pcd)

    print("Gradient Boosting", gbc2.score(true_X, true_y_dos))
    print()

In [7]:
# Save the models in binary files
from sklearn.externals import joblib
models_path = "./models/"

_ = joblib.dump(gbc, models_path + 'gbc_att_demo.pkl', compress=9)
# _ = joblib.dump(gbc2, models_path + 'gbc_dos_demo.pkl', compress=9)


In [8]:
from sklearn.externals import joblib
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
loaded_model = joblib.load("./models/gbc_att_demo.pkl")
print("Accuracy: ", loaded_model.score(test_X, test_y))
print("Actual outputs",loaded_model.predict(test_X[0:2]))

Accuracy:  0.77861071682044
Actual outputs [1 1]
