In [4]:
"""
In this script, we evaluate the performance of the SVM classifier over the NSL-KDD
Intrusion  Detection Dataset
"""
from __future__ import print_function

import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
from mlxtend.plotting import plot_decision_regions
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix , mean_squared_error
from sklearn import svm
from sklearn.svm import LinearSVC
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format


#Importing the data as panda dataframe
intrusion_data_df = pd.read_csv("C:/datasets/NSLKDD/Clean/KDDTrain.csv", sep=",", header=None)
intrusion_data_df_test = pd.read_csv("C:/datasets/NSLKDD/Clean/KDDTest.csv", sep=",", header=None)

intrusion_data_df.columns = ["duration","protocol_type","service","flag","src_bytes","dst_bytes",
                             "land","wrong_fragment","urgent","hot","num_failed_logins","logged_in",
                             "num_compromised","root_shell","su_attempted","num_root","num_file_creations","num_shells",
                             "num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count","srv_count",
                             "serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate",
                             "srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate",
                             "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
                             "dst_host_rerror_rate","dst_host_srv_rerror_rate","class"]
intrusion_data_df_test.columns = ["duration","protocol_type","service","flag","src_bytes","dst_bytes",
                             "land","wrong_fragment","urgent","hot","num_failed_logins","logged_in",
                             "num_compromised","root_shell","su_attempted","num_root","num_file_creations","num_shells",
                             "num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count","srv_count",
                             "serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate",
                             "srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate",
                             "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
                             "dst_host_rerror_rate","dst_host_srv_rerror_rate","class"]
attack_types_dict = {
        'back': 'dos',
        'buffer_overflow': 'u2r',
        'ftp_write': 'r2l',
        'guess_passwd': 'r2l',
        'imap': 'r2l',
        'ipsweep': 'probe',
        'land': 'dos',
        'loadmodule': 'u2r',
        'multihop': 'r2l',
        'neptune': 'dos',
        'nmap': 'probe',
        'perl': 'u2r',
        'phf':'r2l',
        'pod':'dos',
        'portsweep': 'probe',
        'rootkit': 'u2r',
        'satan': 'probe',
        'smurf': 'dos',
        'spy':'r2l',
        'teardrop': 'dos',
        'warezclient': 'r2l',
        'warezmaster':'r2l',
        'normal': 'normal',
        'unknown':'unknown'
}

binary_mapping = {
    "r2l" : 1,
    "u2r" : 1,
    "probe" : 1,
    "dos" : 1,
    "unknown" : 1,
    "normal" : 0
}

#step 1 mapping
intrusion_data_df["class"] = intrusion_data_df["class"].map(attack_types_dict)
intrusion_data_df_test["class"] = intrusion_data_df_test["class"].map(attack_types_dict)

#setp 2 mapping
intrusion_data_df["class"] = intrusion_data_df["class"].map(binary_mapping)
intrusion_data_df_test["class"] = intrusion_data_df_test["class"].map(binary_mapping)


#Random Permutations
intrusion_data_df = intrusion_data_df.reindex(np.random.permutation(intrusion_data_df.index))
intrusion_data_df_test = intrusion_data_df_test.reindex(np.random.permutation(intrusion_data_df_test.index))

###### Convert Categorical Variables #######

convert_protocol_type = {
    "tcp" : 1,
    "udp" : 2,
    "icmp" : 3
}

convert_flag = {
    "SF": 1,       
    "REJ": 2,        
    "S0": 3,       
    "RSTO" :4,       
    "RSTR": 5,       
    "S3" : 6 ,      
    "SH" : 7,         
    "S1":  8,        
    "S2": 9,         
    "OTH":10,           
    "RSTOS0": 11
}

convert_service = {
    "http":            1,
    "private":         2,
    "domain_u":        3,
    "smtp":            4,
    "ftp_data":        5,
    "eco_i":           6,
    "other":           7,
    "ecr_i":           8,
    "telnet":          9,
    "finger":          10,
    "ftp":             11,
    "auth":            12,
    "Z39_50":          13,
    "uucp":            14,
    "courier":         15,
    "bgp":             16,
    "whois":           17,
    "uucp_path":       18,
    "iso_tsap":        19,
    "time":            20,
    "imap4":           21,
    "nnsp":            22,
    "vmnet" :          23,
    "urp_i":           24,
    "domain":          25,
    "ctf" :            26,
    "csnet_ns":        27,
    "supdup":          28,
    "discard" :        29,
    "http_443":        30,
    "daytime":         31,
    "gopher":          32,
    "efs"  :           33,
    "systat" :         34,
    "link"  :          35,
    "exec":            36,
    "hostnames":       37,
    "name"  :          38,
    "mtp" :            39,
    "echo":            40,
    "klogin" :         41,
    "login":           42,
    "ldap":            43,
    "netbios_dgm":     44,
    "sunrpc" :         45,
    "netbios_ssn":     46,
    "netstat" :        47,
    "netbios_ns":      48,
    "ssh" :            49,
    "kshell" :         50,
    "nntp" :           51,
    "pop_3" :          52,
    "sql_net":         53,
    "IRC" :            54,
    "ntp_u":           55,
    "rje":             56,
    "remote_job":      57,
    "pop_2":           58,
    "X11" :            59,
    "printer":         60,
    "shell" :          61,
    "urh_i" :          62,
    "tim_i" :          63,
    "red_i" :          64,
    "pm_dump":         65,
    "tftp_u" :         66,
    "http_8001" :      67,
    "aol":             68,
    "harvest" :        69,
    "http_2784":       70
}


intrusion_data_df["protocol_type"] = intrusion_data_df["protocol_type"].map(convert_protocol_type)
intrusion_data_df_test["protocol_type"] = intrusion_data_df_test["protocol_type"].map(convert_protocol_type)

intrusion_data_df["flag"] = intrusion_data_df["flag"].map(convert_flag)
intrusion_data_df_test["flag"] = intrusion_data_df_test["flag"].map(convert_flag)

intrusion_data_df["service"] = intrusion_data_df["service"].map(convert_service)
intrusion_data_df_test["service"] = intrusion_data_df_test["service"].map(convert_service)

###########################################


#intrusion_data_df.head()

#Normalization function
def log_normalize(series):
  return series.apply(lambda x:math.log(x+1.0))


def linear_scale(series):
  min_val = series.min()
  max_val = series.max()
  scale = (max_val - min_val) / 2.0
  return series.apply(lambda x:((x - min_val) / scale) - 1.0)

#Function to Process the features / inputs
def preprocess_inputs(intrusion_data_df):
    '''
    Function to Process the features / inputs
    We do not use a for loop as we want to be  able to add or remove features 
    in the experiments
    '''
    processed_inputs = pd.DataFrame()
    
    processed_inputs["duration"] = log_normalize(intrusion_data_df["duration"])
    processed_inputs["protocol_type"] = log_normalize(intrusion_data_df["protocol_type"])
    processed_inputs["flag"] = log_normalize(intrusion_data_df["flag"])
    processed_inputs["service"] = log_normalize(intrusion_data_df["service"])
    processed_inputs["src_bytes"] = log_normalize(intrusion_data_df["src_bytes"])
    processed_inputs["dst_bytes"] = log_normalize(intrusion_data_df["dst_bytes"])
    processed_inputs["land"] = log_normalize(intrusion_data_df["land"])
    processed_inputs["wrong_fragment"] = log_normalize(intrusion_data_df["wrong_fragment"])
    processed_inputs["urgent"] = log_normalize(intrusion_data_df["urgent"])
    processed_inputs["hot"] = log_normalize(intrusion_data_df["hot"])
    processed_inputs["num_failed_logins"] = log_normalize(intrusion_data_df["num_failed_logins"])
    processed_inputs["logged_in"] = log_normalize(intrusion_data_df["logged_in"])
    processed_inputs["num_compromised"] = log_normalize(intrusion_data_df["num_compromised"])
    processed_inputs["root_shell"] = log_normalize(intrusion_data_df["root_shell"])
    processed_inputs["su_attempted"] = log_normalize(intrusion_data_df["su_attempted"])
    processed_inputs["num_root"] = log_normalize(intrusion_data_df["num_root"])
    processed_inputs["num_file_creations"] = log_normalize(intrusion_data_df["num_file_creations"])
    processed_inputs["num_shells"] = log_normalize(intrusion_data_df["num_shells"])
    processed_inputs["num_access_files"] = log_normalize(intrusion_data_df["num_access_files"])
    processed_inputs["num_outbound_cmds"] = log_normalize(intrusion_data_df["num_outbound_cmds"])
    processed_inputs["is_host_login"] = log_normalize(intrusion_data_df["is_host_login"])
    processed_inputs["is_guest_login"] = log_normalize(intrusion_data_df["is_guest_login"])
    processed_inputs["count"] = log_normalize(intrusion_data_df["count"])
    processed_inputs["srv_count"] = log_normalize(intrusion_data_df["srv_count"])
    processed_inputs["serror_rate"] = log_normalize(intrusion_data_df["serror_rate"])
    processed_inputs["srv_serror_rate"] = log_normalize(intrusion_data_df["srv_serror_rate"])
    processed_inputs["rerror_rate"] = log_normalize(intrusion_data_df["rerror_rate"])
    processed_inputs["srv_rerror_rate"] = log_normalize(intrusion_data_df["srv_rerror_rate"])
    processed_inputs["same_srv_rate"] = log_normalize(intrusion_data_df["same_srv_rate"])
    processed_inputs["diff_srv_rate"] = log_normalize(intrusion_data_df["diff_srv_rate"])
    processed_inputs["srv_diff_host_rate"] = log_normalize(intrusion_data_df["srv_diff_host_rate"])
    processed_inputs["dst_host_count"] = log_normalize(intrusion_data_df["dst_host_count"])
    processed_inputs["dst_host_srv_count"] = log_normalize(intrusion_data_df["dst_host_srv_count"])
    processed_inputs["dst_host_same_srv_rate"] = log_normalize(intrusion_data_df["dst_host_same_srv_rate"])
    processed_inputs["dst_host_diff_srv_rate"] = log_normalize(intrusion_data_df["dst_host_diff_srv_rate"])
    processed_inputs["dst_host_same_src_port_rate"] = log_normalize(intrusion_data_df["dst_host_same_src_port_rate"])
    processed_inputs["dst_host_srv_diff_host_rate"] = log_normalize(intrusion_data_df["dst_host_srv_diff_host_rate"])
    processed_inputs["dst_host_serror_rate"] = log_normalize(intrusion_data_df["dst_host_serror_rate"])
    processed_inputs["dst_host_srv_serror_rate"] = log_normalize(intrusion_data_df["dst_host_srv_serror_rate"])
    processed_inputs["dst_host_rerror_rate"] = log_normalize(intrusion_data_df["dst_host_rerror_rate"])
    processed_inputs["dst_host_srv_rerror_rate"] = log_normalize(intrusion_data_df["dst_host_srv_rerror_rate"])


    return processed_inputs
    
#Function to Process Targets
def preprocess_targets(intrusion_data_df):
    output_targets = pd.DataFrame()
    output_targets["class"] = intrusion_data_df["class"]
    return output_targets
 
#Select the first  for training
training_inputs = preprocess_inputs(intrusion_data_df.head(94480))
training_targets =preprocess_targets(intrusion_data_df.head(94480))

#Select data for validation
validation_inputs = preprocess_inputs(intrusion_data_df.tail(31493))
validation_targets =preprocess_targets(intrusion_data_df.tail(31493))


#Select the last  for testing
test_inputs = preprocess_inputs(intrusion_data_df_test)
test_targets = preprocess_targets(intrusion_data_df_test)

#Replacing att the inf and -inf to nan;  replace nan with 0s
true_test_targets_0 = test_targets.replace([np.inf, -np.inf], np.nan).fillna(0)
true_test_targets = pd.DataFrame(data = true_test_targets_0, dtype=np.int8)


#SVM Classifier Definition  - Linear SVC 
C=1.0 #SVM regularization parameter
#svm_classifier = svm.SVC(kernel='linear',gamma='scale', C=C)  #Binary Classifier
svm_classifier = LinearSVC(random_state=0, tol=1e-5,max_iter=5000)
svm_classifier.fit(training_inputs,training_targets.values.ravel()) 

#Predictions on validation data
val_predictions = svm_classifier.predict(validation_inputs)
val_accuracy = accuracy_score(validation_targets,val_predictions )

#Predictions on test data
test_predictions = svm_classifier.predict(test_inputs)
test_accuracy = accuracy_score(true_test_targets,test_predictions)

print("The accuracy of the SVM on validation data is "+ str(val_accuracy *100))
print("The accuracy of the SVM on test data is "+ str(test_accuracy *100))


#PLOTTING


The accuracy of the SVM on validation data is 97.00568380274981
The accuracy of the SVM on test data is 83.83090094486093


