### Author:  Sydney M. Kasongo

#### The Scikit-Learn GradientBoostingClassifier to used to generate the feature importances
#### for the UNSW-NB15 intrusion detection datasets
#### The dataset can be downloaded from: https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/

### Import all the necessary Libraries 

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier


### Import the UNSW-NB15 Dataset

In [None]:
unsw_b15_train = pd.read_csv('C:/Users/sydne/Documents/Sydney/UJ/datasets/UNSW_NB15_TRAINING.csv', sep = ",")
unsw_b15_test = pd.read_csv('C:/Users/sydne/Documents/Sydney/UJ/datasets/UNSW_NB15_TEST.csv', sep = ",")

### Perform all Necessary Mappings on the UNSW-NB15 Dataset

In [None]:
class_mapping = {
    'Normal': 0,
    'Generic': 1,
    'Exploits': 2,
    'Fuzzers': 3,
    'DoS': 4,
    'Reconnaissance': 5,
    'Analysis': 6,
    'Backdoor': 7,
    'Shellcode': 8,
    'Worms': 9
}
state_mapping = {
        'INT': 1,
        'FIN': 2,
        'CON': 3,
        'REQ': 4,
        'RST': 5,
        'ECO': 6,
        'PAR': 7,
        'no':  8,
        'URN': 9
    }
state_test_mapping = {
        'FIN': 2,
        'INT': 1,
        'CON': 3,    
        'REQ': 4,
        'ACC': 10,
        'RST': 5,
        'CLO': 11
}

service_mapping = {
    '-': 11,        
    'dns': 12,         
    'http': 13,      
    'smtp': 14,         
    'ftp-data': 15,     
    'ftp': 16,         
    'ssh': 17,        
    'pop3': 18,        
    'dhcp': 19,           
    'snmp': 20,          
    'ssl':21,            
    'irc':22,            
    'radius': 23       
}

proto_mapping = {
        'tcp': 1,           
        'udp': 2,        
        'unas':3,         
        'arp': 4,             
        'ospf': 5,           
        'sctp': 6,            
        'any':7,            
        'gre':8,              
        'ipv6':9,            
        'sun-nd': 10,           
        'swipe':11,            
        'pim': 12,              
        'mobile': 13,           
        'rsvp':14,             
        'sep': 15,             
        'ib':  16,             
        'sprite-rpc': 17,      
        'ttp': 18,             
        'smp': 19,            
        'visa': 20,
        'sps': 21,            
        'vines':22,           
        'ipv6-frag':23,      
        'ipip': 24,           
        'merit-inp': 25,       
        'idpr': 26,            
        'xtp': 27,            
        'il':  28,             
        'iatp': 29,             
        'scps': 30,            
        'gmtp': 31,           
        'pnni': 32,           
        'pvp': 33,            
        'mfe-nsp': 34,         
        'vmtp':35,             
        'snp': 36,            
        'ptp': 37,           
        'vrrp': 38,             
        'l2tp': 39,            
        'sm': 40,               
        'wsn': 41,             
        'qnx': 42,             
        'ipv6-opts':43,        
        'zero': 44,            
        'mtp': 45,             
        'tp++': 46,             
        'pipe': 47,            
        'secure-vmtp':48,      
        'ipcomp': 49,         
        'ipx-n-ip': 50,         
        'uti':51,              
        'ifmp': 52,             
        'sat-mon': 53,          
        'sdrp': 54,            
        'ippc': 55,           
        'bna': 56,              
        'idpr-cmtp': 57,       
        'encap': 58,           
        'wb-mon': 59,           
        'idrp': 60,            
        'crudp': 61,           
        'fc':  62,             
        'tlsp': 63,            
        'wb-expak': 64,        
        'larp': 65,             
        'ddx': 66,              
        'dgp': 67,            
        'compaq-peer': 68,     
        'rvd': 69,             
        'fire': 70,            
        'a/n': 71,              
        'ipv6-route': 72,      
        'eigrp': 73,           
        'iso-ip': 74,          
        'mhrp': 75,             
        'cftp': 76,           
        'pri-enc': 77,          
        'micp': 78,            
        'srp' : 79,            
        'kryptolan' :80,       
        'ipv6-no': 81,        
        'narp' : 82,           
        'ipcv': 83,            
        'pgm': 84,              
        'isis': 85,            
        'ax.25': 86,            
        'cpnx': 87,             
        '3pc': 88,            
        'tcf': 89,            
        'stp': 90,            
        'i-nlsp': 91,           
        'aris': 92,             
        'cphb': 93,               
        'skip': 94,             
        'etherip': 95,           
        'br-sat-mon': 96,        
        'ddp': 97,              
        'sccopmce': 98,         
        'aes-sp3-d': 99,         
        'nsfnet-igp': 100,        
        'sat-expak' : 101,       
        'iplt': 102,              
        'leaf-2': 103,            
        'dcn': 104,               
        'pup': 105,               
        'nvp': 106,               
        'trunk-1': 107,            
        'cbt': 108,                 
        'trunk-2': 109,            
        'crtp': 110,               
        'leaf-1': 111,             
        'chaos': 112,             
        'igp': 113,                
        'iso-tp4': 114,           
        'ggp': 115,                
        'emcon': 116,              
        'xnet': 117,              
        'ip': 118,                
        'ipnip': 119,              
        'st2': 120,               
        'mux': 121,               
        'irtp': 122,              
        'prm': 123,             
        'xns-idp': 124,            
        'hmp': 125,               
        'egp': 126,               
        'rdp': 127,               
        'netblt': 128,            
        'bbn-rcc': 129,            
        'argus' : 130,            
        'igmp': 131,              
        'icmp': 132,              
        'rtp':  133               
}


#STEP 1 MAPPING  USING .map(dict_name)
unsw_b15_train["attack_cat"] = unsw_b15_train["attack_cat"].map(class_mapping)
unsw_b15_test["attack_cat"] = unsw_b15_test["attack_cat"].map(class_mapping)

#STEP 2 MAPPING  - 'state' FEATURE
unsw_b15_train["state"] = unsw_b15_train["state"].map(state_mapping)
unsw_b15_test["state"] = unsw_b15_test["state"].map(state_test_mapping)

#STEP 3 MAPPING  - 'service' FEATURE
unsw_b15_train["service"] = unsw_b15_train["service"].map(service_mapping)
unsw_b15_test["service"] = unsw_b15_test["service"].map(service_mapping)

#STEP 4 MAPPING  - 'proto' FEATURE
unsw_b15_train["proto"] = unsw_b15_train["proto"].map(proto_mapping)
unsw_b15_test["proto"] = unsw_b15_test["proto"].map(proto_mapping)

#For Binary Classification, drop the attack_cat feature col.
unsw_b15_train = unsw_b15_train.drop(['attack_cat'], axis=1)
unsw_b15_test =  unsw_b15_test.drop(['attack_cat'], axis=1)

#inputs and targets
X_unsw_b15_train = unsw_b15_train.drop(['label', 'id'], axis=1)
y_unsw_b15_train = unsw_b15_train['label']

### Generate the Training and Validation sets using train_test_split()

In [None]:
#Generate the train and validation datasets.
X_train, X_val, y_train, y_val = train_test_split(X_unsw_b15_train, y_unsw_b15_train, random_state=10)
X_test = unsw_b15_test.drop(['label', 'id'], axis=1)
y_test = unsw_b15_test['label']

### Instantiate The XGBoost classifier and Generate the feature importances

In [None]:
#XGBoost classifier
xgboost = GradientBoostingClassifier(random_state=0).fit(X_train, y_train)

col_names = list(X_train.columns)
importance_scores = list(xgboost.feature_importances_)

# You can set the limit here - This number can change based on preferences.
limit =  19 

selected_columns = []
score_list =[]
for feature_name, score in zip(col_names, importance_scores):
    score_list.append([feature_name,score])

def sortBySecond(index):
    return index[1]
score_list.sort(key=sortBySecond, reverse=True)

for i in range(42):
    selected_columns.append(score_list[i][0])
    print(score_list[i])
    
print(selected_columns[0:limit])