In [9]:
"""This Scripts applies the Extra-Trees Model on the UNSW-NB15 dataset
in order to extract the feature importance factor with respect to the class"""
from __future__ import print_function

import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
from mlxtend.plotting import plot_decision_regions
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix , mean_squared_error
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
pd.options.display.max_rows = 10
from sklearn import preprocessing

pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.3f}'.format

#IMPORT TRAINING AND TEST DATA IN PANDA DATAFRAMES

unswnb15_train = pd.read_csv("C:/Users/SYDNEY/Documents/Sydney/PhD/Tutorials/Python/UNSW-NB15/data/UNSW_NB15_TRAINING.csv", sep=",")

#INTRUSIONS/ATTACKS TYPES MAPPING

class_mapping = {
    'Normal': 0,
    'Generic': 1,
    'Exploits': 2,
    'Fuzzers': 3,
    'DoS': 4,
    'Reconnaissance': 5,
    'Analysis': 6,
    'Backdoor': 7,
    'Shellcode': 8,
    'Worms': 9
}
state_mapping = {
        'INT': 1,
        'FIN': 2,
        'CON': 3,
        'REQ': 4,
        'RST': 5,
        'ECO': 6,
        'PAR': 7,
        'no':  8,
        'URN': 9
    }
state_test_mapping = {
        'FIN': 2,
        'INT': 1,
        'CON': 3,    
        'REQ': 4,
        'ACC': 10,
        'RST': 5,
        'CLO': 11
}

service_mapping = {
    '-': 11,        
    'dns': 12,         
    'http': 13,      
    'smtp': 14,         
    'ftp-data': 15,     
    'ftp': 16,         
    'ssh': 17,        
    'pop3': 18,        
    'dhcp': 19,           
    'snmp': 20,          
    'ssl':21,            
    'irc':22,            
    'radius': 23       
}

proto_mapping = {
        'tcp': 1,           
        'udp': 2,        
        'unas':3,         
        'arp': 4,             
        'ospf': 5,           
        'sctp': 6,            
        'any':7,            
        'gre':8,              
        'ipv6':9,            
        'sun-nd': 10,           
        'swipe':11,            
        'pim': 12,              
        'mobile': 13,           
        'rsvp':14,             
        'sep': 15,             
        'ib':  16,             
        'sprite-rpc': 17,      
        'ttp': 18,             
        'smp': 19,            
        'visa': 20,
        'sps': 21,            
        'vines':22,           
        'ipv6-frag':23,      
        'ipip': 24,           
        'merit-inp': 25,       
        'idpr': 26,            
        'xtp': 27,            
        'il':  28,             
        'iatp': 29,             
        'scps': 30,            
        'gmtp': 31,           
        'pnni': 32,           
        'pvp': 33,            
        'mfe-nsp': 34,         
        'vmtp':35,             
        'snp': 36,            
        'ptp': 37,           
        'vrrp': 38,             
        'l2tp': 39,            
        'sm': 40,               
        'wsn': 41,             
        'qnx': 42,             
        'ipv6-opts':43,        
        'zero': 44,            
        'mtp': 45,             
        'tp++': 46,             
        'pipe': 47,            
        'secure-vmtp':48,      
        'ipcomp': 49,         
        'ipx-n-ip': 50,         
        'uti':51,              
        'ifmp': 52,             
        'sat-mon': 53,          
        'sdrp': 54,            
        'ippc': 55,           
        'bna': 56,              
        'idpr-cmtp': 57,       
        'encap': 58,           
        'wb-mon': 59,           
        'idrp': 60,            
        'crudp': 61,           
        'fc':  62,             
        'tlsp': 63,            
        'wb-expak': 64,        
        'larp': 65,             
        'ddx': 66,              
        'dgp': 67,            
        'compaq-peer': 68,     
        'rvd': 69,             
        'fire': 70,            
        'a/n': 71,              
        'ipv6-route': 72,      
        'eigrp': 73,           
        'iso-ip': 74,          
        'mhrp': 75,             
        'cftp': 76,           
        'pri-enc': 77,          
        'micp': 78,            
        'srp' : 79,            
        'kryptolan' :80,       
        'ipv6-no': 81,        
        'narp' : 82,           
        'ipcv': 83,            
        'pgm': 84,              
        'isis': 85,            
        'ax.25': 86,            
        'cpnx': 87,             
        '3pc': 88,            
        'tcf': 89,            
        'stp': 90,            
        'i-nlsp': 91,           
        'aris': 92,             
        'cphb': 93,               
        'skip': 94,             
        'etherip': 95,           
        'br-sat-mon': 96,        
        'ddp': 97,              
        'sccopmce': 98,         
        'aes-sp3-d': 99,         
        'nsfnet-igp': 100,        
        'sat-expak' : 101,       
        'iplt': 102,              
        'leaf-2': 103,            
        'dcn': 104,               
        'pup': 105,               
        'nvp': 106,               
        'trunk-1': 107,            
        'cbt': 108,                 
        'trunk-2': 109,            
        'crtp': 110,               
        'leaf-1': 111,             
        'chaos': 112,             
        'igp': 113,                
        'iso-tp4': 114,           
        'ggp': 115,                
        'emcon': 116,              
        'xnet': 117,              
        'ip': 118,                
        'ipnip': 119,              
        'st2': 120,               
        'mux': 121,               
        'irtp': 122,              
        'prm': 123,             
        'xns-idp': 124,            
        'hmp': 125,               
        'egp': 126,               
        'rdp': 127,               
        'netblt': 128,            
        'bbn-rcc': 129,            
        'argus' : 130,            
        'igmp': 131,              
        'icmp': 132,              
        'rtp':  133               
}

#STEP 1 MAPPING  USING .map(dict_name)
unswnb15_train["attack_cat"] = unswnb15_train["attack_cat"].map(class_mapping)

#STEP 2 MAPPING  - 'state' FEATURE
unswnb15_train["state"] = unswnb15_train["state"].map(state_mapping)

#STEP 3 MAPPING  - 'service' FEATURE
unswnb15_train["service"] = unswnb15_train["service"].map(service_mapping)

#STEP 4 MAPPING  - 'proto' FEATURE
unswnb15_train["proto"] = unswnb15_train["proto"].map(proto_mapping)



#Normalization functions
def log_normalize(series):
  return series.apply(lambda x:math.log(x+1.0))

def process_features(dataset):
    '''This function retrives and normalizes the inputs'''
    
    #Declare an empty dataframe
    proccessed_features = pd.DataFrame()
    #Retrieve columns names
    col_names = list(dataset)
    
    for feature_index in range(0,41):
        proccessed_features[col_names[feature_index]] = log_normalize(dataset[col_names[feature_index]])
    
    return proccessed_features

def process_targets(dataset):
    """This function returns the label/output"""
    targets = pd.DataFrame()
    targets['label'] = dataset['label']
    return targets

#Random Permutations
unswnb15_train = unswnb15_train.reindex(np.random.permutation(unswnb15_train.index))
#Get the feature and  targets
training_data_features = process_features(unswnb15_train)
training_data_labels = process_targets(unswnb15_train)


#Extra Tree Classifier 
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(training_data_features, training_data_labels.values.ravel())
clf.feature_importances_

scores = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],
             axis=0)
indices = np.argsort(scores)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(training_data_features.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], scores[indices[f]]))


Feature ranking:
1. feature 31 (0.125573)
2. feature 10 (0.094520)
3. feature 9 (0.076320)
4. feature 12 (0.070620)
5. feature 8 (0.041252)
6. feature 15 (0.039846)
7. feature 11 (0.037874)
8. feature 40 (0.030439)
9. feature 30 (0.029490)
10. feature 3 (0.026663)
11. feature 27 (0.025744)
12. feature 6 (0.025600)
13. feature 7 (0.024240)
14. feature 26 (0.021015)
15. feature 35 (0.019919)
16. feature 5 (0.019777)
17. feature 19 (0.019592)
18. feature 16 (0.019439)
19. feature 34 (0.019201)
20. feature 24 (0.017525)
21. feature 23 (0.017200)
22. feature 18 (0.017169)
23. feature 17 (0.016747)
24. feature 20 (0.016511)
25. feature 39 (0.015816)
26. feature 41 (0.015645)
27. feature 32 (0.012183)
28. feature 0 (0.011640)
29. feature 25 (0.011529)
30. feature 21 (0.011115)
31. feature 13 (0.010566)
32. feature 33 (0.010469)
33. feature 22 (0.009855)
34. feature 1 (0.009437)
35. feature 14 (0.008576)
36. feature 2 (0.006838)
37. feature 4 (0.005124)
38. feature 28 (0.003523)
39. feature 38