In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
#np.seterr(divide='print', invalid='print')
from xgboost import XGBClassifier
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import confusion_matrix, roc_curve
import seaborn as sns
import IPython
from plotly.offline import init_notebook_mode
%matplotlib inline
import plotly.offline as offline
from sklearn.calibration import CalibratedClassifierCV
import plotly.graph_objs as go
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import pickle

from common_functions import *

In [2]:
with open("y_encoder", "rb") as fp:   
    y_encoder = pickle.load(fp)

In [3]:
class OvrXgb:

    def __init__(self):
        self.ovr_models = dict()
        self.weight = dict()
        for i in range(10):
            self.weight[i] = 1
            
    """
        set new weights
    """
    def set_weight(self, new_weight):
        self.weight = new_weight
    
    """
        train model and validate
    """
    def fit(self, train_x, train_y, val_x, val_y, balance=False):
        ovr_classes = np.unique(train_y)
        print('Training...')
        for ovr_class in ovr_classes:
            self.__fit_class(ovr_class, train_x, train_y, val_x, val_y, balance)


    def __fit_class(self, ovr_class, train_x, train_y, val_x, val_y, balance):
        #train_y_ovr = np.array([1 if y == ovr_class else 0 for y in train_y])
        train_y_ovr = []
        val_y_ovr = np.array([1 if y == ovr_class else 0 for y in val_y])

        #classes of value 1 will be quite less since all other classes will get covered under 0. this might create imbalanced training set
        #generating a balanced training and val subset
        train_mask = []
        val_mask = []

        ovr_class_count = 0
        non_ovr_class_count = 0
        for index, y in enumerate(train_y):
            if y == ovr_class:   #if current class then add directly
                train_mask.append(index)
                train_y_ovr.append(1)
                ovr_class_count+=1
            elif not balance: #if not balance then add directly else add only if balanced
                train_mask.append(index)
                train_y_ovr.append(0)
                non_ovr_class_count+=1
            elif ovr_class_count >= non_ovr_class_count:
                train_mask.append(index)
                train_y_ovr.append(0)
                non_ovr_class_count+=1
        
        x_cfl=XGBClassifier(nthread=4)
        x_cfl.fit(train_x[train_mask], train_y_ovr)

        model = CalibratedClassifierCV(x_cfl, method="sigmoid")
        model.fit(train_x[train_mask], train_y_ovr)
        
        self.ovr_models[ovr_class] = model #store OVR model

        train_ovr_detection_acc = self.__get_acc(train_y_ovr, [int(np.round(x)) for x in model.predict(train_x[train_mask])])
        val_ovr_detection_acc = self.__get_acc(val_y_ovr, [int(np.round(x)) for x in model.predict(val_x)])

        print('Accuracy for class "' + str(y_encoder.classes_[ovr_class]) + '" - Train acc.: ' + str(train_ovr_detection_acc) + ' %, Val acc.: ' + str(val_ovr_detection_acc) + ' %')


    """
        get accuracy
    """
    def __get_acc(self, true_y, pred_y):
        return round(accuracy_score(true_y, pred_y) * 100, 2)

    """
        get auc score
    """
    def __get_auc(self, true_y, pred_y):
        return round(roc_auc_score(true_y, pred_y) * 100, 2)


    """
        predict test X
    """
    def predict(self, test_x):
        ovr_pred = np.array([])  #for storing predictions
        ovr_proba = np.array([]) #for storing probabilities
        for ovr_class in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
            
            if ovr_class==0:
                ovr_pred, ovr_proba = self.__predict_using_ovr(ovr_class, test_x)
                ovr_proba = ovr_proba * self.weight[ovr_class]
            else:
                pred, proba = self.__predict_using_ovr(ovr_class, test_x)
                proba = proba * self.weight[ovr_class]

                ovr_pred = np.hstack((ovr_pred, pred))
                ovr_proba = np.hstack((ovr_proba, proba))

        result = []
        for index, pred_row in enumerate(ovr_pred):
            #if single prediction is present across all predictions go for this value otherwise use probabilites
            if np.bincount(pred_row.astype(int), minlength=2)[1] == 1:
                result.append(np.argmax(ovr_pred[index]))
            else:
                result.append(np.argmax(ovr_proba[index]))

        return np.array(result)
    
    
    """
        predict probability
    """
    def predict_proba(self, test_x):
        ovr_pred = np.array([])  #for storing predictions
        ovr_proba = np.array([]) #for storing probabilities
        for ovr_class in np.unique([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]):
            
            if ovr_class==0:
                ovr_pred, ovr_proba = self.__predict_using_ovr(ovr_class, test_x)
                ovr_proba = ovr_proba * self.weight[ovr_class]
            else:
                pred, proba = self.__predict_using_ovr(ovr_class, test_x)
                proba = proba * self.weight[ovr_class]

                ovr_pred = np.hstack((ovr_pred, pred))
                ovr_proba = np.hstack((ovr_proba, proba))

        return ovr_proba

    """
        utility method
    """
    def __predict_using_ovr(self, ovr_class, test_x):
        
        ovr_model = self.ovr_models[ovr_class]

        """
            use predict
            if it fails then use proba
        """
        pred = ovr_model.predict(test_x)
        #proba = ovr_model.predict_proba(test_x) #use for keras models
        proba = ovr_model.predict_proba(test_x)[:, 1] #use for xgb
        return pred.reshape(-1, 1), proba.reshape(-1, 1)


In [4]:
"""
    Takes X and predicts y
"""
def run_pipeline_1(X):
    
    X = pd.DataFrame(X, columns = ["id", "dur", "proto", "service", "state", "spkts", "dpkts", "sbytes", "dbytes", "rate", "sttl", "dttl", "sload", 
                                "dload", "sloss", "dloss", "sinpkt", "dinpkt", "sjit", "djit", "swin", "stcpb", "dtcpb", "dwin", "tcprtt", "synack", 
                                "ackdat", "smean", "dmean", "trans_depth", "response_body_len", "ct_srv_src", "ct_state_ttl", "ct_dst_ltm", 
                                "ct_src_dport_ltm", "ct_dst_sport_ltm", "ct_dst_src_ltm", "is_ftp_login", "ct_ftp_cmd", "ct_flw_http_mthd", "ct_src_ltm", 
                                "ct_srv_dst", "is_sm_ips_ports"])
    
    X = X.drop(['id'], axis=1)
    
    #2. Encode
    with open("binary_features", "rb") as fp:   # Unpickling binary features
        binary_features = pickle.load(fp)

    with open("categorical_features", "rb") as fp:   # Unpickling categorical features
        categorical_features = pickle.load(fp)

    with open("numerical_features", "rb") as fp:   # Unpickling numerical features
        numerical_features = pickle.load(fp)
    
    with open("one_hot_encoders", "rb") as fp:   
        one_hot_encoders = pickle.load(fp)
        
    with open("one_hot_encoders_features", "rb") as fp:   
        one_hot_encoders_features = pickle.load(fp)
        
    with open("feature_standardizers", "rb") as fp: 
        feature_standardizers = pickle.load(fp)
    
    with open("minmax_scalers", "rb") as fp: 
        minmax_scalers = pickle.load(fp)
        
    X_encoded, _ = merge_all(X, one_hot_encoders_features, 
                                      get_one_hot_encoded_features(X, categorical_features, one_hot_encoders), 
                                      get_standardized_features(X, numerical_features, feature_standardizers), 
                                      binary_features)

    with open("random_forest", "rb") as fp: 
        random_forest = pickle.load(fp)  
    
    y_1_probas = random_forest.predict_proba(X_encoded)
    # print(y_1.shape)
    #y = np.array([np.argmax(y) for y in y_1]).reshape(-1, 1)
    
    y_1_pred = []
    threshold = 0.980280 # best threshold from previous notebook
    for y_proba in y_1_probas[:, 1]: 
        if y_proba < threshold:
            y_1_pred.append(0)
        else:
            y_1_pred.append(1)
            
    y_1_pred = np.array(y_1_pred).reshape(-1, 1)
    
    #print(y.shape)
    #print(X_encoded.shape)
    X_encoded = np.hstack((X_encoded.todense(), y_1_pred))
        
    oxgb = OvrXgb()
    with open("oxgb_models", "rb") as fp:   # Unpickling model
        oxgb.ovr_models = pickle.load(fp)
    
    y_proba = oxgb.predict_proba(X_encoded)
    
    y_pred = np.array([np.argmax(y) for y in y_proba]).reshape(-1, 1)
    #print("Predicted class is " + y_encoder.inverse_transform(y_pred.ravel()))
    
    return y_pred, y_proba

In [5]:
"""
117295,0.000006,udp,dns,INT,2,0,114,0,166666.6608,254,0,76000000,0,0,0,0.006,0,0,0,0,0,0,0,0,0,0,57,0,0,0,32,2,16,16,16,32,0,0,0,16,32,0,
Generic,1
"""

y_pred, y_proba = run_pipeline_1([[1, 0.000006,"udp","dns","INT",2,0,114,0,166666.6608,254,0,76000000,0,0,0,0.006,0,0,0,0,0,0,0,0,0,0,57,0,0,0,32,2,16,16,16,32,0,0,0,16,32,0]])

print("Predicted class is " + y_encoder.inverse_transform(y_pred.ravel()))

features_encoded shape: (1, 196), features_encoded_name len: 196
['Predicted class is Generic']


In [6]:
""" 
44091,1.504671,tcp,http,FIN,10,10,878,2606,12.627345,62,252,4205.570801,12473.15918,2,2,167.185667,157.208562,
10025.67205,257.018641,255,2630345556,2391717210,255,0.168907,0.088862,0.080045,88,261,0,0,2,1,1,1,1,1,0,0,0,2,2,0,
Exploits,1
"""

y_pred, y_proba = run_pipeline_1([[44091,1.504671,"tcp","http","FIN",10,10,878,2606,12.627345,62,252,4205.570801,12473.15918,2,2,167.185667,157.208562,10025.67205,257.018641,255,2630345556,2391717210,255,0.168907,0.088862,0.080045,88,261,0,0,2,1,1,1,1,1,0,0,0,2,2,0]])

print("Predicted class is " + y_encoder.inverse_transform(y_pred.ravel()))

features_encoded shape: (1, 196), features_encoded_name len: 196
['Predicted class is Normal']


In [8]:
test = pd.read_csv("data/UNSW_NB15_testing-set.csv", encoding = "ISO-8859-1")

In [9]:
"""
Takes X, y and runs model to get y_pred and displays metrics
"""
def run_pipeline_2(X, y):
    y_pred, y_proba = run_pipeline_1(X)

    y_true = y_encoder.transform(y)

    print ("log loss for data : " + str(log_loss(y_true, y_proba)))
    print("accuracy score is " + str(accuracy_score(y_true, y_pred)))
    
    return y_pred, y_proba

In [10]:
y_pred, y_proba = run_pipeline_2(test.drop(["label", "attack_cat"], axis=1), test.attack_cat.to_numpy())

features_encoded shape: (82332, 196), features_encoded_name len: 196
log loss for data : 1.0040835032678204
accuracy score is 0.7515182432104164
