# 1. import libraries

In [1]:
import warnings
import numpy as np
import pandas as pd
from sklearn import tree
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

In [3]:
warnings.filterwarnings("ignore")
%matplotlib inline
plt.style.use('seaborn-white')
plt.rcParams['figure.figsize'] = 15,5

---
# 2. import dataset

In [4]:
df = pd.read_csv('clean.csv')
df.shape

(4293, 13)

In [5]:
y = df['Classification']
x = df.drop('Classification', axis=1)
x.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
C1,4293.0,-3.095073e-16,3.443987,-6.210224,-2.117362,-0.802819,0.800228,17.341713
C2,4293.0,-3.558507e-17,2.595577,-10.943134,-1.505902,-0.111367,1.388182,22.139193
C3,4293.0,2.177413e-15,2.467559,-7.171382,-1.398834,-0.191611,1.045199,29.636138
C4,4293.0,2.651294e-16,2.061767,-6.937613,-1.451622,-0.012118,1.433494,8.781903
C5,4293.0,-6.899779e-17,1.773863,-6.028839,-1.225517,-0.086506,1.149357,6.889825
C6,4293.0,5.379138e-18,1.674063,-4.785686,-0.993242,-0.213896,0.66327,14.234338
C7,4293.0,2.059331e-16,1.620776,-7.893766,-1.022278,-0.09285,0.947502,8.584635
C8,4293.0,5.082251e-16,1.513429,-5.621405,-0.81872,0.073281,0.855956,17.09935
C9,4293.0,-2.818875e-16,1.496118,-14.08403,-0.624685,0.091726,0.753313,16.431091
C10,4293.0,-2.5570300000000003e-17,1.436209,-5.219685,-0.729003,-0.019743,0.665288,13.897605


---
# 3. model

In [6]:
model = xgb.XGBClassifier(objective='multi:softprob', max_depth=25, subsample=0.5, n_estimators=100, random_state = 42)
model.fit(x, y)
pred_values = model.predict_proba(x)



In [7]:
pred_values = pd.DataFrame(pred_values)
pred_values['max'] = pred_values.max(axis=1)
pred_values['Classification'] = np.where(pred_values[0]==pred_values['max'],0,
                                         np.where(pred_values[1]==pred_values['max'],1,
                                                  np.where(pred_values[2]==pred_values['max'],2,
                                                           np.where(pred_values[3]==pred_values['max'],3,
                                                                    np.where(pred_values[4]==pred_values['max'],4,5)))))
pred_values.shape

(4293, 7)

---
# 4. AUROC, Gini and KS

In [8]:
def stat1(y,pred_values):
    print('AUROC:',np.round(roc_auc_score(y, pred_values), 3))
    
    auroc = roc_auc_score(y, pred_values)
    gini = 2 * auroc - 1
    print('Gini:',np.round(gini, 3))
    
    train_dict = {'actuals':y, 'predicted':pred_values}
    train_calc = pd.DataFrame(train_dict)

    train_calc['1-actuals'] = 1-train_calc['actuals']
    train_calc['1-predicted'] = 1-train_calc['predicted']
    train_calc['bucket'] = pd.qcut(train_calc['1-predicted'], 10, duplicates='drop')

    kstable = train_calc.groupby('bucket', as_index = False)[['1-actuals','actuals']].sum()
    kstable['event_rate'] = (kstable['actuals'] / kstable['actuals'].sum()).cumsum()
    kstable['non_event_rate'] = (kstable['1-actuals'] / kstable['1-actuals'].sum()).cumsum()
    kstable['KS'] = (kstable['event_rate'] - kstable['non_event_rate']).abs()
    print('KS:',np.round(kstable['KS'].max(), 3))

In [9]:
print('NORMAL')
y = np.where(df['Classification']==0,1,0)
print(sum(y))
stat1(y,pred_values[0])

NORMAL
2757
AUROC: 1.0
Gini: 1.0
KS: 0.934


In [10]:
print('NETWORK_DELAY_KANBAN_API_GATEWAY')
y = np.where(df['Classification']==1,1,0)
print(sum(y))
stat1(y,pred_values[1])

NETWORK_DELAY_KANBAN_API_GATEWAY
646
AUROC: 1.0
Gini: 1.0
KS: 0.942


In [11]:
print('NETWORK_DELAY_KANBAN_COMMAND_SERVICE')
y = np.where(df['Classification']==2,1,0)
print(sum(y))
stat1(y,pred_values[2])

NETWORK_DELAY_KANBAN_COMMAND_SERVICE
500
AUROC: 1.0
Gini: 1.0
KS: 0.905


In [12]:
print('POD_KILL_KANBAN_API_GATEWAY')
y = np.where(df['Classification']==3,1,0)
print(sum(y))
stat1(y,pred_values[3])

POD_KILL_KANBAN_API_GATEWAY
310
AUROC: 1.0
Gini: 1.0
KS: 0.97


In [13]:
print('CPU_BURN_KANBAN_API_GATEWAY')
y = np.where(df['Classification']==4,1,0)
print(sum(y))
stat1(y,pred_values[4])

CPU_BURN_KANBAN_API_GATEWAY
80
AUROC: 1.0
Gini: 1.0
KS: 0.917


---
# 5. Confusion matrix

In [14]:
y = np.where(df['Classification']==0,'NORMAL',
             np.where(df['Classification']==1,'NETWORK_DELAY_KANBAN_API_GATEWAY',
                      np.where(df['Classification']==2,'NETWORK_DELAY_KANBAN_COMMAND_SERVICE',
                               np.where(df['Classification']==3,'POD_KILL_KANBAN_API_GATEWAY',
                                        np.where(df['Classification']==4,'CPU_BURN_KANBAN_API_GATEWAY','MISSING')))))

p = np.where(pred_values['Classification']==0,'NORMAL',
             np.where(pred_values['Classification']==1,'NETWORK_DELAY_KANBAN_API_GATEWAY',
                      np.where(pred_values['Classification']==2,'NETWORK_DELAY_KANBAN_COMMAND_SERVICE',
                               np.where(pred_values['Classification']==3,'POD_KILL_KANBAN_API_GATEWAY',
                                        np.where(pred_values['Classification']==4,'CPU_BURN_KANBAN_API_GATEWAY','MISSING')))))

In [15]:
print(confusion_matrix(y,p))

[[  80    0    0    0    0]
 [   0  646    0    0    0]
 [   0    0  500    0    0]
 [   0    0    0 2757    0]
 [   0    0    0    0  310]]


In [16]:
print('Accuracy:',classification_report(y,p))                  

Accuracy:                                       precision    recall  f1-score   support

         CPU_BURN_KANBAN_API_GATEWAY       1.00      1.00      1.00        80
    NETWORK_DELAY_KANBAN_API_GATEWAY       1.00      1.00      1.00       646
NETWORK_DELAY_KANBAN_COMMAND_SERVICE       1.00      1.00      1.00       500
                              NORMAL       1.00      1.00      1.00      2757
         POD_KILL_KANBAN_API_GATEWAY       1.00      1.00      1.00       310

                            accuracy                           1.00      4293
                           macro avg       1.00      1.00      1.00      4293
                        weighted avg       1.00      1.00      1.00      4293

