In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from joblib import dump
from collections import Counter
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.metrics import *
from joblib import load
from aqosd_experiments.config import *
from aqosd_experiments.data import *
from aqosd_experiments.utils import *
from aqosd_experiments.plot import *
from aqosd_experiments.scorers import *

In [3]:
%%time
metrics, bottlenecks = import_and_prepare_data(RAW_DATASET_PATH,  HOST_LIST)
print('Shape of metrics : ',metrics.shape,'\t','Shape of bottlenecks : ',bottlenecks.shape)
print('Label cardinality = %.5f \t Label density = %.5f' % (bottlenecks.sum(axis=1).mean(),bottlenecks.mean(axis=1).mean()))

0 days 23:59:50 of Data
Shape of metrics :  (8640, 104) 	 Shape of bottlenecks :  (8640, 32)
Label cardinality = 1.96019 	 Label density = 0.06126
Wall time: 10.1 s


In [4]:
metric_names, bottleneck_names = list(metrics.columns), list(bottlenecks.columns)
print(metric_names)
print(100*'-')
print(bottleneck_names)

['SRV./: Free inodes in %', 'SRV./: Space utilization', 'SRV./: Used space', 'SRV./boot: Free inodes in %', 'SRV./boot: Space utilization', 'SRV./boot: Used space', 'SRV.Available memory', 'SRV.Available memory in %', 'SRV.CPU idle time', 'SRV.CPU iowait time', 'SRV.CPU softirq time', 'SRV.CPU system time', 'SRV.CPU user time', 'SRV.CPU utilization', 'SRV.Context switches per second', 'SRV.Free swap space', 'SRV.Free swap space in %', 'SRV.Interface enp0s8: Bits received', 'SRV.Interface enp0s8: Bits sent', 'SRV.Interrupts per second', 'SRV.Load average (15m avg)', 'SRV.Load average (1m avg)', 'SRV.Load average (5m avg)', 'SRV.Memory utilization', 'SRV.Number of processes', 'SRV.Number of running processes', 'GW1./: Free inodes in %', 'GW1./: Space utilization', 'GW1./: Used space', 'GW1./boot: Free inodes in %', 'GW1./boot: Space utilization', 'GW1./boot: Used space', 'GW1.Available memory', 'GW1.Available memory in %', 'GW1.CPU idle time', 'GW1.CPU iowait time', 'GW1.CPU softirq time

In [5]:
train_indexes, test_indexes = next(CV_2.split(metrics, bottlenecks))

In [6]:
X_train, y_train = metrics.iloc[train_indexes, :], bottlenecks.iloc[train_indexes, :]
X_test, y_test = metrics.iloc[test_indexes, :], bottlenecks.iloc[test_indexes, :]
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6480, 104), (6480, 32), (2160, 104), (2160, 32))

In [7]:
X_train, y_train, X_test, y_test = X_train.values, y_train.values,  X_test.values, y_test.values
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6480, 104), (6480, 32), (2160, 104), (2160, 32))

In [8]:
pd.DataFrame({
    'train': Counter(str(c) for row in get_combination_wise_output_matrix(y_train, order=2) for c in row),
    'test' : Counter(str(c) for row in get_combination_wise_output_matrix(y_test, order=2) for c in row)
}).T.fillna(0.0)

Unnamed: 0,"(4, 4)","(4, 15)","(15, 15)","(30, 30)","(15, 30)","(23, 23)","(23, 30)","(27, 30)","(27, 27)","(1, 1)",...,"(16, 28)","(14, 28)","(5, 24)","(0, 23)","(5, 26)","(1, 7)","(13, 30)","(6, 30)","(7, 31)","(9, 23)"
train,412.0,16.0,291.0,332.0,21.0,257.0,11.0,46.0,528.0,350.0,...,14.0,4.0,9.0,5.0,5.0,9.0,9.0,2.0,5.0,4.0
test,138.0,5.0,101.0,120.0,7.0,86.0,4.0,15.0,179.0,124.0,...,5.0,2.0,3.0,1.0,2.0,0.0,3.0,1.0,2.0,2.0


In [9]:
def plt_roc_auc(clf_name, y_score, y_test, labels):
    fig, ax = plt.subplots(figsize=(1.5, 1.5))
    if not hasattr(y_score, 'toarray'):
        y_score = np.array(y_score).T[1] if isinstance(y_score, list)  else np.array(y_score)
    if hasattr(y_score, 'A'):
        y_score = y_score.A
    n_classes, lw = len(labels), 1
    fpr, tpr, roc_auc = dict(), dict(), dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    fpr["macro"], tpr["macro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    min_class = min(roc_auc, key=roc_auc.get)
    max_class = max(roc_auc, key=roc_auc.get)
    for i in range(n_classes):
        ax.plot(fpr[i], tpr[i], color='grey', label='Bottleneck curve[1-'+str(n_classes)+']', alpha=.5,linestyle='solid',
                lw=.5) if i == 0 else ax.plot(fpr[i], tpr[i], color='grey', alpha=.5, lw=.5, linestyle='solid')
    ax.plot(fpr["macro"], tpr["samples"], label='Average area={0:0.3f}' ''.format(roc_auc["macro"]), color='r', 
            linewidth=lw, linestyle='-')
    ax.plot(fpr[min_class], tpr[min_class], label='Minimum area={0:0.3f}' ''.format(roc_auc[min_class]), color='b', 
            linewidth=lw, linestyle='--')
    ax.plot([0, 1], [0, 1], color='g', lw=.5, linestyle='--', label='Random Classifier')
    ax.set_xlim([-0.05, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('1 - Specificity')
    ax.set_ylabel('Sensitivity')
    ax.legend(loc='lower right', fontsize=5.5)
    [item.set_fontsize(5) for item in ([ax.title,ax.xaxis.label,ax.yaxis.label]+ax.get_xticklabels()+ax.get_yticklabels())]
    #ax.set_title(clf_name)
    plt.show()
    return fig


def print_metrics(y_test, y_pred):
    a = accuracy_score(y_test, y_pred)
    c = coverage_error(y_test, y_pred)
    p = precision_score(y_test, y_pred, average=AVERAGE)
    s = user_defined_specificity(y_test, y_pred)
    r = recall_score(y_test, y_pred, average=AVERAGE)
    rest = {'Precision':round(p, ROUND), 'Subset Accuracy': round(a, ROUND), 'Coverage Error':round(c, ROUND),
           'Specificity':round(s, ROUND),'Sensitivity':round(r, ROUND)}
    print('\t',rest)
    return rest

In [10]:
results, results_y_pred, results_y_pred_proba={},{},{}
def train(X_train, y_train, X_test, y_test):
    for clf_name, clf in CLASSIFIERS.items():
        train_x, train_y = np.copy(X_train), np.copy(y_train)
        test_x, test_y = np.copy(X_test), np.copy(y_test)
        print(80*'-')
        print('#',clf_name.ljust(16), end=' ')
        start=time.time()
        clf.fit(train_x, train_y);
        fit_time = round(time.time()-start, ROUND)
        print('>','fit_time:',fit_time,'secondes', end = ' ')
        start=time.time()
        y_pred = clf.predict(test_x)
        predict_time=  round(time.time()-start, ROUND)
        print('>','predict_time:',predict_time,'secondes')
        results_y_pred_proba[clf_name] = clf.predict_proba(test_x)
        if not hasattr(y_pred, 'toarray'):
            y_pred = sparse.csr_matrix(y_pred)
        results_y_pred[clf_name] = y_pred
        results[clf_name] = print_metrics(test_y, y_pred.toarray())
        results[clf_name]['Fit Time']=fit_time
        results[clf_name]['Predict Time']=predict_time
    return pd.DataFrame.from_dict(results)
def plot(results_y_pred_proba, y_test):
    for clf_name, clf in CLASSIFIERS.items():
        y_pred_proba = results_y_pred_proba[clf_name]
        fig = plt_roc_auc(clf_name, y_pred_proba, y_test, bottleneck_names)
        fig.savefig(FIG_PATH + clf_name +"_roc_curve.pdf", bbox_inches='tight')

In [11]:
df = train(X_train, y_train, X_test, y_test)

--------------------------------------------------------------------------------
# CC [Neural Net]  > fit_time: 132.9161 secondes > predict_time: 0.3038 secondes
	 {'Precision': 0.8671, 'Subset Accuracy': 0.5602, 'Coverage Error': 12.0782, 'Specificity': 0.992, 'Sensitivity': 0.7831}
--------------------------------------------------------------------------------
# BR [Neural Net]  > fit_time: 128.4111 secondes > predict_time: 0.2394 secondes
	 {'Precision': 0.8753, 'Subset Accuracy': 0.538, 'Coverage Error': 12.6736, 'Specificity': 0.9926, 'Sensitivity': 0.778}
--------------------------------------------------------------------------------
# LP [Neural Net]  > fit_time: 67.8486 secondes > predict_time: 0.2703 secondes
	 {'Precision': 0.8253, 'Subset Accuracy': 0.6519, 'Coverage Error': 9.394, 'Specificity': 0.9887, 'Sensitivity': 0.8307}
--------------------------------------------------------------------------------
# ML-kNN           > fit_time: 13.7581 secondes > predict_time: 4.6

In [12]:
df

Unnamed: 0,CC [Neural Net],BR [Neural Net],LP [Neural Net],ML-kNN
Precision,0.8671,0.8753,0.8253,0.8388
Subset Accuracy,0.5602,0.538,0.6519,0.5167
Coverage Error,12.0782,12.6736,9.394,13.6213
Specificity,0.992,0.9926,0.9887,0.9904
Sensitivity,0.7831,0.778,0.8307,0.7515
Fit Time,132.9161,128.4111,67.8486,13.7581
Predict Time,0.3038,0.2394,0.2703,4.6626


In [None]:
plot(results_y_pred_proba, y_test)

In [None]:
from itertools import cycle
d, lw=8, .5
c = cycle(['k','r','b','g'][::-1])
mpl.rcParams['hatch.linewidth'] = 0.1 
h1, h2 = 8, 3
hatchs=cycle([h1*"*",h1*"/", h1*"+",h1*"x", h2*"/", h2*"+",h2*"x",""][::-1])  
boxprops = dict(linestyle='-', linewidth=lw)
flierprops = dict(marker='o', markerfacecolor='w', markersize=2, markeredgewidth=lw)
medianprops = dict(linestyle='-', linewidth=lw, color='r')
meanlineprops = dict(linestyle='-', linewidth=lw)
meanpointprops = dict(marker=">", markeredgecolor='none', markerfacecolor='b',markersize=4)
whiskerprops = dict(linestyle='-' , linewidth=lw)

def plt_precision_box(y_score, y_test, labels):
    def add_hatch(bp):
        for box in  bp['boxes']:
            cs=next(c)
            box.set(facecolor = "w" )
            box.set(hatch =next(hatchs))
        for cap in bp['caps']:
            cap.set(linewidth=.5)
    def getdf(df0, cl):
        df = df0.groupby(cl)['precision'].apply(list).to_frame().T
        df = df.apply(pd.Series.explode)
        df.reset_index(inplace=True)
        del df['index']
        df = df.rename_axis(None, axis = 1)
        for c in df.columns:
            df[c] = df[c].astype(float) 
        return df

    figs=[]
    _precision={}
    if not hasattr(y_score, 'toarray'):
        y_score = np.array(y_score).T[1] if isinstance(y_score, list)  else np.array(y_score)
    if hasattr(y_score, 'A'):
        y_score = y_score.A
    for i in range(len(labels)):
        _precision[labels[i]] = precision_score(y_test[:, i], y_score[:, i])
    df = pd.DataFrame.from_dict(_precision, orient='index')
    df.reset_index(inplace=True)
    df["Node"], df["Bottleneck"] = zip(*df['index'].str.split('.').tolist())
    del df['index']
    df.columns=['precision', 'Node', 'Bottleneck']
    df = df[['Node','Bottleneck', 'precision']]
    
    groups=['Node', 'Bottleneck']
    for i, cl in enumerate(groups):
        df1= getdf(df, cl)
        f, ax = plt.subplots(figsize=(1.6, 1.6))
        if i==0:
            cols=["SRV", "GW1", "GW11","GW111"][::-1]
            lab=["NF1", "NF2", "NF3", "NF4"][::-1]
        else :
            cols=['cpu','memory','diskspace','diskio','network delay','network packet duplicate','network packet loss',
                  'network packet corrupt'][::-1]
            lab=['CPU','Memory','Disk space','Disk I/O','Packet delay','Packet duplicate','Packet loss',
                 'Packet corrupt'][::-1]
        df1 = df1[cols]
        df1.columns=lab
        df2=pd.DataFrame()
        df2['mean']=df1.mean()
        df2['min']=df1.min()
        df2['median']=df1.median()
        #display(df2.mean())
        boxplot  = df1.boxplot(ax=ax,  rot=0, fontsize=5,  patch_artist=True, return_type='dict', vert=False,
                               boxprops=boxprops,flierprops=flierprops, medianprops=medianprops,showmeans=True,
                               meanprops=meanpointprops , whiskerprops=whiskerprops)
        #ax.set_xlim(0.7,1)
        add_hatch(boxplot)
        figs.append(f)
    return figs, groups

figs, groups = plt_precision_box(results_y_pred['LP [Neural Net]'], y_test, bottleneck_names)
_ = [f.savefig(FIG_PATH +group+"_resultat.pdf", bbox_inches='tight') for f, group in zip(figs, groups)]
print(np.mean(precision_score(y_test,results_y_pred['LP [Neural Net]'],  average=None)))
print(np.mean(precision_score(y_test,results_y_pred['LP [Neural Net]'],  average='macro')))