In [1]:
#%load_ext watermark
#%watermark -a 'Ouedraogo Clovis' -u -d -v -m

## Imports

In [2]:
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.preprocessing import MinMaxScaler
from joblib import dump

from aqosd_experiments.config import *
from aqosd_experiments.data import *
from aqosd_experiments.plot import *
from aqosd_experiments.scorers import *
from osms import OverheadSensitiveMetricSelection

## Load Config

In [3]:
%whos str  int  tuple OrderedDict

Variable             Type           Data/Info
---------------------------------------------
AVERAGE              str            samples
CLASSIFIERS          OrderedDict    OrderedDict([('Random Cla<...>re_dense=[True, True]))])
CLEAN_DATASET_PATH   str            C:/Users/couedrao/Pycharm<...>ts/../data/clean_dataset/
DATE_FORMAT          str            %Y-%m-%d %H:%M:%S
FIG_PATH             str            C:/Users/couedrao/Pycharm<...>/../data/output/plotting/
HOST_LIST            tuple          n=4
K_FOLD               int            3
MODELS_PATH          str            C:/Users/couedrao/Pycharm<...>data/output/saved_models/
PATH                 str            C:/Users/couedrao/Pycharm<...>qosd_experiments/../data/
RAW_DATASET_PATH     str            C:/Users/couedrao/Pycharm<...>ents/../data/raw_dataset/
ROUND                int            5
SCORING              OrderedDict    OrderedDict([('mcc', make<...>scorer(accuracy_score))])
SEED                 int            42


In [4]:
#raw_dataset_path, host_list, models_path, fig_path= RAW_DATASET_PATH,  HOST_LIST, MODELS_PATH, FIG_PATH
#classifiers,param_grids = CLASSIFIERS, PARAM_GRIDS
#scoring, cv = SCORING, CV
save=False

## Load and prepare data

In [5]:
metrics, bottlenecks = import_and_prepare_data(RAW_DATASET_PATH, HOST_LIST)
print('Shape of metrics : ',metrics.shape,'\t','Shape of bottlenecks : ',bottlenecks.shape) #42813
print('Label cardinality = %.5f \t Label density = %.5f' % (bottlenecks.sum(axis=1).mean(),bottlenecks.mean(axis=1).mean()))

Shape of metrics :  (38814, 104) 	 Shape of bottlenecks :  (38814, 32)
Label cardinality = 2.02118 	 Label density = 0.06316


In [6]:
#metrics=scale_metrics(metrics, MinMaxScaler()) 
#print('Shape of metrics : ',metrics.shape,'\t','Shape of bottlenecks : ',bottlenecks.shape)

In [7]:
metric_names, bottleneck_names = list(metrics.columns), list(bottlenecks.columns)
print(metric_names)
print(100*'-')
print(bottleneck_names)

['SRV./: Free inodes in %', 'SRV./: Space utilization', 'SRV./: Used space', 'SRV./boot: Free inodes in %', 'SRV./boot: Space utilization', 'SRV./boot: Used space', 'SRV.Available memory', 'SRV.Available memory in %', 'SRV.CPU idle time', 'SRV.CPU iowait time', 'SRV.CPU softirq time', 'SRV.CPU system time', 'SRV.CPU user time', 'SRV.CPU utilization', 'SRV.Context switches per second', 'SRV.Free swap space', 'SRV.Free swap space in %', 'SRV.Interface enp0s8: Bits received', 'SRV.Interface enp0s8: Bits sent', 'SRV.Interrupts per second', 'SRV.Load average (15m avg)', 'SRV.Load average (1m avg)', 'SRV.Load average (5m avg)', 'SRV.Memory utilization', 'SRV.Number of processes', 'SRV.Number of running processes', 'GW1./: Free inodes in %', 'GW1./: Space utilization', 'GW1./: Used space', 'GW1./boot: Free inodes in %', 'GW1./boot: Space utilization', 'GW1./boot: Used space', 'GW1.Available memory', 'GW1.Available memory in %', 'GW1.CPU idle time', 'GW1.CPU iowait time', 'GW1.CPU softirq time

In [8]:
#metrics.head()

In [9]:
#bottlenecks.describe()

In [10]:
w, o = 10, 0
train_indexes, test_indexes = next(CV.split(metrics, bottlenecks))
X_train, y_train = metrics.iloc[train_indexes, :], bottlenecks.iloc[train_indexes, :]
X_test, y_test = metrics.iloc[test_indexes, :], bottlenecks.iloc[test_indexes, :]
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((29110, 104), (29110, 32), (9704, 104), (9704, 32))

In [None]:
import tsfel
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from skimage.util import view_as_windows
import warnings

#X_train, X_test, y_train, y_test = train_test_split(metrics, bottlenecks, test_size=0.25, random_state=42)
cfg = tsfel.get_features_by_domain(domain='temporal')
X_train = tsfel.time_series_features_extractor(cfg, X_train, window_size=w, overlap=o) 
X_test = tsfel.time_series_features_extractor(cfg, X_test, window_size=w, overlap=o)

*** Feature extraction started ***


In [None]:
def stride_axis0(a, L, overlap):
    overlap = int(overlap * L)
    if L == overlap:
        raise Exception("Overlap arg must be smaller than length of windows")
    S = L - overlap
    nd0 = ((len(a) - L) // S) + 1
    if nd0 * S - S != len(a) - L:
        warnings.warn("Not all elements were covered")
    a = view_as_windows(a, (L, a.shape[1]), step=S)[:, 0, :, :]
    a = np.sum(a, axis=1)
    a[a > 1] = 1
    return a

y_train = stride_axis0(y_train.values, w, o)
y_test = stride_axis0(y_test.values, w, o)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
def fill_missing_values(df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0.0, inplace=True)
    return df

# Handling eventual missing values from the feature extraction
X_train = fill_missing_values(X_train)
X_test = fill_missing_values(X_test)

# Highly correlated features are removed
corr_features = tsfel.correlated_features(X_train)
X_train.drop(corr_features, axis=1, inplace=True)
X_test.drop(corr_features, axis=1, inplace=True)

# Remove low variance features
selector = VarianceThreshold()
X_train = selector.fit_transform(X_train)
X_test = selector.transform(X_test)

# Normalising Features
min_max_scaler = preprocessing.StandardScaler()
nX_train = min_max_scaler.fit_transform(X_train)
nX_test = min_max_scaler.transform(X_test)

In [None]:
np.isnan(X_train).sum(),np.isnan(y_train).sum(),np.isnan(X_test).sum(),np.isnan(y_test).sum()

In [None]:
nX_train.shape, y_train.shape, nX_test.shape , y_test.shape 

In [None]:
def print_metrics(y_test, y_pred):
    a = accuracy_score(y_test, y_pred)
    m = user_defined_matthews_corrcoef(y_test, y_pred)
    rest = {'Subset Accuracy': round(a, ROUND), 'MCC':round(m, ROUND)}
    print(rest)
    return rest

In [None]:
results={}
def train_and_plot(X_train, y_train, X_test, y_test):
    for clf_name, clf in CLASSIFIERS.items():
        train_x, train_y = np.copy(X_train), np.copy(y_train)
        test_x, test_y = np.copy(X_test), np.copy(y_test)
        print(80*'-')
        print('#',clf_name.ljust(16), end=' ')
        start=time.time()
        clf.fit(train_x, train_y)
        tt = time.time()-start
        print('>','fit_time:',tt/60,'minutes') if tt > 60  else print('>','time:',tt,'secondes')
        y_pred = clf.predict(test_x)
        y_pred_proba = clf.predict_proba(test_x)
        if not hasattr(y_pred, 'toarray'):
            y_pred = sparse.csr_matrix(y_pred)
        results[clf_name]=print_metrics(test_y, y_pred.toarray())
        results[clf_name]['fit_time']=round(tt, ROUND)
        #fig = plt_roc_auc(clf_name, y_pred_proba, test_y, bottleneck_names)
        #fig.savefig(FIG_PATH + clf_name +"_roc_curve.pdf", bbox_inches='tight')
    return pd.DataFrame.from_dict(results)