In [12]:
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from joblib import dump

from aqosd_experiments.config import CLASSIFIERS, PARAM_GRIDS, RAW_DATASET_PATH, HOST_LIST, CV, MODELS_PATH, FIG_PATH
from aqosd_experiments.data import import_and_prepare_data, scale_metrics, over_sampling
from aqosd_experiments.plot import plot_number_of_instance, plot_osdm, plt_long_stats, plt_corr_metrics, \
    plot_multicollinear_metrics, plt_corr_bottlenecks, plt_all_data
from aqosd_experiments.scorers import process_score, SCORING,_hamming_loss_wrapper,_coverage_error_wrapper,\
    _label_ranking_loss_wrapper
from osms import OverheadSensitiveMetricSelection
from sklearn.metrics import *

import matplotlib.pyplot as plt
from seglearn.pipe import Pype
from seglearn.transform import FeatureRep, Segment
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import warnings
from skimage.util import view_as_windows
from tensorflow.python.keras.layers import Dense, LSTM, Conv1D
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV


In [2]:
save=False

In [3]:
metrics, bottlenecks = import_and_prepare_data(RAW_DATASET_PATH, HOST_LIST)
print('Shape of metrics : ',metrics.shape,'\t','Shape of bottlenecks : ',bottlenecks.shape) 
print('Label cardinality = %.5f \t Label density = %.5f' % (bottlenecks.sum(axis=1).mean(),bottlenecks.mean(axis=1).mean()))

Shape of metrics :  (97343, 105) 	 Shape of bottlenecks :  (97343, 32)
Label cardinality = 1.96252 	 Label density = 0.06133


In [4]:
metric_names, bottleneck_names = list(metrics.columns), list(bottlenecks.columns)
print(metric_names)
print(100*'-')
print(bottleneck_names)

['SRV./: Free inodes in %', 'SRV./: Space utilization', 'SRV./: Used space', 'SRV./boot: Free inodes in %', 'SRV./boot: Space utilization', 'SRV./boot: Used space', 'SRV.Available memory', 'SRV.Available memory in %', 'SRV.CPU idle time', 'SRV.CPU iowait time', 'SRV.CPU softirq time', 'SRV.CPU system time', 'SRV.CPU user time', 'SRV.CPU utilization', 'SRV.Context switches per second', 'SRV.Free swap space', 'SRV.Free swap space in %', 'SRV.Interface enp0s8: Bits received', 'SRV.Interface enp0s8: Bits sent', 'SRV.Interrupts per second', 'SRV.Load average (15m avg)', 'SRV.Load average (1m avg)', 'SRV.Load average (5m avg)', 'SRV.Memory utilization', 'SRV.Number of processes', 'SRV.Number of running processes', 'GW1./: Free inodes in %', 'GW1./: Space utilization', 'GW1./: Used space', 'GW1./boot: Free inodes in %', 'GW1./boot: Space utilization', 'GW1./boot: Used space', 'GW1.Available memory', 'GW1.Available memory in %', 'GW1.CPU idle time', 'GW1.CPU iowait time', 'GW1.CPU softirq time

In [17]:
def multi_label(y):
    y = np.sum(y, axis=1)
    y[y > 1] = 1
    return y

In [31]:
from seglearn.split import TemporalKFold

# temporal splitting of data
splitter = TemporalKFold(n_splits=2)
X, y, cv = splitter.split(metrics.values, bottlenecks.values)
#X_train, X_test, y_train, y_test = temporal_split([metrics.values], [bottlenecks.values], test_size=0.25)
#print('Shape  : ',X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [32]:
nv, nc = len(metric_names), len(bottleneck_names)

def crnn_model(width,  n_vars=nv, n_classes=nc, conv_kernel_size=5, conv_filters=2, lstm_units=2):
    input_shape = (width, n_vars)
    model = Sequential()
    model.add(Conv1D(filters=conv_filters, kernel_size=conv_kernel_size,
                     padding='valid', activation='relu', input_shape=input_shape))
    model.add(LSTM(units=lstm_units, dropout=0.1, recurrent_dropout=0.1))
    model.add(Dense(n_classes, activation="sigmoid"))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [33]:
windows= [ 5, 10,20,30,60][::-1]

print(100*'-')
print('#',str(w).ljust(3),'|', end=' ')
steps = [('seg', Segment(order='C', width=w, y_func=multi_label)),
         ('crnn', KerasClassifier(build_fn=crnn_model, epochs=10, batch_size=256, verbose=0))]
pipe = Pype(steps)

par_grid = {'seg__width': windows, 'seg__overlap': [0.], 'crnn__width': ['seg__width']}
clf = GridSearchCV(pipe, par_grid, cv=cv, verbose=2)
clf.fit(X, y)

----------------------------------------------------------------------------------------------------
# 60  | Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] crnn__width=seg__width, seg__overlap=0.0, seg__width=60 .........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  crnn__width=seg__width, seg__overlap=0.0, seg__width=60, total=  19.3s
[CV] crnn__width=seg__width, seg__overlap=0.0, seg__width=60 .........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   19.2s remaining:    0.0s


[CV]  crnn__width=seg__width, seg__overlap=0.0, seg__width=60, total=  18.4s
[CV] crnn__width=seg__width, seg__overlap=0.0, seg__width=60 .........
[CV]  crnn__width=seg__width, seg__overlap=0.0, seg__width=60, total=  18.0s
[CV] crnn__width=seg__width, seg__overlap=0.0, seg__width=30 .........
[CV]  crnn__width=seg__width, seg__overlap=0.0, seg__width=30, total=  10.7s
[CV] crnn__width=seg__width, seg__overlap=0.0, seg__width=30 .........
[CV]  crnn__width=seg__width, seg__overlap=0.0, seg__width=30, total=  11.2s
[CV] crnn__width=seg__width, seg__overlap=0.0, seg__width=30 .........
[CV]  crnn__width=seg__width, seg__overlap=0.0, seg__width=30, total=  14.5s
[CV] crnn__width=seg__width, seg__overlap=0.0, seg__width=20 .........
[CV]  crnn__width=seg__width, seg__overlap=0.0, seg__width=20, total=  10.3s
[CV] crnn__width=seg__width, seg__overlap=0.0, seg__width=20 .........
[CV]  crnn__width=seg__width, seg__overlap=0.0, seg__width=20, total=   9.1s
[CV] crnn__width=seg__width, seg__o

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  2.7min finished


UnboundLocalError: local variable 'logs' referenced before assignment

In [None]:
scores = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
plt.plot(par_grid['seg__width'], scores, '-.')
plt.title("Grid Search Scores")
plt.xlabel("Width [s]")
plt.ylabel("CV Average Score")
plt.fill_between(par_grid['seg__width'], scores - stds, scores + stds, alpha=0.2, color='navy')
plt.show()