In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.random import set_seed
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, max_error, mean_absolute_error

In [2]:
# Carregar dados do CPU
machine = 'm_103.csv'
workload = pd.read_csv(machine, 
             names = ['machine_id', 'time_stamp', 'cpu', 'mem', 'mem_gps', 
                      'mkpi', 'net_in', 'net_out', 'disk_io_percent'])
workload.time_stamp = pd.to_datetime(workload.time_stamp, unit='s')
workload.set_index('time_stamp', inplace=True)
workload = workload.resample('5min').interpolate()
workload = workload[-12*24*3:] # 5min * 12 * 24 * 3 = 3 days
workload

Unnamed: 0_level_0,machine_id,cpu,mem,mem_gps,mkpi,net_in,net_out,disk_io_percent
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1970-01-06 00:00:00,m_103,17.0,95.0,,,41.43,29.40,2.0
1970-01-06 00:05:00,m_103,15.0,95.0,,,41.43,29.40,1.0
1970-01-06 00:10:00,m_103,26.0,95.0,,,41.43,29.40,2.0
1970-01-06 00:15:00,m_103,14.0,90.0,,,41.44,29.41,4.0
1970-01-06 00:20:00,m_103,21.0,93.0,,,41.44,29.41,3.0
...,...,...,...,...,...,...,...,...
1970-01-08 23:35:00,m_103,14.0,88.0,2.02,1.0,45.88,32.63,2.0
1970-01-08 23:40:00,m_103,16.0,89.0,1.90,0.0,45.88,32.63,2.0
1970-01-08 23:45:00,m_103,12.0,88.0,1.28,0.0,45.89,32.63,2.0
1970-01-08 23:50:00,m_103,13.0,89.0,2.11,1.0,45.89,32.64,1.0


In [3]:
# grid search lstm for monthly airline passengers dataset
import datetime
from math import sqrt
from numpy import array
from numpy import mean
from pandas import DataFrame
from pandas import concat
from pandas import read_csv
from sklearn.metrics import mean_squared_error
from tensorflow.random import set_seed
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM

# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
    return data[:-n_test], data[-n_test:]

# transform list into supervised learning format
def series_to_supervised(data_arr, look_back):
    X, Y =[], []
    for i in range(len(data_arr)-look_back):
        d=i+look_back
        X.append(data_arr[i:d,])
        Y.append(data_arr[d,])
    return np.array(X), np.array(Y)

# root mean squared error or rmse
def measure_rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

# difference dataset
def difference(data, order):
    return [data[i] - data[i - order] for i in range(order, len(data))]

# fit a model
def model_fit(train, config):
    # unpack config
    n_input, n_nodes, n_epochs, n_batch, n_diff = config
    # prepare data
    if n_diff > 0:
        train = difference(train, n_diff)
    # transform series into supervised format
    #data = series_to_supervised(train, n_in=n_input)
    # separate inputs and outputs
    train_x, train_y = series_to_supervised(train, n_input)#data[:, :-1], data[:, -1]
    # reshape input data into [samples, timesteps, features]
    n_features = 1
    train_x = train_x.reshape((train_x.shape[0], train_x.shape[1], n_features))
    # define model
    model = Sequential()
    model.add(LSTM(n_nodes, activation='relu', input_shape=(n_input, n_features)))
    model.add(Dense(1))
    model.compile(loss='mse', optimizer='adam')
    # fit model
    model.fit(train_x, train_y, epochs=n_epochs, batch_size=n_batch, verbose=0)
    return model

# forecast with the fit model
def model_predict(model, history, config):
    # unpack config
    n_input, _, _, _, n_diff = config
    # prepare data
    correction = 0.0
    if n_diff > 0:
        correction = history[-n_diff]
        history = difference(history, n_diff)
    # reshape sample into [samples, timesteps, features]
    x_input = array(history[-n_input:]).reshape((1, n_input, 1))
    # forecast
    yhat = model.predict(x_input, verbose=0)
    return correction + yhat[0]

# walk-forward validation for univariate data
def walk_forward_validation(data, n_test, cfg):
    predictions = list()
    # split dataset
    train, test = train_test_split(data, n_test)
    # fit model
    model = model_fit(train, cfg)
    # seed history with training dataset
    history = [x for x in train]
    # step over each time-step in the test set
    for i in range(len(test)):
        # fit model and make forecast for history
        yhat = model_predict(model, history, cfg)
        # store forecast in list of predictions
        predictions.append(yhat)
        # add actual observation to history for the next loop
        history.append(test[i])
    
    # reverse scale
    predictions = scaler.inverse_transform(predictions)
    test = scaler.inverse_transform(test)
    
    # estimate prediction error
    error = measure_rmse(test, predictions)
    #print(' > %.3f' % error)
    return error

# score a model, return None on failure
def repeat_evaluate(data, config, n_test):
    # convert config to a key
    key = str(config)
    # fit and evaluate the model n times
    scores = [walk_forward_validation(data, n_test, config) for _ in range(config[2])]
    # summarize score
    result = mean(scores)
    print('> Model[%s] %.3f' % (key, result))
    return (key, result)

# grid search configs
def grid_search(data, cfg_list, n_test):
    # evaluate configs
    scores = scores = [repeat_evaluate(data, cfg, n_test) for cfg in cfg_list]
    # sort configs by error, asc
    scores.sort(key=lambda tup: tup[1])
    return scores

# create a list of configs to try
def model_configs():
    # define scope of configs
    n_input = [30,12]
    n_nodes = [5,11,15]
    n_epochs = [10,20,30]
    n_batch = [11,21,30]
    n_diff = [0]
    # create configs
    configs = list()
    for i in n_input:
        for j in n_nodes:
            for k in n_epochs:
                for l in n_batch:
                    for m in n_diff:
                        cfg = [i, j, k, l, m]
                        configs.append(cfg)
    print('Total configs: %d' % len(configs))
    return configs

# define dataset
data = workload[['cpu']].values
scaler = MinMaxScaler(feature_range=(0, 1))#LTSM is senstive to the scale of features
data = scaler.fit_transform(data)

# disable random weigths
np.random.seed(1234)
set_seed(1234)

# data split
n_test = int(len(workload)*1/3)
# model configs
cfg_list = model_configs()
# grid search
start = datetime.datetime.now()
scores = grid_search(data, cfg_list, n_test)
print('done')
lapsed = datetime.datetime.now() - start
print('Tempo: '+str(lapsed).split('.')[0])
# list top 10 configs
for cfg, error in scores[:3]:
    print(cfg, error)

Total configs: 54
> Model[[30, 5, 10, 11, 0]] 8.932
> Model[[30, 5, 10, 21, 0]] 9.307
> Model[[30, 5, 10, 30, 0]] 9.163
> Model[[30, 5, 20, 11, 0]] 9.185
> Model[[30, 5, 20, 21, 0]] 8.937
> Model[[30, 5, 20, 30, 0]] 8.852
> Model[[30, 5, 30, 11, 0]] 8.668
> Model[[30, 5, 30, 21, 0]] 8.727
> Model[[30, 5, 30, 30, 0]] 9.288
> Model[[30, 11, 10, 11, 0]] 8.403
> Model[[30, 11, 10, 21, 0]] 9.087
> Model[[30, 11, 10, 30, 0]] 9.308
> Model[[30, 11, 20, 11, 0]] 8.402
> Model[[30, 11, 20, 21, 0]] 8.384
> Model[[30, 11, 20, 30, 0]] 8.433
> Model[[30, 11, 30, 11, 0]] 8.416
> Model[[30, 11, 30, 21, 0]] 8.313
> Model[[30, 11, 30, 30, 0]] 8.316
> Model[[30, 15, 10, 11, 0]] 8.318
> Model[[30, 15, 10, 21, 0]] 8.442
> Model[[30, 15, 10, 30, 0]] 8.383
> Model[[30, 15, 20, 11, 0]] 8.717
> Model[[30, 15, 20, 21, 0]] 8.370
> Model[[30, 15, 20, 30, 0]] 8.395
> Model[[30, 15, 30, 11, 0]] 8.195
> Model[[30, 15, 30, 21, 0]] 8.182
> Model[[30, 15, 30, 30, 0]] 8.232
> Model[[12, 5, 10, 11, 0]] 8.757
> Model[[12,