In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras_tuner as kt
import os, json, pickle

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

from sklearn.preprocessing import MinMaxScaler

from helpers.DataSaham import DataSaham

Model yang akan digunakan adalah multi-step timeseries. Model tersebut akan memprediksi 5 hari kedepan dengan pendekatan one vector output (bukan auto regressive). Model tersebut akan dilatih setiap hari agar model tersebut bisa mempelajari pola dengan menambahkan data yang baru. Oleh karena itu tujuan dari pemodelan kali ini adalah mencari model dengan arsitekturnya yang paling bagus performanya seiiring bertambahnya waktu. Karena tidak mungkin setiap bertambahnya waktu dilakukan hyperparameter tuning yang sangat memakan waktu.

In [2]:
# buat data dummy dengan 10 variabel dan 100 observasi yang berurutan

n_var  = 10
n_row = 100

col_names = [f"col_{i+1}" for i in range(n_var - 1)] + ['target']
dummy = pd.DataFrame(
    np.add(np.arange(n_var * n_row), 1).reshape(n_row, n_var),
    columns=col_names
)
dummy.head(5)

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,target
0,1,2,3,4,5,6,7,8,9,10
1,11,12,13,14,15,16,17,18,19,20
2,21,22,23,24,25,26,27,28,29,30
3,31,32,33,34,35,36,37,38,39,40
4,41,42,43,44,45,46,47,48,49,50


In [3]:
class TSMultistepSplit():
    def __init__(self, n_splits, n_step, look_back):
        self.n_splits = n_splits
        self.n_step = n_step
        self.look_back = look_back
        
    def split(self, X):
        n_samples = len(X)
        indices = np.arange(n_samples)
        n_train = n_samples - (self.n_step + self.n_splits) + 1

        if n_train < self.n_step + self.look_back + self.n_splits:
            print("Sample size don't enough to make train data")

        for i in range(self.n_splits):
            end_train = n_train + i
            yield indices[:end_train], indices[end_train-self.look_back:end_train + self.n_step]
        

class DataStore():
    def __init__(
        self, data, target_column, 
        look_back, n_steps, format=None,
        scaler_x=None, scaler_y=None, 
        default_scaler=MinMaxScaler
    ):
        
        self.data = data.copy()
        self.target_column = target_column
        self.look_back = look_back
        self.n_steps = n_steps
        self.n_features = data.shape[1] - 1
        self.scaler_x = default_scaler() if scaler_x is None else scaler_x
        self.scaler_y = default_scaler() if scaler_y is None else scaler_y
        self.defined_scaler_x = scaler_x is not None
        self.defined_scaler_y = scaler_y is not None
        
        if format == "rnn": self.format_for_rnn()
        
    def save(self, path):
        with open(path, 'wb') as outp:
            pickle.dump(self, outp, pickle.HIGHEST_PROTOCOL)
               
    def format_for_rnn(self):
        (x, scaled_x), (y, scaled_y) = self.__get_independent_dependent_data(self.data, self.target_column)
        
        self.x, self.y = self.__lstm_output_vector(x, y)
        self.scaled_x, self.scaled_y = self.__lstm_output_vector(scaled_x, scaled_y)
        
    def __get_independent_dependent_data(self,
                                       data : pd.DataFrame, 
                                       target_column : str) -> tuple:

        x = np.array(data)
        y = np.array(data[target_column]).reshape(-1, 1)
        
        if not self.defined_scaler_x: self.scaler_x = self.scaler_x.fit(x)
        if not self.defined_scaler_y: self.scaler_y = self.scaler_y.fit(y)
        
        scaled_x = self.scaler_x.transform(x)
        scaled_y = self.scaler_y.transform(y).squeeze()
        
        return (x, scaled_x), (y, scaled_y)
    
    def __lstm_output_vector(self, data_x: np.ndarray,  data_y: np.ndarray) -> tuple:
        x = []
        y = []
        n_data = len(data_x)
        for index in range(n_data):
            index_end = index + self.look_back
            index_end_output = index_end + self.n_steps
            
            if index_end_output > n_data: break
            
            x.append(data_x[index:index_end, :])
            y.append(data_y[index_end:index_end_output])

        return np.array(x), np.array(y)

In [61]:
class MyTuner(kt.Tuner):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        
    def __get_folder(self, trial_id, fname):
        return os.path.join(self.get_trial_dir(trial_id), fname)
        
    def save_model(self, trial_id, model, step=0):
        fname = self.__get_folder(trial_id, "model.h5")
        model.save(fname)
        
    def _save_data(self, trial_id, data_store, type_data, step=0):
        fname = self.__get_folder(trial_id, f"data_{type_data}.pkl")
        data_store.save(fname)
        
    def _save_metrics(self, trial_id, metrics):
        fname = self.__get_folder(trial_id, "metrics.json")
        with open(fname, 'w') as file:
            file.write(json.dumps(metrics))
            
    def __load_model(self, trial_id):
        fname = self.__get_folder((trial_id), "model.h5")
        return tf.keras.models.load_model(fname)

    def load_model(self, trial):
        return self.__load_model(trial.trial_id)
    
    def _get_callbacks(self):
        early_stopping = EarlyStopping(
            monitor="val_loss", patience=25, restore_best_weights=True
        )
        
        reduce_lr = ReduceLROnPlateau(
            monitor='val_loss', factor=0.5, patience=15, min_lr=1e-7
        )
        
        return [reduce_lr, early_stopping]
    
    def _data_store(
        self, data, index, look_back,
        scaler_x=None, scaler_y=None
    ):
        return DataStore(
            data=data.iloc[index, :],
            target_column=self.target_column,
            look_back=look_back,
            n_steps=self.n_steps,
            scaler_x=scaler_x,
            scaler_y=scaler_y,
            format="rnn"
        )
                
    def get_metrics(self, errors):
        squared_errors = np.power(errors, 2)
        abs_errors = np.abs(errors)
        return {
            'rmse_total' : np.sqrt(squared_errors.mean()),
            'rmse_eachday' : np.sqrt(squared_errors.mean(axis=0)).tolist(),
            'mae_total' : abs_errors.mean(),
            'mae_eachday' : abs_errors.mean(axis=0).tolist()
        }
        
    def get_file_in_directory(self, best_trial=True, trial_id=None):
        if best_trial:
            trial_id = self.oracle.get_best_trials()[0].trial_id
        
        path_data_train = self.__get_folder(trial_id, "data_train.pkl")
        with open(path_data_train, 'rb') as pickle_file:
            data_train = pickle.load(pickle_file)
            
        path_data_test = self.__get_folder(trial_id, "data_test.pkl")
        with open(path_data_test, 'rb') as pickle_file:
            data_test = pickle.load(pickle_file)
            
        path_metrics = self.__get_folder(trial_id, "metrics.json") 
        with open(path_metrics, 'r') as file:
            metrics = json.load(file)
        
        model = self.__load_model(trial_id)
        return model, data_train, data_test, metrics
        

class WFVTuner(MyTuner):
    def __init__(self, target_column, n_splits, n_steps, **kwargs):
        self.target_column = target_column
        self.n_splits = n_splits
        self.n_steps = n_steps
        super().__init__(**kwargs)
        
    def __get_data(self, data, train_indices, test_indices, look_back):
        data_train = self._data_store(data, train_indices, look_back)
        data_test = self._data_store(
            data, test_indices, look_back,
            scaler_x=data_train.scaler_x,
            scaler_y=data_train.scaler_y
        )
        return data_train, data_test
    
    def get_prediction(self, model, x, scaler=None):
        prediction = model.predict(x)
        if scaler is not None:
            prediction = np.array(prediction).reshape(-1, 1)
            prediction = scaler.inverse_transform(prediction)
        return prediction
    
    def run_trial(self, trial, x, y, *args, **kwargs):
        look_back = trial.hyperparameters.Choice('look_back', [5, 10, 20, 30])    
        batch_size = trial.hyperparameters.Int('batch_size', 0, 64, step=8)
        epochs = 100
        
        tss = TSMultistepSplit(
            n_splits=self.n_splits, n_step=self.n_steps, look_back=look_back
        )

        errors = []
        for train_indices, test_indices in tss.split(x):
            data_train, data_test = self.__get_data(
                x, train_indices, test_indices, look_back
            )
            
            model = self.hypermodel.build(trial.hyperparameters)
            model.fit(
                data_train.scaled_x, data_train.scaled_y, 
                shuffle=False, batch_size=batch_size,
                epochs=epochs, verbose=2, 
                validation_data=(data_test.scaled_x, data_test.scaled_y), 
                callbacks=self._get_callbacks()
            )
            
            prediction = self.get_prediction(model, data_test.scaled_x, data_test.scaler_y)
            error = (prediction - data_test.y).reshape(-1)
            errors.append(error)
            
        errors = np.array(errors)
        metrics = self.get_metrics(errors)
        
        trial_id = trial.trial_id
        self.oracle.update_trial(trial_id, {'val_rmse': metrics['rmse_total']})
        self.save_model(trial_id, model)
        self._save_data(trial_id, data_train, "train")
        self._save_data(trial_id, data_test, "test")
        
        metrics['errors'] = errors.tolist()
        self._save_metrics(trial_id, metrics)

In [62]:
class MyHyperModel(kt.HyperModel):
    def __init__(self, n_features, n_steps, **kwargs):
        self.n_features = n_features
        self.n_steps = n_steps
        super().__init__(**kwargs)

    def build(self, hp):
        
        try:
            look_back = hp.get('look_back')
        except:
            look_back = 10
        
        model = Sequential()
        model.add(LSTM(
            units=hp.Int('units', 100, 500, step=100),
            input_shape=(look_back, self.n_features),
            activation='tanh'
        ))
        
        model.add(Dense(self.n_steps))
        model.compile(
            optimizer=Adam(learning_rate=0.001),
            loss="mse",
        )
        return model

In [64]:
n_steps = 10
n_features = dummy.shape[1]

hypermodel=MyHyperModel(
    n_features=n_features,
    n_steps=n_steps
)

oracle = kt.oracles.BayesianOptimization(
    objective=kt.Objective('val_rmse', 'min'), 
    max_trials=3
)

tuner = WFVTuner(
    target_column='target',
    n_splits=10,
    n_steps=n_steps,
    oracle=oracle, 
    hypermodel=hypermodel,
    directory="model",
    project_name="one_vector",
)

In [65]:
tuner.search(dummy, y=None)

Trial 3 Complete [00h 01m 59s]
val_rmse: 28.060833827405702

Best val_rmse So Far: 17.28286298322715
Total elapsed time: 00h 04m 57s
INFO:tensorflow:Oracle triggered exit


In [66]:
model, data_train, data_test, metrics = tuner.get_file_in_directory()
metrics['errors'][-1]

[-24.0966796875,
 -30.692138671875,
 -28.5023193359375,
 -13.79547119140625,
 -29.8031005859375,
 -14.0010986328125,
 -26.9931640625,
 -20.8192138671875,
 -0.85589599609375,
 -23.7109375]

In [67]:
prediction = tuner.get_prediction(model, data_test.scaled_x, data_test.scaler_y)
(prediction - data_test.y).reshape(-1)

array([-24.09667969, -30.69213867, -28.50231934, -13.79547119,
       -29.80310059, -14.00109863, -26.99316406, -20.81921387,
        -0.855896  , -23.7109375 ])