# Testing Data

This notebook contains the implementation of the best models on the test data. In addition, there are models that are also applied to 2 models from other studies.

In [1]:
import numpy as np
import pandas as pd
import os, json, pickle, time

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau

from sklearn.preprocessing import MinMaxScaler

from IPython.display import clear_output

## Define Class

In [2]:
class TSMultistepSplit:
    """
    This class performs data splitting for Walk-Forward
    Validation with sliding window 1. The splitting data
    will be used in the LSTM for multistep forecasting or
    not (it's up to you depending on the `n_steps` parameter).
    We can divide the data into 3 categories, training, early
    stopping and testing. If you don't use early stopping,
    set `with_early_stopping_set` to `false`
    """

    def __init__(self, n_splits, n_steps, look_back):
        """
        Args:
            n_splits (int): How many fold you will used?

            n_steps (int): How many days do you predict?
            If larger than 1 its means you will perform multi-step forecasting

            look_back (int): The number of days it takes to make a prediction.
        """
        self.n_splits = n_splits
        self.n_steps = n_steps
        self.look_back = look_back

    def split(self, X, with_early_stopping_set=True):
        """
        This method returns index for training, early 
        stopping and test dataset. Below the example to
        use this class and method
        
        ```python
        
            tss = TSMultistepSplit()
            splits = tss.split(x, with_early_stopping_set=True)
            for train_indices, early_stopping_indices, test_indices in splits:
                print(train_indices, early_stopping_indices, test_indices)
                # your code        
        ```
        """
        
        n_samples = len(X)
        indices = np.arange(n_samples)

        # The number of data will used for early stopping and test data
        n_out = self.n_steps * 2 if with_early_stopping_set else self.n_steps
    
        n_train = n_samples - (n_out + self.n_splits) + 1
        if n_train < self.n_steps + self.look_back + self.n_splits:
            print("Sample size don't enough to make train data")

        for i in range(self.n_splits):
            end_train = n_train + i
            train_set = indices[:end_train]

            if with_early_stopping_set:
                early_stopping_set = indices[
                    end_train - self.look_back : 
                    end_train + self.n_steps
                ]

                test_set = indices[
                    end_train + self.n_steps - self.look_back : 
                    end_train + self.n_steps + self.n_steps
                ]

                yield train_set, early_stopping_set, test_set
            else:
                test_set = indices[
                    end_train - self.look_back : 
                    end_train + self.n_steps
                ]
                yield train_set, test_set


class DataStore():
    """
    This class is used for storing data. The format support 
    for RNN model, like LSTM.
    """
    
    def __init__(
        self, data, target_column, 
        look_back, n_steps, format=None,
        scaler_x=None, scaler_y=None, 
        default_scaler=MinMaxScaler
    ):
        """

        Args:
            data: Data will be used. `data` must have a column named `target column`
            
            target_column: Name of dependent variabel
            
            look_back: The number of days it takes to make a prediction.
            
            n_steps: How many days do you predict? If larger than 1 its means you will 
            perform multi-step forecasting
            
            format (optional): Just support RNN format. Defaults to None.
            
            scaler_x (optional): Object for transform independent data. 
            That object must have `transform` method and have been fitted. 
            If empty then the scaler to be used is the scaler defined in the 
            `default_scaler` parameter. Defaults to None.
            
            scaler_y (optional): Object for transform dependent data. 
            That object must have `transform` method and have been fitted. 
            If empty then the scaler to be used is the scaler defined in the 
            `default_scaler` parameter. Defaults to None.
            
            default_scaler (optional): The scaler must have `fit` and `transform` 
            methods. Preferably use a scaler from Scikit Learn. Defaults to MinMaxScaler.
        """
        
        self.data = data.copy()
        self.target_column = target_column
        self.look_back = look_back
        self.n_steps = n_steps
        self.n_features = data.shape[1] - 1
        
        self.scaler_x = default_scaler() if scaler_x is None else scaler_x
        self.scaler_y = default_scaler() if scaler_y is None else scaler_y
        self.defined_scaler_x = scaler_x is not None
        self.defined_scaler_y = scaler_y is not None
        
        if format == "rnn": self.format_for_rnn()
        
    def save(self, path):
        """
        To save this object to file with pickle
        """
        with open(path, 'wb') as outp:
            pickle.dump(self, outp, pickle.HIGHEST_PROTOCOL)
               
    def format_for_rnn(self):
        """
        Format data for used in RNN model that have 3 dimension, namely
        (n_data, look_back, n_features)
        """
        (x, scaled_x), (y, scaled_y) = self.__get_independent_dependent_data(self.data, self.target_column)
        
        self.x, self.y = self.__lstm_output_vector(x, y)
        self.scaled_x, self.scaled_y = self.__lstm_output_vector(scaled_x, scaled_y)
        
    def __get_independent_dependent_data(self,
                                       data : pd.DataFrame, 
                                       target_column : str) -> tuple:

        x = np.array(data)
        y = np.array(data[target_column]).reshape(-1, 1)
        
        if not self.defined_scaler_x: self.scaler_x = self.scaler_x.fit(x)
        if not self.defined_scaler_y: self.scaler_y = self.scaler_y.fit(y)
        
        scaled_x = self.scaler_x.transform(x)
        scaled_y = self.scaler_y.transform(y).squeeze()
        
        return (x, scaled_x), (y, scaled_y)
    
    def __lstm_output_vector(self, data_x: np.ndarray,  data_y: np.ndarray) -> tuple:
        x = []
        y = []
        n_data = len(data_x)
        
        for index in range(n_data):
            index_end = index + self.look_back
            index_end_output = index_end + self.n_steps
            
            if index_end_output > n_data: break
            
            x.append(data_x[index:index_end, :])
            y.append(data_y[index_end:index_end_output])

        return np.array(x), np.array(y)
    
    
class SuccessiveEarlyStopping(Callback):
    """
    This class implements the successive early stops described in the paper: 
    
    L. Prechelt, “Early Stopping - But When?,” in Neural Networks: Tricks of 
    the Trade, vol. 1524, G. B. Orr and K.-R. Müller, Eds. Berlin, Heidelberg: 
    Springer Berlin Heidelberg, 1998, pp. 55–69. doi: 10.1007/3-540-49430-8_3.
    
    This class extends Callback class from Keras Tensorflow so it's easy to
    integrate with keras model.
    """
    def __init__(self, patience=0, monitor='val_loss', min_epochs=30):
        """
        If as many as `patience` times the `monitor` goes up in a row, 
        the training process will be stopped
        """
        
        super(SuccessiveEarlyStopping, self).__init__()
        self.patience = patience
        self.monitor = monitor
        self.min_epochs = min_epochs

    def on_train_begin(self, logs=None):
        """
        This method will be executed when the modeling process starts. 
        This method will initialize some configuration
        """
        self.wait = 0
        self.stopped_epoch = 0
        self.best = np.Inf
        self.last_epoch_loss = np.Inf
        self.best_weights_epoch = 0
        self.last_loss_down = 0

    def on_epoch_end(self, epoch, logs=None):
        """
        Every epoch end, this method will be executed to
        check if the termination criteria are met.
        """
        current = logs.get(self.monitor)
        
        if np.less(current, self.last_epoch_loss):
            if np.less(current, self.best): 
                self.best = current
                self.best_weights = self.model.get_weights()
                self.best_weights_epoch = epoch + 1
                
            self.wait = 0
            self.last_loss_down = epoch + 1
        else:
            if epoch + 1 >= self.min_epochs:
                self.wait += 1
                
            if (self.wait >= self.patience):
                self.model.stop_training = True
                print(f"Early Stopping on epoch {epoch + 1}")
                                
        self.last_epoch_loss = current
        self.stopped_epoch = epoch + 1

    def on_train_end(self, logs=None):
        """
        This method will be executed when the modeling process end.
        When the termination criteria are met. This method will 
        restore weight to best weights
        """
        if np.less(self.best, self.last_epoch_loss):
            print(f"Restoring model weights from epoch {self.best_weights_epoch} with {self.monitor} = {self.best}")
            self.model.set_weights(self.best_weights)
            
class WalkForwardValidation():
    """
    This class implements Walk Forward validation
    without hyperparameter optimization. There are 
    3 models implement in this step. First is the 
    best combination variabel with best hyperparameter.
    Second and third from other publication
        1.  A. Heiden and R. S. Parpinelli, “Applying LSTM for Stock 
            e Prediction with Sentiment Analysis,” in Anais do 15. Congresso 
            Brasileiro de Inteligência Computacional, Jan. 2021, 
            pp. 1–8. doi: 10.21528/CBIC2021-45.
       
        2.  S. T. Mndawe, B. S. Paul, and W. Doorsamy, “Development of 
            a Stock Price Prediction Framework for Intelligent Media 
            and Technical Analysis,” Appl. Sci., vol. 12, no. 2, 
            p. 719, Jan. 2022, doi: 10.3390/app12020719
    """
    def __init__(self, target_column, n_splits, n_steps, max_epochs=100, verbose_fit_model=2):
        self.target_column = target_column
        self.n_splits = n_splits
        self.n_steps = n_steps
        self.max_epochs = max_epochs
        self.verbose_fit_model = verbose_fit_model
        
    def get_metrics(self, errors):
        squared_errors = np.power(errors, 2)
        abs_errors = np.abs(errors)
        return {
            'rmse_total' : np.sqrt(squared_errors.mean(axis=1)).mean(),
            'rmse_eachday' : np.sqrt(squared_errors.mean(axis=0)).tolist(),
            'mae_total' : abs_errors.mean(axis=1).mean(),
            'mae_eachday' : abs_errors.mean(axis=0).tolist()
        }
            
    def _get_callbacks(self):
        early_stopping = SuccessiveEarlyStopping(
            patience=3, monitor='val_loss', min_epochs=40
        )
        
        reduce_lr = ReduceLROnPlateau(
            monitor='val_loss', factor=0.5, patience=20, min_lr=1e-7
        )
        
        return early_stopping, reduce_lr
    
    def _data_store(
        self, data, index, look_back,
        scaler_x=None, scaler_y=None
    ):
        return DataStore(
            data=data.iloc[index, :],
            target_column=self.target_column,
            look_back=look_back,
            n_steps=self.n_steps,
            scaler_x=scaler_x,
            scaler_y=scaler_y,
            format="rnn"
        )
        
    def __get_data(self, data, train_indices, test_indices, look_back, es_indices=None):
        data_train = self._data_store(data, train_indices, look_back)
        data_test = self._data_store(
            data, test_indices, look_back,
            scaler_x=data_train.scaler_x,
            scaler_y=data_train.scaler_y
        )
        if es_indices is not None:
            data_early_stopping = self._data_store(
                data, es_indices, look_back, 
                scaler_x=data_train.scaler_x, 
                scaler_y=data_train.scaler_y
            )
            return data_train, data_early_stopping, data_test
        
        return data_train, data_test
    
    def get_prediction(self, model, x, scaler=None):
        prediction = model.predict(x)
        if scaler is not None:
            prediction = np.array(prediction).reshape(-1, 1)
            prediction = scaler.inverse_transform(prediction)
        return prediction
    
    def convert2float(self, data: list):
        return [float(x) for x in data]
    
    def _save_file2json(self, file, fname):
        with open(fname, 'w') as f:
            f.write(json.dumps(file))
            
    def _get_model_default(self, hp, n_features):
        look_back = hp['look_back']

        model = Sequential()
        num_layers = hp["num_layers"]
        is_first_layer = True
        for i in range(num_layers):        
            
            if is_first_layer:
                model.add(Input(shape=(look_back, n_features)))
                is_first_layer = False
                
            is_last_layer = i == num_layers - 1
            return_sequences = not is_last_layer    
            model.add(
                LSTM(
                    units=hp[f"units_{i}"],
                    return_sequences=return_sequences, name=f"lstm_layer_{i}"
                )
            )
            model.add(Dropout(
                rate=hp[f'dropout_rate_{i}'],
                name=f"dropuout_layer_{i}"
            ))

        model.add(Dense(self.n_steps, name="dense_layer_output"))
        
        optimizer = Adam if hp["optimizer"] == "adam" else RMSprop
        learning_rate = hp["lr"]
        
        model.compile(
            optimizer=optimizer(learning_rate=learning_rate), loss="mse"
        )
        
        return model
    
    def _model_default(self, x, params, train_every_n_day=1):
        look_back = params['look_back']
        batch_size = params['batch_size']
        n_features = x.shape[1]
                
        tss = TSMultistepSplit(
            n_splits=self.n_splits, n_steps=self.n_steps, look_back=look_back
        )

        errors = []
        summary_iter = []
        iteration = 1
        
        splits = tss.split(x, with_early_stopping_set=True)
        for train_indices, early_stopping_indices, test_indices in splits:
            print("Fold ke-", iteration)
            train_in_this_fold = (iteration - 1) % train_every_n_day == 0
            
            if train_in_this_fold:
                print('Train in this fold with last 3 indices:', train_indices[-3:], test_indices[-3:])
                data_train, data_early_stopping, data_test = self.__get_data(
                    x, train_indices, test_indices, look_back, 
                    early_stopping_indices
                )
                
                last_train_indices = train_indices
                last_early_stopping_indices = early_stopping_indices
                
                early_stopping, reduce_lr = self._get_callbacks()
                
                model = self._get_model_default(params, n_features)
            
                start_time = time.time()
                history = model.fit(
                    data_train.scaled_x, data_train.scaled_y, 
                    shuffle=False, batch_size=batch_size,
                    epochs=self.max_epochs, verbose=self.verbose_fit_model, 
                    validation_data=(data_early_stopping.scaled_x, data_early_stopping.scaled_y), 
                    callbacks=[early_stopping, reduce_lr]
                )
                training_time = time.time() - start_time
                
            else :
                print('No train in this fold with last 3 indices:', last_train_indices[-3:], test_indices[-3:])
                data_train, data_early_stopping, data_test = self.__get_data(
                    x, last_train_indices, test_indices, look_back, 
                    last_early_stopping_indices
                )
            
            prediction_test = self.get_prediction(model, data_test.scaled_x, data_test.scaler_y)
            prediction_es = self.get_prediction(model, data_early_stopping.scaled_x, data_early_stopping.scaler_y)
            error = (prediction_test - data_test.y).reshape(-1)
            errors.append(error)
            
            summary_iter.append({
                'iteration' : iteration,
                'rmse' : np.sqrt(np.power(error, 2).mean()),
                'mae' : np.abs(error).mean(),
                'best_epoch' : early_stopping.best_weights_epoch,
                'stopped_epoch' : early_stopping.stopped_epoch,
                'training_time' : training_time,
                'prediction_early_stopping' : {
                    'y_true' : data_early_stopping.y.reshape(-1).tolist(),
                    'y_pred' : prediction_es.reshape(-1).tolist()
                },
                'prediction_test' : {
                    'y_true' : data_test.y.reshape(-1).tolist(),
                    'y_pred' : prediction_test.reshape(-1).tolist(),
                },
                'history' : {
                    'loss' : self.convert2float(history.history['loss']),
                    'val_loss' : self.convert2float(history.history['val_loss']),
                    'lr' : self.convert2float(history.history['lr'])
                }
            })
            iteration += 1
            
        return errors, summary_iter
    
    def _get_model_heiden_mndawe(self, hp, n_features, model_type):
        look_back = 60 if model_type == 'heiden' else 20
        num_layers = 2 if model_type == 'heiden' else 1
        units = [64, 32] if model_type == 'heiden' else [200]

        model = Sequential()
        is_first_layer = True
        for i in range(num_layers):        
            
            if is_first_layer:
                model.add(Input(shape=(look_back, n_features)))
                is_first_layer = False
                
            is_last_layer = i == num_layers - 1
            return_sequences = not is_last_layer    
            model.add(
                LSTM(
                    units=units[i],
                    return_sequences=return_sequences, name=f"lstm_layer_{i}"
                )
            )

        if model_type == 'heiden':
            model.add(Dense(self.n_steps, name="dense_layer_output", activation='relu'))
        else:
            model.add(Dense(self.n_steps, name="dense_layer_output"))
        
        optimizer = Adam if hp["optimizer"] == "adam" else RMSprop
        learning_rate = hp["lr"] if model_type == 'heiden' else 0.0001
        
        model.compile(
            optimizer=optimizer(learning_rate=learning_rate), loss="mse"
        )
        
        return model
    
    def _model_heiden_mndawe(self, x, params, model_type='heiden', train_every_n_day=1):
        look_back = 60 if model_type == 'heiden' else 20
        batch_size = params['batch_size']
        n_features = x.shape[1]
                
        tss = TSMultistepSplit(
            n_splits=self.n_splits, n_steps=self.n_steps, look_back=look_back
        )

        errors = []
        summary_iter = []
        iteration = 1
        
        splits = tss.split(x, with_early_stopping_set=False)
        for train_indices, test_indices in splits:
            print("Fold ke-", iteration)
            train_in_this_fold = (iteration - 1) % train_every_n_day == 0
            
            if train_in_this_fold:
                print(f'Train in this fold with last 3 indices', train_indices[-3:], test_indices[-3:])
                data_train, data_test = self.__get_data(
                    x, train_indices, test_indices, look_back, 
                )
                last_train_indices = train_indices
                
                model = self._get_model_heiden_mndawe(params, n_features, model_type=model_type)
                
                start_time = time.time()
                history = model.fit(
                    data_train.scaled_x, data_train.scaled_y, 
                    shuffle=False, batch_size=batch_size,
                    epochs=100, verbose=self.verbose_fit_model
                )
                training_time = time.time() - start_time
                # print(model.summary())
                # model.save('model.h5')
                
            else :
                print('No train in this fold with last 3 indices:', last_train_indices[-3:], test_indices[-3:])
                data_train, data_test = self.__get_data(
                    x, last_train_indices, test_indices, look_back, 
                )
                # print(model.summary())
                
            
            prediction_test = self.get_prediction(model, data_test.scaled_x, data_test.scaler_y)
            error = (prediction_test - data_test.y).reshape(-1)
            errors.append(error)
            
            summary_iter.append({
                'iteration' : iteration,
                'rmse' : np.sqrt(np.power(error, 2).mean()),
                'mae' : np.abs(error).mean(),
                'best_epoch' : 100,
                'stopped_epoch' : 100,
                'training_time' : training_time,
                'prediction_test' : {
                    'y_true' : data_test.y.reshape(-1).tolist(),
                    'y_pred' : prediction_test.reshape(-1).tolist(),
                },
                'history' : {
                    'loss' : self.convert2float(history.history['loss']),
                }
            })
            iteration += 1
            
        return errors, summary_iter
    
    def fit(
        self, x, params, foldername, directory="kombinasi_variabel", 
        model_type="default", train_every_n_day=1
    ):
        
        if model_type == 'default' :
            errors, summary_iter = self._model_default(x, params, train_every_n_day=train_every_n_day)  
        else :
            errors, summary_iter = self._model_heiden_mndawe(x, params, model_type=model_type, train_every_n_day=train_every_n_day)
            
        errors = np.array(errors)
        metrics = self.get_metrics(errors)
        
        metrics['errors'] = errors.tolist()
        
        if not os.path.exists(directory):
            os.makedirs(directory)
            
        foldername = os.path.join(directory, foldername)
        if not os.path.exists(foldername):
            os.makedirs(foldername)
        
        self._save_file2json(metrics, os.path.join(foldername, "metrics.json"))
        self._save_file2json(summary_iter, os.path.join(foldername, "summary_iter.json"))
        
class SummaryHPO():
    """
    This class for make summary of Hyperparameter Optimization Process.
    This class will read all `metrics.json` file that generated when
    run Hyperparamter Optimization
    """
    def __init__(self, file_path=None, root=None, with_details=False):
        """If `file_path` is not None, will be read from file even `root` is not None
        """
        if file_path is None and root is None:
            raise Exception('File Path and root can\'t be None at the same time')
        
        self.data = pd.read_csv(file_path) if file_path is not None else self._read_from_folder(root)
        
        if not with_details and 'details' in self.data.columns:
            self.data.drop('details', axis=1, inplace=True)
          
    def _read_from_folder(self, root):
        
        trials = []
        for dir1 in os.listdir(root):
            for dir2 in os.listdir(os.path.join(root, dir1)):
                for dir3 in os.listdir(os.path.join(root, dir1, dir2)):
                    dir4 = os.path.join(root, dir1, dir2, dir3)
                    
                    if not os.path.isdir(dir4) or dir3 == 'trial_data':                
                        continue
                
                    with open(os.path.join(dir4, 'metrics.json'), 'r') as f:
                        metrics = json.load(f)
                        
                    with open(os.path.join(dir4, 'trial.json'), 'r') as f:
                        trial_json = json.load(f)
                        
                    with open(os.path.join(dir4, 'summary_iter.json'), 'r') as f:
                        summary = json.load(f)
                        summary = pd.DataFrame(summary)
                           
                    trials.append({
                        'variabel' : dir1,
                        'iterasi' : dir2,
                        'trial' : dir3,
                        'rmse_total' : metrics['rmse_total'], 
                        'mae_total' : metrics['mae_total'], 
                        'mean_best_epoch' : summary.best_epoch.mean(),
                        'mean_stopped_epoch' : summary.stopped_epoch.mean(),
                        'mean_training_time' : summary.training_time.mean(),
                        'rmse_eachday' : metrics['rmse_eachday'], 
                        'mae_eachday' : metrics['mae_eachday'],
                        'hyperparameters' : trial_json['hyperparameters']['values'],
                        'details' : summary.to_json(orient='records')
                    })               
        trials = pd.DataFrame(trials)     
        trials['kombinasi_variabel'] = trials.variabel.apply(self._define_kombinasi_variabel)
        return trials
    
    def _define_kombinasi_variabel(self, variabel):
        if variabel == 'close': return 'KV 1'
        if variabel == 'close_sentiment': return 'KV 2'
        if variabel == 'close_kurs': return 'KV 3'
        if variabel == 'close_technical': return 'KV 4'
        if variabel == 'close_sentiment_kurs': return 'KV 5'
        if variabel == 'close_technical_sentiment': return 'KV 6'
        if variabel == 'close_technical_kurs': return 'KV 7'
        if variabel == 'close_technical_sentiment_kurs': return 'KV 8'
        
    def __groupby(self, columns, target='rmse_total'):
        idx_best =  self.data.groupby(columns)[target].idxmin()
        return self.data.loc[idx_best].reset_index(drop=True).sort_values(columns)
    
    @property
    def best_per_kv(self):
        return self.__groupby(['kombinasi_variabel'])
    
    @property
    def best_per_kv_iterasi(self):
        return self.__groupby(['kombinasi_variabel', 'iterasi'])
    
    @property
    def mean_rmse(self):
        per_kv = self.best_per_kv_iterasi
        return per_kv.groupby('kombinasi_variabel')['rmse_total'].agg([np.mean, np.std, np.min, np.max])
    
    @property
    def mean_mae(self):
        per_kv = self.best_per_kv_iterasi
        return per_kv.groupby('kombinasi_variabel')['mae_total'].agg([np.mean, np.std, np.min, np.max])
    
    @property
    def best_params(self):
        params  = []
        
        for index, row in self.best_per_kv.iterrows():
            param = eval(row['hyperparameters'])
            temp = {
                'kombinasi_variabel' : row['kombinasi_variabel'],
                'look_back' : param['look_back'],
                'optimizer' : param['optimizer'],
                'learning_rate' : param['lr'],
                'num_layers' : param['num_layers']
            }
            
            for i in range(1, 4):
                temp[f'layer_{i}_unit'] = param[f"units_{i-1}"] if i <= param['num_layers'] else None
                temp[f'layer_{i}_dropout'] = param[f"units_{i-1}"] if i <= param['num_layers'] else None

            params.append(temp)

        return pd.DataFrame(params)

class RunWalkForwardValidation():
    """
    This class is interface to run WalkForwardValidation class
    """
    def __init__(
        self, data, n_splits, n_steps, n_iter=10,
        verbose_fit_model=0, directory="kombinasi_variabel"
    ):
        self.n_splits = n_splits
        self.n_steps = n_steps
        self.data = data
        self.n_iter = n_iter
        self.verbose_fit_model = verbose_fit_model
        self.directory = directory
        
        self.wfv = WalkForwardValidation(
            target_column='close', n_splits=n_splits, 
            n_steps=n_steps, verbose_fit_model=verbose_fit_model
        )
        
    def __get_data(self, kv):
        if kv == 'KV 1': return self.data[['close']]
        if kv == 'KV 2': return self.data[['close', 'sentiment_score']]
        if kv == 'KV 3': return self.data[['close', 'kurs']]
        if kv == 'KV 4': return self.data.drop(['sentiment_score', 'sentiment_category_score', 'kurs'], axis=1)
        if kv == 'KV 5': return self.data[['close', 'sentiment_score', 'kurs']]
        if kv == 'KV 6': return self.data.drop(['sentiment_category_score', 'kurs'], axis=1)
        if kv == 'KV 7': return self.data.drop(['sentiment_score', 'sentiment_category_score'], axis=1)
        if kv == 'KV 8': return self.data.drop(['sentiment_category_score'], axis=1)
        
    def fit(self, params, iterasi=None, model_type='default', train_every_n_day=1):
        for index, param in params.iterrows():
            kv = param['kombinasi_variabel']
            param = eval(param['hyperparameters'])
            
            iterasi = iterasi if iterasi is not None else range(1, self.n_iter + 1)
            for i in iterasi:
                clear_output(wait=True)
                subfolder = f"{kv}" if model_type == 'default' else f"{model_type}"
                subfolder = f"{subfolder}-{train_every_n_day}_days"
                
                print(f"{subfolder} - Iterasi ke-{i}")
                
                foldername = os.path.join(self.directory, subfolder, f"iterasi_{i}")
                if os.path.exists(foldername):
                    continue
                
                data = self.__get_data(kv)
                self.wfv.fit(
                    x = data, params=param, 
                    foldername=os.path.join(subfolder, f"iterasi_{i}"), 
                    directory=self.directory,
                    model_type=model_type, train_every_n_day=train_every_n_day
                )

## Run Model in Testing Data

In [3]:
summary = SummaryHPO('data/summary_hpo_nodetail.csv')

In [None]:
bri = pd.read_csv('data/testing_data.csv', index_col=0)

# n split = 57 karena data testing dari Januari - Maret 2022
# berjumlah 61 observasi, dikurangi 5 karena akan memprediksi 5 hari kedepan
n_splits = 57
n_steps = 5
n_iterasi = 10

run = RunWalkForwardValidation(
    data=bri, n_splits=n_splits, n_steps=n_steps, n_iter=n_iterasi, 
    verbose_fit_model=0, directory="testing_data"
)

params = summary.best_per_kv[['kombinasi_variabel', 'hyperparameters']]

# Hanya menggunakan model KV2 karena model tersebut merupakan model
# terbaik berdasarkan hasil yang diperoleh
params = params[params.kombinasi_variabel == "KV 2"]

for m_type in ['heiden']:
    for n_day in [1]:
        run.fit(params, iterasi=[7, 8], train_every_n_day=n_day, model_type=m_type)