#### Contents

0. [Load data and preprocess](#Load-data-and-preprocess)
1. [Initialize VRAE object](#Initialize-VRAE-object)
2. [Fit the model onto dataset](#Fit-the-model-onto-dataset)
3. [Transform the input timeseries to encoded latent vectors](#Transform-the-input-timeseries-to-encoded-latent-vectors)
4. [Save the model to be fetched later](#Save-the-model-to-be-fetched-later)

### Import required modules

In [1]:
from model.vrae import VRAE

from model.utils import *
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import trange
import tqdm

import pickle

from sklearn.preprocessing import MinMaxScaler


### Input parameters

In [2]:
dload = './saved_model' #download directory

### utils.load_data

In [3]:
def load_gen_data(file_name, cols_to_remove = None):
    """
    folder: folder where data is located
    """
    
    # define path(must be in pkl file)
    data_loc = f'./data/netis/{file_name}.pkl'    
    
    # get data
    with open(data_loc, 'rb') as f:
        df = pickle.load(f)
    
    # if needed remove columns that is not necessary
    if cols_to_remove != None:
        df = df_total.drop(cols_to_remove, axis=1)
    
    df = df.dropna()
    
    # TRAIN TEST SPLIT
    # TRAIN
    TRAIN_DF = df.query('Time < 20211103184400 or Time > 20211106084400 and label==0')
    
    # TEST(GET ONLY 정상)
    TEST_DF = df.query('Time >= 20211103184400 and Time <= 20211106084400 and label==0')

    TOTAL_DF = df.to_numpy()
    
    # REMOVE TIME & LABEL
    TRAIN_DF = TRAIN_DF.iloc[:,1:-1]
    TEST_DF = TEST_DF.iloc[:,1:-1]
    
    scaler = MinMaxScaler()
    
    TRAIN_SCALED_DF = scaler.fit(TRAIN_DF).transform(TRAIN_DF)
    TEST_SCALED_DF = scaler.transform(TEST_DF)
    
    return TOTAL_DF, TRAIN_SCALED_DF, TEST_SCALED_DF

### Hyper parameters

### Load data and preprocess
- `file_name` : pkl file_name
- `cols_to_remove` : generation 수행하지 않을 column 제거

In [4]:
# params
file_name = 'netis'

# load data
TOTAL_DF, TRAIN_DF, TEST_DF = load_gen_data(file_name)

# shape
print(TOTAL_DF.shape)
print(TRAIN_DF.shape)
print(TEST_DF.shape)

(26002, 94)
(22363, 92)
(3627, 92)


In [5]:
TRAIN_DF

array([[3.66328864e-04, 2.52639355e-05, 2.90615333e-05, ...,
        3.42446043e-01, 4.98597475e-01, 1.65289256e-01],
       [2.66795942e-04, 3.25753827e-05, 3.13032504e-05, ...,
        3.51079137e-01, 4.97896213e-01, 1.65289256e-01],
       [2.01613776e-04, 1.37864421e-05, 3.04728091e-05, ...,
        3.51079137e-01, 4.97896213e-01, 1.23966942e-01],
       ...,
       [2.66813941e-03, 9.48559406e-04, 3.24553957e-06, ...,
        6.04316547e-01, 9.89481066e-01, 2.89256198e-01],
       [1.45200665e-03, 2.26151271e-03, 3.08280452e-06, ...,
        6.04316547e-01, 9.88779804e-01, 3.05785124e-01],
       [8.30917887e-03, 8.96945321e-04, 7.79900694e-06, ...,
        6.04316547e-01, 9.88779804e-01, 2.97520661e-01]])

In [6]:
TEST_DF

array([[4.02355635e-03, 6.62881732e-05, 1.69243372e-05, ...,
        6.79136691e-01, 3.52033661e-01, 2.14876033e-01],
       [4.07135202e-03, 1.18662977e-04, 6.92800865e-06, ...,
        6.79136691e-01, 3.52734923e-01, 2.14876033e-01],
       [4.02378633e-03, 2.82489361e-04, 2.07942738e-05, ...,
        6.79136691e-01, 3.52734923e-01, 2.14876033e-01],
       ...,
       [3.98419630e-03, 7.85747554e-05, 1.75819403e-05, ...,
        5.98561151e-01, 4.25666199e-01, 2.56198347e-01],
       [3.98799304e-03, 4.90275879e-05, 3.81572787e-06, ...,
        6.00000000e-01, 4.24964937e-01, 2.56198347e-01],
       [3.93781591e-03, 3.97621553e-05, 8.13602856e-06, ...,
        6.04316547e-01, 4.24964937e-01, 2.47933884e-01]])

In [7]:
class GenerationDataset(Dataset):
    def __init__(self, data, window):
        self.data = torch.Tensor(data)
        self.window = window
 
    def __len__(self):
        return len(self.data) // self.window -1
    
    def __getitem__(self, index):
#         x = self.data[index*self.window:index*(self.window+1)]
        x = self.data[index*self.window:(index+1)*(self.window)]
        return x

In [8]:
window = 100

In [9]:
train_dataset = GenerationDataset(TRAIN_DF, window)
train_dataset

<__main__.GenerationDataset at 0x7efbee52de80>

In [10]:
test_dataset = GenerationDataset(TEST_DF, window)
test_dataset

<__main__.GenerationDataset at 0x7efbee5764e0>

In [11]:
train_dataset[0].shape

torch.Size([100, 92])

**Fetch `sequence_length` from dataset**

In [12]:
sequence_length = train_dataset[0].shape[0]
sequence_length

100

**Fetch `number_of_features` from dataset**

This config corresponds to number of input features

In [13]:
number_of_features = train_dataset[0].shape[1]
number_of_features

92

### Parameters

In [14]:
n_epochs = 1000
hidden_size = 90
hidden_layer_depth = 1
latent_length = 20
batch_size = 1
learning_rate = 0.0005
dropout_rate = 0.2
optimizer = 'Adam' # options: ADAM, SGD
cuda = True # options: True, False
print_every=30
clip = True # options: True, False
max_grad_norm=5
loss = 'MSELoss' # options: SmoothL1Loss, MSELoss
block = 'LSTM' # options: LSTM, GRU

### Initialize VRAE object

VRAE inherits from `sklearn.base.BaseEstimator` and overrides `fit`, `transform` and `fit_transform` functions, similar to sklearn modules

In [15]:
vrae = VRAE(sequence_length=sequence_length,
            number_of_features = number_of_features,
            hidden_size = hidden_size, 
            hidden_layer_depth = hidden_layer_depth,
            latent_length = latent_length,
            batch_size = batch_size,
            learning_rate = learning_rate,
            n_epochs = n_epochs,
            dropout_rate = dropout_rate,
            optimizer = optimizer, 
            cuda = cuda,
            print_every=print_every, 
            clip=clip, 
            max_grad_norm=max_grad_norm,
            loss = loss,
            block = block,
            dload = dload)

  "num_layers={}".format(dropout, num_layers))


### Fit the model onto dataset

In [None]:
loss_arr = vrae.fit(train_dataset)

#If the model has to be saved, with the learnt parameters use:
# vrae.fit(dataset, save = True)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Epoch: 0
Batch 30, loss = 13.5689, recon_loss = 13.5587, kl_loss = 0.0102
Batch 60, loss = 1.2331, recon_loss = 0.9340, kl_loss = 0.2991
Batch 90, loss = 1.3064, recon_loss = 1.1876, kl_loss = 0.1188
Batch 120, loss = 1.2437, recon_loss = 1.1185, kl_loss = 0.1252
Batch 150, loss = 114.6840, recon_loss = 114.3879, kl_loss = 0.2962
Batch 180, loss = 30.6796, recon_loss = 30.4654, kl_loss = 0.2141
Batch 210, loss = 141.6872, recon_loss = 141.4629, kl_loss = 0.2244
Average loss: 38.4770
Epoch: 1
Batch 30, loss = 20.6469, recon_loss = 20.5294, kl_loss = 0.1175
Batch 60, loss = 0.8517, recon_loss = 0.7684, kl_loss = 0.0833
Batch 90, loss = 1.2049, recon_loss = 1.1256, kl_loss = 0.0793
Batch 120, loss = 0.5958, recon_loss = 0.5386, kl_loss = 0.0572
Batch 150, loss = 129.7405, recon_loss = 129.6276, kl_loss = 0.1129
Batch 180, loss = 59.1760, recon_loss = 59.0556, kl_loss = 0.1204
Batch 210, loss = 84.6632, recon_loss = 84.6073, kl_loss = 0.0559
Average loss: 44.8867
Epoch: 2
Batch 30, loss = 

In [None]:
import matplotlib.pyplot as plt

plt.plot(loss_arr)

### Save the model to be fetched later

In [None]:
vrae.save('vrae.pth')

# To load a presaved model, execute:
# vrae.load('vrae.pth')

In [None]:
vrae.is_fitted

### Transform the input timeseries to encoded latent vectors

In [None]:
z_run = vrae.transform(test_dataset)
z_run

In [None]:
z_run.shape

### Reconstruct

In [None]:
reconstruction = vrae.reconstruct(test_dataset)
reconstruction

In [None]:
TEST_DF.shape

In [None]:
reconstruction.shape