#### Contents

0. [Load data and preprocess](#Load-data-and-preprocess)
1. [Initialize VRAE object](#Initialize-VRAE-object)
2. [Fit the model onto dataset](#Fit-the-model-onto-dataset)
3. [Transform the input timeseries to encoded latent vectors](#Transform-the-input-timeseries-to-encoded-latent-vectors)
4. [Save the model to be fetched later](#Save-the-model-to-be-fetched-later)

In [1]:
import random
import torch
import numpy as np

def fix_seed(seed: int) -> None:
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

In [2]:
fix_seed(555)

### Import required modules

In [3]:
from model.vrae import VRAE

from model.utils import *
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import trange
import tqdm

import pickle

from sklearn.preprocessing import MinMaxScaler


### Input parameters

In [4]:
dload = './saved_model' #download directory

### utils.load_data

In [5]:
def load_gen_data(file_name, cols_to_remove = None):
    """
    folder: folder where data is located
    """
    
    # define path(must be in pkl file)
    data_loc = f'./data/netis/{file_name}.pkl'    
    
    # get data
    with open(data_loc, 'rb') as f:
        df = pickle.load(f)
    
    # if needed remove columns that is not necessary
    if cols_to_remove != None:
        df = df_total.drop(cols_to_remove, axis=1)
    
    df = df.dropna()
    
    # TRAIN TEST SPLIT
    # TRAIN
    TRAIN_DF = df.query('Time < 20211103184400 or Time > 20211106084400 and label==0')
    
    # TEST(GET ONLY 정상)
    TEST_DF = df.query('Time >= 20211103184400 and Time <= 20211106084400 and label==0')

    TOTAL_DF = df.to_numpy()
    
    # REMOVE TIME & LABEL
    TRAIN_DF = TRAIN_DF.iloc[:,1:-1]
    cols = TRAIN_DF.columns
    TRAIN_DF = TRAIN_DF.to_numpy()
    TEST_DF = TEST_DF.iloc[:,1:-1].to_numpy()
    
    MM_scaler = MinMaxScaler()
    
    TRAIN_SCALED = MM_scaler.fit(TRAIN_DF).transform(TRAIN_DF)
    TEST_SCALED = MM_scaler.transform(TEST_DF)
    
    return TOTAL_DF, TRAIN_DF, TEST_DF, TRAIN_SCALED, TEST_SCALED, cols, MM_scaler

### Hyper parameters

### Load data and preprocess
- `file_name` : pkl file_name
- `cols_to_remove` : generation 수행하지 않을 column 제거

In [6]:
# params
file_name = 'netis'

# load data
TOTAL_DF, TRAIN_DF, TEST_DF, TRAIN_SCALED, TEST_SCALED, cols, MM_scaler = load_gen_data(file_name)

# shape
print(TOTAL_DF.shape)
print(TRAIN_SCALED.shape)
print(TEST_SCALED.shape)

(26002, 94)
(22363, 92)
(3627, 92)


In [7]:
TRAIN_SCALED

array([[3.66328864e-04, 2.52639355e-05, 2.90615333e-05, ...,
        3.42446043e-01, 4.98597475e-01, 1.65289256e-01],
       [2.66795942e-04, 3.25753827e-05, 3.13032504e-05, ...,
        3.51079137e-01, 4.97896213e-01, 1.65289256e-01],
       [2.01613776e-04, 1.37864421e-05, 3.04728091e-05, ...,
        3.51079137e-01, 4.97896213e-01, 1.23966942e-01],
       ...,
       [2.66813941e-03, 9.48559406e-04, 3.24553957e-06, ...,
        6.04316547e-01, 9.89481066e-01, 2.89256198e-01],
       [1.45200665e-03, 2.26151271e-03, 3.08280452e-06, ...,
        6.04316547e-01, 9.88779804e-01, 3.05785124e-01],
       [8.30917887e-03, 8.96945321e-04, 7.79900694e-06, ...,
        6.04316547e-01, 9.88779804e-01, 2.97520661e-01]])

In [8]:
TEST_SCALED

array([[4.02355635e-03, 6.62881732e-05, 1.69243372e-05, ...,
        6.79136691e-01, 3.52033661e-01, 2.14876033e-01],
       [4.07135202e-03, 1.18662977e-04, 6.92800865e-06, ...,
        6.79136691e-01, 3.52734923e-01, 2.14876033e-01],
       [4.02378633e-03, 2.82489361e-04, 2.07942738e-05, ...,
        6.79136691e-01, 3.52734923e-01, 2.14876033e-01],
       ...,
       [3.98419630e-03, 7.85747554e-05, 1.75819403e-05, ...,
        5.98561151e-01, 4.25666199e-01, 2.56198347e-01],
       [3.98799304e-03, 4.90275879e-05, 3.81572787e-06, ...,
        6.00000000e-01, 4.24964937e-01, 2.56198347e-01],
       [3.93781591e-03, 3.97621553e-05, 8.13602856e-06, ...,
        6.04316547e-01, 4.24964937e-01, 2.47933884e-01]])

In [9]:
class GenerationDataset(Dataset):
    def __init__(self, data, window):
        self.data = torch.Tensor(data)
        self.window = window
 
    def __len__(self):
        return len(self.data) // self.window # -1
    
    def __getitem__(self, index):
#         x = self.data[index*self.window:index*(self.window+1)]
        x = self.data[index*self.window:(index+1)*(self.window)]
        return x

In [10]:
window = 100

In [11]:
train_dataset = GenerationDataset(TRAIN_SCALED, window)
train_dataset

<__main__.GenerationDataset at 0x7f2268a34ef0>

In [12]:
test_dataset = GenerationDataset(TEST_SCALED, window)
test_dataset

<__main__.GenerationDataset at 0x7f2268a34780>

In [13]:
train_dataset[0].shape

torch.Size([100, 92])

**Fetch `sequence_length` from dataset**

In [14]:
sequence_length = train_dataset[0].shape[0]
sequence_length

100

**Fetch `number_of_features` from dataset**

This config corresponds to number of input features

In [15]:
number_of_features = train_dataset[0].shape[1]
number_of_features

92

### Parameters

In [16]:
n_epochs = 1000
hidden_size = 90
hidden_layer_depth = 1
latent_length = 20
batch_size = 1
learning_rate = 0.0002
dropout_rate = 0.2
optimizer = 'Adam' # options: ADAM, SGD
cuda = True # options: True, False
print_every=30
clip = True # options: True, False
max_grad_norm=5
loss = 'MSELoss' # options: SmoothL1Loss, MSELoss
block = 'LSTM' # options: LSTM, GRU

### Initialize VRAE object

VRAE inherits from `sklearn.base.BaseEstimator` and overrides `fit`, `transform` and `fit_transform` functions, similar to sklearn modules

In [17]:
vrae = VRAE(sequence_length=sequence_length,
            number_of_features = number_of_features,
            hidden_size = hidden_size, 
            hidden_layer_depth = hidden_layer_depth,
            latent_length = latent_length,
            batch_size = batch_size,
            learning_rate = learning_rate,
            n_epochs = n_epochs,
            dropout_rate = dropout_rate,
            optimizer = optimizer, 
            cuda = cuda,
            print_every=print_every, 
            clip=clip, 
            max_grad_norm=max_grad_norm,
            loss = loss,
            block = block,
            dload = dload)

  "num_layers={}".format(dropout, num_layers))


### Fit the model onto dataset

In [None]:
loss_arr = vrae.fit(train_dataset)

#If the model has to be saved, with the learnt parameters use:
# vrae.fit(dataset, save = True)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Epoch: 0
Batch 30, loss = 72.3787, recon_loss = 72.3705, kl_loss = 0.0081
Batch 60, loss = 33.8839, recon_loss = 33.8686, kl_loss = 0.0153
Batch 90, loss = 2.8417, recon_loss = 2.7706, kl_loss = 0.0712
Batch 120, loss = 1.4384, recon_loss = 1.2654, kl_loss = 0.1730
Batch 150, loss = 130.5443, recon_loss = 130.1898, kl_loss = 0.3545
Batch 180, loss = 65.8884, recon_loss = 65.5564, kl_loss = 0.3320
Batch 210, loss = 86.6773, recon_loss = 86.4329, kl_loss = 0.2444
Average loss: 54.9296
Epoch: 1
Batch 30, loss = 24.2688, recon_loss = 24.1388, kl_loss = 0.1300
Batch 60, loss = 4.9921, recon_loss = 4.8392, kl_loss = 0.1529
Batch 90, loss = 1.5662, recon_loss = 1.3917, kl_loss = 0.1746
Batch 120, loss = 0.8952, recon_loss = 0.7551, kl_loss = 0.1401
Batch 150, loss = 143.5255, recon_loss = 143.1471, kl_loss = 0.3784
Batch 180, loss = 98.2070, recon_loss = 97.8368, kl_loss = 0.3702
Batch 210, loss = 52.7060, recon_loss = 52.4864, kl_loss = 0.2197
Average loss: 47.9244
Epoch: 2
Batch 30, loss = 

In [None]:
import matplotlib.pyplot as plt

plt.plot(loss_arr)

### Save the model to be fetched later

In [None]:
vrae.save('vrae.pth')

# To load a presaved model, execute:
# vrae.load('vrae.pth')

In [None]:
vrae.is_fitted

### Transform the input timeseries to encoded latent vectors

In [None]:
z_run = vrae.transform(test_dataset)
z_run

In [None]:
z_run.shape

In [None]:
from einops import rearrange
import torch.nn as nn

In [None]:
def concat_recon(recon_output):
    w,b,f = recon_output.shape
    
    tmp = rearrange(recon_output, 'w b f -> b w f')

    output = tmp.reshape(w*b,f)

    return output

def inverse_minmax(x):
    inv_x = MM_scaler.inverse_transform(x)
    return inv_x

def eval_recon(recon, real, undo = True):
    criterion = nn.MSELoss()
    
    if undo == True:
        # undo minmax scaling
        recon = inverse_minmax(recon)
    
    r = recon.shape[0]
    real = real[:r,:]
    
    # get shape
    print(recon.shape)
    print(real.shape)

    # compute loss
    loss = criterion(torch.tensor(recon), torch.tensor(real))
    
    return loss

## Reconstruct

### Train

In [None]:
# train reconstruct
train_recon = vrae.reconstruct(train_dataset)
print(train_recon.shape)
train_recon

In [None]:
train_recon = concat_recon(train_recon)
train_recon.shape

In [None]:
eval_recon(train_recon, TRAIN_DF)

In [None]:
eval_recon(train_recon, TRAIN_SCALED, False)

### Test

In [None]:
# test reconstruct
test_recon = vrae.reconstruct(test_dataset)
print(test_recon.shape)
test_recon

In [None]:
test_recon = concat_recon(test_recon)
test_recon.shape

In [None]:
eval_recon(test_recon, TEST_DF)

In [None]:
eval_recon(test_recon, TEST_SCALED, False)

### VISUALIZE