#### Contents

0. [Load data and preprocess](#Load-data-and-preprocess)
1. [Initialize VRAE object](#Initialize-VRAE-object)
2. [Fit the model onto dataset](#Fit-the-model-onto-dataset)
3. [Transform the input timeseries to encoded latent vectors](#Transform-the-input-timeseries-to-encoded-latent-vectors)
4. [Save the model to be fetched later](#Save-the-model-to-be-fetched-later)

### Import required modules

In [22]:
%load_ext autoreload
%autoreload 2

from model.vrae import VRAE
from model.utils import *
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import trange
import tqdm

### Input parameters

In [2]:
dload = './saved_model' #download directory

### Hyper parameters

### Load data and preprocess
- `folder` : data location
- `cols_to_remove` : generation 수행하지 않을 column 제거

**TODO : 해당 변수에 대한 처리를 어떻게 해줘야하는가 확인 작업이 필요함**

~~~
YYYYMMDD : 년월일
HHMMSS : 시분초
MNG_NO : 장비번호
IF_IDX : 회선 index
~~~

- 현재는 분석의 편의를 위해 ['YYYYMMDD', 'HHMMSS']만 제거해줌

In [3]:
# params
folder = 'data'
cols_to_remove = ['YYYYMMDD', 'HHMMSS']

# load data
df_total = load_data(folder, cols_to_remove)

# shape
print(df_total.shape)

(23195128, 56)


In [4]:
class HamonDataset(Dataset):
    def __init__(self, data, window, stride):
        self.data = torch.Tensor(data)
        self.window = window
 
    def __len__(self):
        return len(self.data) -  self.window 
    
    def __getitem__(self, index):
        x_index = index*self.window
        x = self.data[x_index:x_index+self.window]
        return x

In [5]:
data = df_total
stride = 10
window = 100

In [6]:
train_dataset = HamonDataset(data, window, stride)
train_dataset

<__main__.HamonDataset at 0x7f7cbaa3f940>

In [7]:
train_dataset[0].shape

torch.Size([100, 56])

**Fetch `sequence_length` from dataset**

In [8]:
sequence_length = train_dataset[0].shape[0]
sequence_length

100

**Fetch `number_of_features` from dataset**

This config corresponds to number of input features

In [9]:
number_of_features = train_dataset[0].shape[1]
number_of_features

56

### Parameters

In [10]:
n_epochs = 1
hidden_size = 90
hidden_layer_depth = 1
latent_length = 20
batch_size = 32
learning_rate = 0.0005
dropout_rate = 0.2
optimizer = 'Adam' # options: ADAM, SGD
cuda = True # options: True, False
print_every=30
clip = True # options: True, False
max_grad_norm=5
loss = 'MSELoss' # options: SmoothL1Loss, MSELoss
block = 'LSTM' # options: LSTM, GRU

In [11]:
train_loader = DataLoader(dataset = train_dataset,
                          batch_size = batch_size,
                          shuffle = False,
                          drop_last=True)

In [12]:
X = iter(train_loader).next()
X

tensor([[[2.7220e+03, 1.2400e+02, 1.8431e+05,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [2.7220e+03, 1.2400e+02, 3.8349e+05,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [2.7220e+03, 1.2400e+02, 2.3519e+05,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         ...,
         [2.8500e+03, 1.2400e+02, 2.3200e+02,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [2.8500e+03, 1.2400e+02, 2.4000e+02,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [2.8500e+03, 1.2400e+02, 2.4000e+02,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00]],

        [[2.8500e+03, 1.2400e+02, 2.4000e+02,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [2.8500e+03, 1.2400e+02, 2.4000e+02,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [2.8500e+03, 1.2400e+02, 2.4000e+02,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         ...,
         [2.8630e+03, 1.2400e+02, 1.8664e+04,  ..., 0.0000e+00,
          0.000

In [13]:
X.shape

torch.Size([32, 100, 56])

### Initialize VRAE object

VRAE inherits from `sklearn.base.BaseEstimator` and overrides `fit`, `transform` and `fit_transform` functions, similar to sklearn modules

In [23]:
vrae = VRAE(sequence_length=sequence_length,
            number_of_features = number_of_features,
            hidden_size = hidden_size, 
            hidden_layer_depth = hidden_layer_depth,
            latent_length = latent_length,
            batch_size = batch_size,
            learning_rate = learning_rate,
            n_epochs = n_epochs,
            dropout_rate = dropout_rate,
            optimizer = optimizer, 
            cuda = cuda,
            print_every=print_every, 
            clip=clip, 
            max_grad_norm=max_grad_norm,
            loss = loss,
            block = block,
            dload = dload)

### Fit the model onto dataset

In [24]:
train_dataset[0]

tensor([[2.7220e+03, 1.2400e+02, 1.8431e+05,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [2.7220e+03, 1.2400e+02, 3.8349e+05,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [2.7220e+03, 1.2400e+02, 2.3519e+05,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [2.8500e+03, 1.2400e+02, 2.3200e+02,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [2.8500e+03, 1.2400e+02, 2.4000e+02,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [2.8500e+03, 1.2400e+02, 2.4000e+02,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]])

In [28]:
train_loader_test = DataLoader(dataset = train_dataset[0],
                          batch_size = 32,
                          shuffle = False,
                          drop_last=True)

In [29]:
tmp = iter(train_loader_test).next()
print(tmp.shape)

torch.Size([32, 56])


In [30]:
vrae.fit(train_dataset)

#If the model has to be saved, with the learnt parameters use:
# vrae.fit(dataset, save = True)

fit result
<__main__.HamonDataset object at 0x7f7cbaa3f940>
tensor([[2.7220e+03, 1.2400e+02, 1.8431e+05,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [2.7220e+03, 1.2400e+02, 3.8349e+05,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [2.7220e+03, 1.2400e+02, 2.3519e+05,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [2.8500e+03, 1.2400e+02, 2.3200e+02,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [2.8500e+03, 1.2400e+02, 2.4000e+02,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [2.8500e+03, 1.2400e+02, 2.4000e+02,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]])
torch.Size([32, 100, 56])
Epoch: 0
--------------------------
DEBUGGING
torch.Size([32, 100, 56])
--------------------------


RuntimeError: Expected hidden[0] size (1, 32, 90), got [1, 100, 90]

### Transform the input timeseries to encoded latent vectors

In [None]:
z_run = vrae.transform(test_dataset)

#If the latent vectors have to be saved, pass the parameter `save`
# z_run = vrae.transform(dataset, save = True)

### Save the model to be fetched later

In [None]:
vrae.save('vrae.pth')

# To load a presaved model, execute:
# vrae.load('vrae.pth')

### Visualize using PCA and tSNE

In [None]:
plot_clustering(z_run, y_val, engine='matplotlib', download = False)

# If plotly to be used as rendering engine, uncomment below line
#plot_clustering(z_run, y_val, engine='plotly', download = False)