# timeGAN para HAR

- A implementação original, disponível [aqui](https://github.com/jsyoon0823/TimeGAN/tree/master), usou o TensorFlow v1.
- Uma implementação alternativa usando TensorFlow v2 (com alguns erros) está disponível [aqui](https://www.kaggle.com/code/alincijov/stocks-generate-synthetic-data-timegan).
- Uma implementação usando pytorch foi desenvolvida em 2021 e está disponível [aqui](https://github.com/benearnthof/TimeGAN/tree/main).
- Uma outra implementação usando pytorch está disponível [aqui](https://github.com/zzw-zwzhang/TimeGAN-pytorch/tree/main)

A proposta aqui é tentar adaptar o modelo original para dados de sensores inerciais em tarefas de HAR.

In [1]:
import numpy as np
import torch 
from torch import nn
from torch import functional as F
import os
import plotly.express as px
import plotly.graph_objects as go
from torchinfo import summary

In [24]:
file = 'stock_data.csv'
raw = np.loadtxt(file, delimiter = ",",skiprows = 1)

In [11]:
def Normalize(dta):
  return (dta - np.min(dta, 0)) /  (np.max(dta, 0) - np.min(dta, 0) + 1e-7)

In [12]:
def real_data_loading (path, seq_len):
  """Load and preprocess real-world datasets.
  
  Args:
    - data_name: stock or energy
    - seq_len: sequence length
    
  Returns:
    - data: preprocessed data.
  """  
  assert os.path.isfile(path)
  ori_data = np.loadtxt(path, delimiter = ",",skiprows = 1)
        
  # Flip the data to make chronological data
  ori_data = ori_data[::-1]
  # Normalize the data
  ori_data = Normalize(ori_data)
    
  # Preprocess the dataset
  temp_data = []    
  # Cut data by sequence length
  for i in range(0, len(ori_data) - seq_len):
    _x = ori_data[i:i + seq_len]
    # yields an array of dims [len(dta) - seq_len, seq_len, n_variables]
    temp_data.append(_x)
        
  # Mix the datasets (to make it similar to i.i.d)
  idx = np.random.permutation(len(temp_data))    
  data = []
  for i in range(len(temp_data)):
    data.append(temp_data[idx[i]])
    
  return data

In [13]:
seq_len = 24
ori_data = real_data_loading(file, seq_len)

In [25]:
def train_test_divide (data_x, data_x_hat, data_t, data_t_hat, train_rate = 0.8):
  """Divide train and test data for both original and synthetic data.
  
  Args:
    - data_x: original data
    - data_x_hat: generated data
    - data_t: original time
    - data_t_hat: generated time
    - train_rate: ratio of training data from the original data
  """
  # Divide train/test index (original data)
  no = len(data_x)
  idx = np.random.permutation(no)
  train_idx = idx[:int(no*train_rate)]
  test_idx = idx[int(no*train_rate):]
    
  train_x = [data_x[i] for i in train_idx]
  test_x = [data_x[i] for i in test_idx]
  train_t = [data_t[i] for i in train_idx]
  test_t = [data_t[i] for i in test_idx]      
    
  # Divide train/test index (synthetic data)
  no = len(data_x_hat)
  idx = np.random.permutation(no)
  train_idx = idx[:int(no*train_rate)]
  test_idx = idx[int(no*train_rate):]
  
  train_x_hat = [data_x_hat[i] for i in train_idx]
  test_x_hat = [data_x_hat[i] for i in test_idx]
  train_t_hat = [data_t_hat[i] for i in train_idx]
  test_t_hat = [data_t_hat[i] for i in test_idx]
  
  return train_x, train_x_hat, test_x, test_x_hat, train_t, train_t_hat, test_t, test_t_hat

In [26]:
def extract_time (data):
  """Returns Maximum sequence length and each sequence length.
  
  Args:
    - data: original data
    
  Returns:
    - time: extracted time information
    - max_seq_len: maximum sequence length
  """
  time = list()
  max_seq_len = 0
  for i in range(len(data)):
    max_seq_len = max(max_seq_len, len(data[i][:,0]))
    time.append(len(data[i][:,0]))
    
  return time, max_seq_len

In [27]:
seq_lengths, max_seq_len = extract_time(ori_data)

In [32]:
def batch_generator(data, time, batch_size):
  """Mini-batch generator.
  
  Args:
    - data: time-series data
    - time: time information
    - batch_size: the number of samples in each batch
    
  Returns:
    - X_mb: time-series data in each batch
    - T_mb: time information in each batch
  """
  no = len(data)
  idx = np.random.permutation(no)
  train_idx = idx[:batch_size]     
            
  X_mb = list(data[i] for i in train_idx)
  T_mb = list(time[i] for i in train_idx)
  
  return X_mb, T_mb

In [33]:
ori_time, max_seq_len = extract_time(ori_data)
no, seq_len, dim = np.asarray(ori_data).shape
batch_size = 128
z_dim = dim

In [38]:
def random_generator (batch_size, z_dim, T_mb, max_seq_len):
  """Random vector generation.
  
  Args:
    - batch_size: size of the random vector
    - z_dim: dimension of random vector
    - T_mb: time information for the random vector
    - max_seq_len: maximum sequence length
    
  Returns:
    - Z_mb: generated random vector
  """
  Z_mb = list()
  for i in range(batch_size):
    temp = np.zeros([max_seq_len, z_dim])
    temp_Z = np.random.uniform(0., 1, [T_mb[i], z_dim])
    temp[:T_mb[i],:] = temp_Z
    Z_mb.append(temp_Z)
  return Z_mb

In [39]:
X_mb, T_mb = batch_generator(ori_data, ori_time, batch_size) 

In [45]:
Z_mb = random_generator(batch_size, z_dim, T_mb, max_seq_len)
np.array(Z_mb).shape

(128, 24, 6)

In [46]:
# X = tf.placeholder(tf.float32, [None, max_seq_len, dim], name = "myinput_x")
# Z = tf.placeholder(tf.float32, [None, max_seq_len, z_dim], name = "myinput_z")
# T = tf.placeholder(tf.int32, [None], name = "myinput_t")
# X is the original data => Flexible batch size, 24, 6 for stock data
# Z is the generated fake data => Flex batch size, 24, 6 for stock data
# T is the time information => Flexible batch size, here 128 for the time data
# the embedding network uses 
num_layers = 2
hidden_dim = 24
seq_len = 24
# i think input size should be 24 for the stock data
input_size = 6
device = "cpu"
batch_size = 128

# from the RNN documentation: 
# arguments: inputsize = num_features, hidden_size, num_layers
# rnn = nn.RNN(6, 24, 2, batch_first=True)
# batch seq features 128, 24, 6
# input: batch, length, hidden
# input = torch.randn(128, 24, 6)
# num layers, batch, hidden size
# h0 = torch.randn(2, 128, 24)
# output, hn = rnn(input, h0)

class RNN(nn.Module):
  def __init__(self, input_size, hidden_dim, num_layers):
    super(RNN, self).__init__()
    self.num_layers = num_layers
    self.hidden_dim = hidden_dim
    # batch must be first dimension, inputsize = 6 for tme data
    # arguments: inputsize = num_features, hidden_size, num_layers
    # rnn = nn.RNN(6, 24, 2, batch_first=True)
    self.rnn = nn.RNN(input_size, hidden_dim, num_layers, batch_first = True)
    # X => batch_size, seq_length, num_features like specified above
    self.fc = nn.Linear(hidden_dim, hidden_dim)
    self.nonlinearity = nn.Sigmoid()

  # rnn needs two inputs: data & initial state
  def forward(self, x):
    # num layers, batch, hidden size
    # h0 = torch.randn(2, 128, 24)
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device).float()
    out, hn = self.rnn(x, h0)
    # out: batch_size, seq_len, hidden_dim
    # out = out[:, -1, :]
    # out (128, 24)
    out = self.fc(out)
    out = self.nonlinearity(out)
    return out

In [55]:
# testing the RNN module
net = RNN(6, 24, 5)
# converting everything to float to avoid data type runtime errors
net.float()

# using one minibatch as example data
dta = np.array(X_mb)
dta.astype(np.float32)
dta = torch.from_numpy(dta)

emb = net(dta.float())
# yields 128 x 24 embeddings => 128 samples in batch, every embedding has 24 features
# => we embed in a higher dimensional space in this case, as mentioned in the paper

In [58]:
emb[0].shape

torch.Size([24, 24])

In [60]:
summary(net, input_size=(batch_size, 24, 6))

Layer (type:depth-idx)                   Output Shape              Param #
RNN                                      [128, 24, 24]             --
├─RNN: 1-1                               [128, 24, 24]             5,568
├─Linear: 1-2                            [128, 24, 24]             600
├─Sigmoid: 1-3                           [128, 24, 24]             --
Total params: 6,168
Trainable params: 6,168
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 17.18
Input size (MB): 0.07
Forward/backward pass size (MB): 1.18
Params size (MB): 0.02
Estimated Total Size (MB): 1.28