In [97]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch import nn, Tensor

from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

import random
import math
import time
import sys
sys.path.insert(1, '../code/')
import ssm
# insert at 1, 0 is the script path (or '' in REPL)
import util

import math
from typing import Tuple

import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer
from torch.utils.data import dataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


### Set the path and specification that you want to predict

In [98]:
data_path = '../data/transformer_data'
spec = 'airworthy' #'hover'#'interference'#'dist' #'mass' #'airworthy'
max_mass = 35. #(kg)

### Download the dataset and build the data loaders

In [99]:
torch.manual_seed(0)
np.random.seed(0)
batch_size = 2 # 512
batch_size_val = 2 # 512
frac_train = 0.4
frac_val = 0.4

dataloader_tr, dataloader_val, dataloader_test, scale_1, scale_2 = ssm.prepare_sequence_data(data_path, spec, batch_size = batch_size ,batch_size_val = batch_size_val, frac_train = frac_train, frac_val = frac_val)

In [4]:
dic = torch.load(data_path)
print(dic.keys())
# dic['X'][-1]
K = len(dic['encoding_dict_keys'])
V = len(dic['encoding_dict_values'])

dict_keys(['X', 'X_norm', 'y', 'airworthy', 'hover_time', 'max_speed', 'max_distance', 'interference_list', 'encoding_dict_keys', 'encoding_dict_values', 'norm_dict', 'path', 'folders'])


In [5]:
len(dic['X'])

15

In [6]:
dic['X'][-6][:,:K].shape, dic['X'][-6][:,K:K+V].shape

(torch.Size([69, 43]), torch.Size([69, 577]))

In [7]:
batch, train_data=next(enumerate(dataloader_test))
data, targets, mask = train_data
data.size(), targets.size(), mask.size()


(torch.Size([3, 69, 653]), torch.Size([3]), torch.Size([3, 69]))

In [8]:
X_k=data[:,:,:K] #19
X_v=data[:,:,K:K+V] #V (above code)

In [9]:
X_k.shape, dic['X'][-1][:,:K].shape

(torch.Size([3, 69, 43]), torch.Size([37, 43]))

In [10]:
X_v.shape, dic['X'][-1][:,K:K+V].shape, data.shape

(torch.Size([3, 69, 577]), torch.Size([37, 577]), torch.Size([3, 69, 653]))

In [11]:
dic['X'][-1].shape, dic['airworthy']

(torch.Size([37, 653]), [1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1])

In [12]:
dic['X'][-1][:,:K].argmax(1).size()

torch.Size([37])

In [13]:
data[-1][:,:K].argmax(1)

tensor([19, 19, 10, 18, 27, 33, 41, 19,  2, 39, 24, 35, 21, 30, 12, 22, 36,  6,
        15,  1,  3, 13, 16, 23, 37, 40, 34, 28, 29,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [14]:
print(K, V)

43 577


In [15]:
def decode(dic, tokens, values):
    '''
    Take an encoding dictionary {'token0':0, 'token1':1, ... } and list of tokens [0, 1, 4, ... ]
    and return the corresponding strings of tokens ['token0', 'token1', 'token4', ...]
    '''
    
    seq = []
    dic_reverse = {value:key for key, value in dic.items()}
    for n, tok in enumerate(tokens):
        if dic_reverse[int(tok)] == 'Value':
            seq.append(float(values[n]))
        else:
            seq.append(dic_reverse[int(tok)])
    return seq

    
# decoded_design = []
# for i in range(len(int(dic['X'][-1]):
k = decode(dic['encoding_dict_keys'], dic['X'][-6][:,:K].argmax(1), None)
v = decode(dic['encoding_dict_values'], dic['X'][-6][:,K:K+V].argmax(1), dic['X_norm'][-6][:,-1])
    # decoded_design.append({k[0]:v[0]}

In [16]:
[{k[i]:v[i]} for i in range(len(k))]

[{'node_type': 'ConnectedHub4_1_2_1'},
 {'node_type': 'PropArm'},
 {'armLength': -0.9728665351867676},
 {'motorType': 't_motor_U7_V2.0KV490'},
 {'propType': 'apc_propellers_17x7E'},
 {'offset': -0.8203427195549011},
 {'angle': -0.34535443782806396},
 {'node_type': 'SidewaysBendWithTopSegment'},
 {'angle': -1.3278722763061523},
 {'armLength': 1.0017277002334595},
 {'node_type': 'PropArm'},
 {'armLength': -1.2462646961212158},
 {'motorType': 'kde_direct_KDE700XF_455_G3'},
 {'propType': 'apc_propellers_16x8'},
 {'offset': 1.362557053565979},
 {'angle': 1.7003532648086548},
 {'node_type': 'CrossSegment'},
 {'armLength': -1.3186391592025757},
 {'node_type': 'AngledPropArm'},
 {'armLength': 1.0142735242843628},
 {'motorType': 't_motor_U7_V2.0KV490'},
 {'propType': 'apc_propellers_16x6E'},
 {'node_type': 'AngledPropArm'},
 {'armLength': -1.4505165815353394},
 {'motorType': 'kde_direct_KDE4012XF_400'},
 {'propType': 'apc_propellers_17x7E'},
 {'node_type': 'CrossSegment'},
 {'armLength': 0.2025

In [17]:
def decode_embedding(x):
    ''' 
    x is one embedding sample [69(timestep), 653(embedding size)]
    '''
    def decode(dic, tokens, values):
        '''
        Take an encoding dictionary {'token0':0, 'token1':1, ... } and list of tokens [0, 1, 4, ... ]
        and return the corresponding strings of tokens ['token0', 'token1', 'token4', ...]
        '''
    
        seq = []
        dic_reverse = {value:key for key, value in dic.items()}
        for n, tok in enumerate(tokens):
            if dic_reverse[int(tok)] == 'Value':
                seq.append(float(values[n]))
            else:
                seq.append(dic_reverse[int(tok)])
        return seq
    data_path = '../data/transformer_data'
    dic = torch.load(data_path)
    K = len(dic['encoding_dict_keys'])
    V = len(dic['encoding_dict_values'])
    k = decode(dic['encoding_dict_keys'], x[:,:K].argmax(1), None)
    v = decode(dic['encoding_dict_values'], x[:,K:K+V].argmax(1), dic['X_norm'][-6][:,-1])
    end_idx = k.index('varioY') #end token
    k = k[:end_idx+1]
    v = v[:end_idx+1]
    output = [{k[i]:v[i]} for i in range(len(k))]
    return output
    

In [90]:
batch, test_data=next(enumerate(dataloader_test))
data, targets, mask = test_data
data.size()

torch.Size([3, 69, 653])

In [91]:
dic['X_norm'][-6][:,-1]

tensor([ 0.0000,  0.0000, -0.9729,  0.0000,  0.0000, -0.8203, -0.3454,  0.0000,
        -1.3279,  1.0017,  0.0000, -1.2463,  0.0000,  0.0000,  1.3626,  1.7004,
         0.0000, -1.3186,  0.0000,  1.0143,  0.0000,  0.0000,  0.0000, -1.4505,
         0.0000,  0.0000,  0.0000,  0.2025,  0.0000, -1.1318, -0.4462,  0.0000,
         0.0000, -0.7071,  0.7071,  0.7071,  1.0000, -0.7071,  0.7071, -0.7071,
        -0.7071,  0.0000,  0.0000,  0.0000, -2.5834,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0882, -0.1040,  0.9496, -0.1040,  0.0000,  1.1547,  0.0000,
        -1.1547,  0.2582, -0.9496,  0.2582,  0.9496,  0.2582, -0.9496,  0.2582,
         0.9496,  0.2582, -0.9496,  0.2582,  0.9496])

In [92]:
data

tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.9729],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.9496],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.2582],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.9496]],

        [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.4256],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0

In [93]:
data2 = torch.clip(data, min=0, max = 1)

In [96]:
len(data[0][:,:K].argmax(1)), len(data[0][:,K:K+V].argmax(1)), data[0][:,:K].argmax(1), data[0][:,K:K+V].argmax(1), data[:,:,:].argmax(2)

(69,
 69,
 tensor([19, 19, 10, 18, 27, 33, 41, 19, 41, 10, 19, 10, 18, 27, 33, 41, 19, 10,
         19, 10, 18, 27, 19, 10, 18, 27, 19, 10, 19, 14, 31,  4,  0,  9,  8, 32,
         20, 26, 11, 42, 38,  5,  7, 19, 10, 18, 27, 19,  2, 39, 24, 35, 21, 30,
         12, 22, 36,  6, 15,  1,  3, 13, 16, 23, 37, 40, 34, 28, 29]),
 tensor([221, 543, 576, 428, 361, 576, 576, 561, 576, 576, 543, 576,  35,  98,
         576, 576, 246, 576, 513, 576, 428, 495, 513, 576,  15, 361, 246, 576,
         328, 576, 576,  67, 353, 576, 576, 576, 576, 576, 576, 576, 576, 576,
         354, 513, 576, 262, 484, 284, 305, 576, 576, 576, 576, 576, 576, 576,
         576, 576, 576, 576, 576, 576, 576, 576, 576, 576, 576, 576, 576]),
 tensor([[ 19,  19,  10,  18, 647,  33,  41,  19,  41, 652,  19,  10, 621, 648,
          652, 652,  19,  10,  19, 652,  18,  27,  19,  10,  18, 647,  19,  10,
           19,  14,  31,   4,   0,   9,   8,  32,  20,  26,  11,  42,  38,   5,
            7,  19,  10, 621, 650,  19, 638,

In [95]:
len(data2[0][:,:K].argmax(1)), len(data2[0][:,K:K+V].argmax(1)), data2[0][:,:K].argmax(1), data2[0][:,K:K+V].argmax(1)

(69,
 69,
 tensor([19, 19, 10, 18, 27, 33, 41, 19, 41, 10, 19, 10, 18, 27, 33, 41, 19, 10,
         19, 10, 18, 27, 19, 10, 18, 27, 19, 10, 19, 14, 31,  4,  0,  9,  8, 32,
         20, 26, 11, 42, 38,  5,  7, 19, 10, 18, 27, 19,  2, 39, 24, 35, 21, 30,
         12, 22, 36,  6, 15,  1,  3, 13, 16, 23, 37, 40, 34, 28, 29]),
 tensor([221, 543, 576, 428, 361, 576, 576, 561, 576, 576, 543, 576,  35,  98,
         576, 576, 246, 576, 513, 576, 428, 495, 513, 576,  15, 361, 246, 576,
         328, 576, 576,  67, 353, 576, 576, 576, 576, 576, 576, 576, 576, 576,
         354, 513, 576, 262, 484, 284, 305, 576, 576, 576, 576, 576, 576, 576,
         576, 576, 576, 576, 576, 576, 576, 576, 576, 576, 576, 576, 576]))

In [82]:
torch.max(data[0]), data[0][:,1], data.size(), data[0][2,K:K+V]

(tensor(1.8281),
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 torch.Size([2, 69, 653]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [21]:
dic_reverse = {value:key for key, value in dic['encoding_dict_keys'].items()}

In [22]:
dic_reverse

{0: 'nacaProfile',
 1: 'autoPilotX',
 2: 'batteryType',
 3: 'autoPilotY',
 4: 'wingType',
 5: 'tubeRot',
 6: 'rpmX',
 7: 'servoType',
 8: 'chordInner',
 9: 'span',
 10: 'armLength',
 11: 'flapBias',
 12: 'battery1Y',
 13: 'currentX',
 14: 'arm1Length',
 15: 'rpmY',
 16: 'currentY',
 17: 'batteryY',
 18: 'motorType',
 19: 'node_type',
 20: 'taperOffset',
 21: 'floorHeight',
 22: 'battery2X',
 23: 'voltageX',
 24: 'vertDiameter',
 25: 'batteryX',
 26: 'aileronBias',
 27: 'propType',
 28: 'varioX',
 29: 'varioY',
 30: 'battery1X',
 31: 'arm2Length',
 32: 'chordOuter',
 33: 'offset',
 34: 'gpsY',
 35: 'horzDiameter',
 36: 'battery2Y',
 37: 'voltageY',
 38: 'tubeOffset',
 39: 'length',
 40: 'gpsX',
 41: 'angle',
 42: 'load'}

In [23]:
len(decode_embedding(data[2])), data[2][:,:K].argmax(1)

(29,
 tensor([19, 19, 10, 18, 27, 33, 41, 19,  2, 39, 24, 35, 21, 30, 12, 22, 36,  6,
         15,  1,  3, 13, 16, 23, 37, 40, 34, 28, 29,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]))

In [24]:
decode_embedding(data[2])

[{'node_type': 'ConnectedHub6_Sym'},
 {'node_type': 'PropArm'},
 {'armLength': -0.9728665351867676},
 {'motorType': 't_motor_AT4130KV300'},
 {'propType': 'apc_propellers_14x14N'},
 {'offset': -0.8203427195549011},
 {'angle': -0.34535443782806396},
 {'node_type': 'DualBatteryFuselageWithComponents'},
 {'batteryType': 'TurnigyGraphene5000mAh6S75C'},
 {'length': 1.0017277002334595},
 {'vertDiameter': 0.0},
 {'horzDiameter': -1.2462646961212158},
 {'floorHeight': 0.0},
 {'battery1X': 0.0},
 {'battery1Y': 1.362557053565979},
 {'battery2X': 1.7003532648086548},
 {'battery2Y': 0.0},
 {'rpmX': -1.3186391592025757},
 {'rpmY': 0.0},
 {'autoPilotX': 1.0142735242843628},
 {'autoPilotY': 0.0},
 {'currentX': 0.0},
 {'currentY': 0.0},
 {'voltageX': -1.4505165815353394},
 {'voltageY': 0.0},
 {'gpsX': 0.0},
 {'gpsY': 0.0},
 {'varioX': 0.20251135528087616},
 {'varioY': 0.0}]

In [25]:
print(f'Training Data:   {dataloader_tr.dataset.y_train.shape[0]}')
print(f'Validation Data: {dataloader_val.dataset.y_train.shape[0]}')
print(f'Test Data:       {dataloader_test.dataset.y_train.shape[0]}')

Training Data:   6
Validation Data: 6
Test Data:       3


### Set up model from the seq_to_spec_model.py

In [26]:
torch.manual_seed(0)
np.random.seed(0)

emsize = 200  # embedding dimension
d_hid = 512  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 8  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 20  # number of heads in nn.MultiheadAttention
dropout = 0.2  # dropout probability
D_out = 200
D = dataloader_tr.dataset.x_train.shape[-1]



In [27]:
"""
class TransformerEncoder(nn.Module):
  def __init__(self, input_size, num_head, hidden_size, num_layers):

    super(TransformerEncoder, self).__init__()

    self.embd = nn.Embedding(word_count,input_size)
    encoder_layer = nn.TransformerEncoderLayer(input_size, num_head, hidden_size)
    self.transformer_enc = nn.TransformerEncoder(encoder_layer, num_layers)
    self.linear1 = nn.Linear(input_size,270)

def forward(self, x):
    x = x.long()
    emb = self.embd(x)
    mem = self.transformer_enc(emb)
    out = self.linear1(mem)
    return out, mem
"""

class TransformerModel(nn.Module):

    def __init__(self, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.01, D: int = 741,
                 emsize: int = 200): #dropout was 0.5
        super().__init__()
        
        self.D = D
        self.D_out = emsize
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Linear(self.D, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, self.D_out)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
#         Need to permute from B x SL x D to SL x B x D
        src = self.encoder(src.permute(1, 0, 2)) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        mem = self.transformer_encoder(src, src_key_padding_mask=src_mask)
        output = self.decoder(mem)
        return output, mem

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1) 
    
class TransformerDecoder(nn.Module):
  def __init__(self, input_size, num_head, output_size, hidden_size, num_layers, dropout: float = 0.01):
    super(TransformerDecoder, self).__init__()

    decoder_layer = nn.TransformerDecoderLayer(input_size, num_head, hidden_size, dropout)
    self.transformer_dec = nn.TransformerDecoder(decoder_layer, num_layers)
    self.linear1 = nn.Linear(input_size, output_size)
      # nn.linear(inpute_size, K+V+1)

  def forward(self, x: Tensor, mem: Tensor, tgt_mask: Tensor = None)-> Tensor:
    out = self.transformer_dec(x, mem) #, tgt_key_padding_mask=tgt_mask)
    out = self.linear1(out)
    return out

class TransformerAutoencoder(nn.Module):
    def __init__(self, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.01, D: int = 741,
                 emsize: int = 200): 
        super().__init__()
        self.transformer_encoder = TransformerModel(d_model, nhead, d_hid, nlayers, dropout, D, emsize)
        self.transformer_decoder = TransformerDecoder(d_model, nhead, 653, d_hid, nlayers, dropout)
    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        out, mem = self.transformer_encoder(src,src_mask)
        print(out.size)
        print(mem.size)
        out  =self.transformer_decoder(out,mem, src_mask)
        out = out.permute(1, 0, 2)
        return out

def loss(out):
    l1 = crossentropy_loss(out[:K],X_k) #K=19 X_k: input up to K
    l2 = crossentropy_loss(out[K:K+V],X_v)
    if X_v == 'Value':
        l3 = mse(out[-1], X_float)
    return l1 + l2 + l3

In [28]:
device='cpu'
model = TransformerAutoencoder( emsize, nhead, d_hid, nlayers, dropout, D, D_out).to(device)

In [29]:
batch, train_data=next(enumerate(dataloader_tr))
data, targets, mask = train_data

In [30]:
data.size(), targets.size(), mask.size()

(torch.Size([2, 69, 653]), torch.Size([2]), torch.Size([2, 69]))

In [31]:
X_k: data[:,:,:K] #19
X_v: data[:,:,K:K+V] #V (above code)

In [32]:
output= model(data.to(device), mask.to(device))

<built-in method size of Tensor object at 0x7fa317161350>
<built-in method size of Tensor object at 0x7fa3170fa2a0>


In [33]:
output.size()

torch.Size([2, 69, 653])

### Set up the training routine

In [34]:
import copy
import time

criterion = nn.MSELoss()
    
lr = .1  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 100.0, gamma=0.95)

def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 200
    start_time = time.time()

    num_batches = 0 #bptt
    for batch, train_data in enumerate(dataloader_tr):
        data, targets, mask = train_data
        output = model(data.to(device), mask.to(device))
        loss = criterion(output, data)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        total_loss += loss.item()
        num_batches += 1

    return total_loss/num_batches

def evaluate(model: nn.Module) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    num_batches = 0
    with torch.no_grad():
        for batch, val_data in enumerate(dataloader_val):
            data, targets, mask = val_data
            output = model(data.to(device), mask.to(device))
            loss = criterion(output, data)
            num_batches += 1
            total_loss += loss.item()
    return total_loss/num_batches

### Train the model and save the best according to the validation data

In [35]:
best_loss = float('inf')
epochs = 2 #300
best_model = None

loss_list = []
val_loss_list = []

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    loss = train(model)
    val_loss = evaluate(model)
    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
          f'loss {loss:5.4f} | ')
          #f'val loss {val_loss:5.4f}' )
    print('-' * 89)

    if val_loss < best_loss:
        best_loss = val_loss
        best_model = copy.deepcopy(model)
    loss_list.append(loss)
    val_loss_list.append(val_loss)
 

<built-in method size of Tensor object at 0x7fa317166c00>
<built-in method size of Tensor object at 0x7fa317494310>
<built-in method size of Tensor object at 0x7fa317166e30>
<built-in method size of Tensor object at 0x7fa3174990d0>
<built-in method size of Tensor object at 0x7fa316fbef20>
<built-in method size of Tensor object at 0x7fa3174a9170>
<built-in method size of Tensor object at 0x7fa3170c5440>
<built-in method size of Tensor object at 0x7fa3171670b0>
-----------------------------------------------------------------------------------------
| end of epoch   1 | time:  5.68s | loss 0.2991 | 
-----------------------------------------------------------------------------------------
<built-in method size of Tensor object at 0x7fa3168c05e0>
<built-in method size of Tensor object at 0x7fa2ea3c1710>
<built-in method size of Tensor object at 0x7fa317167dd0>
<built-in method size of Tensor object at 0x7fa2ea3c1a80>
<built-in method size of Tensor object at 0x7fa3170f4950>
<built-in metho

In [36]:
### Save and load model

In [46]:
torch.save(model, "./temp.pt")

In [51]:
del model

In [52]:
model = TransformerAutoencoder( emsize, nhead, d_hid, nlayers, dropout, D, D_out).to(device)

In [53]:
model = torch.load("./temp.pt")

In [54]:
model

TransformerAutoencoder(
  (transformer_encoder): TransformerModel(
    (pos_encoder): PositionalEncoding(
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (transformer_encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-7): 8 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
          )
          (linear1): Linear(in_features=200, out_features=512, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
          (linear2): Linear(in_features=512, out_features=200, bias=True)
          (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.2, inplace=False)
          (dropout2): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (encoder): Linear(in_features=653, out_features=200, bias=True)
    (decoder): Linear(in

In [59]:
batch, data = next(enumerate(dataloader_tr))
data, targets, mask = data
print(data.size(), mask.size())
print( model.transformer_encoder)
embedding, mem = model.transformer_encoder(data, mask)
print(embedding.size(), mem.size())
embedding_copy = model.transformer_encoder.decoder(mem)
print(embedding_copy.size())
#mem=mem.permute(1,0,2)
print(mem.size())
mem2 = mem.permute(1,0,2)
print(mem2.size())
out  =model.transformer_decoder(embedding,mem,mask)
out = out.permute(1, 0, 2)
print(out.size())

torch.Size([2, 69, 653]) torch.Size([2, 69])
TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-7): 8 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Linear(in_features=200, out_features=512, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=512, out_features=200, bias=True)
        (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (encoder): Linear(in_features=653, out_features=200, bias=True)
  (decoder): Linear(in_features=200, out_features=200, bias=True)
)


### Plot the training and validation loss

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(loss_list)
plt.plot(val_loss_list)
plt.grid()
plt.show()

In [201]:
i = 0
for x,y,m in dataloader_test:
    with torch.no_grad():
        print(x.shape)
        best_model.eval().to(device)
        output = best_model(x.to(device), m.to(device)).cpu()
        print("MSE btw input and output of autoencoder",criterion(x, output).item())

torch.Size([3, 69, 653])
<built-in method size of Tensor object at 0x7f9b060ac4f0>
<built-in method size of Tensor object at 0x7f9b1139ffb0>
MSE btw input and output of autoencoder 0.004527577199041843


In [None]:
### Convert Output of Autoencoder back to original sequence