In [8]:
import copy
import glob
import gc
import os
import sys
import warnings
import os
import sys
import math
import time
import re
sys.path.append("/share/tml_package")
# sys.path.append("/share/tml_package/tml")
sys.path.append("/share/uspto_pkg")
from tml import utils
from scipy import io
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.optim as optim

import json

from data import TechDataset, CVSampler
from models import Transformer, init_weights
from train_utils import run_epoch, EarlyStopping, perf_eval
from utils import token2class, DotDict

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import cleantext
from cleantext.sklearn import CleanTransformer

from collections.abc import Iterable

## Scratch

In [5]:
class Config(dict):
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__
    
    @classmethod
    def load(cls, file):
        with open(file, 'r') as f:
            config = json.loads(f.read())
            return Config(config)

In [122]:
# train_conf = Config({}).load('train_config.json')
# model_conf = Config({}).load('model_config.json')
configs = Config({}).load('configs.json')

In [23]:
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')

In [116]:
tech_dataset = TechDataset(data_dir="/home2/glee/Tech_Gen/data/", params=train_conf)

configs.model.update({'device': device,
                  'n_enc_vocab': tech_dataset.vocab_size,
                  'n_dec_vocab': tech_dataset.vocab_size,
                  'n_enc_seq': tech_dataset.seq_len,
                  'n_dec_seq': tech_dataset.seq_len,
                  'i_pad': tech_dataset.vocab_w2i['<PAD>']})

In [180]:
dd.model.update({'device': device,
                  'n_enc_vocab': tech_dataset.vocab_size,
                  'n_dec_vocab': tech_dataset.vocab_size,
                  'n_enc_seq': tech_dataset.seq_len,
                  'n_dec_seq': tech_dataset.seq_len,
                  'i_pad': tech_dataset.vocab_w2i['<PAD>']})

In [54]:
data_loader = DataLoader(tech_dataset, batch_size=16)

In [57]:
X, Y = next(iter(data_loader))
X, Y = X.to(device), Y.to(device)

In [58]:
n_vocab = tech_dataset.vocab_size
d_hidn = 128

In [59]:
def get_sinusoid_encoding_table(n_seq, d_hidn):
    def cal_angle(position, i_hidn):
        return position / np.power(10000, 2 * (i_hidn // 2) / d_hidn)
    def get_posi_angle_vec(position):
        return [cal_angle(position, i_hidn) for i_hidn in range(d_hidn)]

    sinusoid_table = np.array([get_posi_angle_vec(i_seq) for i_seq in range(n_seq)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # even index sin 
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # odd index cos
    return sinusoid_table

def get_attn_pad_mask(seq_q, seq_k, i_pad):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    pad_attn_mask = seq_k.data.eq(i_pad)
    pad_attn_mask = pad_attn_mask.unsqueeze(1).expand(batch_size, len_q, len_k)
    return pad_attn_mask

def get_attn_decoder_mask(seq):
    subsequent_mask = torch.ones_like(seq).unsqueeze(-1).expand(seq.size(0), seq.size(1), seq.size(1))
    subsequent_mask = subsequent_mask.triu(diagonal=1) # upper triangular part of a matrix(2-D)
    return subsequent_mask

In [60]:
from torch.nn import functional as F

In [61]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.device = self.config.device
        self.dropout = nn.Dropout(self.config.dropout)
        self.scale = 1 / (self.config.d_head ** 0.5)
        
    def forward(self, Q, K, V, attn_mask):    
        scores = torch.matmul(Q, K.transpose(-1, -2)) # scores: (batch_size, n_head, n_q_seq, n_k_seq)
        scores = scores.mul_(self.scale)
        scores.masked_fill_(attn_mask, -1e9)
        
        attn_prob = nn.Softmax(dim=-1)(scores) # attn_prob: (batch_size, n_head, n_q_seq, n_k_seq)
        attn_prob = self.dropout(attn_prob)
        
        context = torch.matmul(attn_prob, V) # context: (batch_size, n_head, n_q_seq, d_v)
        
        return context, attn_prob

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.device = self.config.device
        
        self.W_Q = nn.Linear(self.config.d_hidn, self.config.n_head * self.config.d_head).to(self.device)
        self.W_K = nn.Linear(self.config.d_hidn, self.config.n_head * self.config.d_head).to(self.device)
        self.W_V = nn.Linear(self.config.d_hidn, self.config.n_head * self.config.d_head).to(self.device)
        self.scaled_dot_attn = ScaledDotProductAttention(self.config)
        self.linear = nn.Linear(self.config.n_head*self.config.d_head, self.config.d_hidn).to(self.device)
        self.dropout = nn.Dropout(self.config.dropout).to(self.device)
        
    def forward(self, X_Q, X_K, X_V, attn_mask):
        batch_size = X_Q.size(0)

        # Q: (batch_size, n_head, n_q_seq, d_head)
        Q = self.W_Q(X_Q).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1, 2)
        # K: (batch_size, n_head, n_k_seq, d_head)
        K = self.W_K(X_K).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1, 2)
        # V: (batch_size, n_head, n_v_seq, d_head)
        V = self.W_V(X_V).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1, 2)

        # attn_mask: (batch_size, n_head, n_q_seq, n_k_seq)
        attn_mask = attn_mask.unsqueeze(1).repeat(1, self.config.n_head, 1, 1)
        
        # context: (batch_size, n_head, n_q_seq, d_head), attn_prob: (batch_size, n_head, n_q_seq, n_k_seq)
        context, attn_prob = self.scaled_dot_attn(Q, K, V, attn_mask)
        # context: (batch_size, n_q_seq, n_head*d_head)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.config.n_head * self.config.d_head)
        
        # output: (batch_size, n_q_seq, d_hidn)
        output = self.dropout(self.linear(context))
        
        return output, attn_prob
    
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.device = self.config.device
        
        self.conv1 = nn.Conv1d(in_channels=self.config.d_hidn, out_channels=self.config.d_ff, kernel_size=1).to(self.device)
        self.conv2 = nn.Conv1d(in_channels=self.config.d_ff, out_channels=self.config.d_hidn, kernel_size=1).to(self.device)
        self.activation = F.relu
        self.dropout = nn.Dropout(self.config.dropout).to(self.device)
        
    def forward(self, inputs):
        output = self.activation(self.conv1(inputs.transpose(1,2))) # output: (batch_size, d_ff, n_seq)
        output = self.dropout(self.conv2(output).transpose(1,2)) # output: (batch_size, n_esq, d_hidn)
    
        return output

class EncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.device = self.config.device
        
        self.self_attn = MultiHeadAttention(self.config)
        self.layer_norm1 = nn.LayerNorm(self.config.d_hidn, eps=self.config.layer_norm_epsilon).to(self.device)
        self.pos_ffn = PoswiseFeedForwardNet(self.config)
        self.layer_norm2 = nn.LayerNorm(self.config.d_hidn, eps=self.config.layer_norm_epsilon).to(self.device)
        
    def forward(self, inputs, attn_mask):
        # attn_outputs: (batch_size, n_enc_seq, d_hidn), attn_prob: (batch_size, n_head, n_enc_seq, n_enc_seq)
        attn_outputs, attn_prob = self.self_attn(inputs, inputs, inputs, attn_mask)
        attn_outputs = self.layer_norm1(inputs + attn_outputs) # residual sum, layer normalization
        
        # ffn_outputs: (batch_size, n_enc_seq, d_hidn)
        ffn_outputs = self.pos_ffn(attn_outputs)
        ffn_outputs = self.layer_norm2(ffn_outputs + attn_outputs) # residual sum, layer normalization
        
        return ffn_outputs, attn_prob
    
class Encoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.device = self.config.device
        
        self.enc_emb = nn.Embedding(self.config.n_enc_vocab, self.config.d_hidn).to(self.device)
        sinusoid_table = torch.tensor(get_sinusoid_encoding_table(self.config.n_enc_seq + 1, self.config.d_hidn), dtype=torch.float64).to(self.device)
        self.pos_emb = nn.Embedding.from_pretrained(sinusoid_table, freeze=True).to(self.device)
        
        self.layers = nn.ModuleList([EncoderLayer(self.config) for _ in range(self.config.n_layer)])
        
    def forward(self, inputs):
        positions = torch.arange(inputs.size(1), device=inputs.device, dtype=inputs.dtype).expand(inputs.size(0), inputs.size(1)).contiguous() + 1
        pos_mask = inputs.eq(self.config.i_pad)
        positions.masked_fill_(pos_mask, 0)
        
        # outputs: (batch_size, n_enc_seq, d_hidn)
        outputs = self.enc_emb(inputs) + self.pos_emb(positions)
        outputs = outputs.to(dtype=torch.float32)
        
        # attn_mask: (batch_size, n_enc_seq, n_enc_seq)
        attn_mask = get_attn_pad_mask(inputs, inputs, self.config.i_pad)
        
        attn_probs = []
        for layer in self.layers:
            # outputs: (batch_size, n_enc_seq, d_hidn), attn_prob: (batch_size, n_head, n_enc_seq, n_enc_seq)
            outputs, attn_prob = layer(outputs, attn_mask)
            attn_probs.append(attn_prob)
            
        return outputs, attn_probs
    
class DecoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.device = self.config.device
        
        self.masked_self_attn = MultiHeadAttention(self.config)
        self.layer_norm1 = nn.LayerNorm(self.config.d_hidn, eps=self.config.layer_norm_epsilon).to(self.device)
        self.dec_enc_attn = MultiHeadAttention(self.config)
        self.layer_norm2 = nn.LayerNorm(self.config.d_hidn, eps=self.config.layer_norm_epsilon).to(self.device)
        self.pos_ffn = PoswiseFeedForwardNet(self.config)
        self.layer_norm3 = nn.LayerNorm(self.config.d_hidn, eps=self.config.layer_norm_epsilon).to(self.device)
    
    def forward(self, dec_inputs, enc_outputs, masked_self_attn_mask, dec_enc_attn_mask):
        # masked_self_attn_outputs: (batch_size, n_dec_seq, d_hidn), masked_self_attn_prob: (batch_size, n_head, n_dec_seq, n_dec_seq)
        masked_self_attn_outputs, masked_self_attn_prob = self.masked_self_attn(dec_inputs, dec_inputs, dec_inputs, masked_self_attn_mask)
        masked_self_attn_outputs = self.layer_norm1(dec_inputs + masked_self_attn_outputs)
        
        # dec_enc_attn_outputs: (batch_size, n_dec_seq, d_hidn), dec_enc_attn_prob: (batch_size, n_head, n_dec_seq, n_enc_seq)
        dec_enc_attn_outputs, dec_enc_attn_prob = self.dec_enc_attn(masked_self_attn_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)
        dec_enc_attn_outputs = self.layer_norm2(masked_self_attn_outputs + dec_enc_attn_outputs)
        
        # ffn_outputs: (batch_size, n_dec_seq, d_hidn)
        ffn_outputs = self.pos_ffn(dec_enc_attn_outputs)
        ffn_outputs = self.layer_norm3(dec_enc_attn_outputs + ffn_outputs)
        
        return ffn_outputs, masked_self_attn_prob, dec_enc_attn_prob
    
class Decoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.device = self.config.device

        self.dec_emb = nn.Embedding(self.config.n_dec_vocab, self.config.d_hidn).to(self.device)
        sinusoid_table = torch.tensor(get_sinusoid_encoding_table(self.config.n_dec_seq + 1, self.config.d_hidn), dtype=torch.float64).to(self.device)
        self.pos_emb = nn.Embedding.from_pretrained(sinusoid_table, freeze=True).to(self.device)

        self.layers = nn.ModuleList([DecoderLayer(self.config) for _ in range(self.config.n_layer)])
        
        self.out = nn.Linear(self.config.d_hidn, self.config.n_dec_vocab).to(self.device)
    
    def forward(self, dec_inputs, enc_inputs, enc_outputs):
        positions = torch.arange(dec_inputs.size(1), device=dec_inputs.device, dtype=dec_inputs.dtype).expand(dec_inputs.size(0), dec_inputs.size(1)).contiguous() + 1
        pos_mask = dec_inputs.eq(self.config.i_pad)
        positions.masked_fill_(pos_mask, 0)
    
        # dec_outputs: (batch_size, n_dec_seq, d_hidn)
        dec_outputs = self.dec_emb(dec_inputs) + self.pos_emb(positions)
        dec_outputs = dec_outputs.to(dtype=torch.float32)

        # dec_attn_pad_mask: (batch_size, n_dec_seq, n_dec_seq)
        dec_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs, self.config.i_pad)
        # dec_attn_decoder_mask: (batch_size, n_dec_seq, n_dec_seq)
        dec_attn_decoder_mask = get_attn_decoder_mask(dec_inputs)
        # dec_self_attn_mask: (batch_size, n_dec_seq, n_dec_seq)
        dec_self_attn_mask = torch.gt((dec_attn_pad_mask + dec_attn_decoder_mask), 0)
        # dec_enc_attn_mask: (batch_size, n_dec_seq, n_enc_seq)
        dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs, self.config.i_pad)

        masked_self_attn_probs, dec_enc_attn_probs = [], []
        for layer in self.layers:
            # dec_outputs: (batch_size, n_dec_seq, d_hidn), masked_self_attn_prob: (batch_size, n_dec_seq, n_dec_seq), dec_enc_attn_prob: (batch_size, n_dec_seq, n_enc_seq)
            dec_outputs, masked_self_attn_prob, dec_enc_attn_prob = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
            masked_self_attn_probs.append(masked_self_attn_prob)
            dec_enc_attn_probs.append(dec_enc_attn_prob)
            
        # dec_outputs: (batch_size, n_dec_seq, n_dec_vocab)
        dec_outputs = self.out(dec_outputs)

        return dec_outputs, masked_self_attn_probs, dec_enc_attn_probs

class Transformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        
        self.encoder = Encoder(self.config)
        self.decoder = Decoder(self.config)
        
    def forward(self, enc_inputs, dec_inputs):
        # enc_outputs: (batch_size, n_enc_seq, d_hidn)
        enc_outputs, enc_self_attn_probs = self.encoder(enc_inputs)
        # dec_outputs: (batch_size, n_dec_seq, d_hidn)
        dec_outputs, dec_self_attn_probs, dec_enc_attn_probs = self.decoder(dec_inputs, enc_inputs, enc_outputs)
#         dec_outputs = nn.Softmax(dim=-1)(dec_outputs)
        
        return dec_outputs, enc_self_attn_probs, dec_self_attn_probs, dec_enc_attn_probs

In [64]:
from models import init_weights, get_sinusoid_encoding_table, get_pad_mask, get_subsequent_mask
from models import ScaledDotProductAttention, MultiHeadAttention, PoswiseFeedForwardNet, EncoderLayer, Encoder, DecoderLayer, Decoder, Transformer

In [65]:
model = Transformer(model_conf)
init_weights(model)

In [66]:
optimizer = optim.AdamW(model.parameters(), lr=train_conf.learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index = model_conf.i_padding)

In [67]:
def train(model, data_loader, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    
    for i, (X, Y) in enumerate(data_loader):
        src, trg = X.to(device), X.to(device)
        y = Y.to(device)
        
        optimizer.zero_grad()
        
        pred_trg, *_ = model(src, trg[:,:-1]) # omit <eos> from target sequence
        # output: (batch_size, n_dec_seq-1, n_dec_vocab)
        output_dim = pred_trg.shape[-1]
        pred_trg = pred_trg.contiguous().view(-1, output_dim) # output: (batch_size * (n_dec_seq-1))
        true_trg = trg[:,1:].contiguous().view(-1) # omit <sos> from target sequence
        
        loss = criterion(pred_trg, true_trg)
        loss.backward()        
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
    
    return epoch_loss / len(data_loader)

def evaluate(model, data_loader, criterion):
    model.eval()
    epoch_loss = 0
    
    for i, (X, Y) in enumerate(data_loader):
        src, trg = X.to(device), X.to(device)
        y = Y.to(device)
        
        pred_trg, *_ = model(src, trg[:,:-1]) # omit <eos> from target sequence
        # output: (batch_size, n_dec_seq-1, n_dec_vocab)
        output_dim = pred_trg.shape[-1]
        pred_trg = pred_trg.contiguous().view(-1, output_dim) # output: (batch_size * (n_dec_seq-1))
        true_trg = trg[:,1:].contiguous().view(-1) # omit <sos> from target sequence
        
        loss = criterion(pred_trg, true_trg)
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(data_loader)

def epoch_time(start, end):
    elapsed_time = end - start
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
## Training
clip = 1
best_valid_loss = float('inf')

for ep in range(100):
    start_time = time.time()
    train_loss = train(model, data_loader, optimizer, criterion, clip)
    valid_loss = evaluate(model, data_loader, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {ep + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):.3f}')
    print(f'\tValidation Loss: {valid_loss:.3f} | Validation PPL: {math.exp(valid_loss):.3f}')

Epoch: 01 | Time: 1m 5s
	Train Loss: 0.711 | Train PPL: 2.036
	Validation Loss: 0.676 | Validation PPL: 1.965
Epoch: 02 | Time: 1m 4s
	Train Loss: 0.598 | Train PPL: 1.818
	Validation Loss: 0.748 | Validation PPL: 2.112
Epoch: 03 | Time: 1m 5s
	Train Loss: 0.518 | Train PPL: 1.678
	Validation Loss: 2.066 | Validation PPL: 7.896
Epoch: 04 | Time: 1m 5s
	Train Loss: 0.507 | Train PPL: 1.660
	Validation Loss: 0.868 | Validation PPL: 2.381
Epoch: 05 | Time: 1m 3s
	Train Loss: 0.497 | Train PPL: 1.644
	Validation Loss: 4.720 | Validation PPL: 112.145
Epoch: 06 | Time: 1m 4s
	Train Loss: 0.489 | Train PPL: 1.630
	Validation Loss: 1.000 | Validation PPL: 2.718
Epoch: 07 | Time: 1m 3s
	Train Loss: 0.486 | Train PPL: 1.626
	Validation Loss: 0.890 | Validation PPL: 2.436
Epoch: 08 | Time: 1m 3s
	Train Loss: 0.479 | Train PPL: 1.614
	Validation Loss: 6.335 | Validation PPL: 563.858
Epoch: 09 | Time: 1m 3s
	Train Loss: 0.480 | Train PPL: 1.617
	Validation Loss: 3.576 | Validation PPL: 35.725
Epoch

In [None]:
example = [tech_dataset.vocab_w2i['<SOS>']]

In [None]:
enc_inputs = X[0].unsqueeze(0).to(device)

In [None]:
enc_outputs, enc_self_attn_probs = transformer.encoder(enc_inputs)

In [None]:
enc_inputs = X[0].unsqueeze(0).to(device)
enc_outputs, enc_self_attn_probs = transformer.encoder(enc_inputs)

for i in range(10):
    trg_tensor = torch.tensor(example).unsqueeze(0).to(device)
    with torch.no_grad():
        dec_outputs, dec_self_attn_probs, dec_enc_attn_probs = transformer.decoder(trg_tensor, enc_inputs, enc_outputs)
        
    pred_token = dec_outputs.argmax(2)[:,-1].item()
    example.append(pred_token)

In [None]:
token2class(example, vocabulary=tech_dataset.vocab_i2w)