In [1]:
!dir


sweet_preprocessed.pkl	sweet_pretrained.pth


In [None]:
from google.colab import drive
drive.mount('/content/drive')

#### import required libraries

In [1]:
# !pip install Cython
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import jieba
import random
import unicodedata
import string
import re
from sklearn.utils import shuffle
from math import floor, ceil
import pickle
import os

MAX_LENGTH=10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
BATCH_SIZE=2

ENVIR_PATH = ''

cuda


#### chinese text data preprocessing

In [10]:
class Lang:
  def __init__(self, name):
    self.name = name
    self.word2index = {'S':0, 'P':1}
    self.word2count = {'S':0, 'P':0}
    self.index2word = {0:'S', 1:'P'}
    self.n_words = 2

  def addSentence(self, sentence):
    for word in sentence.split(' '):
      self.addWord(word)

  def addWord(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word
      self.n_words += 1
    else:
      self.word2count[word] += 1
      

def unicodeToAscii(s):
  return ''.join(
      c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn'
  )

def sentence_padding(sentence, mode='enc_input', max_length=MAX_LENGTH):
  ss = sentence.split()
  sentence_length = len(ss)
  if mode == 'enc_input' or mode == 1:
    for i in range(max_length):
      ss.append('P')
    return ' '.join(ss[:max_length])
  if mode == 'dec_input' or mode == 2:
    for i in range(max_length):
      ss.append('P')
    ss.insert(0, 'S')
    return ' '.join(ss[:max_length])
  if mode == 'tgt_input' or mode == 3:
    for i in range(max_length):
      ss.append('P')
    return ' '.join(ss[:max_length])
    
def normalizeString_eng(s):
  s = unicodeToAscii(s.lower().strip())
  s = re.sub(r'([.!?])', r' \1', s)
  s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
  return sentence_padding(s, mode=1)
def process_chn_sentence(s, mode):
  s = ' '.join(jieba.cut(s.split('\n')[0].replace(' ', '')))
  return sentence_padding(s, mode)

def make_batch(language_pairs, batch_size=BATCH_SIZE):
    tmp_pairs = [random.choice(language_pairs) for n in range(batch_size)]
    input_batch = [[src_vocab[n] for n in tmp_pair[0].split()] for tmp_pair in tmp_pairs]
    output_batch = [[tgt_vocab[n] for n in tmp_pair[1].split()] for tmp_pair in tmp_pairs]
    target_batch = [[tgt_vocab[n] for n in tmp_pair[2].split()] for tmp_pair in tmp_pairs]
    return torch.LongTensor(input_batch).to(device), torch.LongTensor(output_batch).to(device), torch.LongTensor(target_batch).to(device)


# generate a starting decoder input sequence used for firing the decoder
def greedy_decoder(model, enc_input, start_symbol):
    """
    For simplicity, a Greedy Decoder is Beam search when K=1. This is necessary for inference as we don't know the
    target sequence input. Therefore we try to generate the target input word by word, then feed it into the transformer.
    Starting Reference: http://nlp.seas.harvard.edu/2018/04/03/attention.html#greedy-decoding
    :param model: Transformer Model
    :param enc_input: The encoder input
    :param start_symbol: The start symbol. In this example it is 'S' which corresponds to index 4
    :return: The target input
    """

    enc_outputs, enc_self_attns = model.encoder(enc_input)
    batch_size, seq_len = enc_outputs.size()[0], enc_outputs.size()[1]
    dec_input = torch.zeros(batch_size, seq_len).type_as(enc_input.data).to(device)
    next_symbol = start_symbol
    
    for i in range(0, seq_len):
        dec_input[:, i] = next_symbol
        dec_outputs, _, _ = model.decoder(dec_input, enc_input, enc_outputs)
        # dec_outputs [batch_size, seq_len, d_model]
        # print(dec_outputs.size())

        projected = model.projection(dec_outputs)
        # projected: [batch_size, seq_len, tgt_vocab_size]
        # print(projected.size())
        
        prob = projected.squeeze(0).max(dim=-1, keepdim=False)
        # prob:   torch.return_types.max()
        # prob[0]:  values  [seq_len, ]
        # prob[1]:  indices  [seq_len, ] every element is the index of the word in tgt_vocab
        prob = prob[1]
        next_word = prob.data[i]
        next_symbol = next_word.data
    return dec_input

def PairFilter(pairs):
  def filterPair(p):
    return p[0].find('P') > 0 and p[1].find('P') > 0
  return [pair for pair in pairs if filterPair(pair)]

if not os.path.exists(ENVIR_PATH + 'sweet_preprocessed.pkl'):
  corpus_path = ENVIR_PATH + 'english-simplified.txt'

  lines = open(corpus_path, encoding='utf-8').read().strip().split('\n')

  pairs = [[normalizeString_eng(l.split('\t')[0]), process_chn_sentence(l.split('\t')[1], 2), process_chn_sentence(l.split('\t')[1], 3)] for l in lines]
  filtered_pairs = PairFilter(pairs)
  print(f'Constraint(s): MAX_LENGTH={MAX_LENGTH}')
  print('filtered pairs:', len(filtered_pairs))
  
  shuffled_pairs = shuffle(filtered_pairs, random_state=88)
  
  shuffled_pairs = shuffled_pairs[:50]
  output_lang = Lang('chinese')
  input_lang = Lang('english')
  for pair in shuffled_pairs:
    input_lang.addSentence(pair[0])
    output_lang.addSentence(pair[1])
  print('Counted words:')
  print(input_lang.name, input_lang.n_words)
  print(output_lang.name, output_lang.n_words)
  print(random.choice(filtered_pairs))
  train_test_ratio = 0.9
  split_index = floor(len(shuffled_pairs) * train_test_ratio)
  train_pairs = shuffled_pairs[:split_index]
  test_pairs = shuffled_pairs[split_index:]
  print('train_pairs:', split_index)
  print('test_pairs:', len(shuffled_pairs) - split_index)
  # filtered_pairs = filtered_pairs[:200]

  big_dict = {'BATCH_SIZE':BATCH_SIZE, 'shuffled_pairs':shuffled_pairs, 'input_lang':input_lang, 'output_lang':output_lang, 'train_pairs':train_pairs, 'test_pairs':test_pairs}
  f=open(ENVIR_PATH + 'sweet_preprocessed.pkl','wb')
  pickle.dump(big_dict,f)
  f.close()
else:
  print('preprocessed data exists!')
  f=open(ENVIR_PATH + 'sweet_preprocessed.pkl','rb')
  big_dict=pickle.load(f)
  input_lang = big_dict['input_lang']
  output_lang = big_dict['output_lang']
  train_pairs = big_dict['train_pairs']
  test_pairs = big_dict['test_pairs']
  print('load done!')

preprocessed data exists!
load done!


#### model construction

In [3]:
# S: Symbol that shows starting of decoding input
# E: Symbol that shows starting of decoding output
# P: Symbol that will fill in blank sequence if current batch data size is short than time steps




src_vocab = input_lang.word2index 
src_vocab_size = input_lang.n_words

tgt_vocab = output_lang.word2index 
number_dict = output_lang.index2word 
tgt_vocab_size = output_lang.n_words

src_len = MAX_LENGTH
tgt_len = MAX_LENGTH

# Transformer Parameters
d_model = 128  # Embedding Size
d_ff = 512 # Position-wise FeedForward dimension
d_k = d_v = 32  # dimension of K(=Q), V
n_layers = 4  # number of Encoder of Decoder Layer
n_heads = 4  # number of heads in Multi-Head Attention
DROPOUT_PROB = 0.1

def construc_pos_emb(input_vec):
  batch_size, seq_len = input_vec.size()
  a = np.arange(seq_len).reshape(1,seq_len)
  for row in range(1, batch_size):
    m = np.arange(seq_len).reshape(1,seq_len)
    a = np.vstack((a, m))
  return torch.from_numpy(a).type(torch.LongTensor).to(device)

class batch_generator(object):
    def __init__(self, data, batch_size=BATCH_SIZE):
      self.data = data
      self.n = len(data)
      self.current = 0
      self.batch_size = batch_size


    def __iter__(self):
      return self

    def __next__(self):
      if self.current < self.n:
          tmp_pairs = [pair for pair in self.data[self.current : self.current + self.batch_size]]
          input_batch =  [[src_vocab[n] for n in tmp_pair[0].split()] for tmp_pair in tmp_pairs]
          output_batch = [[tgt_vocab[n] for n in tmp_pair[1].split()] for tmp_pair in tmp_pairs]
          target_batch = [[tgt_vocab[n] for n in tmp_pair[2].split()] for tmp_pair in tmp_pairs]  
          self.current = self.current + self.batch_size
          res_ = (torch.tensor(input_batch, dtype=torch.long, device=device), torch.tensor(output_batch, dtype=torch.long, device=device), torch.tensor(target_batch, dtype=torch.long, device=device))
          return res_

      else:
          raise StopIteration

    def __len__(self):
      return ceil(self.n/self.batch_size)



def get_sinusoid_encoding_table(n_position, d_model):
    def cal_angle(position, hid_idx):
        return position / np.power(10000, 2 * (hid_idx // 2) / d_model)
    def get_posi_angle_vec(position):
        return [cal_angle(position, hid_j) for hid_j in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
    return torch.FloatTensor(sinusoid_table).to(device)

def get_attn_pad_mask(seq_q, seq_k):
    # print(seq_q)
    # seq_q/seq_k: [batch_size, sentence_length]
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k).to(device)  # batch_size x len_q x len_k

def get_attn_subsequent_mask(seq):
    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
    subsequent_mask = np.triu(np.ones(attn_shape), k=1)
    subsequent_mask = torch.from_numpy(subsequent_mask).byte()
    return subsequent_mask.to(device)

class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) 
        # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)

        # Fills elements of self tensor with value where mask is one.
        scores.masked_fill_(attn_mask, -1e9) 
        attn = nn.Softmax(dim=-1)(scores)
        # attn : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)
        # V: [batch_size x n_heads x len_q(=len_k) x d_v]
        context = torch.matmul(attn, V)
        # context: [batch_size x n_heads x len_q(=len_k) x d_v
        return context, attn

class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
        self.linear_affin = nn.Linear(n_heads * d_v, d_model)
        self.layernorm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(DROPOUT_PROB)
        self.dotproduct  =  ScaledDotProductAttention().to(device)
    def forward(self, Q, K, V, attn_mask):
        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
        residual, batch_size = Q, Q.size(0)
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) 
        # attn_mask : [batch_size x n_heads x len_q x len_k]

        # q_s: [batch_size x n_heads x len_q x d_k]
        # k_s: [batch_size x n_heads x len_k x d_k]
        # v_s: [batch_size x n_heads x len_k x d_v]
        # scores: (=q_s * k_s^T) [batch_size x n_heads x len_q x len_k]
        # weighted sum: (=softmax(scores) * v_s) [batch_size x n_heads x len_q x d_v]
        # weighted sum = context 
        # context: [batch_size x n_heads x len_q x d_v]
        # attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        # attn = softmax(scores) 
        context, attn = self.dotproduct(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v)
        
        # context: [batch_size x len_q x n_heads * d_v]
        output = self.linear_affin(context)
        # output: [batch_size x len_q x d_model]
        output = self.layernorm(output + residual)
        output = self.dropout(output)
        return output, attn 

class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
        self.layernorm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(DROPOUT_PROB)
    def forward(self, inputs):
        # inputs : [batch_size, len_q, d_model]
        residual = inputs 
        # residual : [batch_size, len_q, d_model]
        output = nn.ReLU()(self.conv1(inputs.transpose(1, 2)))
        # output: [batch_size, d_ff, len_q]
        output = self.conv2(output).transpose(1, 2)
        # output: [batch_size, len_q, d_model]
        # the two subsequent convolutions is configured with k=1 and padding=zeros,
        # so, the output sequence won't be changed.
        output = self.layernorm(output + residual)
        output = self.dropout(output)
        return output

class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention().to(device)
        self.pos_ffn = PoswiseFeedForwardNet().to(device)

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)
        # enc_inputs: [batch_size x len_q x d_model]
        # attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        # enc_outputs: [batch_size x len_q x d_model]
        enc_outputs = self.pos_ffn(enc_outputs) 
        # enc_outputs: [batch_size x len_q x d_model]
        return enc_outputs, attn

class DecoderLayer(nn.Module):
    def __init__(self):
        super(DecoderLayer, self).__init__()
        self.dec_self_attn = MultiHeadAttention().to(device)
        self.dec_enc_attn = MultiHeadAttention().to(device)
        self.pos_ffn = PoswiseFeedForwardNet().to(device)

    def forward(self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):
        dec_outputs, dec_self_attn = self.dec_self_attn(dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask)
        # the K and V matrix of the encoder-decoder attention layer is generated with the encoder outputs information
        dec_outputs, dec_enc_attn = self.dec_enc_attn(dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)
        dec_outputs = self.pos_ffn(dec_outputs)
        return dec_outputs, dec_self_attn, dec_enc_attn

class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.src_emb = nn.Embedding(src_vocab_size, d_model)
        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(src_len+1, d_model),freeze=True)
        self.layers = nn.ModuleList([EncoderLayer().to(device) for _ in range(n_layers)])

    def forward(self, enc_inputs): # enc_inputs : [batch_size x source_len]
        enc_outputs = self.src_emb(enc_inputs) + self.pos_emb(construc_pos_emb(enc_inputs))
        enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs)
        enc_self_attns = []
        for layer in self.layers:
            enc_outputs, enc_self_attn = layer(enc_outputs, enc_self_attn_mask)
            enc_self_attns.append(enc_self_attn)
        return enc_outputs, enc_self_attns

class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(tgt_len+1, d_model),freeze=True)
        self.layers = nn.ModuleList([DecoderLayer().to(device) for _ in range(n_layers)])

    def forward(self, dec_inputs, enc_inputs, enc_outputs): # dec_inputs : [batch_size x target_len]
        dec_outputs = self.tgt_emb(dec_inputs) + self.pos_emb(construc_pos_emb(dec_inputs))
        dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs)
        dec_self_attn_subsequent_mask = get_attn_subsequent_mask(dec_inputs)
# mask matrix(dec_self_attn_mask) used for decoder-end self attention computation should take the padding information of decoder input 
# and the unsolved mystery of target sentence into account.
        dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)

        dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs)

        dec_self_attns, dec_enc_attns = [], []
        for layer in self.layers:
            dec_outputs, dec_self_attn, dec_enc_attn = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
            dec_self_attns.append(dec_self_attn)
            dec_enc_attns.append(dec_enc_attn)
        return dec_outputs, dec_self_attns, dec_enc_attns

class Transformer(nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()
        self.encoder = Encoder().to(device)
        self.decoder = Decoder().to(device)
        self.projection = nn.Linear(d_model, tgt_vocab_size, bias=False)
    def forward(self, enc_inputs, dec_inputs):
        enc_outputs, enc_self_attns = self.encoder(enc_inputs)
        dec_outputs, dec_self_attns, dec_enc_attns = self.decoder(dec_inputs, enc_inputs, enc_outputs)
        dec_logits = self.projection(dec_outputs) # dec_logits : [batch_size x seq_len x tgt_vocab_size]
        return dec_logits.view(-1, dec_logits.size(-1)), enc_self_attns, dec_self_attns, dec_enc_attns



def showgraph(attn):
    attn = attn[-1].squeeze(0)[0]
    attn = attn.squeeze(0).data.numpy()
    fig = plt.figure(figsize=(n_heads, n_heads)) # [n_heads, n_heads]
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attn, cmap='viridis')
    ax.set_xticklabels(['']+sentences[0].split(), fontdict={'fontsize': 14}, rotation=90)
    ax.set_yticklabels(['']+sentences[2].split(), fontdict={'fontsize': 14})
    plt.show()

def validate(model, valid_generator):
  model.eval()
  loss_recorder = 0
  for data_ele in valid_generator:
    enc_inputs, dec_inputs, target_batch = data_ele[0], data_ele[1], data_ele[2]
    outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs, dec_inputs)
    target_batch = target_batch.contiguous().view(-1)
    loss_recorder += float(criterion(outputs, target_batch))
  return loss_recorder/len(valid_generator)


save_path = ENVIR_PATH + 'sweet_pretrained.pth'




#### train

In [43]:
# model = Transformer().to(device)
# model.load_state_dict(torch.load(save_path))

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-09)
# scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[5,15,50,70], gamma=0.1)
EPOCH = 100
BATCH_SIZE=5



Recorder_loss = {'valid_loss':[], 'train_loss':[]}

# get the current level of training loss
model.eval()
valid_gen = batch_generator(test_pairs)
valid_loss = validate(model, valid_gen)
print('last saved test loss: ', valid_loss)
# set the starting min loss for model saving
min_valid_loss = valid_loss #float('inf')

print('start training...')
for epoch in range(1, EPOCH + 1):
    torch.cuda.empty_cache()
    train_gen = batch_generator(train_pairs)
    valid_gen = batch_generator(test_pairs)

    model.train()
    optimizer.zero_grad()

    loss_recorder = 0
    for data_ele in train_gen:
      enc_inputs, dec_inputs, target_batch = data_ele[0], data_ele[1], data_ele[2] 
      outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs, dec_inputs)
      target_batch = target_batch.contiguous().view(-1)
      # outputs = greedy_decoder(model, enc_inputs, start_symbol=tgt_vocab["S"])
      loss = criterion(outputs, target_batch)
      loss.backward()
      optimizer.step()
      loss_recorder += loss.detach_()
    train_loss = loss_recorder/len(train_gen)
    Recorder_loss['train_loss'].append(train_loss)

    # scheduler.step()

    valid_loss = validate(model, valid_gen)
    Recorder_loss['valid_loss'].append(valid_loss)

    print('Epoch:', '%03d' % (epoch), 'train_loss =', '{:.6f}'.format(train_loss), 'valid_loss =', '{:.6f}'.format(valid_loss))
    
    if valid_loss < min_valid_loss:
      min_valid_loss = valid_loss
      torch.save(model.state_dict(), save_path)
 
# torch.save(model.state_dict(), save_path)



last saved test loss:  5.062276522318522
start training...
Epoch: 001 train_loss = 0.676232 valid_loss = 5.334757
Epoch: 002 train_loss = 3.170341 valid_loss = 5.557751
Epoch: 003 train_loss = 4.221232 valid_loss = 5.474510
Epoch: 004 train_loss = 5.379053 valid_loss = 8.375980
Epoch: 005 train_loss = 6.126348 valid_loss = 10.050239
Epoch: 006 train_loss = 7.302420 valid_loss = 9.874002
Epoch: 007 train_loss = 6.219340 valid_loss = 7.726750
Epoch: 008 train_loss = 4.829407 valid_loss = 7.442229
Epoch: 009 train_loss = 4.303938 valid_loss = 6.690504
Epoch: 010 train_loss = 3.927466 valid_loss = 6.473931
Epoch: 011 train_loss = 3.964869 valid_loss = 6.964736
Epoch: 012 train_loss = 4.224590 valid_loss = 6.720912
Epoch: 013 train_loss = 4.416991 valid_loss = 6.842773
Epoch: 014 train_loss = 4.444931 valid_loss = 7.097198
Epoch: 015 train_loss = 4.101233 valid_loss = 7.062989
Epoch: 016 train_loss = 4.028417 valid_loss = 7.045493
Epoch: 017 train_loss = 4.279836 valid_loss = 6.876467
Epoch

#### saving the loss along the training process

In [14]:
import pickle
f=open(ENVIR_PATH + 'train_loss_log.pkl','wb')
pickle.dump(Recorder_loss,f)
f.close()



#### model loading

In [11]:

# torch.save(model.state_dict(), save_path)
model = 0
model = Transformer().to(device)
model.load_state_dict(torch.load(save_path))


#### scoring the model

In [54]:
# evaluation
model.eval()
valid_gen = batch_generator(train_pairs)
valid_loss = validate(model, valid_gen)
valid_loss

0.1985220133488917

#### examinating its performance

In [13]:


# Test
enc_inputs, dec_inputs, target_batch = make_batch(train_pairs, batch_size=1)
greedy_dec_input = greedy_decoder(model, enc_inputs, start_symbol=tgt_vocab["S"])

# greedy_dec_input = greedy_dec_input
predict, _, _, _ = model(enc_inputs, greedy_dec_input)
# predict [seq_len, tgt_vocab_size]
predict = predict.data.max(dim=-1, keepdim=True)[1]
# predict [seq_len, 1]


source_seq = ' '.join([input_lang.index2word[n.item()] for n in enc_inputs[0]]) 
output_seq1 = ' '.join([number_dict[n.item()] for n in greedy_dec_input.squeeze()])
output_seq2 = ' '.join([number_dict[n.item()] for n in predict.squeeze()])
print('source:', source_seq)
print('result:', output_seq1)
print('result:', output_seq2)
# print([input_lang.index2word[n.item()] for n in enc_inputs[0]], '->', [number_dict[n.item()] for n in predict.squeeze()])

# print('first head of last state enc_self_attns')
# showgraph(enc_self_attns)

# print('first head of last state dec_self_attns')
# showgraph(dec_self_attns)

# print('first head of last state dec_enc_attns')
# showgraph(dec_enc_attns)

source: i ve always been proud of you . P P
result: S 你 最好 不要 在 这里 等 。 P P
result: 你 最好 不要 在 这里 等 。 P P P


#### have a try on it 
(translate a sentence)

In [33]:
def prepare_single_sentence(sentn):
  normalized_sentn = normalizeString_eng(sentn)
  input_batch = [[src_vocab[n] for n in normalized_sentn.split()]]
  return torch.LongTensor(input_batch).to(device)

sentence_to_be_translated = 'I '

# Test
enc_inputs = prepare_single_sentence(sentence_to_be_translated)
greedy_dec_input = greedy_decoder(model, enc_inputs, start_symbol=tgt_vocab["S"])

# greedy_dec_input = greedy_dec_input
predict, _, _, _ = model(enc_inputs, greedy_dec_input)
# predict [seq_len, tgt_vocab_size]
predict = predict.data.max(dim=-1, keepdim=True)[1]
# predict [seq_len, 1]


source_seq = ' '.join([input_lang.index2word[n.item()] for n in enc_inputs[0]]) 
output_seq1 = ' '.join([number_dict[n.item()] for n in greedy_dec_input.squeeze()])
output_seq2 = ' '.join([number_dict[n.item()] for n in predict.squeeze()])
print('source:', source_seq)
print('result1:', output_seq1)
print('result2:', output_seq2)

source: i P P P P P P P P P
result1: S 你 最好 不要 在 这里 等 。 P P
result2: 你 最好 不要 在 这里 等 。 P P P


#### miscellaeous

In [52]:
import moxing as mox
# mox.file.copy('sweet_pretrained.pth', 'obs://class-1275-41781/Lab-2208/mo1/sweet_pretrained.pth')
mox.file.copy('obs://class-1275-41781/Lab-2208/m91/sweet_pretrained.pth', 'sweet_pretrained.pth')

In [None]:
import numpy as np
from pprint import pprint

bool_m = np.random.randn(6).reshape(2,3)
pprint(bool_m)
zero_m = np.zeros((2,3))
bool_m = bool_m>zero_m
pprint(bool_m)
value_m = np.array([[1,1,0],[0,1,1]])
pprint(value_m)

pprint(bool_m+value_m)


