# Sequence 2 Sequence
#### Reference:
* https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

In [1]:
from torch import optim

import codecs
import random
import re
import time
import torch
import torch.nn as nn
import torch.nn.functional as f

# activate cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

TSV_PATH = "C:\\Users\\User\\Desktop\\Ricardo\\KnowledgeGraph_materials\\data_kg\\sequence2sequence\\Sentence pairs in English-Mandarin Chinese - 2021-06-27.tsv"
MAX_LENGTH = 10

cuda


# Preparing data

In [2]:
# beginning token and ending token of sentence
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {
            0: "SOS",
            1: "EOS"
        }
        self.n_words = 2 # only SOS & EOS at this moment
    
    def addSentence(self, sentence, language):
        if language == "EN":
            for word in sentence.split(' '):
                self.addWord(word)
        elif language == "ZH":
            for word in sentence:
                self.addWord(word)
                
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [3]:
def normalizationString(s):
    s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def readLangs(lang1, lang2, reverse=False):
    print("Read lines....")
    pairs = []
    
    # Read the files and split into lines
    data = codecs.open(TSV_PATH, encoding="utf8", errors="ignore")
    for lineIndex, line in enumerate(data.readlines()):
        line_list = line.split("\t")
        first_sequence = line_list[1]
        second_sequece = line_list[3]
        
        # deal with format of mandarin and english sequence
        second_sequece = second_sequece.replace("\r\n", "")
        
        pairs.append([normalizationString(first_sequence), second_sequece])
        
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
        
    return input_lang, output_lang, pairs

In [4]:
'''
Since there are a lot of example sentences and we want to train something quickly,
we’ll trim the data set to only relatively short and simple sentences. 
Here the maximum length is 10 words (that includes ending punctuation) 
and we’re filtering to sentences that translate to the form “I am” or “He is” etc.
(accounting for apostrophes replaced earlier).
'''

eng_prefixed = (
    "i am", "i m",
    "he is", "he s",
    "she is", "she s",
    "you are", "you re",
    "we are", "we re",
    "they are", "they re"
)

def filterPair(p):
    return 

In [5]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    
    print("Read %s sentence pairs" % len(pairs))
    
    for pair in pairs:
        input_lang.addSentence(pair[0], "EN")
        output_lang.addSentence(pair[1], "ZH")
        
    print("Counted words...")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData("EN", "ZH", True)
print(random.choice(pairs))

Read lines....
Read 56574 sentence pairs
Counted words...
ZH 51582
EN 32
['她是被迫和他結婚的。', 'she didn t marry him of her own will .']


# Model Stage
#### Documetations
* Pytorch Embedding: https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
* Pytorh GRU: https://pytorch.org/docs/stable/generated/torch.nn.GRU.html
* Pytorch Softmax: https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html
* Pytorch View: https://pytorch.org/docs/stable/generated/torch.Tensor.view.html
* Pytorch Linear: https://pytorch.org/docs/stable/generated/torch.nn.Linear.html

In [6]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size   
        self.embedding = nn.Embedding(input_size, hidden_size)
        # Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.
        self.gru = nn.GRU(hidden_size, hidden_size)
        
    def forward(self, input_vector, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [7]:
# Decoder
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input_vector, hidden):
        output = self.embedding(input_vector).View(1, 1, -1)
        output = F.relu(output)
        output_hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [8]:
# Attention Decoder
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        # initiate model attributes
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, input_vector, hidden, encoder_outputs):
        embedded = self.embedding(input_vector).View(1, 1, -1)
        embedded = self.dropout(embedded)
        
        attn_weight = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                encoder_outputs.unsqueeze(0))
        
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        
        output = F.log_softmax(self.out(output[0], dim=1))
        return output, hidden, attn_weights
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [9]:
def indexFromSentence(lang, sentence):
    if lang == "EN":
        return [lang.word2index[word] for word in sentence.split(" ")]
    elif lang == "ZH":
        return [lang.word2index[word] for word in sentence]
    
def tensorFromSentence(lang, sentence):
    indexes = indexFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [10]:
# trainging model
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tenosr, encoder, decoder, encoder_optimizer, decoder_optimizer,
         criterion, max_length=MAX_LENGTH):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    
    loss = 0
    
    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [11]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [12]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [13]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

NameError: name 'time' is not defined