# Sequence 2 Sequence
#### Reference:
* https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

In [1]:
from torch import optim

import codecs
import random
import re
import torch
import torch.nn as nn
import torch.nn.functional as f

# activate cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

TSV_PATH = "C:\\Users\\User\\Desktop\\Ricardo\\KnowledgeGraph_materials\\data_kg\\sequence2sequence\\Sentence pairs in English-Mandarin Chinese - 2021-06-27.tsv"
MAX_LENGTH = 10

cuda


In [2]:
# beginning token and ending token of sentence
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {
            0: "SOS",
            1: "EOS"
        }
        self.n_words = 2 # only SOS & EOS at this moment
    
    def addSentence(self, sentence, language):
        if language == "EN":
            for word in sentence.split(' '):
                self.addWord(word)
        elif language == "ZH":
            for word in sentence:
                self.addWord(word)
                
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [3]:
def normalizationString(s):
    s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def readLangs(lang1, lang2, reverse=False):
    print("Read lines....")
    pairs = []
    
    # Read the files and split into lines
    data = codecs.open(TSV_PATH, encoding="utf8", errors="ignore")
    for lineIndex, line in enumerate(data.readlines()):
        line_list = line.split("\t")
        first_sequence = line_list[1]
        second_sequece = line_list[3]
        
        # deal with format of mandarin and english sequence
        second_sequece = second_sequece.replace("\r\n", "")
        
        pairs.append([normalizationString(first_sequence), second_sequece])
        
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
        
    return input_lang, output_lang, pairs

In [4]:
'''
Since there are a lot of example sentences and we want to train something quickly,
we’ll trim the data set to only relatively short and simple sentences. 
Here the maximum length is 10 words (that includes ending punctuation) 
and we’re filtering to sentences that translate to the form “I am” or “He is” etc.
(accounting for apostrophes replaced earlier).
'''

eng_prefixed = (
    "i am", "i m",
    "he is", "he s",
    "she is", "she s",
    "you are", "you re",
    "we are", "we re",
    "they are", "they re"
)

def filterPair(p):
    return 

In [5]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    
    print("Read %s sentence pairs" % len(pairs))
    
    for pair in pairs:
        input_lang.addSentence(pair[0], "EN")
        output_lang.addSentence(pair[1], "ZH")
        
    print("Counted words...")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData("EN", "ZH", True)
print(random.choice(pairs))

Read lines....
Read 56574 sentence pairs
Counted words...
ZH 51582
EN 32
['我在一條河的附近長大。', 'i grew up near a river .']
