<a href="https://colab.research.google.com/github/gunadhineha/molecularGNN_smiles/blob/master/Machine_Translation_Student_Notebook_New.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import os
import random
import math

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
!gdown --folder https://drive.google.com/drive/folders/1VcPv6Pi-P9a0fXek9g6vwru2TFy1yPkp?usp=sharing -O .

Retrieving folder list
Processing file 1oxBeBA2os9AbRaFKu1cnHx3jN0bcQn0X eng_cleaned.npy
Processing file 1yVsXqSzymhHultyVJ-nh5pmQk8c-Py8N jpn_cleaned.npy
Retrieving folder list completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1oxBeBA2os9AbRaFKu1cnHx3jN0bcQn0X
To: /content/eng_cleaned.npy
100% 6.62M/6.62M [00:00<00:00, 99.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1yVsXqSzymhHultyVJ-nh5pmQk8c-Py8N
To: /content/jpn_cleaned.npy
100% 10.4M/10.4M [00:00<00:00, 88.4MB/s]
Download completed


In [None]:
jpn_dataset = np.load("jpn_cleaned.npy", allow_pickle=True)
eng_dataset = np.load("eng_cleaned.npy", allow_pickle=True)

In [None]:
jpn_vocab = build_vocab_from_iterator(jpn_dataset, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
eng_vocab = build_vocab_from_iterator(eng_dataset, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
jpn_vocab.set_default_index(jpn_vocab["<unk>"])
eng_vocab.set_default_index(eng_vocab["<unk>"])

In [None]:
dataset = [(
    torch.tensor(jpn_vocab(jpn_text), dtype=torch.long),
    torch.tensor(eng_vocab(eng_text), dtype=torch.long)) for (jpn_text, eng_text) in zip(jpn_dataset, eng_dataset)]

In [None]:
J_PAD_IDX = jpn_vocab['<pad>']
J_BOS_IDX = jpn_vocab['<bos>']
J_EOS_IDX = jpn_vocab['<eos>']
E_PAD_IDX = eng_vocab['<pad>']
E_BOS_IDX = eng_vocab['<bos>']
E_EOS_IDX = eng_vocab['<eos>']

def generate_batch(batch):
    jpn_T, eng_T = 0, 0
    jpn_list, eng_list = [], []
    for (jpn_batch, eng_batch) in batch:
        jpn_batch = torch.cat([torch.tensor([J_BOS_IDX]), jpn_batch, torch.tensor([J_EOS_IDX])], dim=0)
        eng_batch = torch.cat([torch.tensor([E_BOS_IDX]), eng_batch, torch.tensor([E_EOS_IDX])], dim=0)
        jpn_list.append(jpn_batch)
        eng_list.append(eng_batch)
        jpn_T = max(jpn_T, len(jpn_batch))
        eng_T = max(eng_T, len(eng_batch))
    if jpn_T > eng_T: eng_list[0] = F.pad(eng_list[0], (0, jpn_T-len(eng_list[0])), value=E_PAD_IDX)
    else: jpn_list[0] = F.pad(jpn_list[0], (0, eng_T-len(jpn_list[0])), value=J_PAD_IDX)
    jpn_list = pad_sequence(jpn_list, padding_value=J_PAD_IDX).transpose(0,1)
    eng_list = pad_sequence(eng_list, padding_value=E_PAD_IDX).transpose(0,1)
    return jpn_list.to(device), eng_list.to(device)

In [None]:
trainloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=generate_batch)

In [None]:
class SimplifiedAttention(nn.Module):
    def __init__(self, D):
      super(SimplifiedAttention, self).__init__()
      self.D = D
      self.q = nn.Linear(D, D)
      self.k = nn.Linear(D, D)
      self.v = nn.Linear(D, D)

    def forward(self, X):
      Q = self.q(X)
      K = self.k(X)
      V = self.v(X)
      S = Q @ torch.transpose(K, 1, 2) / (self.D ** 0.5)
      A = F.softmax(S, dim=-1)
      Y = A @ V
      return Y

class TransformerLayer(nn.Module):
    def __init__(self, D):
        super(TransformerLayer, self).__init__()
        self.sa = SimplifiedAttention(D)
        self.ln1 = nn.LayerNorm(D)
        self.linear1 = nn.Linear(D, 2*D)
        self.linear2 = nn.Linear(2*D, D)
        self.relu = nn.ReLU()
        self.ln2 = nn.LayerNorm(D)

    def forward(self, X):
        h = self.sa(X) + X
        h = self.ln1(h)
        h = self.linear2(self.relu(self.linear1(h))) + h
        output = self.ln2(h)
        return output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=500):
        super(PositionalEncoding, self).__init__()
        self.pos_embeddings = nn.Parameter(torch.zeros(1, max_len, d_model))

    def forward(self, x):
        return x + self.pos_embeddings[:,:x.shape[1]]

class Model(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = PositionalEncoding(embed_dim)
        self.trans1 = TransformerLayer(embed_dim)
        self.trans2 = TransformerLayer(embed_dim)
        self.trans3 = TransformerLayer(embed_dim)
        self.trans4 = TransformerLayer(embed_dim)
        self.trans5 = TransformerLayer(embed_dim)
        self.trans6 = TransformerLayer(embed_dim)
        self.fc = nn.Linear(embed_dim, num_class)

    def forward(self, input):
        h = self.embedding(input)
        h = self.pos_embedding(h)
        h = self.trans1(h)
        h = self.trans2(h)
        h = self.trans3(h)
        h = self.trans4(h)
        h = self.trans5(h)
        h = self.trans6(h)
        output = self.fc(h)
        return output

In [None]:
def train(model, loader, optimizer, criterion):
    model.train()
    epoch_loss = 0

    for i, (src, trg) in enumerate(loader):
        src, trg = src.to(device), trg.to(device)
        output = model(src)
        loss = criterion(output.reshape(-1, output.shape[-1]), trg.reshape(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        
        if i == 0:
            jpn_sentence = jpn_vocab.lookup_tokens(src[0].cpu().numpy())
            eng_sentence = eng_vocab.lookup_tokens(trg[0].cpu().numpy())
            trn_sentence = eng_vocab.lookup_tokens(output[0].argmax(dim=-1).squeeze().cpu().numpy())
            jpn_sentence = ''.join(jpn_sentence[1:jpn_sentence.index('<eos>')])
            eng_sentence = ' '.join(eng_sentence[1:eng_sentence.index('<eos>')])
            if '<eos>' not in trn_sentence: trn_sentence.append('<eos>')
            trn_sentence = ' '.join(trn_sentence[1:trn_sentence.index('<eos>')])
            print("Original sentence: \t {}".format(jpn_sentence))
            print("Target sentence: \t {}".format(eng_sentence))
            print("Model's sentence: \t {}".format(trn_sentence))

    return epoch_loss / len(loader)

In [None]:
def translate(sentence):
    sentence = torch.tensor(jpn_vocab(sentence), dtype=torch.long)
    padding = torch.tensor([J_PAD_IDX]*16)
    sentence = torch.cat([torch.tensor([J_BOS_IDX]), sentence, torch.tensor([J_EOS_IDX]), padding], dim=0).unsqueeze(0)
    sentence = sentence.to(device)
    model.eval()
    with torch.no_grad():
        output = model(sentence)
        predicted_word_idxs = output.argmax(dim=-1).squeeze().cpu().numpy()
        translation = eng_vocab.lookup_tokens(predicted_word_idxs)
        if '<eos>' not in translation: translation.append('<eos>')
        translation = ' '.join(translation[1:translation.index('<eos>')])
        print(translation)

In [None]:
input_vocab_size = len(jpn_vocab)
num_class = len(eng_vocab)

model = Model(input_vocab_size, 128, num_class).to(device)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=E_PAD_IDX)

In [None]:
for epoch in range(50):
    train_loss = train(model, trainloader, optimizer, criterion)

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

Original sentence: 	 トムとよく口論になるの？
Target sentence: 	 do you and tom argue often ?
Model's sentence: 	 do you and tom often ? ?
Epoch: 01
	Train Loss: 0.707 | Train PPL:   2.028
Original sentence: 	 心から尊敬してます。
Target sentence: 	 i really respect you .
Model's sentence: 	 i really respect you .
Epoch: 02
	Train Loss: 0.701 | Train PPL:   2.015
Original sentence: 	 彼は医者になることを望んでいる。
Target sentence: 	 he wishes to become a doctor .
Model's sentence: 	 he wishes to a a . .
Epoch: 03
	Train Loss: 0.700 | Train PPL:   2.013
Original sentence: 	 誰かここに日本語の話せる人はいませんか。
Target sentence: 	 does anyone here speak japanese ?
Model's sentence: 	 is anybody here speak japanese ? ?
Epoch: 04
	Train Loss: 0.695 | Train PPL:   2.005
Original sentence: 	 私に怒らないで。
Target sentence: 	 do n't be mad at me .
Model's sentence: 	 do n't be mad at me .
Epoch: 05
	Train Loss: 0.691 | Train PPL:   1.996
Original sentence: 	 明日の朝、電話するよ。
Target sentence: 	 i 'll call you up tomorrow morning .
Model's sentence: 	 i 'll

In [None]:
translate(['私', 'は', 'コーヒー', 'が', '大嫌い', 'です', '。'])

i hate coffee .
