# MT Indian Language 2 English

In [4]:
#!pip3 install nltk

In [5]:
import json
from tqdm import tqdm
import sentencepiece as spm
import random
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import nltk

In [17]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Preprocess

In [6]:
def pre_process_indian(s):
    unicodes = {"hindi": 2304,
     "bengali": 2432,
     "gujarati": 2688,
     "tamil": 2944,
     "telgu": 3072,
     "kannada": 3200,
     "malayalam": 3328}
    re = s
    for i in s:
        if ord(i) not in range(2304, 3328+128) and i != " " and i not in "01234567890":
            re = re.replace(i, "")
    return ' '.join(re.split())

def pre_process_english(s):
    re = s
    cond1 = lambda x : ord(x) not in range(48, 58)
    cond2 = lambda x : ord(x) not in range(65, 91)
    cond3 = lambda x : ord(x) not in range(97, 123)
    for i in s:
        if cond1(i) and cond2(i) and cond3(i) and i != " " and i != "." :
            re = re.replace(i, "")
    return ' '.join(re.split())

In [7]:
def write_preprocess_data(data):
    source = open("source.txt", "w")
    target = open("target.txt", "w")
    for key in data.keys():
        for i in tqdm(data[key]["Train"].items()):
            source.writelines(f'{pre_process_indian(i[1]["source"])}\n')
            target.writelines(f'{pre_process_english(i[1]["target"])}\n')
    source.close()
    target.close()

In [None]:
data = json.load(open("/content/drive/MyDrive/MT2/train_data2.json", "r"))
write_preprocess_data(data)

100%|██████████| 68848/68848 [00:09<00:00, 7357.81it/s]
100%|██████████| 47482/47482 [00:07<00:00, 6214.13it/s]
100%|██████████| 80797/80797 [00:13<00:00, 6185.41it/s]
100%|██████████| 46794/46794 [00:05<00:00, 8420.88it/s]
100%|██████████| 54057/54057 [00:08<00:00, 6251.70it/s]
100%|██████████| 58361/58361 [00:09<00:00, 5947.05it/s]
100%|██████████| 44904/44904 [00:05<00:00, 8671.58it/s]


### Training Sentencepiece Tokenizer

In [None]:
src_vocab = 64000
tgt_vocab = 16000
spm.SentencePieceTrainer.train(input="source.txt", model_prefix="src", vocab_size=src_vocab,model_type='bpe', pad_id=0,unk_id=3)
spm.SentencePieceTrainer.train(input="target.txt", model_prefix="tgt", vocab_size=tgt_vocab,model_type='bpe',pad_id=0,unk_id=3)

In [None]:
def random_split(data):
    train = []
    val = []
    for key in data.keys():
        tmp = data[key]["Train"].items()
        new = []
        for i in tqdm(tmp):
            src = pre_process_indian(i[1]["source"])
            if len(src.split(" ")) <= 32:
                new.append(i)
        n = int(0.9 * len(new))
        random.shuffle(new)
        train += new[:n]
        val += new[n:]
    json.dump(train,open("train.json", "w"))
    json.dump(val,open("val.json", "w"))

In [None]:
random_split(data)
del data

100%|██████████| 68848/68848 [00:03<00:00, 18481.57it/s]
100%|██████████| 47482/47482 [00:02<00:00, 18783.69it/s]
100%|██████████| 80797/80797 [00:02<00:00, 29083.28it/s]
100%|██████████| 46794/46794 [00:01<00:00, 31166.33it/s]
100%|██████████| 54057/54057 [00:02<00:00, 24679.99it/s]
100%|██████████| 58361/58361 [00:02<00:00, 19679.73it/s]
100%|██████████| 44904/44904 [00:02<00:00, 16169.42it/s]


## Dataloader

In [8]:
class MTDataset(Dataset):
    def __init__(self, filename="train.json",  src_tokenizer=None, tgt_tokenizer=None):
        self.data = json.load(open(filename, "r"))
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.ids = [i[0] for i in self.data]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src = [1] + self.src_tokenizer.encode(pre_process_indian(self.data[idx][1]['source'])) + [2]   # 1 for <sos>
        tgt = [1] + self.tgt_tokenizer.encode(pre_process_english(self.data[idx][1]['target'])) + [2]  # 2 for <eos>
        return torch.tensor(src), torch.tensor(tgt)

def add_padding(batch):
    pad_idx = 0
    ss,ts = [],[]
    for s,t in batch:
        ss.append(s)
        ts.append(t)
    ss = pad_sequence(ss,batch_first=False,padding_value=pad_idx)
    ts = pad_sequence(ts, batch_first=False, padding_value=pad_idx)
    return ss, ts

## Model

In [9]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math


class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.2):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask


def train_epoch(model, optimizer, train_dataloader):
    model.train()
    losses = 0
    i = 0
    for src, tgt in tqdm(train_dataloader):
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)
        i += 1
        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / i

def evaluate(model, val_dataloader):
  model.eval()
  losses = 0
  i = 0
  for src, tgt in val_dataloader:
      src = src.to(DEVICE)
      tgt = tgt.to(DEVICE)
      i += 1
      tgt_input = tgt[:-1, :]

      src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

      logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

      tgt_out = tgt[1:, :]
      loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
      losses += loss.item()

  return losses / i

## Decodeing (Beam)

In [56]:
def greedy_decode_batch(model, src, src_mask, max_len=25):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, memory.shape[1]).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0)).type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=-1)
        ys = torch.cat([ys,next_word.unsqueeze(0)], dim=0)
    return ys


def bleu_score(true, translated):
    true = [[nltk.word_tokenize(i)] for i in true]
    translated = [nltk.word_tokenize(i) for i in translated]
    return nltk.translate.bleu_score.corpus_bleu(true, translated)


def validation_batchwise(model, val_loader, src_tok, tgt_tok):
    mode = model.eval()
    true = []
    translations = []
    for src, tgt in tqdm(val_loader):
        src = src.to(DEVICE)
        true += tgt_tok.decode(tgt.T.tolist())
        mask = (torch.zeros(src.shape[0], src.shape[0])).type(torch.bool).to(DEVICE)
        out = greedy_decode_batch(model, src, mask, src.shape[0]+1)
        translations += tgt_tok.decode(out.T.tolist())
    return bleu_score(true, translations)


def beam_search_decode(model, src, src_mask, max_length=45, beam_width=2):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)
    EOS_IDX = 2
    memory = model.encode(src, src_mask)
    beam = [([1], 1.0)]
    final_candidates = []
    for _ in range(max_length):
        candidates = []
        for candidate_seq, candidate_score in beam:
            tgt = torch.LongTensor(candidate_seq).unsqueeze(1).to(DEVICE)
            tgt_mask = generate_square_subsequent_mask(tgt.shape[0]).type(torch.bool).to(DEVICE)
            out = model.decode(tgt, memory, tgt_mask)
            prob = model.generator(out[-1,:]).softmax(-1).squeeze(0)
            topk_probs, topk_indices = torch.topk(prob, beam_width, dim=-1)
            for i in range(beam_width):
                word_index = topk_indices[i].item()
                word_prob = topk_probs[i].item()
                new_seq = candidate_seq + [word_index]
                new_score = candidate_score * word_prob
                candidates.append((new_seq, new_score))
        candidates.sort(key=lambda x: -x[1])
        beam = candidates[:beam_width]
        for seq, score in beam:
            if seq[-1] == EOS_IDX:
                final_candidates.append((seq, score))
        if len(final_candidates) >= beam_width:
            break
    if not final_candidates:
        final_candidates = beam
    final_candidates.sort(key=lambda x: -x[1])
    best_seq, best_score = final_candidates[0]
    return best_seq

def translate_greedy(model, src_sentence, src_tok, tgt_tok):
    model.eval()
    src = torch.tensor(src_tok.encode(src_sentence)).view(1, -1).to(DEVICE)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).to(DEVICE)
    tgt_tokens = greedy_decode_batch(
        model,  src, src_mask, max_len=num_tokens + 2).flatten()
    tgt_tokens = tgt_tokens.T.tolist()
    return tgt_tok.decode(tgt_tokens)

def translate_beam_search(model, src_sentence, src_tok, tgt_tok,  beam=5):
    model.eval()
    src = torch.tensor(src_tok.encode(src_sentence)).view(-1, 1).to(DEVICE)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).to(DEVICE)
    tgt_tokens = beam_search_decode(model,  src, src_mask, num_tokens+4, beam)
    #print(tgt_tokens)
    return " ".join(tgt_tok.decode(tgt_tokens).replace('⁇', '').split())

### Training

In [None]:
src_tok = spm.SentencePieceProcessor(model_file='src.model')
tgt_tok = spm.SentencePieceProcessor(model_file='tgt.model')

In [None]:
train_set = MTDataset("train.json", src_tok, tgt_tok)
train_loader = DataLoader(train_set, batch_size=128,shuffle=True, collate_fn=add_padding)

In [None]:
val_set = MTDataset("val.json", src_tok, tgt_tok)
val_loader = DataLoader(val_set, batch_size=128,shuffle=False, collate_fn=add_padding)

In [None]:
DEVICE = torch.device("cuda")
PAD_IDX = 0
torch.manual_seed(0)

SRC_VOCAB_SIZE = 64000
TGT_VOCAB_SIZE = 16000
EMB_SIZE = 256
NHEAD = 4
FFN_HID_DIM = 512
NUM_ENCODER_LAYERS = 2
NUM_DECODER_LAYERS = 2

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX,label_smoothing=0.3)
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0005, betas=(0.9, 0.98), eps=1e-9)



In [None]:
from timeit import default_timer as timer
NUM_EPOCHS = 20

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer, train_loader)
    end_time = timer()
    torch.save(transformer, "model.pt")
    val_loss = evaluate(transformer, val_loader)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

100%|██████████| 3058/3058 [07:44<00:00,  6.59it/s]


Epoch: 1, Train loss: 6.931, Epoch time = 464.210s


100%|██████████| 3058/3058 [07:49<00:00,  6.51it/s]


Epoch: 2, Train loss: 6.303, Epoch time = 469.540s


100%|██████████| 3058/3058 [07:45<00:00,  6.57it/s]


Epoch: 3, Train loss: 6.039, Epoch time = 465.523s


100%|██████████| 3058/3058 [07:48<00:00,  6.52it/s]


Epoch: 4, Train loss: 5.885, Epoch time = 468.920s


100%|██████████| 3058/3058 [07:47<00:00,  6.54it/s]


Epoch: 5, Train loss: 5.785, Epoch time = 467.554s


100%|██████████| 3058/3058 [07:42<00:00,  6.60it/s]


Epoch: 6, Train loss: 5.715, Epoch time = 463.003s


100%|██████████| 3058/3058 [07:42<00:00,  6.62it/s]


Epoch: 7, Train loss: 5.662, Epoch time = 462.151s


100%|██████████| 3058/3058 [07:46<00:00,  6.55it/s]


Epoch: 8, Train loss: 5.620, Epoch time = 466.708s


100%|██████████| 3058/3058 [07:46<00:00,  6.55it/s]


Epoch: 9, Train loss: 5.586, Epoch time = 466.956s


100%|██████████| 3058/3058 [07:47<00:00,  6.54it/s]


Epoch: 10, Train loss: 5.557, Epoch time = 467.275s


100%|██████████| 3058/3058 [07:48<00:00,  6.53it/s]


Epoch: 11, Train loss: 5.533, Epoch time = 468.077s


100%|██████████| 3058/3058 [07:48<00:00,  6.53it/s]


Epoch: 12, Train loss: 5.512, Epoch time = 468.549s


100%|██████████| 3058/3058 [07:50<00:00,  6.50it/s]


Epoch: 13, Train loss: 5.494, Epoch time = 470.224s


100%|██████████| 3058/3058 [07:46<00:00,  6.55it/s]


Epoch: 14, Train loss: 5.477, Epoch time = 466.965s


100%|██████████| 3058/3058 [07:46<00:00,  6.56it/s]


Epoch: 15, Train loss: 5.463, Epoch time = 466.112s


100%|██████████| 3058/3058 [07:46<00:00,  6.56it/s]


Epoch: 16, Train loss: 5.450, Epoch time = 466.181s


100%|██████████| 3058/3058 [07:46<00:00,  6.55it/s]


Epoch: 17, Train loss: 5.439, Epoch time = 466.590s


100%|██████████| 3058/3058 [07:45<00:00,  6.57it/s]


Epoch: 18, Train loss: 5.428, Epoch time = 465.774s


100%|██████████| 3058/3058 [07:44<00:00,  6.59it/s]


Epoch: 19, Train loss: 5.418, Epoch time = 464.224s


100%|██████████| 3058/3058 [07:44<00:00,  6.58it/s]


Epoch: 20, Train loss: 5.409, Epoch time = 464.671s


# Inference Code

In [12]:
# Downloading pretrained tokenizer and model
!gdown 1rUutTmQw6UtONYnNYA82gaX7m8UidxV8
!unzip trained.zip

Downloading...
From (uriginal): https://drive.google.com/uc?id=1rUutTmQw6UtONYnNYA82gaX7m8UidxV8
From (redirected): https://drive.google.com/uc?id=1rUutTmQw6UtONYnNYA82gaX7m8UidxV8&confirm=t&uuid=56ac3263-046c-4480-aec9-bc111008ad92
To: /home/pshivam/MT/MT2/trained.zip
100%|████████████████████████████████████████| 107M/107M [00:10<00:00, 10.3MB/s]
Archive:  trained.zip
   creating: trained/
  inflating: trained/model5.pt       
  inflating: trained/src.model       
  inflating: trained/tgt.model       


In [14]:
model = torch.load("trained/model5.pt",map_location=torch.device('cpu'))
src_tok = spm.SentencePieceProcessor(model_file='trained/src.model')
tgt_tok = spm.SentencePieceProcessor(model_file='trained/tgt.model')

In [59]:
sent = "जीवन का रहस्य  केवल आनंद ही नहीं है, बल्कि अनुभव के माध्यम से सीखना भी है।"
#sent = "उठो, जागो और तब तक नहीं रुको जब तक लक्ष्य ना प्राप्त हो जाये"
translate_beam_search(model, sent, src_tok, tgt_tok)

'The mystery of life is not only enjoy but also learning through experience.'

In [60]:
sent = "জীবনের রহস্য শুধু উপভোগই নয়, অভিজ্ঞতার মাধ্যমে শেখাও।"
translate_beam_search(model, sent, src_tok, tgt_tok)

'Not just enjoy life she also taught with experience.'

In [61]:
sent = "જીવનનું રહસ્ય માત્ર આનંદ જ નહીં પણ અનુભવ દ્વારા શીખવું એ પણ છે."
translate_beam_search(model, sent, src_tok, tgt_tok)

'The secret of life is also not only to learn through experience .'

In [62]:
sent = "ಜೀವನದ ರಹಸ್ಯವೆಂದರೆ ಕೇವಲ ಆನಂದವಲ್ಲ, ಅನುಭವದ ಮೂಲಕ ಕಲಿಯುವುದು."
translate_beam_search(model, sent, src_tok, tgt_tok)

'The secret of life is only enjoy learning through experience.'

In [63]:
sent = "ജീവിതത്തിന്റെ രഹസ്യം ആസ്വാദനം മാത്രമല്ല, അനുഭവത്തിലൂടെയുള്ള പഠനം കൂടിയാണ്."
translate_beam_search(model, sent, src_tok, tgt_tok)

'The secret of life is not only a study of life but also with experience.'

In [64]:
sent = "வாழ்க்கையின் ரகசியம் இன்பம் மட்டுமல்ல, அனுபவத்தின் மூலம் கற்றுக்கொள்வதும் ஆகும்."
translate_beam_search(model, sent, src_tok, tgt_tok)

'The secret of life is not just a pleasant and learning through experience.'

In [65]:
sent = "జీవితం యొక్క రహస్యం ఆనందాన్ని మాత్రమే కాదు, అనుభవం ద్వారా నేర్చుకోవడం కూడా."
translate_beam_search(model, sent, src_tok, tgt_tok)

'The secret of life is not just enjoy and learning through experience .'

In [73]:
test = json.load(open("./sample_test_phase2.json", "r"))

In [85]:
sents = []
ids = []
for k in test.keys():
    ss = test[k]["Test"]
    ids += [i for i,j in ss.items()]
    sents += [j["source"] for i,j in ss.items()]


In [78]:
len(ids)

700

In [81]:
trs = []

In [82]:
for s in tqdm(sents):
    trs.append(translate_beam_search(model, pre_process_indian(s), src_tok, tgt_tok))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 700/700 [04:15<00:00,  2.74it/s]


In [83]:
trs

['The rhizomes should be sown very finely.',
 'It is located in the middle of the pond earlier pond.',
 'These reasons include good contact details of good contact information is good option.',
 'Raja Jai Singh II built in 1701 was used against Buddha on the world of 23rd and 1730 Bundi and 1730 AD in the fight of Buddha Singh II .',
 'However you will feel a good experience before treatment 2 to 4 weeks.',
 'The disease occurs from mental depression and mental depression.',
 'Sit down towards the disease .',
 'The last part of the rope should be kept inside the outside .',
 'Beautiful beautiful beautiful beautiful beautiful',
 'The island is popular for fun on a day for fun in the open road and those who disadvantages are for those who do not look at the future.',
 'turn my alarm on my alarm please turn',
 'The Portuguese of the Red stone have been discovered by the c of the Qutub of the Qutub of the',
 'If you are present in any of those programs then a strange matter where people wi