Referenced mainly from - https://torchtutorialstaging.z5.web.core.windows.net/beginner/translation_transformer.html





In [1]:
import math
import torchtext
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable, List

from torch import Tensor
import io
import time

torch.manual_seed(0)
# torch.use_deterministic_algorithms(True)

<torch._C.Generator at 0x7f6a0c4aa5f0>

In [2]:
!python -m spacy download xx_ent_wiki_sm

2023-04-14 22:25:52.410516: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-14 22:25:55.726420: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-04-14 22:25:55.727052: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-04-

In [3]:
# from transformers import AutoTokenizer

tokenizer = get_tokenizer('spacy', language='xx_ent_wiki_sm')

text = "यह हिंदी में टोकनाइज करने का उदाहरण है।"

tokens = tokenizer(text)

print(tokens)

['यह', 'हिंदी', 'में', 'टोकनाइज', 'करने', 'का', 'उदाहरण', 'है', '।']


In [4]:
import pandas as pd
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.naive_bayes import LogisticRegression

from google.colab import drive
drive.mount('/content/drive')

label_dict = {-1:0,0:1,1:2}
data = pd.read_csv("/content/drive/MyDrive/eng_Hindi_data_train.csv",header = None)
test = pd.read_csv("/content/drive/MyDrive/eng_Hindi_data_test_X.csv",header = None)
data.columns = ["english_text", "hindi_text"]
test.columns = ["hindi_text"]
data.head()
# data.drop(columns = ["text_id"], inplace = True)
# data['label'] = data['gold_label'].map(label_dict)
trn_data = data.sample(frac=0.8, random_state=42)
vl_data = data.drop(trn_data.index)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
data.head()

Unnamed: 0,english_text,hindi_text
0,and deliver us by Thy mercy from the people of...,और अपनी रहमत से हमें इन काफ़िर लोगों (के नीचे)...
1,Transformed position of fourth point,चौथे बिन्दु का रूपांतरित स्थान
2,"Oh, woe to me; I wish I never took so - and - ...",हाए अफसोस काश मै फला शख्स को अपना दोस्त न बनाता
3,The PS file is to be translated into a PDF fil...,पीएस2पीडीएफ के इस्तेमाल से पीएस फ़ाइल को पीडीए...
4,Receiving LDAP search results...,LDAP खोज परिणाम पा रहा है...


In [6]:
trn_data = trn_data.reset_index(drop = True)
vl_data = vl_data.reset_index(drop = True)

In [7]:
import torch
from torch.utils.data import IterableDataset, DataLoader

class MyIterableDataset(IterableDataset):
    def __init__(self, english_sentences, hindi_sentences):
        self.english_sentences = english_sentences
        self.hindi_sentences = hindi_sentences
    
    def __iter__(self):
        for hindi_sentence, english_sentence in zip(self.hindi_sentences, self.english_sentences):
            # Convert the sentences to tensors
            # hindi_tensor = torch.tensor(hindi_sentence)
            # english_tensor = torch.tensor(english_sentence)
            
            yield hindi_sentence, english_sentence

# Example usage
train_iter = MyIterableDataset(trn_data['hindi_text'], trn_data['english_text'])
eval_iter = MyIterableDataset(vl_data['hindi_text'], vl_data['english_text'])


In [8]:
hi_tokenizer = get_tokenizer('spacy', language='xx_ent_wiki_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

In [9]:


SRC_LANGUAGE = 'hi'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}


token_transform['hi'] = get_tokenizer('spacy', language='xx_ent_wiki_sm')
token_transform['en'] = get_tokenizer('spacy', language='en_core_web_sm')


# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {'hi': 1, 'en': 0}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3

special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

train_iter = MyIterableDataset(trn_data['hindi_text'], trn_data['english_text'])
vocab_transform['hi'] = build_vocab_from_iterator(yield_tokens(train_iter, 'hi'),
                                                min_freq=1,
                                                specials=special_symbols,
                                                special_first=True,
                                                max_tokens = 5000)
train_iter = MyIterableDataset(trn_data['hindi_text'], trn_data['english_text'])
vocab_transform['en'] = build_vocab_from_iterator(yield_tokens(train_iter, 'en'),
                                                min_freq=1,
                                                specials=special_symbols,
                                                special_first=True,
                                                max_tokens = 5000)

vocab_transform['hi'].set_default_index(UNK_IDX)
vocab_transform['en'].set_default_index(UNK_IDX)


In [10]:
print(vocab_transform['en'].get_stoi().keys())



In [11]:
def process(data):
  hi_tensor_ = torch.tensor([vocab_transform['hi'][token] for token in hi_tokenizer(data)],
                            dtype=torch.long)
  return hi_tensor_

In [12]:
from torch.utils.data import Dataset, IterableDataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
class MyDataset(Dataset):
    def __init__(self, data_, vocab_transform,token_transform):
        self.data = []
        # self.vocab = vocab
        for index, row in data_.iterrows():
            hi_sen = row['hindi_text']
            en_sen = row['english_text']
            hi_tensor_ = torch.tensor([vocab_transform['hi'][token] for token in hi_tokenizer(hi_sen)],
                                    dtype=torch.long)
            en_tensor_ = torch.tensor([vocab_transform['en'][token] for token in en_tokenizer(en_sen)],
                                    dtype=torch.long)
            self.data.append((hi_tensor_, en_tensor_))

        
        self.n_samples = len(self.data)

    def __getitem__(self, index):
        if index >= self.n_samples:
            raise StopIteration
        else:
            return self.data[index]

    def __len__(self):
        return self.n_samples

In [13]:
train_data= MyDataset(trn_data,vocab_transform,token_transform)
val_data = MyDataset(vl_data,vocab_transform,token_transform)

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


BATCH_SIZE = 128
PAD_IDX = vocab_transform['hi']['<pad>']
BOS_IDX = vocab_transform['hi']['<bos>']
EOS_IDX = vocab_transform['hi']['<eos>']
print(PAD_IDX,BOS_IDX,EOS_IDX)

1 2 3


In [15]:
# from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def generate_batch(data_batch):
  hi_batch, en_batch = [], []
  for (hi_item, en_item) in data_batch:
    hi_batch.append(torch.cat([torch.tensor([BOS_IDX]), hi_item, torch.tensor([EOS_IDX])], dim=0))
    en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
  hi_batch = pad_sequence(hi_batch, padding_value=PAD_IDX)
  en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
  return hi_batch, en_batch

train_iter = DataLoader(dataset = train_data, batch_size=BATCH_SIZE,
                        shuffle=True,collate_fn=generate_batch)
valid_iter = DataLoader(dataset = val_data, batch_size=BATCH_SIZE,
                        shuffle=True,collate_fn=generate_batch)
# test_iter = DataLoader(test_data, batch_size=BATCH_SIZE,
#                        shuffle=True, collate_fn=generate_batch)

In [16]:
from torch.nn import (TransformerEncoder, TransformerDecoder,
                      TransformerEncoderLayer, TransformerDecoderLayer)

class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers: int, num_decoder_layers: int,
                 emb_size: int, src_vocab_size: int, tgt_vocab_size: int,
                 dim_feedforward:int = 512, dropout:float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        encoder_layer = TransformerEncoderLayer(d_model=emb_size, nhead=NHEAD,
                                                dim_feedforward=dim_feedforward)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        decoder_layer = TransformerDecoderLayer(d_model=emb_size, nhead=NHEAD,
                                                dim_feedforward=dim_feedforward)
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)

        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self, src: Tensor, trg: Tensor, src_mask: Tensor,
                tgt_mask: Tensor, src_padding_mask: Tensor,
                tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        memory = self.transformer_encoder(src_emb, src_mask, src_padding_mask)
        outs = self.transformer_decoder(tgt_emb, memory, tgt_mask, None,
                                        tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer_encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer_decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [17]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding +
                            self.pos_embedding[:token_embedding.size(0),:])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [18]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
  src_seq_len = src.shape[0]
  tgt_seq_len = tgt.shape[0]

  tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
  src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)

  src_padding_mask = (src == PAD_IDX).transpose(0, 1)
  tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
  return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [19]:
SRC_VOCAB_SIZE = len(vocab_transform['hi'])
TGT_VOCAB_SIZE = len(vocab_transform['en'])
EMB_SIZE = 64
NHEAD = 1
FFN_HID_DIM = 64
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 1
NUM_DECODER_LAYERS = 1
NUM_EPOCHS = 16

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS,
                                 EMB_SIZE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
                                 FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(device)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(
    transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9
)

In [20]:
def train_epoch(model, train_iter, optimizer):
  model.train()
  losses = 0
  for idx, (src, tgt) in enumerate(train_iter):
      src = src.to(device)
      tgt = tgt.to(device)

      tgt_input = tgt[:-1, :]

      src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

      logits = model(src, tgt_input, src_mask, tgt_mask,
                                src_padding_mask, tgt_padding_mask, src_padding_mask)

      optimizer.zero_grad()

      tgt_out = tgt[1:,:]
      loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
      loss.backward()

      optimizer.step()
      losses += loss.item()
  return losses / len(train_iter)


def evaluate(model, val_iter):
  model.eval()
  losses = 0
  for idx, (src, tgt) in (enumerate(valid_iter)):
    src = src.to(device)
    tgt = tgt.to(device)

    tgt_input = tgt[:-1, :]

    src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

    logits = model(src, tgt_input, src_mask, tgt_mask,
                              src_padding_mask, tgt_padding_mask, src_padding_mask)
    tgt_out = tgt[1:,:]
    loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
    losses += loss.item()
  return losses / len(val_iter)

In [None]:
NUM_EPOCHS = 18
for epoch in range(1, NUM_EPOCHS+1):
  start_time = time.time()
  train_loss = train_epoch(transformer, train_iter, optimizer)
  end_time = time.time()
  val_loss = evaluate(transformer, valid_iter)
  print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "
          f"Epoch time = {(end_time - start_time):.3f}s"))



Epoch: 1, Train loss: 6.492, Val loss: 5.601, Epoch time = 39.554s


In [None]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = process(src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

In [None]:
src_sentence = "ईमान लाओ और उसके रसूल के साथ होकर जिहाद करो"

In [None]:
src = process(src_sentence)

In [None]:
print(translate(transformer, src_sentence))

In [None]:
import pickle
filename = 'transformer.pkl'
with open(filename, 'wb') as file:
    pickle.dump(transformer, file)
print("File saved successfully")

In [None]:
with open(filename, 'rb') as file:
    loaded_model = pickle.load(file)
print("model loaded successfully")

In [None]:
print(translate(loaded_model, "ईमान लाओ और उसके रसूल के साथ होकर जिहाद करो"))

In [None]:
answer_filename = 'answer.txt'
count = 0
with open(answer_filename, 'w', encoding = 'utf-8') as f:
  for sentence in test['sentence']:
    translated = translate(transformer, sentence)
    # print(type(translated))
    count+=1
    f.write(translated + '\n')