In [1]:
import pandas as pd

df = pd.read_csv("data/english-newari.csv")

In [2]:
df.head()

Unnamed: 0,SN,en,new
0,,welcome to Ideax,Ideax ए लसकुस​
1,,this is a test,थो test ख​
2,,wa are tongue techies,जिपि tongue techies ख​
3,,this is just a demo,थो demo जक ख​
4,,I am from Urlabari,जि उर्लाबारी च्वंम्ह


In [3]:
data = df[['en','new']]

In [4]:
data.astype(str)

Unnamed: 0,en,new
0,welcome to Ideax,Ideax ए लसकुस​
1,this is a test,थो test ख​
2,wa are tongue techies,जिपि tongue techies ख​
3,this is just a demo,थो demo जक ख​
4,I am from Urlabari,जि उर्लाबारी च्वंम्ह
...,...,...
1006,lets go to see Indra Jatra,नु ईन्द्रजात्रा स्वो वोने ।
1007,Lakhe,लशिँ – पुलुकिशि
1008,You are like a lakhe,छ लाखे थें चोँ ।
1009,Is your beloved well?,छिमी यज्जु म्ह फु ला ?


In [5]:
import re
import string

def preprocessing(df):
    def process_text(text):
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = text.strip()
        text = re.sub("\s+", " ", text)
        return text

    df["en"] = df["en"].apply(process_text)
    
    def clean_text(text):
        text = re.sub(r'[०-९]', '', text)
        text = re.sub(r'[()#/@;:<>‘+=।?!|,’‘’]".', '', text)
        text = text.strip()
        return text

    df["new"] = df["new"].apply(clean_text)    
    return df
data = preprocessing(data)
data.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["en"] = df["en"].apply(process_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["new"] = df["new"].apply(clean_text)


Unnamed: 0,en,new
0,welcome to ideax,Ideax ए लसकुस​
1,this is a test,थो test ख​
2,wa are tongue techies,जिपि tongue techies ख​
3,this is just a demo,थो demo जक ख​
4,i am from urlabari,जि उर्लाबारी च्वंम्ह
5,i am from damak,जि दमक च्वंम्ह
6,i am from birtamod,जि बिर्तामोड च्वंम्ह
7,my name is nishant,जिगू नां निशान्त खः।
8,my name is drishya,जिगू नां दृश्य खः।
9,hello my name is sushan,ज्वजलपा जिगू नां सुशन खः।


In [6]:
data.isna().sum()

en     0
new    0
dtype: int64

In [7]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable, List
from transformers import AutoTokenizer

tokenizer_nepali = AutoTokenizer.from_pretrained('sakonii/deberta-base-nepali')

def newari_tokenizer(sentence):
    tokens = tokenizer_nepali.tokenize(sentence)
    return tokens

# Create source and target language tokenizer.
SRC_LANGUAGE = 'new'
TGT_LANGUAGE = 'en'

# Placeholders
token_transform = {}
vocab_transform = {}

token_transform[SRC_LANGUAGE] = get_tokenizer(newari_tokenizer)
token_transform[TGT_LANGUAGE] = get_tokenizer('basic_english')

def yield_tokens(data_iter: Iterable, language: str) -> List[str]:    
    for index,data_sample in data_iter:
        yield token_transform[language](data_sample[language])

UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    train_iter = df.iterrows()
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    vocab_transform[ln].set_default_index(UNK_IDX)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
import torch
from torch import Tensor
import math
from torch.nn import Transformer
import torch.nn as nn 

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #Check whether running on gpu or cpu

class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float = 0.1,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size) 
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)
   
    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [9]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence

def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# src and tgt language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


torch.manual_seed(0)
SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8 
FFN_HID_DIM = 512
BATCH_SIZE = 10
NUM_ENCODER_LAYERS = 4
NUM_DECODER_LAYERS = 4
DROP_OUT = 0.1

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM,DROP_OUT)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
     
# function to collate data samples into batch tesors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch



In [10]:
# Split data to tran test set
split_ratio = 0.9
split = round(df.shape[0]* split_ratio)
train = df.iloc[:split]
train_ds = list(zip(train['new'],train['en']))
valid = df.iloc[split:]
val_ds = list(zip(valid['new'],valid['en']))

In [11]:
class EarlyStopping():
    def __init__(self, tolerance=5, min_delta=0):

        self.tolerance = tolerance
        self.min_delta = min_delta
        self.counter = 0
        self.early_stop = False

    def __call__(self, train_loss, validation_loss):
        if (validation_loss - train_loss) > self.min_delta:
            self.counter +=1
            if self.counter >= self.tolerance:  
                self.early_stop = True

from torch.utils.data import DataLoader

accumulation_steps = 5

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    val_los = 0
    train_dataloader = DataLoader(train_ds, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    optimizer.zero_grad() 
    for i, (src, tgt) in enumerate(train_dataloader):
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)   
        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss = loss / accumulation_steps # Normalize our loss (if averaged)
        loss.backward()
        
        if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
            optimizer.step() # Now we can do an optimizer step
            optimizer.zero_grad() # Reset gradients tensor  

        losses += loss.item()

    return losses / len(train_dataloader)

def evaluate(model):
    model.eval()
    losses = 0

    #val_iter = valid.iterrows()
    val_dataloader = DataLoader(val_ds, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss = loss / accumulation_steps # Normalize our loss (if averaged)
        losses += loss.item()

    return losses / len(val_dataloader)

In [12]:
from timeit import default_timer as timer

early_stopping = EarlyStopping(tolerance=5, min_delta=0.1)
NUM_EPOCHS = 100
history = {
        "loss": [], 
        "val_los": []
        }

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    history['loss'].append(train_loss)
    history['val_los'].append(val_loss)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))
    # Early Stopping
    # early_stopping(train_loss, val_loss)
    # if early_stopping.early_stop:
    #     print("We are at epoch:", epoch)
    #     break



Epoch: 1, Train loss: 1.209, Val loss: 1.186, Epoch time = 3.746s
Epoch: 2, Train loss: 1.116, Val loss: 1.190, Epoch time = 2.793s
Epoch: 3, Train loss: 1.073, Val loss: 1.171, Epoch time = 2.705s
Epoch: 4, Train loss: 1.038, Val loss: 1.164, Epoch time = 2.663s
Epoch: 5, Train loss: 0.997, Val loss: 1.139, Epoch time = 2.632s
Epoch: 6, Train loss: 0.953, Val loss: 1.138, Epoch time = 2.711s
Epoch: 7, Train loss: 0.906, Val loss: 1.137, Epoch time = 2.690s
Epoch: 8, Train loss: 0.870, Val loss: 1.150, Epoch time = 2.760s
Epoch: 9, Train loss: 0.837, Val loss: 1.151, Epoch time = 2.746s
Epoch: 10, Train loss: 0.809, Val loss: 1.113, Epoch time = 2.750s
Epoch: 11, Train loss: 0.759, Val loss: 1.132, Epoch time = 2.698s
Epoch: 12, Train loss: 0.716, Val loss: 1.114, Epoch time = 2.752s
Epoch: 13, Train loss: 0.673, Val loss: 1.140, Epoch time = 2.769s
Epoch: 14, Train loss: 0.639, Val loss: 1.154, Epoch time = 2.690s
Epoch: 15, Train loss: 0.611, Val loss: 1.153, Epoch time = 2.705s
Epoc

In [13]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

In [14]:
torch.save(transformer.state_dict(),"models/newari-english.pth")

In [21]:
def detokenize_subword_tokens(tokens):
    tokens = [token.replace("▁", " ") for token in tokens]
    complete_word = " ".join(tokens)
    return complete_word

In [33]:
def translate_with_postprocessing(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    
    tgt_tokens = [token for token in tgt_tokens if token not in [BOS_IDX, EOS_IDX]]
    
    tgt_tokens = torch.tensor(tgt_tokens)
    
    translated_sentence = detokenize_subword_tokens(
        vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))
    )
    
    return translated_sentence

translated_sentence = translate_with_postprocessing(transformer, "थो test ख​")
print(translated_sentence)


this is a test
