In [2]:
import numpy as np 
import torch
import os
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import random
from tqdm import tqdm

In [3]:
for dirname, _ ,filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname,filename))

/kaggle/input/roman-to-telgu/tel_test.csv
/kaggle/input/roman-to-telgu/tel_valid.csv
/kaggle/input/roman-to-telgu/tel_train.csv
/kaggle/input/roman-to-hindi/hin_valid.csv
/kaggle/input/roman-to-hindi/hin_test.csv
/kaggle/input/roman-to-hindi/hin_train.csv


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [5]:
class Hyperparameters:
    def __init__(self,input_dim:int,output_dim:int,
                 encoder_layers =1,decoder_layers=1,hidden_size=64,embed_dim=512,num_layers=1
                 ,cell_type:str='rnn',bidirectional:bool=False,dropout:float=0,beam_search:int=0,
                 learning_rate=0.001):
        self.encoder_layers = encoder_layers
        self.decoder_layers = decoder_layers
        self.hidden_size = hidden_size
        #input_dim is size of vocabulary of input language
        self.input_dim = input_dim
        self.embed_dim = embed_dim
        self.num_layers = num_layers
        #output_dim is size of vocabulary of output language
        self.output_dim = output_dim
    
        cell_dict = {'rnn':nn.RNN,'gru':nn.GRU,'lstm':nn.LSTM}
        self.cell = cell_dict[cell_type]
        self.cell_name = cell_type
        self.bidirectional = bidirectional
        self.dropout = dropout
        self.beam_search = beam_search
        self.learning_rate = learning_rate

In [6]:
# parameters - this class contains all the configurations for the model
class EncoderRNN(nn.Module):
    def __init__(self,parameters:Hyperparameters):
        super(EncoderRNN,self).__init__()
        # hidden_dim - number of the neuron in the hidden state
        self.hidden_dim = parameters.hidden_size
        # num_layers - number of the layers in the encoder
        self.num_layers = parameters.encoder_layers
        # parameters.embedding - size of the embedding vector
        # parameters.input_dim - size of the vocabulary dictionary
        self.embedding = nn.Embedding(parameters.input_dim,parameters.embed_dim,padding_idx = 2)
        # parameters.cell - the type of cell : RNN, LSTM, GRU
        self.cell_name = parameters.cell
        self.dropout = nn.Dropout(parameters.dropout)
        self.cell = parameters.cell(parameters.embed_dim,parameters.hidden_size,num_layers=self.num_layers,batch_first=True, dropout=parameters.dropout)
        #batch_first=False, dropout=0.0, bidirectional=False
    
    def forward(self,input_data,h_0):
        
        embedded = self.embedding(input_data)
        embedded = self.dropout(embedded)
        output, hidden = self.cell(embedded,h_0)
        return output,hidden

    def hidden_initializer(self,batch_size):
        return torch.zeros(self.num_layers,batch_size,self.hidden_dim,device = device)
    
class DecoderRNN(nn.Module):
    def __init__(self,parameters:Hyperparameters):
        super(DecoderRNN,self).__init__()
        # hidden_dim - number of neurons in the hidden state
        self.hidden_dim = parameters.hidden_size
        # num_layers - number of decoder layers
        self.num_layers = parameters.decoder_layers
        # cell_name - LSTM, GRU, RNN
        self.cell_name = parameters.cell
        self.embedding = nn.Embedding(parameters.output_dim,parameters.embed_dim)
        self.dropout = nn.Dropout(parameters.dropout)
        self.cell = parameters.cell(parameters.embed_dim,self.hidden_dim,num_layers=self.num_layers,batch_first=True, dropout=parameters.dropout)
        self.out = nn.Linear(parameters.hidden_size,parameters.output_dim)
        self.softmax = nn.LogSoftmax(dim=2)
        
    def forward(self,input_data,h_0):
        embedded = self.embedding(input_data)
        activation = F.relu(embedded)
        activation = self.dropout(activation)
        output, hidden = self.cell(activation, h_0)
        output = self.softmax(self.out(output))
        return output,hidden

In [7]:
SOS_token = 0
EOS_token = 1
PAD_token = 2
def characterFetching(x):
    characters = 3
    ind2ch ={SOS_token:'<',EOS_token:'>',PAD_token:'_'}
    ch2ind ={'<':SOS_token,'>':EOS_token,'_':PAD_token}
    for word in x:
        for letter in word:
            if letter not in ch2ind:
                ch2ind[letter] = characters
                ind2ch[characters] = letter
                characters+=1
    return [ch2ind,ind2ch,characters]
def wordPairs(x,y):
    return [[x[i],y[i]] for i in range(len(x))]

In [8]:
def dataLoading(data_type):
    path = "/kaggle/input/roman-to-telgu/tel_{}.csv".format(data_type)
    df = pd.read_csv(path,header=None)
    return df[0].to_numpy(), df[1].to_numpy()    

In [9]:
train_input_data, train_output_data = dataLoading('train')
val_input_data, val_output_data = dataLoading('valid')

In [10]:
print(train_input_data,train_output_data)
print(val_input_data, val_output_data )

['vargaalavaarine' 'vastadira' 'factamfos' ... 'venakkiteesukoovaalane'
 'roopaantaraalu' 'chendindindi'] ['వర్గాలవారినే' 'వస్తాదిరా' 'ఫ్యాక్టమ్ఫోస్' ... 'వెనక్కితీసుకోవాలనే'
 'రూపాంతరాలు' 'చెందిందింది']
['bheeshmudini' 'vinyasaanni' 'kaavachhunu' ... 'asramam' 'divine' 'dis'] ['భీష్ముడిని' 'విన్యాసాన్ని' 'కావచ్చును' ... 'ఆశ్రమం' 'డివైన్' 'డిస్']


In [11]:
#train_en, train_hin, valid_en and valid_hin all are list of length 3
# 0 index contain dictionary for characters to index
# 1 index contain dictionary for index to characters
# 2 index contain number of unique characters 
# en - english and hin - hindi
# train - training data , valid - validation data
train_en = characterFetching(train_input_data)
train_tel = characterFetching(train_output_data)
train_wordpairs = wordPairs(train_input_data,train_output_data)
valid_wordpairs = wordPairs(val_input_data,val_output_data)

In [12]:
print(train_en[2])
print(train_tel[2])

29
65


In [13]:
#input_t and output_t are I have stored character to index dictionary and one pair is given
def mannualPadding(x,padding_index,max_length):
    length_of_padding = max_length - len(x)
    padded_list = [padding_index]*(length_of_padding)
    x.extend(padded_list)
    return x
def gettingTensorFromPair(pair,input_t,output_t,padding_index,max_length):
    word_en = pair[0]
    word_tel = pair[1]
    indexes_en = [input_t[char] for char in word_en]
    indexes_tel = [output_t[char] for char in word_tel]
    indexes_en.append(EOS_token)
    indexes_tel.append(EOS_token)
    
    indexes_en = mannualPadding(indexes_en,padding_index,max_length)
    indexes_tel = mannualPadding(indexes_tel,padding_index,max_length)
    
    input_tensor = torch.tensor(indexes_en,dtype=torch.long,device=device)
    output_tensor = torch.tensor(indexes_tel,dtype=torch.long,device=device)
    return input_tensor,output_tensor

In [14]:

BATCH_SIZE = 32
MAX_LENGTH = 30
train_data = [gettingTensorFromPair(pair,train_en[0],train_tel[0],2,MAX_LENGTH) for pair in train_wordpairs]
val_data = [gettingTensorFromPair(pair,train_en[0],train_tel[0],2,MAX_LENGTH) for pair in valid_wordpairs]

# train_input_tensors , train_output_tensors = [pair[0] for pair in train_data ],[pair[1] for pair in train_data]
# val_input_tensors, val_output_tensors = [pair[0] for pair in val_data], [pair[1] for pair in val_data]

train_data = DataLoader(train_data,batch_size=BATCH_SIZE,shuffle= True)
valid_data = DataLoader(val_data,batch_size = BATCH_SIZE,shuffle = True)

# train_input_loader = DataLoader(train_input_tensors,BATCH_SIZE)
# train_output_loader = dataLoader(train_output_tensors,BATCH_SIZE)
# val_input_loader = dataLoader(val_input_tensors,BATCH_SIZE)
# val_output_loader = dataLoader(val_output_tensors,BATCH_SIZE)
# valid_data = [val_input_loader,val_output_loader]

In [15]:
for input_t, labels in train_data:
    print(input_t.size())
    for word in input_t:
        print([train_en[1][char.item()] for char in word])
        break
    for word in labels:
        print([train_tel[1][char.item()] for char in word])
        break
    print(labels.size())
    break

torch.Size([32, 30])
['m', 'a', 'r', 'u', 'v', 'a', 'd', 'd', 'u', '>', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_']
['మ', 'ర', 'ు', 'వ', 'ద', '్', 'ద', 'ు', '>', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_']
torch.Size([32, 30])


In [16]:
def printingString(predicted_sequences,output_tensor,input_tensor,type_of_data):
    
    if type_of_data=='train':
        for i in range(5):
            predicted_string = ""
            target_string =""
            input_string =""
            for j in range(predicted_sequences.size(1)):
                predicted_string += train_tel[1][predicted_sequences[i,j].item()]
                target_string += train_tel[1][output_tensor[i,j].item()]
                input_string += train_en[1][input_tensor[i,j].item()]
            print("{} {} {}".format(predicted_string,target_string,input_string))
    else:
        for i in range(5):
            predicted_string = ""
            target_string =""
            input_string =""
            for j in range(predicted_sequences.size(1)):
                predicted_string += train_tel[1][predicted_sequences[i,j].item()]
                target_string += train_tel[1][output_tensor[i,j].item()]
                input_string += train_en[1][input_tensor[i,j].item()]
            print("{} {} {}".format(predicted_string,target_string,input_string))

In [17]:
def accuracy(para,encoder,decoder,data,batch_size,type_of_data):
    encoder.eval()
    decoder.eval()
    criterion = nn.NLLLoss()
    correct_predictions =0 
    total=0
    total_loss =0
    batch_length = len(data)
    with torch.no_grad():
        for input_batch , output_batch in data:
            loss = 0
            
            #predicted_string_index = torch.zeros(input_data.size(1),batch_size,1)
            
            input_tensor = input_batch.to(device)
            output_tensor = output_batch.to(device)
            
            encoder_hidden = encoder.hidden_initializer(batch_size)
            if para.cell_name=='lstm':
                encoder_hidden = (encoder_hidden,encoder.hidden_initializer(batch_size))
                
            output_length = output_tensor.size(0)
            
            encoder_out , encoder_hidden = encoder(input_tensor,encoder_hidden)
                
            decoder_input = torch.full((batch_size,1),SOS_token,device = device)
            #print(decoder_input.size())
            #print(output_tensor.size())
            decoder_hidden = encoder_hidden
            predicted_sequences = []
            for j in range(output_batch.size(1)):
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                loss+= criterion(decoder_output[:,-1,:],output_tensor[:,j])
                # Get predicted tokens
                _, topi = decoder_output.topk(1)
                predicted_sequences.append(topi.squeeze().tolist())
                
                # Use predicted token as next input
                decoder_input = topi.squeeze().detach().view(batch_size,1)
            total_loss += loss.item()/output_tensor.size(1)
            # Convert predicted sequences to tensors
            predicted_sequences = torch.transpose(torch.tensor(predicted_sequences),0,1).to(device)
           # make_string(predicted_sequences,output_tensor,input_tensor,type_of_data)
            # Compare predicted sequences with target sequences
            correct_predictions += torch.sum((predicted_sequences == output_tensor).all(dim=1)).item()
            total += batch_size
        return correct_predictions/total, total_loss/batch_length

In [18]:
def train(encoder:EncoderRNN,decoder:DecoderRNN,epochs:int,para:Hyperparameters,train_data,valid_data,batch_size,teacher_forcing_ratio):
    encoder_opt = optim.Adam(encoder.parameters(),para.learning_rate)
    decoder_opt = optim.Adam(decoder.parameters(),para.learning_rate)
    criterion = nn.NLLLoss()
    total_batches = len(train_data)
    for epch in range(epochs):
        total_loss = 0 
        encoder.train()
        decoder.train()
        for ind, (input_tensor, output_tensor) in enumerate(tqdm(train_data, desc=f'Training Progress {epch+1}')):
       # for input_tensor, output_tensor in zip(input_t,output_t):
            encoder_opt.zero_grad()
            decoder_opt.zero_grad()
            
            input_length = input_tensor.size(0)
            output_length = output_tensor.size(0)
            
            input_tensor = input_tensor.to(device)
            output_tensor = output_tensor.to(device)
            # D*num_layers , batch_size, number of neurons in hidden layer
            encoder_hidden = encoder.hidden_initializer(batch_size)
            if para.cell_name=='lstm':
                   encoder_hidden = (encoder_hidden, encoder.hidden_initializer(batch_size))
            
            loss =0
            encoder_out , encoder_hidden = encoder(input_tensor,encoder_hidden)
                
            #decoder_input = torch.full((batch_size,1),SOS_token,device = device)
            decoder_input = output_tensor[:,0].view(batch_size,1)
            
            decoder_hidden = encoder_hidden

            teacher_forcing = True if random.random() < teacher_forcing_ratio else False
#             if teacher_forcing:
#                 for j in range(output_tensor.size(1)):
                    
#                     decoder_out, decoder_hidden = decoder(decoder_input, decoder_hidden)
#                     loss+= criterion(decoder_out[:,-1,:],output_tensor[:,j])
#                     decoder_input = output_tensor[:,j].unsqueeze(1)
#             else:
#                 for j in range(output_tensor.size(1)):
#                     decoder_out, decoder_hidden = decoder(decoder_input,decoder_hidden)
#                     #decoder.size = batch size , sequence length, output vocabulry size
#                     loss += criterion(decoder_out[:,-1,:], output_tensor[:, j])
#                     topv, topi = decoder_out.topk(1)
                    
#                     decoder_input = topi.squeeze().detach().view(batch_size,1)
            for j in range(output_tensor.size(1)):
                decoder_out, decoder_hidden = decoder(decoder_input,decoder_hidden)
                topv, topi = decoder_out.topk(1)
                decoder_input = topi.squeeze().detach().view(batch_size,1)
                loss+=criterion(decoder_out[:,-1,:],output_tensor[:,j])
                if(j<output_tensor.size(1)-1):
                    if teacher_forcing:
                        decoder_input = output_tensor[:,j+1].view(batch_size,1)
    
            total_loss += loss.item()/output_tensor.size(1)
            loss.backward()
            encoder_opt.step()
            decoder_opt.step()
   #     train_acc, train_loss = accuracy(para,encoder,decoder,train_data,batch_size,'train')
        val_acc, val_loss = accuracy(para,encoder,decoder,valid_data,batch_size,'valid')
        print("Training accuracy for epoch {} is, and loss - {}".format((epch+1),total_loss/total_batches))
        print("Validation accuracy is - {} and loss -{}".format(val_acc,val_loss))

In [19]:
parameters = Hyperparameters(input_dim=train_en[2],output_dim=train_tel[2],encoder_layers = 15,decoder_layers =15,cell_type='lstm',bidirectional=True,hidden_size = 256)
encoder = EncoderRNN(parameters).to(device)
decoder = DecoderRNN(parameters).to(device)

In [20]:
train(encoder,decoder,20,parameters,train_data,valid_data,batch_size=BATCH_SIZE,teacher_forcing_ratio=0.5)

Training Progress 1:   0%|          | 0/1600 [00:04<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# import gc
# encoder = None
# decoder = None
# gc.collect()
# torch.cuda.empty_cache()