# **`Checking GPU availability`**

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Tue Apr 13 16:08:48 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# **Checking RAM availability**

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.4 gigabytes of available RAM

You are using a high-RAM runtime!


# **Importing Libraries and Dependencies**

In [None]:
!pip install torchtext==0.6.0 --quiet
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, BucketIterator, TabularDataset
import numpy as np
import pandas as pd
import spacy
import random
#from torchtext.data.metrics import bleu_score
#from pprint import pprint
from torch.utils.tensorboard import SummaryWriter
#from torchsummary import summary



[K     |████████████████████████████████| 71kB 5.2MB/s 
[K     |████████████████████████████████| 1.2MB 15.8MB/s 
[?25h

In [None]:
!python -m spacy download en --quiet


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"

fatal: destination path 'indic_nlp_library' already exists and is not an empty directory.


In [None]:
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git

fatal: destination path 'indic_nlp_resources' already exists and is not an empty directory.


In [None]:
# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"/content/indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES="/content/indic_nlp_resources"

In [None]:
import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))

In [None]:
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)

In [None]:
from indicnlp import loader
loader.load()

In [None]:
from indicnlp.tokenize import indic_tokenize  

indic_string='सुनो, कुछ आवाज़ आ रही है। फोन?'

print('Input String: {}'.format(indic_string))
print('Tokens: ')
for t in indic_tokenize.trivial_tokenize(indic_string): 
    print(t)

print(indic_tokenize.trivial_tokenize(indic_string))

Input String: सुनो, कुछ आवाज़ आ रही है। फोन?
Tokens: 
सुनो
,
कुछ
आवाज़
आ
रही
है
।
फोन
?
['सुनो', ',', 'कुछ', 'आवाज़', 'आ', 'रही', 'है', '।', 'फोन', '?']


# **Mounting Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
spacy_english = spacy.load("en")



# **Defining Tokenizers for English (spacy) and Hindi (Indic NLP)**

In [None]:
def tokenize_english(text):                  #tokenizer for english using Spacy
  return [token.text for token in spacy_english.tokenizer(text)]


sample_text = "I am, going to work"
print(tokenize_english(sample_text))

['I', 'am', ',', 'going', 'to', 'work']


In [None]:
def tokenize_hindi(text):                      #tokenizer for hindi using Indic NLP
  return indic_tokenize.trivial_tokenize(text)

sample_text = 'सुनो, कुछ आवाज़ आ रही है। फोन?'
print(tokenize_hindi(sample_text))

['सुनो', ',', 'कुछ', 'आवाज़', 'आ', 'रही', 'है', '।', 'फोन', '?']


In [None]:
import pandas as pd


raw_data=pd.read_csv('/content/drive/MyDrive/AssignmentNLP/train/train.csv')

In [None]:
!ls '/content/drive/MyDrive/AssignmentNLP/train/train.csv'

/content/drive/MyDrive/AssignmentNLP/train/train.csv


# **Preprocessing**

In [None]:
raw_data.head(6)
raw_data=raw_data.iloc[:,1:]
raw_data.head(10)


Unnamed: 0,hindi,english
0,"एल सालवाडोर मे, जिन दोनो पक्षों ने सिविल-युद्ध...","In El Salvador, both sides that withdrew from ..."
1,मैं उनके साथ कोई लेना देना नहीं है.,I have nothing to do with them.
2,-हटाओ रिक.,"Fuck them, Rick."
3,क्योंकि यह एक खुशियों भरी फ़िल्म है.,Because it's a happy film.
4,The thought reaching the eyes...,The thought reaching the eyes...
5,मैंने तुमे School से हटवा दिया .,I got you suspended.
6,"यह Vika, एक फूल है.","It's a flower, Vika."
7,पर मेरे लिए उसका यहुदी विरोधी होना उसके कार्यो...,"But personally, for me, the fact that Picquart..."
8,"नहीं, नहीं, नहीं... ठीक है, हम उह हूँ... हम का...","No, no, no... fine, we'll uh... we'll use the ..."
9,- क्या भाषा क्या वे वहाँ बात की?,- What language do they speak there?


In [None]:
raw_data.hindi.head(10)

0    एल सालवाडोर मे, जिन दोनो पक्षों ने सिविल-युद्ध...
1                  मैं उनके साथ कोई लेना देना नहीं है.
2                                           -हटाओ रिक.
3                 क्योंकि यह एक खुशियों भरी फ़िल्म है.
4                     The thought reaching the eyes...
5                     मैंने तुमे School से हटवा दिया .
6                                  यह Vika, एक फूल है.
7    पर मेरे लिए उसका यहुदी विरोधी होना उसके कार्यो...
8    नहीं, नहीं, नहीं... ठीक है, हम उह हूँ... हम का...
9                     - क्या भाषा क्या वे वहाँ बात की?
Name: hindi, dtype: object

In [None]:
df = raw_data


In [None]:
df['hin_len'] = df['hindi'].str.count(' ')
df['eng_len'] = df['english'].str.count(' ')
df = df.query('hin_len<100 & eng_len<100')
df = df.query('hin_len>2  & eng_len>2')
df = df.query('hin_len<eng_len*2 & hin_len*2>eng_len')

In [None]:
from sklearn.model_selection import train_test_split
# create train and validation set 
train, val = train_test_split(df, test_size=0.1)
train.to_csv("/content/drive/MyDrive/train.csv", index=False)
val.to_csv("/content/drive/MyDrive/val.csv", index=False)

In [None]:
hindi = Field(tokenize=tokenize_hindi, lower=True,
               init_token="<sos>", eos_token="<eos>")

english = Field(tokenize=tokenize_english, lower=True,
               init_token="<sos>", eos_token="<eos>")

In [None]:
train.head(10)

Unnamed: 0,hindi,english,hin_len,eng_len
96864,इस गोदाम में सब लोग प्राप्त करने के लिए?,To get everybody into this warehouse?,8,5
23450,क्या आप अब भी ऐसा करना चाहते हैं?,Do you still want to do this?,7,6
1150,"अब, शायद मैं इन्ही शब्दों का उपयोग ना करूं अपन...","Now, I'm not sure if I would use any of these ...",21,23
80468,"फ़िर हमने सीमा पार की और सीरिया गये, फ़िर अलेप्प...","Then we crossed the border into Syria, went to...",16,16
46719,"देखिए, माफ़ी चाहूँगा, पर हम पूजा के लिए बैठने ह...","Look, I'm so sorry, but we're about to sit dow...",11,11
997,आज हम अपने दरवाजे पर कर रहे हैं कि राक्षसों का...,Today we face the monsters that are at our door,11,9
53707,मैं तुम्हें कल देखेंगे.,I'll see you tomorrow.,3,3
98251,अभी आपने क्या कहा?,What did you just say?,3,4
13912,सोचो ... जिसका गंतव्य पृष्ठ है नहीं। 65।,Think... whose destination is page no. 65.,7,6
23607,आज के कलाकार जान सकते हैं हम क्या महसूस कर रहे...,Today's artists can know what we're feeling.,11,6


In [None]:
# associate the text in the 'English' column with the EN_TEXT field, # and 'French' with FR_TEXT
from torchtext.data import TabularDataset
data_fields = [('hindi', hindi), ('english', english)]
train,val = TabularDataset.splits(path='/content/drive/MyDrive/', train='train.csv', validation='val.csv', format='csv', fields=data_fields)

# **Creating the English and Hindi Vocabuaries**

In [None]:
hindi.build_vocab(train, min_freq=2)
english.build_vocab(train, min_freq=2)

In [None]:
print(f"Unique tokens in source (hi) vocabulary: {len(hindi.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(english.vocab)}")

Unique tokens in source (hi) vocabulary: 19405
Unique tokens in target (en) vocabulary: 16903


In [None]:
hindi.vocab

<torchtext.vocab.Vocab at 0x7feeb4087310>

In [None]:
print(english.vocab.stoi['the'])
print(english.vocab.itos[6])

6
the


In [None]:
print(hindi.vocab.itos[6])

है


In [None]:
train_iter = BucketIterator(train, batch_size=20, sort_key=lambda x: len(x.hindi), shuffle=True)

In [None]:
#batch=next(iter(train_iter))
#print(batch.hindi)

In [None]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# **Defining the Encoder (LSTM) architecture**

In [None]:
class EncoderLSTM(nn.Module):
 
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
    super(EncoderLSTM, self).__init__()
    self.input_size = input_size
    self.embedding_size = embedding_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.dropout = nn.Dropout(p)
    self.tag = True
    self.embedding = nn.Embedding(self.input_size, self.embedding_size)
    self.LSTM = nn.LSTM(self.embedding_size, hidden_size, num_layers, dropout = p)

  def forward(self, x):

    embedding = self.dropout(self.embedding(x))
    outputs, (hidden_state, cell_state) = self.LSTM(embedding)
    return hidden_state, cell_state
input_size_encoder = len(hindi.vocab)
encoder_embedding_size = 100
hidden_size = 512
num_layers = 3
encoder_dropout = float(0.4)

encoder_lstm = EncoderLSTM(input_size_encoder, encoder_embedding_size,hidden_size, num_layers, encoder_dropout).to(device)



# **Defining the Decoder(LSTM) Architecture**

In [None]:
class DecoderLSTM(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p, output_size):
    super(DecoderLSTM, self).__init__()
    self.input_size = input_size
    self.embedding_size = embedding_size
    self.hidden_size = hidden_size  
    self.num_layers = num_layers
    self.output_size = output_size
    self.dropout = nn.Dropout(p)
    self.tag = True
    self.embedding = nn.Embedding(self.input_size, self.embedding_size)
    self.LSTM = nn.LSTM(self.embedding_size, hidden_size, num_layers, dropout = p)
    self.fc = nn.Linear(self.hidden_size, self.output_size)

  def forward(self, x, hidden_state, cell_state):
    x = x.unsqueeze(0)
    embedding = self.dropout(self.embedding(x))
    outputs, (hidden_state, cell_state) = self.LSTM(embedding, (hidden_state, cell_state))
    predictions = self.fc(outputs)
    predictions = predictions.squeeze(0)
    return predictions, hidden_state, cell_state

input_size_decoder = len(english.vocab)
decoder_embedding_size = 100
hidden_size = 512
num_layers = 3
decoder_dropout = float(0.4)
output_size = len(english.vocab)
decoder_lstm = DecoderLSTM(input_size_decoder, decoder_embedding_size, hidden_size, num_layers, decoder_dropout, output_size).to(device)



# **Defining the Sequence-to-Sequence Model**

In [None]:
class Seq2Seq(nn.Module):
  def __init__(self, Encoder_LSTM, Decoder_LSTM):
    super(Seq2Seq, self).__init__()
    self.Encoder_LSTM = Encoder_LSTM
    self.Decoder_LSTM = Decoder_LSTM

  def forward(self, source, target, tfr=0.5):
    batch_size = source.shape[1]
    target_len = target.shape[0]
    target_vocab_size = len(english.vocab)
    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
    hidden_state, cell_state = self.Encoder_LSTM(source)
    x = target[0] 
    for i in range(1, target_len):
      output, hidden_state, cell_state = self.Decoder_LSTM(x, hidden_state, cell_state)
      outputs[i] = output
      best_guess = output.argmax(1)
      x = target[i] if random.random() < tfr else best_guess 
    return outputs


# **Defining Hyperparameters of the model**

In [None]:
learning_rate = 0.001
#writer = SummaryWriter(f"runs/loss_plot")
step = 0
model = Seq2Seq(encoder_lstm, decoder_lstm).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [None]:
len(train_iter)

3380

# **Defining Utility funtions**

In [None]:
def translate_sentence(model, sentence, hindi, english, device, max_length=50):
    tokens=tokenize_hindi(sentence)
    tokens.insert(0, hindi.init_token)
    tokens.append(hindi.eos_token)
    text_to_indices = [hindi.vocab.stoi[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
    with torch.no_grad():
        hidden, cell = model.Encoder_LSTM(sentence_tensor)
    outputs = [english.vocab.stoi["<sos>"]]
    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.Decoder_LSTM(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
            break
    translated_sentence = [english.vocab.itos[idx] for idx in outputs]
    return translated_sentence[1:]



In [None]:
def checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss):
    state = {'model': model,'best_loss': best_loss,'epoch': epoch,'rng_state': torch.get_rng_state(), 'optimizer': optimizer.state_dict(),}
    torch.save(state, '/content/drive/MyDrive/checkpoint-week1')
    torch.save(model.state_dict(),'/content/drive/MyDrive/checkpoint-state-dict-week1')

# **Training the Model**

In [None]:
epoch_loss = 0.0
num_epochs = 100
best_loss = 10000000
best_epoch = -1
sentence1="वे कहते हैं कि जहाज पर आप की जरूरत है।"
ts1 = []
for epoch in range(num_epochs):
  print("Epoch - {} / {}".format(epoch+1, num_epochs))
  model.eval()
  translated_sentence1 = translate_sentence(model, sentence1, hindi, english, device, max_length=50)
  print(translated_sentence1)
  ts1.append(translated_sentence1)
  model.train(True)
  for batch_idx, batch in enumerate(train_iter):
    input = batch.hindi.to(device)
    target = batch.english.to(device)
    output = model(input, target)
    output = output[1:].reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)
    optimizer.zero_grad()       
    loss = criterion(output, target)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
    optimizer.step()
    step += 1
    epoch_loss += loss.item()
    #writer.add_scalar("Training loss", loss, global_step=step)

  if epoch_loss < best_loss:
    best_loss = epoch_loss
    best_epoch = epoch
    checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss)
    if ((epoch - best_epoch) >= 10):
      print("no improvement in 10 epochs, break")
      break
  print("Epoch_Loss - {}".format(loss.item()))
  print()
print(epoch_loss / len(train_iter))
print('------------done---------')

#while True:pass

In [None]:
#checkpoint = torch.load('/content/drive/MyDrive/checkpoint-week1')

In [None]:
#state=torch.load('/content/drive/MyDrive/checkpoint-state-dict-week1')


In [None]:
#model.load_state_dict(torch.load('/content/drive/MyDrive/checkpoint-state-dict-week1'))

<All keys matched successfully>

In [None]:
model.eval()
#print(checkpoint['best_loss'])

Seq2Seq(
  (Encoder_LSTM): EncoderLSTM(
    (dropout): Dropout(p=0.4, inplace=False)
    (embedding): Embedding(19405, 100)
    (LSTM): LSTM(100, 512, num_layers=3, dropout=0.4)
  )
  (Decoder_LSTM): DecoderLSTM(
    (dropout): Dropout(p=0.4, inplace=False)
    (embedding): Embedding(16903, 100)
    (LSTM): LSTM(100, 512, num_layers=3, dropout=0.4)
    (fc): Linear(in_features=512, out_features=16903, bias=True)
  )
)

In [None]:
model.eval()
sentence="वे कहते हैं कि जहाज पर आप की जरूरत है।"
translated_sentence = translate_sentence(model, sentence, hindi, english, device, max_length=50)
print(translated_sentence)


In [None]:
#checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss)

# **Generating the translated sentences of the development set**

In [None]:

hs=pd.read_csv('/content/drive/MyDrive/AssignmentNLP/week2/hindistatements.csv')

In [None]:
hs.head(6)
#raw_data=raw_data.iloc[:,1:]
#raw_data.head(10)

Unnamed: 0.1,Unnamed: 0,id,hindi
0,0,0,कौन वे अपनी आस्तीन ऊपर है क्या अन्य तरकीबें जा...
1,1,1,हम कहानियों के ज़रिये अपने ज्ञान को आगे देते हैं।
2,2,2,फिर वे मुझे भी साथ लाते।
3,3,3,"- हाँ, दुर्भाग्य से."
4,4,4,मुलाक़ात नहीं हो पाई
5,5,5,"और जब आप इस बारे में में सोचते हैं, कि हम संयु..."


In [None]:
hs.hindi[1]
print(len(hs))

5000


# **Defining the Eglish De-tokenizer** 

In [None]:
op=[]
for i in range(0,5000):
  sentence=hs.hindi[i]
  translated_sentence = translate_sentence(model, sentence, hindi, english, device, max_length=50)
  ts=''
  for wd in translated_sentence:
    if wd=='<eos>':
      break
    if wd=='<unk>':
      continue
    ts=ts+wd+' '
  op.append(ts[:-1])



In [None]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
op2=[]
for i in range(0,len(hs)):
  sentence=hs.hindi[i]
  translated_sentence = translate_sentence(model, sentence, hindi, english, device, max_length=50)
  ts=TreebankWordDetokenizer().detokenize(translated_sentence)
  op2.append(ts[-1])


In [None]:
print(op[0])
#print(op2[1])

# **Saving the outputs**

In [None]:
ip=[]
for i in range(0,len(hs)):
  sentence=hs.hindi[i]
  ip.append(sentence)


In [None]:

#with open('/content/drive/MyDrive/AssignmentNLP/hin.txt', 'w') as f2:
#    for item in op:
#        f2.write("%s\n" % item)

In [None]:


with open('/content/drive/MyDrive/AssignmentNLP/english.txt', 'w') as f:
    for item in op:
        f.write("%s\n" % item)

In [None]:
!ls '/content/drive/MyDrive/AssignmentNLP'

english.txt  evaluationscript  hindistatements.csv  hin.txt  train  week2


In [None]:
while True:pass