# **Setup**

---




*   Connect with Gdrive.
*   Download and load libraries/packages.

In [None]:
SOS_token = 0
EOS_token = 1
UKN_token = 2
MAX_LENGTH = 500
teacher_forcing_ratio = 0.5
random_trainData = False
N_Epochs = 3

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
%cd gdrive/MyDrive/CS779:\ Competition

/content/gdrive/MyDrive/CS779: Competition


In [None]:
import numpy as np
import pandas as pd

import time
import math
import random

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker

import warnings
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

from sklearn.model_selection import train_test_split

In [None]:
%matplotlib inline

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on =", device)

Running on = cuda


Setup SpaCy

---



In [None]:
!python3 -m spacy download en >> /dev/null

In [None]:
import spacy
eng = spacy.load("en_core_web_sm")

Setup indicnlp library

---



In [None]:
#### Ony run to download the indic library (first time only) 
# !git clone https://github.com/anoopkunchukuttan/indic_nlp_library >> /dev/null
# !git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git >> /dev/null

In [None]:
!pip install Morfessor >> /dev/null

In [None]:
# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME = "indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES = "indic_nlp_resources"

# Add Library to Python path
import sys
sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME))

# Set environment variable
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)

# Initialize the Indic NLP library
from indicnlp import loader
loader.load()

In [None]:
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory

Setup iNLTK library

---



In [None]:
!pip install inltk >> /dev/null

In [None]:
from inltk.inltk import setup
'''
Note: Run setup('<code-of-language>') to a language for the FIRST TIME ONLY.
This will download all the necessary models required to do inference for that language.
'''
try:
    setup("hi")
except:
    print("Downloading 'hindi' setup... Wait before running next cell!")

Downloading 'hindi' setup... Wait before running next cell!


In [None]:
from inltk.inltk import tokenize as inltk_tokenize

Setup NLTK library - For evaluation

---



In [None]:
!pip install -U nltk >> /dev/null

In [None]:
import nltk
nltk.download('wordnet')

from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score

Downloading Model. This might take time, depending on your internet connection. Please be patient.
We'll only do this for the first time.


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# **Load Data**


---



*   Load training data
*   Load Embedding matrix, word2index and index2word for Hindi








In [None]:
df = pd.read_csv("train/train.csv")
df = df[["hindi", "english"]]
display(df.head(5))
print("\nTotal hindi-english sentence pairs = ", df.shape[0])

Unnamed: 0,hindi,english
0,"एल सालवाडोर मे, जिन दोनो पक्षों ने सिविल-युद्ध...","In El Salvador, both sides that withdrew from ..."
1,मैं उनके साथ कोई लेना देना नहीं है.,I have nothing to do with them.
2,-हटाओ रिक.,"Fuck them, Rick."
3,क्योंकि यह एक खुशियों भरी फ़िल्म है.,Because it's a happy film.
4,The thought reaching the eyes...,The thought reaching the eyes...



Total hindi-english sentence pairs =  102322


In [None]:
# #####################################################################
# #### Testing on smaller data
# #####################################################################
# df = df[:500]

In [None]:
my_embeddings = np.load("hindi_embeddings.npy")
word2index_npy = np.load("word2index.npy")
index2word_npy = np.load("index2word.npy")

In [None]:
hindi_word2index = {}
hindi_index2word = {}

for i in range(len(word2index_npy)):
    hindi_word2index[word2index_npy[i, 0]] = int(word2index_npy[i, 1])

for i in range(len(index2word_npy)):
    hindi_index2word[int(index2word_npy[i, 0])] = index2word_npy[i, 1]

# **Data Normalisation and Pre-processing**

---

*   Remove nuktas from Hindi sentences.
*   Create **embedding matrix** using iNLTK library.





In [None]:
factory = IndicNormalizerFactory()
normalizer = factory.get_normalizer("hi", remove_nuktas = True)

for i, pair in df.iterrows():
    df.iloc[i, 0] = normalizer.normalize(pair[0])

    if(i != 0 and i%10000 == 0):
        print("Iterations done =", i)
print("...Done")

Iterations done = 10000
Iterations done = 20000
Iterations done = 30000
Iterations done = 40000
Iterations done = 50000
Iterations done = 60000
Iterations done = 70000
Iterations done = 80000
Iterations done = 90000
Iterations done = 100000
...Done


In [None]:
class Language:
    def __init__(self, name):
        self.name = name
        if (name == "english"):
            self.n_words = 3  # UKN, SOS and EOS
            self.word2index = {"<sos>":0, "<eos>":1, "<ukn>":2}
            self.index2word = {0: "<sos>", 1: "<eos>", 2: "<ukn>"}
        elif (name == "hindi"):
            self.n_words = len(hindi_word2index)
            self.word2index = hindi_word2index
            self.index2word = hindi_index2word

    def addSentence(self, sentence):
        tokens = eng(sentence)
        for token in tokens:
            self.addWord(token.text)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.n_words += 1

In [None]:
def readData():
    print("Reading data...")
    pairs = [[sent for sent in pair] for _, pair in df.iterrows()]
    input_lang = Language("hindi")
    output_lang = Language("english")

    print("Data normalisation...")
    #############################################################################
    ### Add normalising techniques
    #############################################################################

    print("Data preprocessing...")
    for i, pair in enumerate(pairs):
        output_lang.addSentence(pair[1])

        if(i != 0 and i%10000 == 0):
            print("Iterations done = ", i)
    print("... Done")

    return input_lang, output_lang, pairs

In [None]:
input_lang, output_lang, pairs = readData()

Reading data...
Data normalisation...
Data preprocessing...
Iterations done =  10000
Iterations done =  20000
Iterations done =  30000
Iterations done =  40000
Iterations done =  50000
Iterations done =  60000
Iterations done =  70000
Iterations done =  80000
Iterations done =  90000
Iterations done =  100000
... Done


# **Utility Functions**

---



In [None]:
def asMinutes(s):
    m = math.floor(s/60)
    s -= m*60
    return '%dm %ds' %(m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s/(percent)
    rs = es - s
    return "Time Elapsed - %s (Expected time remaining - %s)\n" %(asMinutes(s), asMinutes(rs))

In [None]:
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base = 0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    plt.show();

In [None]:
def indexesFromSentence(lang, sentence):
    idx_list = []
    if (lang.name == "english"):
        tokens = eng(sentence)
        for token in tokens:
            if token.text in lang.word2index:
                idx_list.append(lang.word2index[token.text])
            else:
                idx_list.append(UKN_token)
    else:
        tokens = inltk_tokenize(sentence, "hi")
        for token in tokens:
                if token in lang.word2index:
                    idx_list.append(lang.word2index[token])
                else:
                    idx_list.append(UKN_token)
                    print("No token in emb.", lang.name, token)
    return idx_list

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype = torch.long, device = device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [None]:
data = np.reshape(np.random.randn(2*df.shape[0]),(df.shape[0], 2)) # 10 training examples
labels = np.random.randint(2, size = df.shape[0]) # 10 labels
X = pd.DataFrame(data, columns = ['Column_1', 'Column_2'])
y = pd.Series(labels)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

train_indexes = list(X_train.index)
test_indexes = list(X_test.index)

In [None]:
test_df = df.iloc[test_indexes]
# validation_pairs = [tensorsFromPair(list(test_df.iloc[i])) for i in range(test_df.shape[0])]
validation_pairs = torch.load("val.pt")

train_df = df.iloc[train_indexes]
# train_pairs = [[sent for sent in pair] for _, pair in train_df.iterrows()]
# train_pairs = [tensorsFromPair(train_pairs[idx]) for idx in range(len(train_pairs))]
train_pairs = torch.load("train.pt")

In [None]:
def inputData(n_iter):
    if (random_trainData == True):
        train = random.choice(train_pairs)
    else:
        idx = int((n_iter - 1) - train_df.shape[0]*int((n_iter - 1)/train_df.shape[0]))
        train = train_pairs[idx]
    return train

# **Model**

---



In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()

        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.embedding.from_pretrained(torch.from_numpy(my_embeddings), freeze = True)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        # print("input encoder = ", input.shape)
        embedded = self.embedding(input)
        embedded = embedded.view(1, 1, -1)
        # print("embedded encoder = ", embedded.shape)
        output, hidden = self.gru(embedded, hidden)
        # print("out, hid encoder = ", output.shape, hidden.shape)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device = device)

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim = 1)

    def forward(self, input, hidden):
        # print("input decoder = ", input.shape)
        output = self.embedding(input).view(1, 1, -1)
        # print("output decoder = ", output.shape)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device = device)

# **Train**

---



In [None]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length = MAX_LENGTH):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_hidden = encoder.initHidden()
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device = device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)

    decoder_input = torch.tensor([[SOS_token]], device = device)
    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item()/target_length

In [None]:
def validate(validation_pairs, encoder, decoder, criterion, max_length = MAX_LENGTH):
    with torch.no_grad():
        val_loss = 0

        for i in range(int(len(validation_pairs)/2)):
            input_tensor = validation_pairs[i][0]
            target_tensor = validation_pairs[i][1]

            input_length = input_tensor.size(0)
            target_length = target_tensor.size(0)

            encoder_hidden = encoder.initHidden()
            encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device = device)

            loss = 0
            for ei in range(input_length):
                encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)

            decoder_input = torch.tensor([[SOS_token]], device = device)
            decoder_hidden = encoder_hidden

            for di in range(target_length):
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze().detach()  # detach from history as input

                loss += criterion(decoder_output, target_tensor[di])
                if decoder_input.item() == EOS_token:
                    break
        val_loss += loss.item()/target_length

    return val_loss

In [None]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience = 10, verbose = False):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf

    def __call__(self, val_loss, enc, dec):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, enc, dec)
        elif score < self.best_score:
            self.counter += 1
            print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, enc, dec)
            self.counter = 0

    def save_checkpoint(self, val_loss, enc, dec):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f"Validation loss decreased ({self.val_loss_min:.5f} --> {val_loss:.5f}).  Saving model ...")
        torch.save(enc.state_dict(), "encoder.pt")
        torch.save(dec.state_dict(), "decoder.pt")
        self.val_loss_min = val_loss

In [None]:
def trainIters(encoder, decoder, n_iters = 0, print_every = 1000, plot_every = 100, val_every = 1, learning_rate = 0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr = learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr = learning_rate)
    # training_pairs, validation_pairs = inputData(n_iters)
    criterion = nn.NLLLoss()
    early_stopping = EarlyStopping(patience = 3, verbose = True)

    for iter in range(1, n_iters + 1):
        # training_pair = training_pairs[iter - 1]
        training_pair = inputData(iter)

        input_tensor = training_pair[0].to(device)
        target_tensor = training_pair[1].to(device)

        # print(input_tensor.shape, target_tensor.shape)
        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter%print_every == 0:
            print_loss_avg = print_loss_total/print_every
            print_loss_total = 0
            print("%sIterations completed - %d (%d%%)\nLoss = %.4f\n" %(timeSince(start, iter/n_iters), iter, iter/n_iters*100, print_loss_avg))

        if iter%plot_every == 0:
            plot_loss_avg = plot_loss_total/plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

        if iter%val_every == 0:
            val_loss = validate(validation_pairs, encoder, decoder, criterion)
            print("Validate loss = ", val_loss, "\n")

            early_stopping(val_loss, encoder, decoder)
        
            # if early_stopping.early_stop:
            #     print("Early stopping")
            #     break
        
    torch.save(encoder.state_dict(), "encoder_last.pt")
    torch.save(decoder.state_dict(), "decoder_last.pt")
    showPlot(plot_losses)

In [None]:
hidden_size = 400
encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device)

trainIters(encoder, decoder, n_iters = N_Epochs*train_df.shape[0], print_every = 10000, plot_every = 1000, val_every = 20000)

Time Elapsed - 8m 1s (Expected time remaining - 213m 50s)
Iterations completed - 10000 (3%)
Loss = 4.8928

Time Elapsed - 16m 38s (Expected time remaining - 213m 8s)
Iterations completed - 20000 (7%)
Loss = 4.5703

Validate loss =  4.740001678466797 

Validation loss decreased (inf --> 4.74000).  Saving model ...
Time Elapsed - 25m 59s (Expected time remaining - 213m 18s)
Iterations completed - 30000 (10%)
Loss = 4.3820

Time Elapsed - 34m 37s (Expected time remaining - 204m 29s)
Iterations completed - 40000 (14%)
Loss = 4.2902

Validate loss =  3.9529294967651367 

Validation loss decreased (4.74000 --> 3.95293).  Saving model ...
Time Elapsed - 43m 59s (Expected time remaining - 199m 3s)
Iterations completed - 50000 (18%)
Loss = 4.2085

Time Elapsed - 52m 36s (Expected time remaining - 189m 39s)
Iterations completed - 60000 (21%)
Loss = 4.1299

Validate loss =  2.4829049110412598 

Validation loss decreased (3.95293 --> 2.48290).  Saving model ...
Time Elapsed - 61m 58s (Expected tim

NameError: ignored

In [None]:
torch.save(encoder.state_dict(), "encoder_last.pt")
torch.save(decoder.state_dict(), "decoder_last.pt")

# **Evaluation**

---



In [None]:
def evaluate(encoder, decoder, sentence, max_length = MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device = device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device = device)  # SOS
        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                # decoded_words.append('<eos>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [None]:
def evaluateRandomly(encoder, decoder, n = 100):
    total_bleu_scores = 0
    total_meteor_scores = 0

    for i in range(n):
        pair = random.choice(pairs)
        # print('>', pair[0])
        # print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        # print('<', output_sentence)
        total_bleu_scores += sentence_bleu(pair[1].split(" "), output_sentence.split(" "))
        total_meteor_scores += single_meteor_score(pair[1], output_sentence)
        # print('')
        print(i)

    bleu_result = total_bleu_scores/n
    meteor_result = total_meteor_scores/n
    print("bleu score: ",bleu_result)
    print("meteor score: ",meteor_result)

In [None]:
evaluateRandomly(encoder, decoder, n = 100)

# **Submission**

---



In [None]:
test = pd.read_csv("test_week1.csv")

In [None]:
def evaluateTest():
    encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
    decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device)
    encoder.load_state_dict(torch.load("encoder_last.pt"))
    decoder.load_state_dict(torch.load("decoder_last.pt"))
    file1 = open("week1.txt", 'w')
    for i, data in test.iterrows():
        print(i)
        sentence = data["hindi"]
        output_words = evaluate(encoder, decoder, sentence)
        output_sentence = ' '.join(output_words)
        file1.writelines(output_sentence + " \n")

In [None]:
evaluateTest()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1164
1165
1166
1167
No token in emb. hindi ▁खड़ा
No token in emb. hindi ▁बड़ा
1168
1169
1170
No token in emb. hindi ड़ा
1171
1172
1173
1174
1175
1176
No token in emb. hindi ▁उड़
1177
1178
1179
1180
No token in emb. hindi |
1181
1182
1183
No token in emb. hindi ▁फ़र्क
1184
1185
1186
1187
No token in emb. hindi ▁फ़िर
No token in emb. hindi ▁तरफ़
No token in emb. hindi ▁फ़िर
No token in emb. hindi ▁दूरभाष
1188
No token in emb. hindi ▁बढ़ी
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
No token in emb. hindi |
No token in emb. hindi |
1204
1205
1206
No token in emb. hindi |
1207
No token in emb. hindi ़ी
No token in emb. hindi |
1208
1209
1210
1211
No token in emb. hindi |
1212
1213
1214
1215
No token in emb. hindi ▁अंग्रेज़
1216
1217
1218
No token in emb. hindi ▁पड़
No token in emb. hindi ▁ज़्यादा
1219
1220
1221
No token in emb. hindi ज़ा
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233

In [None]:
file2 = open("week1.txt", 'r')

In [None]:
ref = file2.readlines()
ref

['- Will ? \n',
 'The you you you you you you you , \n',
 'They see come and see . \n',
 "I think I know about of you you , and you know , of you 're , and you . of , \n",
 'If I I , , , , , , , , , \n',
 'In the , on the , on , \n',
 'I was back and the and the and \n',
 'And , , \n',
 "I 'll have you the you before you they \n",
 "Everything n't do n't have \n",
 'Well , because you think you can tell you , because where we say , " where , and you , , you , , " and you , , world we you , and ? \n',
 "In , that 's n't that we , , , we , , , we , this , , we , we , we that the we , , we this , , we the , we that the we , , the we , the , we that the \n",
 "It 's a . \n",
 'MAN : \n',
 "Do n't stop that we want to go to . \n",
 "- Who 's fucking ... \n",
 "This 's a \n",
 "The n't n't you \n",
 "Here 's the world world world the world world world world the world world world world the world world world world the world world world world the world world world world , we world world world w