## Import libraries

In [1]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 3.5 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 31.8 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 39.7 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 45.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64

In [2]:
import torch
import torch.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from keras.preprocessing.sequence import pad_sequences 
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import unicodedata
import re
import nltk
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel


In [3]:
data = pd.read_csv('all_train.csv')
data.head(5)

Unnamed: 0.1,Unnamed: 0,QA,ANS,TAG
0,0,what movies are about [ginger rogers],Top Hat|Kitty Foyle|The Barkleys of Broadway,has_tags_inv
1,1,which movies can be described by [moore],Fahrenheit 9/11|Far from Heaven,has_tags_inv
2,2,what films can be described by [occupation],Red Dawn|The Teahouse of the August Moon,has_tags_inv
3,3,which films are about [jacques tati],Mon Oncle|Playtime|Trafic,has_tags_inv
4,4,what movies are about [donnie darko],S. Darko,has_tags_inv


In [4]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    """
    Normalizes latin chars with accent to their canonical decomposition
    """
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ." 
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z0-9_?.!,¿]+", " ", w)
    w = w.rstrip().strip()
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [5]:
# Now we do the preprocessing using pandas and lambdas
data["QA"] = data.QA.apply(lambda w: preprocess_sentence(w))
data["TAG"] = data.TAG.apply(lambda w: preprocess_sentence(w))
data.sample(10)


Unnamed: 0.1,Unnamed: 0,QA,ANS,TAG
285553,70467,<start> the movies that share actors with the ...,Victor Schertzinger|Joshua Logan|Raoul Walsh|S...,<start> starred_actors starred_actors_inv dire...
144288,48182,<start> what genres do the movies written by a...,Drama,<start> written_by_inv has_genre <end>
223463,8377,<start> who are the actors in the movies direc...,Simon Yam|Andy Lau|Melissa George|Nick Cheung|...,<start> directed_by directed_by_inv starred_ac...
243700,28614,<start> when did the movies directed by the th...,1957|1951|1997|1952,<start> directed_by directed_by_inv release_ye...
151545,55439,<start> which person wrote the movies starred ...,Jeff Pope,<start> starred_actors_inv written_by <end>
310938,95852,<start> what were the release years of the mov...,2002|2014|2000,<start> written_by written_by_inv release_year...
251138,36052,<start> what were the release years of the fil...,1965|1972|1963,<start> directed_by directed_by_inv release_ye...
319499,104413,<start> what were the release years of the mov...,2000,<start> written_by written_by_inv release_year...
269315,54229,<start> what types are the films directed by t...,Action|Drama|Horror|Comedy|Documentary,<start> directed_by directed_by_inv has_genre ...
16567,16567,<start> which words describe anchors aweigh <end>,bd-r|gene kelly|frank sinatra|george sidney,<start> has_tags <end>


#### Building Vocabulary Index


In [6]:
class LanguageIndex():
    def __init__(self, lang):
        """ lang are the list of phrases from each language"""
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()
        self.create_index()
        
    def create_index(self):
        for phrase in self.lang:
            self.vocab.update(phrase.split(' '))
        self.vocab = sorted(self.vocab)
        self.word2idx['<pad>'] = 0
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1 # +1 because of pad token
        for word, index in self.word2idx.items():
            self.idx2word[index] = word      


inp_lang = LanguageIndex(data["QA"].values.tolist())
targ_lang = LanguageIndex(data["TAG"].values.tolist())
input_tensor = [[inp_lang.word2idx[s] for s in es.split(' ')]  for es in data["QA"].values.tolist()]
target_tensor = [[targ_lang.word2idx[s] for s in eng.split(' ')]  for eng in data["TAG"].values.tolist()]
input_tensor[:10]

[[191, 25724, 16627, 1175, 264, 9402, 20321, 190],
 [191, 25736, 16627, 3840, 2087, 6271, 3677, 16481, 190],
 [191, 25724, 8277, 3840, 2087, 6271, 3677, 17517, 190],
 [191, 25736, 8277, 1175, 264, 11908, 23648, 190],
 [191, 25724, 16627, 1175, 264, 6706, 5854, 190],
 [191, 25724, 16627, 3840, 2087, 6271, 25971, 2278, 22922, 190],
 [191, 25724, 8277, 1175, 264, 6476, 190],
 [191, 25724, 16627, 3840, 2087, 6271, 25971, 4588, 17350, 190],
 [191, 25724, 16627, 1175, 264, 9213, 17962, 190],
 [191, 25724, 16627, 1175, 264, 20285, 14629, 190]]

In [7]:
target_tensor[:10]

[[2, 9, 1],
 [2, 9, 1],
 [2, 9, 1],
 [2, 9, 1],
 [2, 9, 1],
 [2, 9, 1],
 [2, 9, 1],
 [2, 9, 1],
 [2, 9, 1],
 [2, 9, 1]]

In [8]:
def max_length(tensor):
    return max(len(t) for t in tensor)
max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)

In [9]:

input_tensor = pad_sequences(input_tensor, max_length_inp)
target_tensor = pad_sequences(target_tensor, max_length_tar)
len(target_tensor)

329282

In [10]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor,shuffle=True, test_size=0.2)
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

(263425, 263425, 65857, 65857)

## Load data into DataLoader for Batching
This is just preparing the dataset so that it can be efficiently fed into the model through batches.

In [11]:
from torch.utils.data import Dataset, DataLoader

In [12]:
class MyData(Dataset):
    def __init__(self, X, y):
        self.data = X
        self.target = y
        self.length = [ np.sum(1 - np.equal(x, 0)) for x in X]
        
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        x_len = self.length[index]
        return x,y,x_len
    
    def __len__(self):
        return len(self.data)

## Parameters
Let's define the hyperparameters and other things we need for training our NMT model.

In [13]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word2idx)
vocab_tar_size = len(targ_lang.word2idx)

train_dataset = MyData(input_tensor_train, target_tensor_train)
val_dataset = MyData(input_tensor_val, target_tensor_val)

dataset = DataLoader(train_dataset, batch_size = BATCH_SIZE, 
                     drop_last=True,
                     shuffle=True)

val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE, 
                     drop_last=True,
                     shuffle=True)

In [14]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim, self.enc_units)
        
    def forward(self, x, device):
        x = self.embedding(x)
        self.hidden = self.initialize_hidden_state(device)
        output, self.hidden = self.gru(x, self.hidden) 
        return output, self.hidden

    def initialize_hidden_state(self, device):
        return torch.zeros((1, self.batch_sz, self.enc_units)).to(device)

In [15]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, dec_units, enc_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.enc_units = enc_units
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim + self.enc_units, 
                          self.dec_units,
                          batch_first=True)
        self.fc = nn.Linear(self.enc_units, self.vocab_size)
        
        self.W1 = nn.Linear(self.enc_units, self.dec_units)
        self.W2 = nn.Linear(self.enc_units, self.dec_units)
        self.V = nn.Linear(self.enc_units, 1)
    
    def forward(self, x, hidden, enc_output):
        enc_output = enc_output.permute(1,0,2)
        hidden_with_time_axis = hidden.permute(1, 0, 2)
        score = torch.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis))
        attention_weights = torch.softmax(self.V(score), dim=1)
        context_vector = attention_weights * enc_output
        context_vector = torch.sum(context_vector, dim=1)
        x = self.embedding(x)
        x = torch.cat((context_vector.unsqueeze(1), x), -1)
        output, state = self.gru(x)
        output =  output.view(-1, output.size(2))
        x = self.fc(output)
        return x, state, attention_weights
    
    def initialize_hidden_state(self):
        return torch.zeros((1, self.batch_sz, self.dec_units))

In [16]:
criterion = nn.CrossEntropyLoss()
def loss_function(real, pred):
    mask = real.ge(1).type(torch.cuda.FloatTensor)
    loss_ = criterion(pred, real) * mask 
    return torch.mean(loss_)

In [17]:
# Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, units, BATCH_SIZE)

encoder.to(device)
decoder.to(device)
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), 
                       lr=0.001)

In [18]:
def sort_batch(X, y, lengths):
    lengths, indx = lengths.sort(dim=0, descending=True)
    X = X[indx]
    y = y[indx]    
    return X.transpose(0,1), y, lengths # transpose (batch x seq) to (seq x batch)

In [19]:
EPOCHS = 1
def eval2(encoder, decoder, sentence, max_length=120):
    encoder.eval()
    decoder.eval()
    total_loss = 0
    
    sentence = torch.unsqueeze(sentence, dim=1)
    with torch.no_grad():
        print(sentence.size())
        enc_output, enc_hidden = encoder(sentence.to(device), [sentence.size(0)], device)
        dec_hidden = enc_hidden
        dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * 1)
        out_sentence = []
        for t in range(1, sentence.size(0)):
            predictions, dec_hidden, _ = decoder(dec_input.to(device), 
                                        dec_hidden.to(device), 
                                        enc_output.to(device))
            dec_input = predictions.argmax(dim=1).unsqueeze(1)
            # print(dec_input)
            out_sentence.append(targ_lang.idx2word[predictions.squeeze().argmax().item()])
            # print(out_sentence)
            
            # print(predictions.size())
    return out_sentence


encoder.batch_sz = 64
encoder.initialize_hidden_state(device)
decoder.batch_sz = 64
decoder.initialize_hidden_state()

for epoch in range(EPOCHS):    
    encoder.train()
    decoder.train()
    total_loss = 0
    
    for (batch, (inp, targ, inp_len)) in enumerate(dataset):
        loss = 0
        xs, ys, lens = sort_batch(inp, targ, inp_len)
        enc_output, enc_hidden = encoder(xs.to(device), device)
        dec_hidden = enc_hidden
        dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * BATCH_SIZE)
        for t in range(1, ys.size(1)):
            predictions, dec_hidden, _ = decoder(dec_input.to(device), 
                                         dec_hidden.to(device), 
                                         enc_output.to(device))
            
            loss += loss_function(ys[:, t].long().to(device), predictions.to(device))
            dec_input = ys[:, t].unsqueeze(1)

        batch_loss = (loss / int(ys.size(1)))
        total_loss += batch_loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.detach().item()))
       
        


Epoch 1 Batch 0 Loss 2.0728
Epoch 1 Batch 100 Loss 0.0334
Epoch 1 Batch 200 Loss 0.0013
Epoch 1 Batch 300 Loss 0.0007
Epoch 1 Batch 400 Loss 0.0003
Epoch 1 Batch 500 Loss 0.0002
Epoch 1 Batch 600 Loss 0.1978
Epoch 1 Batch 700 Loss 0.0139
Epoch 1 Batch 800 Loss 0.0003
Epoch 1 Batch 900 Loss 0.0001
Epoch 1 Batch 1000 Loss 0.0001
Epoch 1 Batch 1100 Loss 0.0001
Epoch 1 Batch 1200 Loss 0.0001
Epoch 1 Batch 1300 Loss 0.0000
Epoch 1 Batch 1400 Loss 0.0000
Epoch 1 Batch 1500 Loss 0.0001
Epoch 1 Batch 1600 Loss 0.0000
Epoch 1 Batch 1700 Loss 0.0000
Epoch 1 Batch 1800 Loss 0.0000
Epoch 1 Batch 1900 Loss 0.0000
Epoch 1 Batch 2000 Loss 0.0000
Epoch 1 Batch 2100 Loss 0.0011
Epoch 1 Batch 2200 Loss 0.0001
Epoch 1 Batch 2300 Loss 0.0002
Epoch 1 Batch 2400 Loss 0.0000
Epoch 1 Batch 2500 Loss 0.0000
Epoch 1 Batch 2600 Loss 0.0000
Epoch 1 Batch 2700 Loss 0.0000
Epoch 1 Batch 2800 Loss 0.0074
Epoch 1 Batch 2900 Loss 0.0000
Epoch 1 Batch 3000 Loss 0.0000
Epoch 1 Batch 3100 Loss 0.0000
Epoch 1 Batch 3200 L

In [55]:
# def translate_sentence(encoder, decoder, sentence, max_length=120):
#     encoder.eval()
#     decoder.eval()
    # total_loss = 0
    # sentence = sentence.transpose(0,1) 
    # with torch.no_grad():
    #     enc_output, enc_hidden = encoder(sentence.to(device),device)
    #     dec_hidden = enc_hidden
    #     dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * 1)
    #     out_sentence = []
    #     for t in range(1, sentence.size(0)):
    #         predictions, dec_hidden, _ = decoder(dec_input.to(device), 
    #                                     dec_hidden.to(device), 
    #                                     enc_output.to(device))
    #         dec_input = predictions.argmax(dim=1).unsqueeze(1)
    #         out_sentence.append(targ_lang.idx2word[predictions.squeeze().argmax().item()])

    # return out_sentence
def translate_sentence(encoder, decoder, sentence, max_length=120):
    encoder.eval()
    decoder.eval()
    total_loss = 0
    sentence = sentence.transpose(0, 1)
    with torch.no_grad():
        enc_output, enc_hidden = encoder(sentence.to(device), device)
        dec_hidden = enc_hidden
        dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * 1)
        out_sentence = []
        for t in range(1, sentence.size(0)):
            predictions, dec_hidden, _ = decoder(dec_input.to(device),
                                                 dec_hidden.to(device),
                                                 enc_output.to(device))
            dec_input = predictions.argmax(dim=1).unsqueeze(1)
            next_word = targ_lang.idx2word[predictions.squeeze().argmax().item()]
            out_sentence.append(next_word)
            if next_word == '<end>':
                break
            

    return out_sentence


encoder.batch_sz = 1
encoder.initialize_hidden_state(device)
decoder.batch_sz = 1
decoder.initialize_hidden_state()

test_sentence = "<start> which films have the same screenwriter of a tree grows in brooklyn <end>"
test_sentence = [[inp_lang.word2idx[s] for s in test_sentence.split(' ')]]
test_sentence = pad_sequences(test_sentence, max_length_inp)
ret = translate_sentence(encoder, decoder, torch.tensor(test_sentence), max_length=max_length_tar)
ret

['<start>', 'written_by', 'written_by_inv', '<end>']

In [56]:
from torchtext.data.metrics import bleu_score

def predict_sentences(sentences):
    def predict_sentence(test_sentence):
        test_sentence = preprocess_sentence(test_sentence)
        test_sentence = [[inp_lang.word2idx[s] for s in test_sentence.split(' ') if s in inp_lang.word2idx]]
        test_sentence = pad_sequences(test_sentence, max_length_inp)
        return translate_sentence(encoder, decoder, torch.tensor(test_sentence), max_length=max_length_tar)

    return [predict_sentence(sentence) for sentence in sentences]





In [57]:

data_test = pd.read_csv('all_test.csv').dropna().iloc[:2000]

tags_pred = predict_sentences(data_test['QA'].values)
tags_true = [preprocess_sentence(sentence).split(' ') for sentence in data_test['TAG']]



In [58]:
tags_original = tags_pred.copy()
true_tags_original = tags_true.copy()
def clean_tags(tags):
    while len(tags) > 0 and tags[0].startswith("<"):
        tags.pop(0)
    if '<end>' in tags:
        return tags[:tags.index('<end>')]
    return [tag for tag in tags if not tag.startswith("<")]
new_tags_pred = [clean_tags(tags) for tags in tags_original]
new_tags_true = [clean_tags(tags) for tags in true_tags_original]
print(new_tags_pred[0], new_tags_true[0])

['starred_actors_inv'] ['starred_actors_inv']


In [None]:
print(data_test.QA.shape, len(tags_true[0]), len(tags_pred[0]))
tags_true_processed = np.array([' '.join(words) for words in new_tags_pred])
tags_pred_processed = np.array([' '.join(words) for words in new_tags_true])
print(tags_pred_processed.shape, tags_true_processed.shape)

In [80]:
results = pd.DataFrame(np.array([data_test.QA.values,tags_true_processed,tags_pred_processed]).transpose(),
                       columns=['QA','Original','Predicted'])
results.to_csv('results.csv')