In [6]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 10.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.0 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 41.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 36.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml

In [7]:
import torch
import torch.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from keras.preprocessing.sequence import pad_sequences 
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import unicodedata
import re
import nltk
from transformers import BertTokenizer, BertModel


In [26]:
data = pd.read_csv('parttrain.csv')
data.head(5)

Unnamed: 0.1,Unnamed: 0,QA,ANS,TAG
0,0,what movies are about [ginger rogers],Top Hat|Kitty Foyle|The Barkleys of Broadway,has_tags_inv
1,1,which movies can be described by [moore],Fahrenheit 9/11|Far from Heaven,has_tags_inv
2,2,what films can be described by [occupation],Red Dawn|The Teahouse of the August Moon,has_tags_inv
3,3,which films are about [jacques tati],Mon Oncle|Playtime|Trafic,has_tags_inv
4,4,what movies are about [donnie darko],S. Darko,has_tags_inv


In [151]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    """
    Normalizes latin chars with accent to their canonical decomposition
    """
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ." 
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z0-9_?.!,¿]+", " ", w)
    w = w.rstrip().strip()
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [None]:
# Now we do the preprocessing using pandas and lambdas
data["QA"] = data.QA.apply(lambda w: preprocess_sentence(w))
data["TAG"] = data.TAG.apply(lambda w: preprocess_sentence(w))
data.sample(10)


#### Building Vocabulary Index


In [None]:

class LanguageIndex():
    def __init__(self, lang):
        """ lang are the list of phrases from each language"""
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()
        self.create_index()
        
    def create_index(self):
        for phrase in self.lang:
            self.vocab.update(phrase.split(' '))
        self.vocab = sorted(self.vocab)
        self.word2idx['<pad>'] = 0
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1 # +1 because of pad token
        for word, index in self.word2idx.items():
            self.idx2word[index] = word      


inp_lang = LanguageIndex(data["QA"].values.tolist())
targ_lang = LanguageIndex(data["TAG"].values.tolist())
input_tensor = [[inp_lang.word2idx[s] for s in es.split(' ')]  for es in data["QA"].values.tolist()]
target_tensor = [[targ_lang.word2idx[s] for s in eng.split(' ')]  for eng in data["TAG"].values.tolist()]
input_tensor[:10]

In [None]:
from transformers import BertTokenizer, BertModel

dd = ['[CLS] A man is eating food. [SEP]',
          'A man is eating a piece of bread. [SEP]',
          'The girl is carrying a baby. [SEP]',
          ]
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenized_text = tokenizer.tokenize(''.join(dd))

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
tokenizer.convert_tokens_to_string(tokenized_text)

indexed_tokens

In [31]:
from transformers import BertTokenizer, BertModel
from transformers import logging

logging.set_verbosity_warning()

def get_word_embeddings():
    qa = data['QA'].values
    embedding_matrix = []
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    for sent in qa :
        tokenized_text = tokenizer.tokenize("[CLS] " + sent + " [SEP]")
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True)
        
        tokens_tensor = torch.tensor([indexed_tokens])
        model.eval()
        with torch.no_grad():
            outputs = model(tokens_tensor)

        hidden_states = outputs[2]
        token_embeddings = torch.stack(hidden_states, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)
        
        token_vecs_cat = []
        for token in token_embeddings:
            cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
            token_vecs_cat.append(cat_vec)

        embedding_matrix.append(token_vecs_cat)
        



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.trans

KeyboardInterrupt: ignored

In [None]:
embedding_matrix.shape

In [155]:
def max_length(tensor):
    return max(len(t) for t in tensor)
# calculate the max_length of input and output tensor
max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)

In [156]:

input_tensor = pad_sequences(input_tensor, max_length_inp)
target_tensor = pad_sequences(target_tensor, max_length_tar)
len(target_tensor)

329282

In [157]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor,shuffle=True, test_size=0.2)
# Show length
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

(263425, 263425, 65857, 65857)

In [158]:
from torch.utils.data import Dataset, DataLoader

In [159]:
class MyData(Dataset):
    def __init__(self, X, y):
        self.data = X
        self.target = y
        self.length = [ np.sum(1 - np.equal(x, 0)) for x in X]
        
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        x_len = self.length[index]
        return x,y,x_len
    
    def __len__(self):
        return len(self.data)

In [160]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word2idx)
vocab_tar_size = len(targ_lang.word2idx)

train_dataset = MyData(input_tensor_train, target_tensor_train)
val_dataset = MyData(input_tensor_val, target_tensor_val)

dataset = DataLoader(train_dataset, batch_size = BATCH_SIZE, 
                     drop_last=True,
                     shuffle=True)

val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE, 
                     drop_last=True,
                     shuffle=True)

In [161]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_matrix, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.vocab_size = vocab_size
        #self.embedding_dim = embedding_dim
        self.embedding = embed_matrix
        #nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim, self.enc_units)
        
    def forward(self, x, device):
        
        x = self.embedding(x) 
        self.hidden = self.initialize_hidden_state(device)
        output, self.hidden = self.gru(x, self.hidden)
        return output, self.hidden

    def initialize_hidden_state(self, device):
        return torch.zeros((1, self.batch_sz, self.enc_units)).to(device)

In [162]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, dec_units, enc_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.enc_units = enc_units
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim + self.enc_units, 
                          self.dec_units,
                          batch_first=True)
        self.fc = nn.Linear(self.enc_units, self.vocab_size)
        
        # used for attention
        self.W1 = nn.Linear(self.enc_units, self.dec_units)
        self.W2 = nn.Linear(self.enc_units, self.dec_units)
        self.V = nn.Linear(self.enc_units, 1)
    
    def forward(self, x, hidden, enc_output):
        enc_output = enc_output.permute(1,0,2)
        hidden_with_time_axis = hidden.permute(1, 0, 2)
        score = torch.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis))
        attention_weights = torch.softmax(self.V(score), dim=1)
        context_vector = attention_weights * enc_output
        context_vector = torch.sum(context_vector, dim=1)
        x = self.embedding(x)
        x = torch.cat((context_vector.unsqueeze(1), x), -1)
        output, state = self.gru(x)
        output =  output.view(-1, output.size(2))
        x = self.fc(output)
        return x, state, attention_weights
    
    def initialize_hidden_state(self):
        return torch.zeros((1, self.batch_sz, self.dec_units))

In [163]:
criterion = nn.CrossEntropyLoss()
def loss_function(real, pred):
    """ Only consider non-zero inputs in the loss; mask needed """
    mask = real.ge(1).type(torch.cuda.FloatTensor)
    loss_ = criterion(pred, real) * mask 
    return torch.mean(loss_)

In [164]:
# Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, units, BATCH_SIZE)

encoder.to(device)
decoder.to(device)
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), 
                       lr=0.001)

In [165]:
def sort_batch(X, y, lengths):
    lengths, indx = lengths.sort(dim=0, descending=True)
    X = X[indx]
    y = y[indx]
    return X.transpose(0,1), y, lengths # transpose (batch x seq) to (seq x batch)

In [166]:
EPOCHS = 10
def eval2(encoder, decoder, sentence, max_length=120):
    encoder.eval()
    decoder.eval()
    total_loss = 0
    
    sentence = torch.unsqueeze(sentence, dim=1)
    with torch.no_grad():
        print(sentence.size())
        enc_output, enc_hidden = encoder(sentence.to(device), [sentence.size(0)], device)
        dec_hidden = enc_hidden
        # use teacher forcing - feeding the target as the next input (via dec_input)
        dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * 1)
        # run code below for every timestep in the ys batch
        out_sentence = []
        for t in range(1, sentence.size(0)):
            predictions, dec_hidden, _ = decoder(dec_input.to(device), 
                                        dec_hidden.to(device), 
                                        enc_output.to(device))
            dec_input = predictions.argmax(dim=1).unsqueeze(1)
            # print(dec_input)
            out_sentence.append(targ_lang.idx2word[predictions.squeeze().argmax().item()])
            # print(out_sentence)
            
            # print(predictions.size())
    return out_sentence


encoder.batch_sz = 64
encoder.initialize_hidden_state(device)
decoder.batch_sz = 64
decoder.initialize_hidden_state()

for epoch in range(EPOCHS):    
    encoder.train()
    decoder.train()
    total_loss = 0
    
    for (batch, (inp, targ, inp_len)) in enumerate(dataset):
        loss = 0
        xs, ys, lens = sort_batch(inp, targ, inp_len)
        enc_output, enc_hidden = encoder(xs.to(device), device)
        dec_hidden = enc_hidden
        # use teacher forcing - feeding the target as the next input (via dec_input)
        dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * BATCH_SIZE)
        # run code below for every timestep in the ys batch
        for t in range(1, ys.size(1)):
            predictions, dec_hidden, _ = decoder(dec_input.to(device), 
                                         dec_hidden.to(device), 
                                         enc_output.to(device))
            
            loss += loss_function(ys[:, t].long().to(device), predictions.to(device))
            #loss += loss_
            dec_input = ys[:, t].unsqueeze(1)

        batch_loss = (loss / int(ys.size(1)))
        total_loss += batch_loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.detach().item()))
       
        


Epoch 1 Batch 0 Loss 1.9958
Epoch 1 Batch 100 Loss 0.0107
Epoch 1 Batch 200 Loss 0.0010
Epoch 1 Batch 300 Loss 0.0081
Epoch 1 Batch 400 Loss 0.0094
Epoch 1 Batch 500 Loss 0.0002
Epoch 1 Batch 600 Loss 0.0002
Epoch 1 Batch 700 Loss 0.0002
Epoch 1 Batch 800 Loss 0.0003
Epoch 1 Batch 900 Loss 0.0001
Epoch 1 Batch 1000 Loss 0.0005
Epoch 1 Batch 1100 Loss 0.0125
Epoch 1 Batch 1200 Loss 0.0000
Epoch 1 Batch 1300 Loss 0.0000
Epoch 1 Batch 1400 Loss 0.0000
Epoch 1 Batch 1500 Loss 0.0000
Epoch 1 Batch 1600 Loss 0.0000
Epoch 1 Batch 1700 Loss 0.0001
Epoch 1 Batch 1800 Loss 0.0000
Epoch 1 Batch 1900 Loss 0.0000
Epoch 1 Batch 2000 Loss 0.0000
Epoch 1 Batch 2100 Loss 0.0000
Epoch 1 Batch 2200 Loss 0.0000
Epoch 1 Batch 2300 Loss 0.0000
Epoch 1 Batch 2400 Loss 0.0000
Epoch 1 Batch 2500 Loss 0.0000
Epoch 1 Batch 2600 Loss 0.0000
Epoch 1 Batch 2700 Loss 0.0007
Epoch 1 Batch 2800 Loss 0.0015
Epoch 1 Batch 2900 Loss 0.0001
Epoch 1 Batch 3000 Loss 0.0001
Epoch 1 Batch 3100 Loss 0.0000
Epoch 1 Batch 3200 L

KeyboardInterrupt: ignored

In [None]:
def translate_sentence(encoder, decoder, sentence, max_length=120):
    encoder.eval()
    decoder.eval()
    total_loss = 0
    sentence = sentence.transpose(0,1) 
    with torch.no_grad():
        enc_output, enc_hidden = encoder(sentence.to(device),device)
        dec_hidden = enc_hidden
        dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * 1)
        out_sentence = []
        for t in range(1, sentence.size(0)):
            predictions, dec_hidden, _ = decoder(dec_input.to(device), 
                                        dec_hidden.to(device), 
                                        enc_output.to(device))
            dec_input = predictions.argmax(dim=1).unsqueeze(1)
            out_sentence.append(targ_lang.idx2word[predictions.squeeze().argmax().item()])

    return out_sentence

encoder.batch_sz = 1
encoder.initialize_hidden_state(device)
decoder.batch_sz = 1
decoder.initialize_hidden_state()
