In [1]:
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from collections import Counter

from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler, WeightedRandomSampler
from transformers import BertTokenizer,BertModel

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def Bert_Tokenizer(model_name):
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    return tokenizer
tokenizer = Bert_Tokenizer('bert-base-uncased')

In [3]:
def text_split():
    #filename = sys.argv[1]
    filename = 'Twitter_URL_Corpus_train.txt'
    file = open(filename, encoding='utf-8')
    lines = file.readlines()
    count = 0
    raw = []
    label = []
    for line in lines:
        tokens = line.split('\t')
        if int(tokens[2][1]) !=3:
            raw.append([tokens[0].strip(), tokens[1].strip()])
            if int(tokens[2][1]) <= 2:  
                label.append(0)  
            else:
                label.append(1)  
        count += 1
       
    return raw, label 

train, train_label=text_split()
print(train[0])

['How the metaphors we use to describe discovery affect men and women in the sciences', 'Light Bulbs or Seeds ? How Metaphors for Ideas Influence Judgments About Genius']


In [4]:
class CustomDataset(Dataset):
    def __init__(self, questions_list, tokenizer, labels, max_len):
        self.questions_list = questions_list
        self.labels = labels
        self.bert_encode = tokenizer
        
#         self.texts = df.text.values
#         self.labels = df.target.values
        self.max_len = max_len

    def __len__(self):
        return len(self.questions_list)
    
    def __getitem__(self, index):
        questions_pair = self.questions_list[index]
        q1 = questions_pair[0]
        q2 = questions_pair[1]
        q1_tokens = self.get_token_mask(q1,self.max_len)
        q2_tokens = self.get_token_mask(q2,self.max_len)
        
        return q1_tokens, q2_tokens, self.labels[index]
    
    def get_token_mask(self,text,max_len):
        
        tokens = []
        mask = []
        text = self.bert_encode.encode(text)
        size = len(text)
        pads = self.bert_encode.encode(['PAD']*(max(0,max_len-size)))
        tokens[:max(max_len,size)] = text[:max(max_len,size)]
        tokens = tokens + pads[1:-1]
        mask = [1]*size+[0]*len(pads[1:-1])
        tokens_len = len(tokens)
        
        return tokens
    
train_dataset = CustomDataset(train,tokenizer, train_label, 120)
print(next(enumerate(train_dataset)))

(0, ([2129, 1996, 19240, 2015, 2057, 2224, 2000, 6235, 5456, 7461, 2273, 1998, 2308, 1999, 1996, 4163, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100], [2422, 25548, 2030, 8079, 1029, 2129, 19240, 2015, 2005, 4784, 3747, 26186, 2055, 11067, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 10

In [20]:
class CustomCollate:
    def custom_collate(self, batch):

        # batch = list of tuples where each tuple is of the form ([i1, i2, i3], [j1, j2, j3], label)
        q1_list = []
        q2_list = []
        labels = []
        for training_example in batch:
#             print(batch)
            q1_list.append(training_example[0])
            q2_list.append(training_example[1])
            labels.append(training_example[2])
          
        
        q1_lengths = [len(q) for q in q1_list]
        q2_lengths = [len(q) for q in q2_list]
        
        return q1_list, q1_lengths, q2_list, q2_lengths, labels
#         return q1_list, q2_list, labels

    def __call__(self, batch):
        return self.custom_collate(batch)

In [21]:
validation_split = 0.2
dataset_size = len(train_dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
shuffle_dataset = True
random_seed = 32

if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# print(train_indices)
train_sampler = SubsetRandomSampler(train_indices)
# print(next(enumerate(train_sampler)))
validation_sampler = SubsetRandomSampler(val_indices)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, sampler=train_sampler, collate_fn=CustomCollate())
val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, sampler=validation_sampler, collate_fn=CustomCollate())

for i, (q1_list, q1_lengths, q2_list, q2_lengths, labels) in enumerate(train_loader):
    print(i, (q1_list, q1_lengths, q2_list, q2_lengths, labels))
    break
print ('Training Set Size {}, Validation Set Size {}'.format(len(train_indices), len(val_indices)))

0 ([[2129, 2865, 11730, 2024, 9992, 4214, 3889, 7221, 8540, 1037, 16939, 17678, 16098, 16089, 3367, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100], [7279, 3401, 7283, 2635, 2058, 6653, 3947, 2013, 13144, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100

In [26]:
# EMBEDDING_REQUIRES_GRAD = False
# HIDDEN_CELLS = 25
# NUM_LAYERS = 1
class CustomNetwork(nn.Module):
    def __init__(self,pre_trained='bert-base-uncased'):
        super().__init__()
        
        self.bert = BertModel.from_pretrained(pre_trained)
        self.hidden_size = self.bert.config.hidden_size
        self.lstm = nn.LSTM(self.hidden_size,self.hidden_size,bidirectional=True)
        self.clf = nn.Linear(self.hidden_size*2,1)
    
    # Manhattan Distance Calculator
    def exponent_neg_manhattan_distance(self, x1, x2):
        return torch.exp(-torch.sum(torch.abs(x1 - x2), dim=0))

    def forward_once(self,inputs, input_length):
        encoded_layers, pooled_output = self.bert(input_ids=torch.tensor(inputs))
        encoded_layers = encoded_layers.permute(1, 0, 2)
        enc_hiddens, (last_hidden, last_cell) = self.lstm(torch.nn.utils.rnn.pack_padded_sequence(encoded_layers, input_length))
        output_hidden = torch.cat((last_hidden[0], last_hidden[1]), dim=1)
#         output_hidden = F.dropout(output_hidden,0.2)
        output = self.clf(output_hidden)
        
        return output

    def forward(self, q1, q1_lengths, q2, q2_lengths):
        output1 = self.forward_once(q1, q1_lengths)
        print('----------q1----------')
        print(output1)
        output2 = self.forward_once(q2, q2_lengths)
        print('----------q2----------')
        similarity_score = torch.zeros(output1.size()[0])
        # Calculate Similarity Score between both questions in a single pair
        for index in range(output1.size()[0]):
            # Sequence lenghts are being used to index and retrieve the activations before the zero padding since they were not part of original question
            q1 = output1[index, q1_lengths[index] - 1, :]
#             print('oh')
#             print(q1.size())
            q2 = output2[index, q2_lengths[index] - 1, :]
#             print('ho')
#             print(q2.size())
            similarity_score[index] = self.exponent_neg_manhattan_distance(q1, q2)
        return similarity_score

In [27]:
model = CustomNetwork()

total_step = len(train_loader)
# Threshold 0.5. Since similarity score will be a value between 0 and 1, we will consider all question pair with values greater than threshold as Duplicate
threshold = torch.Tensor([0.5])

# define hyperparameter
num_epochs = 1
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001 )

for epoch in range(num_epochs):
    loss_history = []
    model.train(True)
    train_correct_total = 0
    for i, (q1_batch, q1_batch_lengths, q2_batch, q2_batch_lengths, labels) in enumerate(train_loader):
#         print(labels)
        labels = torch.FloatTensor(labels)
        
        # Clear grads
        optimizer.zero_grad()
        
        # Run the forward pass
        similarity_score = model(q1_batch, q1_batch_lengths, q2_batch, q2_batch_lengths)
        print(1)

----------q1----------
tensor([[-0.1724],
        [-0.2094]], grad_fn=<AddmmBackward0>)
----------q2----------


IndexError: too many indices for tensor of dimension 2