In [1]:
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from collections import Counter

from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler, WeightedRandomSampler

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def text_split():
    #filename = sys.argv[1]
    filename = 'Twitter_URL_Corpus_train.txt'
    file = open(filename, encoding='utf-8')
    lines = file.readlines()
    count = 0
    raw = []
    #raw1 = []
    #raw2 = []
    label = []
    # Strips the newline character
    for line in lines:
        tokens = line.split('\t')
        if int(tokens[2][1]) !=3:
            raw.append([tokens[0].strip(), tokens[1].strip()])
            if int(tokens[2][1]) <= 2:  
                label.append(0)  
            else:
                label.append(1)  
#         if int(tokens[2][1]) !=3:
#             raw1.append(tokens[0].strip())
#             raw2.append(tokens[1])
#             if int(tokens[2][1]) <= 2:  
#                 label.append('False')  
#             else:
#                 label.append('True')  
           
        count += 1
       
    return raw, label 
#text_split()
train, train_label=text_split()
print(train[0])

['How the metaphors we use to describe discovery affect men and women in the sciences', 'Light Bulbs or Seeds ? How Metaphors for Ideas Influence Judgments About Genius']


In [3]:

def word2index(input_list):
    encoded_list=[]
    for sentence_list in input_list:
        vec_list=[]
        for sentences in sentence_list:
            words = sentences.split(' ')
        
            vocab = Counter(words) 
            vocab = sorted(vocab, key=vocab.get, reverse=True)
            vocab_size = len(vocab)
        
            word2idx = {word: ind for ind, word in enumerate(vocab)} 
        
            encoded_sentences = [word2idx[word] for word in words]
            vec_list.append(encoded_sentences)
        encoded_list.append(vec_list)
    return encoded_list
encoded_train=word2index(train)
#encoded_train_2=word2index(train_2)

print(encoded_train[0])
#encoded_train.shape
#print(encoded_train_2[0])
#print(encoded_train_2[1])

# emb_dim = 3 
# emb_layer = nn.Embedding(vocab_size, emb_dim)
# word_vectors = emb_layer(torch.LongTensor(encoded_sentences))

[[1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 13], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]


In [4]:
class Language:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.n_words = 0

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words + 1
            self.word2count[word] = 1
            self.index2word[self.n_words + 1] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

language = Language()
for data in [train]:
    for question_pair in data:
#         print(question_pair)
        q1 = question_pair[0]
        q2 = question_pair[1]
        language.addSentence(q1)
        language.addSentence(q2)
# print(language.word2index)
n_vocabulary_words = len(language.word2index)
print ('Total Unique Vocabulary Words: ', n_vocabulary_words)

Total Unique Vocabulary Words:  48361


In [5]:
class QuestionsDataset(Dataset):
    def __init__(self, questions_list, word2index, labels):
        self.questions_list = questions_list
        self.labels = labels
        self.word2index = word2index
        
    def __len__(self):
        return len(self.questions_list)
    
    def __getitem__(self, index):
        questions_pair = self.questions_list[index]
        q1 = questions_pair[0]
        q1_indices = []
        for word in q1.split():
            q1_indices.append(self.word2index[word])
            
        q2 = question_pair[1]
        q2_indices = []
        for word in q2.split():
            q2_indices.append(self.word2index[word])
            
        # q1_indices and q2_indices are lists of indices against words used in the sentence 
        return q1_indices, q2_indices, self.labels[index]
    
train_dataset = QuestionsDataset(train, language.word2index, train_label)
print(next(enumerate(train_dataset)))

(0, ([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14], [170, 240, 1591, 5308, 1070, 8069, 6, 59, 60, 2, 1253, 151, 172, 40688, 43579, 93, 3138, 736, 6, 172, 29430, 11, 43580, 11, 6, 555, 4, 43581, 2388, 117, 172, 43582], 0))


In [6]:
class CustomCollate:
    def custom_collate(self, batch):

        # batch = list of tuples where each tuple is of the form ([i1, i2, i3], [j1, j2, j3], label)
        q1_list = []
        q2_list = []
        labels = []
        for training_example in batch:
#             print(batch)
            q1_list.append(training_example[0])
            q2_list.append(training_example[1])
            labels.append(training_example[2])
          
        
        q1_lengths = [len(q) for q in q1_list]
        q2_lengths = [len(q) for q in q2_list]
        
        return q1_list, q1_lengths, q2_list, q2_lengths, labels

    def __call__(self, batch):
        return self.custom_collate(batch)

In [7]:
validation_split = 0.2
dataset_size = len(train_dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
shuffle_dataset = True
random_seed = 32

if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# print(train_indices)
train_sampler = SubsetRandomSampler(train_indices)
# print(next(enumerate(train_sampler)))
validation_sampler = SubsetRandomSampler(val_indices)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1, sampler=train_sampler, collate_fn=CustomCollate())
val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1, sampler=validation_sampler, collate_fn=CustomCollate())

for i, (q1_batch, q1_batch_lengths, q2_batch, q2_batch_lengths, labels) in enumerate(train_loader):
    print(i, (q1_batch, q1_batch_lengths, q2_batch, q2_batch_lengths, labels))
    break
print ('Training Set Size {}, Validation Set Size {}'.format(len(train_indices), len(val_indices)))

0 ([[444, 199, 10137, 238, 472, 12979, 11, 238, 155, 7579, 472, 12979, 322]], [13], [[170, 240, 1591, 5308, 1070, 8069, 6, 59, 60, 2, 1253, 151, 172, 40688, 43579, 93, 3138, 736, 6, 172, 29430, 11, 43580, 11, 6, 555, 4, 43581, 2388, 117, 172, 43582]], [32], [0])
Training Set Size 33760, Validation Set Size 8440


In [8]:
EMBEDDING_PATH = './GoogleNews-vectors-negative300.bin'
EMBEDDING_DIMENSION = 300
# Load pre-trained embeddings from word2vec
word2vec_model = KeyedVectors.load_word2vec_format(EMBEDDING_PATH, binary=True)
print(word2vec_model)
# Convert word2vec embeddings into FloatTensor
word2vec_weights = torch.FloatTensor(word2vec_model.vectors)

# Create a random weight tensor of the shape (n_vocabulary_words + 1, EMBEDDING_DIMENSION) and place each word's embedding from word2vec at the index assigned to that word
# Two key points:
# 1. Weights tensor has been initialized randomly so that the words which are part of our dataset vocabulary but are not present in word2vec are given a random embedding
# 2. Embedding at 0 index is all zeros. This is the embedding for the padding that we will do for batch processing
weights = torch.randn(n_vocabulary_words + 1, EMBEDDING_DIMENSION)
weights[0] = torch.zeros(EMBEDDING_DIMENSION)
for word, lang_word_index in language.word2index.items():
    if word in word2vec_model:
        weights[lang_word_index] = torch.FloatTensor(word2vec_model.word_vec(word))

# del word2vec_model
# del word2vec_weights

<gensim.models.keyedvectors.KeyedVectors object at 0x000001FE130A6048>




In [9]:
EMBEDDING_DIMENSION = 300
test_weights = torch.randn(n_vocabulary_words + 1, EMBEDDING_DIMENSION)
for word, lang_word_index in language.word2index.items():
    if word in word2vec_model:
        test_weights[lang_word_index] = torch.FloatTensor(word2vec_model.word_vec(word))
        print(len(word2vec_model.word_vec(word)))
        print(word, lang_word_index)
        print(test_weights[1].size())
        print(test_weights[1])
        break

300
How 1
torch.Size([300])
tensor([ 0.1602,  0.2168,  0.0549,  0.2070, -0.1533,  0.1338,  0.1069,  0.0183,
         0.1406, -0.1025,  0.1797, -0.1445, -0.3457, -0.0703, -0.2598,  0.2168,
         0.0771,  0.1973,  0.0537, -0.0430, -0.0322,  0.0317,  0.4688, -0.1963,
        -0.1484,  0.1357, -0.2305, -0.0679,  0.1245,  0.0845,  0.0835,  0.0598,
        -0.0352, -0.1709,  0.0253,  0.3750, -0.0084,  0.0898, -0.0396,  0.2793,
        -0.1660,  0.0060,  0.1318, -0.2812, -0.0352,  0.0038, -0.0197, -0.0786,
         0.0640,  0.2637, -0.3027,  0.2158,  0.0366, -0.1826,  0.1494, -0.0525,
        -0.2100,  0.0496,  0.3066,  0.1348,  0.1572,  0.0060, -0.4453, -0.0347,
        -0.0332, -0.2334,  0.0007,  0.2236, -0.1001, -0.1992,  0.1309,  0.1748,
         0.1816,  0.0154, -0.1572, -0.4023, -0.0464,  0.2236, -0.0317,  0.2227,
        -0.1504,  0.0859, -0.1699,  0.1196, -0.4863, -0.2109, -0.1777,  0.1836,
        -0.0320,  0.2246, -0.0781,  0.1406, -0.3574,  0.0164, -0.0771, -0.0806,
         0.2

  """
  


In [10]:
# class NLPModel(nn.Module):

#     def __init__(self, vocab_size, embedding_dim, context_size):
#         super(NLPModel, self).__init__()
#         self.embeddings = nn.Embedding(vocab_size, embedding_dim)
#         self.linear1 = nn.Linear(context_size * embedding_dim, 128)
#         self.linear2 = nn.Linear(128, vocab_size)

#     def forward(self, inputs):
#         embeds = self.embeddings(inputs).view((1, -1))
#         out = F.relu(self.linear1(embeds))
#         out = self.linear2(out)
#         log_probs = F.log_softmax(out, dim=1)
#         return log_probs

In [32]:
EMBEDDING_REQUIRES_GRAD = False
HIDDEN_CELLS = 25
NUM_LAYERS = 1
class SiameseNetwork(nn.Module):
    def __init__(self, pretrained_weights):
        super(SiameseNetwork, self).__init__()
        # Creating embedding object from the pre-trained weights
        self.embedding = nn.Embedding.from_pretrained(pretrained_weights)
        self.embedding.weight.requires_grad = EMBEDDING_REQUIRES_GRAD
        # Create a single LSTM since this is a Siamese Network and the weights are shared
        self.lstm = nn.LSTM(input_size=EMBEDDING_DIMENSION, hidden_size=HIDDEN_CELLS, num_layers = NUM_LAYERS, batch_first = True)
    
    # Manhattan Distance Calculator
    def exponent_neg_manhattan_distance(self, x1, x2):
        return torch.exp(-torch.sum(torch.abs(x1 - x2), dim=0)).to(device)

    def forward_once(self, x, input_lengths):
      
        # x is of the shape (batch_dim, sequence)
        # e.g. x = [
        #  [i1, i2, i3],
        #  [j1, j2, j3, j4]
        # ]
        
        # input_lengths is the list that contains the sequence lengths for each sequence
        # e.g. input_lengths = [3, 4]
        
        # Reverse sequence lengths indices in decreasing order as per the requirement from PyTorch before Padding and Packing
        sorted_indices = np.flipud(np.argsort(input_lengths))
        input_lengths = np.flipud(np.sort(input_lengths))
        input_lengths = input_lengths.copy() # https://github.com/facebookresearch/InferSent/issues/99
        
        # Reorder questions in the decreasing order of their lengths
        ordered_questions = [torch.LongTensor(x[i]).to(device) for i in sorted_indices]
        # Pad sequences with 0s to the max length sequence in the batch
        ordered_questions = torch.nn.utils.rnn.pad_sequence(ordered_questions, batch_first=True)
        # Retrieve Embeddings
        embeddings = self.embedding(ordered_questions).to(device)
        # Pack the padded sequences and pass it through LSTM
        packed = torch.nn.utils.rnn.pack_padded_sequence(embeddings, input_lengths, batch_first=True)
        out, (hn, cn) = self.lstm(packed)
        unpacked, unpacked_len = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=int(input_lengths[0]))
        
        # The following step reorders the calculated activations to the original order in which questions were passed
        result = torch.FloatTensor(unpacked.size())
        for i, encoded_matrix in enumerate(unpacked):
            result[sorted_indices[i]] = encoded_matrix
        print(result.size())
        return result

    def forward(self, q1, q1_lengths, q2, q2_lengths):
        output1 = self.forward_once(q1, q1_lengths)
        output2 = self.forward_once(q2, q2_lengths)
        similarity_score = torch.zeros(output1.size()[0]).to(device)
        # Calculate Similarity Score between both questions in a single pair
        for index in range(output1.size()[0]):
            # Sequence lenghts are being used to index and retrieve the activations before the zero padding since they were not part of original question
            q1 = output1[index, q1_lengths[index] - 1, :]
            print('oh')
            print(q1.size())
            q2 = output2[index, q2_lengths[index] - 1, :]
            print('ho')
            print(q2.size())
            similarity_score[index] = self.exponent_neg_manhattan_distance(q1, q2)
        return similarity_score

In [33]:
model = SiameseNetwork(weights).to(device)

total_step = len(train_loader)
# Threshold 0.5. Since similarity score will be a value between 0 and 1, we will consider all question pair with values greater than threshold as Duplicate
threshold = torch.Tensor([0.5]).to(device)

# define hyperparameter
num_epochs = 1
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001 )

for epoch in range(num_epochs):
    loss_history = []
    model.train(True)
    train_correct_total = 0
    for i, (q1_batch, q1_batch_lengths, q2_batch, q2_batch_lengths, labels) in enumerate(train_loader):
#         print(labels)
        labels = torch.FloatTensor(labels).to(device)
        
        # Clear grads
        optimizer.zero_grad()
        
        # Run the forward pass
        similarity_score = model(q1_batch, q1_batch_lengths, q2_batch, q2_batch_lengths)
        predictions = (similarity_score > threshold).float() * 1
        total = labels.size()[0]
#         print(total)
        correct = (predictions == labels).sum().item()
        train_correct_total += correct
        
        # Calculate Loss
        loss = criterion(similarity_score, labels)
        
        # Calculate gradients
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        if (i + 1) % 300 == 0:
            loss_history.append(loss.item())
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Accuracy: {:.2f}%'.format(epoch + 1, num_epochs, i + 1, total_step, np.mean(loss_history), (train_correct_total / (i+1)) * 100))
            
    print('Training Loss: {:.4f}, Training Accuracy: {:.4f}'.format(np.mean(loss_history), (train_correct_total / len(train_indices)) * 100))
    
    model.train(False)
    val_correct_total = 0
    with torch.no_grad():
        for i, (q1_batch, q1_batch_lengths, q2_batch, q2_batch_lengths, labels) in enumerate(val_loader):

            labels = torch.FloatTensor(labels).to(device)

            similarity_score = model(q1_batch, q1_batch_lengths, q2_batch, q2_batch_lengths)
            predictions = (similarity_score > threshold).float() * 1
            total = labels.size()[0]
            correct = (predictions == labels).sum().item()
            val_correct_total += correct
        
        avg_acc_val =  val_correct_total * 100 / len(val_indices)
        print ('Validation Set Size {}, Correct in Validation {}, Validation Accuracy {:.2f}%'.format(len(val_indices), val_correct_total, avg_acc_val))
 

torch.Size([1, 12, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 17, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 16, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 15, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 10, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 19, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 9, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 9, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 16, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 13, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 11, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 8, 25])
torch.Size(

torch.Size([1, 14, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 12, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 11, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 14, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 9, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 9, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 16, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 14, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 9, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 16, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 11, 25])
torch.Size([1, 32, 25])
oh
torch.Size([25])
ho
torch.Size([25])
torch.Size([1, 14, 25])
torch.Size(

KeyboardInterrupt: 