In [1]:
import pandas as pd
import networkx

In [2]:
data = pd.read_csv('./datasets/quora-question-pairs/train.csv')

In [3]:
graph = networkx.MultiGraph()

In [4]:
graph.add_nodes_from(data['qid1'].unique())

graph.add_nodes_from(data['qid2'].unique())

In [None]:
graph.add_edges_from(data[data['is_duplicate'] == 1][['qid1', 'qid2']].values)

In [18]:
data['qid1'].unique()

array([     1,      3,      5, ..., 537928, 537930, 537932])

In [35]:
networkx.number_connected_components(graph)

200803

In [None]:
networkx.nodes(graph)

In [38]:
from tqdm import tqdm

In [40]:
dic = {}

In [42]:
for _, (qid1, qid2, q1, q2) in tqdm(data[['qid1', 'qid2', 'question1', 'question2']].iterrows()):
    dic[qid1] = q1
    dic[qid2] = q2

404290it [01:13, 5517.98it/s]


In [48]:
subgraphs = (graph.subgraph(c) for c in networkx.connected_components(graph))

for sg in subgraphs:
    nn = networkx.number_of_nodes(sg)
    ne = networkx.number_of_edges(sg)
    if ne != nn * (nn - 1) // 2:
        print(nn, ne)
        ids = networkx.nodes(sg)
        print([dic[vertex] for vertex in ids][:40])
        break

53 268
["How will Trump's presidency affect prospective international students from Syria?", "What would be the effect of Trump's presidency on international Master's students who plan to work in the US after completing the degree?", 'How would a Trump presidency affect schools accessibility for international students?', 'I am an international student in the US - do I have to leave if Trump wins?', "What effect will Donald Trump's victory have on international students looking to study in USA?", 'Will international college students be negatively affected if Trump becomes president?', "What does Donald Trump's win mean for Indian students in USA?", "What does Donald Trump's victory mean for the graduate students from Islamic countries?", 'What happens to the Indian students trying to pursue Masters of PHD in USA, now that Trump is the president?', "What does a Trump presidency mean for Indian students looking to pursue Master's degrees in USA?", 'I am an Indian, planning to go to US for

In [None]:
graph.add_edges_from()

In [30]:
data[['qid1', 'qid2']].values

array([[     1,      2],
       [     3,      4],
       [     5,      6],
       ...,
       [537928, 537929],
       [537930, 537931],
       [537932, 537933]])

In [1]:
import torch

In [2]:
def masked_softmax(input, masks, dim, epsilon=1e-12):
    masked_vec = input * masks
    max_vec = torch.max(masked_vec, dim=dim, keepdim=True).values
    exps = torch.exp(masked_vec - max_vec)
    masked_exps = exps * masks
    masked_sums = masked_exps.sum(dim, keepdim=True)
    return masked_exps / masked_sums

In [60]:
class VectorAttention(torch.nn.Module):
    def __init__(self, dim):
        super(VectorAttention, self).__init__()
        self.dim = dim
        self.param = torch.nn.Parameter(data=(torch.ones(self.dim, 1) / self.dim))
        
    def forward_one(self, batch_vectors, batch_masks):
        #batch_masks: batch_size x seq_len x 1
        coef = masked_softmax((batch_vectors @ self.param), batch_masks.unsqueeze(2), dim=1)
        return (batch_vectors * coef).sum(dim=1)

    def forward(self, batch_1, mask_1, batch_2, mask_2):
        return self.forward_one(batch_1, mask_1), self.forward_one(batch_2, mask_2)

In [69]:
class NNAttention(torch.nn.Module):
    def __init__(self, dim):
        super(NNAttention, self).__init__()
        self.dim = dim
        self.nn = torch.nn.Sequential(torch.nn.Linear(dim, 1), torch.nn.Tanh())
    
    def forward_one(self, batch_vectors, batch_masks):
        coef = masked_softmax(self.nn(batch_vectors), batch_masks.unsqueeze(2), dim=1)
        return (batch_vectors * coef).sum(dim=1)
    
    def forward(self, batch_1, mask_1, batch_2, mask_2):
        return self.forward_one(batch_1, mask_1), self.forward_one(batch_2, mask_2)

In [70]:
class Seq2SeqAttention(torch.nn.Module):
    def __init__(self, dim, hidden_dim=50):
        super(Seq2SeqAttention, self).__init__()
        self.dim = dim #embedding dim
        self.hidden_dim = hidden_dim #attention dim
        self.mapping = torch.nn.Linear(dim, hidden_dim, bias=False)
        
    def forward(self, batch_1, mask_1, batch_2, mask_2):
        mapped_1 = self.mapping(batch_1)
        mapped_2 = self.mapping(batch_2)
        
        coef_matrix = torch.bmm(mapped_1, mapped_2.permute(0, 2, 1))
        coef_matrix = coef_matrix * mask_1.unsqueeze(2) * mask_2.unsqueeze(1)
        
        coef_1 = coef_matrix.sum(dim=2)
        coef_2 = coef_matrix.sum(dim=1)
        
        print(mask_1.shape)
        print(coef_1.shape)
        coef_1 = masked_softmax(coef_1, mask_1, dim=1).unsqueeze(2)
        coef_2 = masked_softmax(coef_2, mask_2, dim=1).unsqueeze(2)
        
        res_1 = (batch_1 * coef_1).sum(dim = 1)
        res_2 = (batch_2 * coef_2).sum(dim = 1)
        
        return res_1, res_2

In [71]:
test_batch_1 = torch.ones(5, 30, 784)
test_mask_1 = torch.ones(5, 30)
test_mask_1[:, 10:] = 0

test_batch_2 = torch.ones(5, 20, 784)
test_mask_2 = torch.ones(5, 20)
test_mask_2[:, 4:] = 0

In [72]:
va = VectorAttention(784)

na = NNAttention(784)

ssa = Seq2SeqAttention(784, 50)

In [73]:
va(test_batch_1, test_mask_1, test_batch_2, test_mask_2)

(tensor([[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
        grad_fn=<SumBackward2>), tensor([[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]], grad_fn=<SumBackward2>))

In [74]:
na(test_batch_1, test_mask_1, test_batch_2, test_mask_2)

(tensor([[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]], grad_fn=<SumBackward2>),
 tensor([[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]], grad_fn=<SumBackward2>))

In [75]:
ssa(test_batch_1, test_mask_1, test_batch_2, test_mask_2)

torch.Size([5, 30])
torch.Size([5, 30])


(tensor([[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
        grad_fn=<SumBackward2>), tensor([[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]], grad_fn=<SumBackward2>))

In [6]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [13]:
data['qid1'].nunique()

290654

In [18]:
data['qid1'].value_counts()

8461      50
14110     48
25984     47
20628     47
1749      47
          ..
415339     1
411241     1
413288     1
409190     1
524288     1
Name: qid1, Length: 290654, dtype: int64

In [16]:
data['qid2'].nunique()

299364

In [14]:
len(data['qid1'])

404290

In [15]:
len(data['qid2'])

404290