In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import itertools
from time import time

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_colwidth', 65)  # default: 50

train = pd.read_csv("train_preprocessed.csv")


In this nb we'll play around with positive examples, trying to infer the possible types of questions available. 

First, let's drop all the negative examples. 

In [2]:
train.columns

Index(['Unnamed: 0', 'id', 'qid1', 'qid2', 'question1', 'question2',
       'is_duplicate'],
      dtype='object')

In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,0,1,2,what is the step by step guide to invest in share market in i...,what is the step by step guide to invest in share market?,0
1,1,1,3,4,what is the story of kohinoor (koh-i-noor) diamond?,what would happen if the indian government stole the kohinoor...,0
2,2,2,5,6,how can i increase the speed of my internet connection while ...,how can internet speed be increased by hacking through dns?,0
3,4,4,9,10,"which one dissolve in water quikly sugar, salt, methane and c...",which fish would survive in salt water?,0
4,5,5,11,12,astrology: i am a capricorn sun cap moon and cap rising...wha...,"i'm a triple capricorn (sun, moon and ascendant in capricorn)...",1


In [4]:
train = train.loc[ train.is_duplicate == 1 , lambda df: ['question1', 'question2'] ] 

We'll now change the questions to be lists of words, and remove all words below some frequency. This should leave only the interesting words (the correct thing to do is probably use them as well as word2vec, so that some of the meaning remains)

In [5]:
words_lst_iterator1 = itertools.chain(*[''.join(map(lambda x: x if str.isalnum(x) else ' ', q)).split() for q in train.question1])
words_lst_iterator2 = itertools.chain(*[''.join(map(lambda x: x if str.isalnum(x) else ' ', q)).split() for q in train.question2])

In [6]:
from collections import Counter
word_counter = Counter(itertools.chain(words_lst_iterator1, words_lst_iterator2))
words = sorted(word_counter.keys(), key = lambda x: word_counter[x], reverse = True)

In [7]:
q_words = [p[0] for p in word_counter.most_common()[:59]]

In [8]:
word_counter["than"]

2825

In [9]:
%pprint
q_words

Pretty printing has been turned OFF


['the', 'what', 'is', 'how', 'i', 'to', 'do', 'in', 'a', 'are', 'of', 'can', 'and', 'you', 'best', 'why', 'for', 'my', 'on', 'it', 'which', 'be', 'some', 'does', 'your', 'get', 'if', 's', 'india', 'should', 'that', 'will', 'quora', 'have', 'or', 'with', 'people', 'from', 'an', 'who', 'money', 'way', 'would', 'life', 'good', 'about', 'there', 'make', 'most', 'trump', 'we', 'learn', 'when', 'one', 'like', 'did', 'as', 'between', '500']

In [10]:
def word_in_qwords(w, q_words = q_words):
    if w in q_words:
        return w
    else:
        return "*"
    
def jokerize_question(question):
    q = (''.join(map(lambda x: x if str.isalnum(x) else ' ', question))).split()
    q = list(map( word_in_qwords, q))
    l = []
    for i in range(len(q)-1):
        if q[i] == q[i+1] == '*':
            continue
        else:
            l.append(q[i])
    return l

In [11]:
train.columns

Index(['question1', 'question2'], dtype='object')

In [12]:
pped_data = train.applymap(jokerize_question)

In [13]:
pped_data.head()

Unnamed: 0,question1,question2
4,"[*, i, *, a, *, and, *, what, does, that, *, about]","[i, *, a, *, and, *, in, *, what, does, *, about]"
6,"[how, can, i, be, a, good]","[what, should, i, do, to, be, a]"
9,"[how, do, i, *, and, *, my]","[how, can, i, *, my]"
10,"[what, can, make, *, to]","[how, can, you, make, *, to]"
11,"[what, *, your, *]","[what, *, your]"


In [14]:
len(q_words)

59

In [15]:
q_words.append('*')

In [16]:
import keras

Using TensorFlow backend.


In [17]:
word_to_freq = {q_words[i]: i for i in range(len(q_words))}

In [18]:
pped_data = pped_data.applymap(lambda l: list(map(lambda w: word_to_freq[w], l)))

In [19]:
# The training data, to our lstm model. Each pair of questions starting on even indices will correspond to duplicate questions.
X = [None]*(len(pped_data))*2
X[::2] = pped_data['question1'].tolist()
X[1::2] = pped_data['question2'].tolist()

In [20]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers import Dense
from keras.preprocessing import sequence

In [21]:
max(map(len, X))

52

In [22]:
sum([len(x)>25 for x in X])

145

In [23]:
len(X)

292962

In [24]:
X = sequence.pad_sequences(X, maxlen=25)

In [25]:
X[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 59,  4, 59,  8,
       59, 12, 59,  1, 23, 30, 59, 45])

In [26]:
output_len = 20

In [27]:
model = Sequential()
model.add(Embedding(len(q_words), 5))
model.add(LSTM(5))
model.add(Dense(output_len,activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 5)           300       
_________________________________________________________________
lstm_1 (LSTM)                (None, 5)                 220       
_________________________________________________________________
dense_1 (Dense)              (None, 20)                120       
Total params: 640.0
Trainable params: 640
Non-trainable params: 0.0
_________________________________________________________________
None


The goal of this model is to take a sentence as an input, and return a probability estimate of what "category" the sentence lies in. We have `output_len` number of possible categories, and the model ends with a softmax layer to make sure that we get probabilities. 

If we know the number of categories we want, and their distribution (i.e. in what probability each data point will be assigned to any specific category), we can try and enforce two things:
1. Make the probability of a pair of duplicate questions to be in a similar category as high as possible.
2. Make the distribution of the categories close to the given apriori distribution.

(Later we will try to give other solutions, since the distribution of categories is not known...)

We will do that by composing the loss from the sum of two values.

In [28]:
def my_loss(y_true, y_pred, gamma=1):
    # We do not need y_true. 
    
    loss = 0
    
    # Motivate duplicate question to go together:
    loss += cluster_duplicates(y_pred)
    
    # Motivate a proper distribution of categories
    loss += gamma*distribute_cats(y_pred)
    
    return loss

The function `cluster_duplicates` returns the part of the loss responsible for enforcing high probability of duplicate questions having the same category. 

Let $x,x'$ be two questions, and let $M$ be the model we are training. We think of $M(x)_i$ as the probability of $x$ belonging to category $i$ (note that $x$ can only be in one category at a time). The "probability" of $x$ and $x'$ having the same category is thus $\sum_i M(x)_i M(x')_i = \langle M(x), M(x') \rangle$. Hence we return the sum of logarithms of the scalar product.

In [29]:
from keras import backend as K

In [30]:
def cluster_duplicates(y): # In this method we return the sum of log probabilities of the duplicate questions having the same category.
    y_pairs = K.reshape(y, [-1, output_len, 2])    
    y_first = K.reshape( K.slice(y_pairs, [0,0,0], [-1,-1,1]), [-1, output_len, 1])
    y_second= K.reshape( K.slice(y_pairs, [0,0,1], [-1,-1,1]), [-1, output_len, 1])
    return K.sum( K.log( K.dot(y_first, K.transpose(y_second) ) ) )

We will also minimize the KL divergence of the apriori distribution of categories from what we'll get. 

The first distribution we'll try out is by choosing each probability uniformly. (intuitively this should not be the case, and we would prefer some kind of harmonic descending sequence, but I think my intuition does not worth much..) 
Eventually we'd not want to fix a strict distribution but instead try and enforce some rules for the distribution. However all simple ideas I had so far do not converge well, so we shall try this first.

In [31]:
l_temp = [np.random.rand() for _ in range(output_len) ]
s_temp = sum(l_temp)
l_temp = [p_temp / s_temp for p_temp in l_temp]

In [32]:
uniformly_random_distribution = np.array(l_temp)

In [33]:
def distribute_cats(y):
    K.sum(axis=)

SyntaxError: invalid syntax (<ipython-input-33-0fd1a9920f1a>, line 2)

In [None]:
y = K.constant([[1,2,2],[3,4,4]])
K.sum(y,axis=0)

In [34]:
test_model = Sequential()

In [35]:
from keras.layers import Lambda

In [81]:
test_model = Sequential()
test_model.add( Lambda( lambda x: K.sum(x, axis=0), input_shape=(2,3), output_shape = (3,)) )

In [83]:
x = np.reshape(np.arange(12), (2,2,3))
test_model.predict(x)

array([[  6.,   8.,  10.],
       [ 12.,  14.,  16.]], dtype=float32)

In [73]:
x

array([[[ 0,  1,  2],
        [ 3,  4,  5]],

       [[ 6,  7,  8],
        [ 9, 10, 11]],

       [[12, 13, 14],
        [15, 16, 17]]])

In [70]:
x[0,1,2]

5