# Quora question pairs: data preparation

## Import packages

In [1]:
from __future__ import print_function

import numpy as np
import csv, json
from zipfile import ZipFile
from os.path import expanduser, exists

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.data_utils import get_file

Using TensorFlow backend.


## Initialize global variables

In [2]:
KERAS_DATASETS_DIR = expanduser('~/.keras/datasets/')
QUESTION_PAIRS_FILE_URL = 'http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv'
QUESTION_PAIRS_FILE = 'quora_duplicate_questions.tsv'
GLOVE_ZIP_FILE_URL = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
GLOVE_ZIP_FILE = 'glove.840B.300d.zip'
GLOVE_FILE = 'glove.840B.300d.txt'
Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 25
EMBEDDING_DIM = 300

## Download and extract questions pairs data

In [3]:
if not exists(KERAS_DATASETS_DIR + QUESTION_PAIRS_FILE):
    get_file(QUESTION_PAIRS_FILE, QUESTION_PAIRS_FILE_URL)

print("Processing", QUESTION_PAIRS_FILE)

question1 = []
question2 = []
is_duplicate = []
with open(KERAS_DATASETS_DIR + QUESTION_PAIRS_FILE, encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t')
    for row in reader:
        question1.append(row['question1'])
        question2.append(row['question2'])
        is_duplicate.append(row['is_duplicate'])

print('Question pairs: %d' % len(question1))

Processing quora_duplicate_questions.tsv
Question pairs: 404290


In [4]:
print(question1[:20])
print(is_duplicate[:20])

['What is the step by step guide to invest in share market in india?', 'What is the story of Kohinoor (Koh-i-Noor) Diamond?', 'How can I increase the speed of my internet connection while using a VPN?', 'Why am I mentally very lonely? How can I solve it?', 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?', 'Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?', 'Should I buy tiago?', 'How can I be a good geologist?', 'When do you use シ instead of し?', 'Motorola (company): Can I hack my Charter Motorolla DCX3400?', 'Method to find separation of slits using fresnel biprism?', 'How do I read and find my YouTube comments?', 'What can make Physics easy to learn?', 'What was your first sexual experience like?', 'What are the laws to change your status from a student visa to a green card in the US, how do they compare to the immigration laws in Canada?', 'What would a Trump presidency mean for current international master’s studen

## Build tokenized word index

In [5]:
questions = question1 + question2
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(questions)
question1_word_sequences = tokenizer.texts_to_sequences(question1)
question2_word_sequences = tokenizer.texts_to_sequences(question2)
word_index = tokenizer.word_index

print("Words in index: %d" % len(word_index))

Words in index: 95596


In [6]:
print(question1_word_sequences[:20])
# print(word_index)

[[2, 3, 1, 1222, 57, 1222, 2581, 7, 576, 8, 763, 383, 8, 35], [2, 3, 1, 559, 10, 14300, 13598, 5, 21311, 4565], [4, 13, 5, 217, 1, 440, 10, 17, 361, 1827, 200, 146, 6, 2773], [16, 72, 5, 2774, 312, 2757, 4, 13, 5, 649, 19], [23, 49, 7131, 8, 231, 35496, 1891, 2047, 10570, 12, 1928, 10924, 6456], [2371, 5, 72, 6, 9925, 940, 4451, 813, 12, 4451, 5037, 2, 21, 28, 238, 46, 60], [31, 5, 126, 31238], [4, 13, 5, 24, 6, 42, 25874], [37, 9, 15, 74, 42132, 482, 10, 42133], [6939, 186, 13, 5, 445, 17, 7896, 55635, 42134], [959, 7, 87, 4663, 10, 31239, 146, 31240, 55636], [4, 9, 5, 223, 12, 87, 17, 286, 1727], [2, 13, 52, 609, 632, 7, 71], [2, 59, 34, 94, 1403, 299, 39], [2, 11, 1, 888, 7, 179, 34, 1161, 33, 6, 234, 517, 7, 6, 653, 233, 8, 1, 105, 4, 9, 66, 168, 7, 1, 2287, 888, 8, 523], [2, 43, 6, 92, 1137, 101, 14, 475, 495, 8504, 288, 20, 32, 3502, 517], [2, 21, 11495, 101], [16, 9, 307, 99, 7, 24, 283, 29, 1, 287, 66, 4839], [16, 11, 62, 79, 54, 834, 3124, 119, 28, 11, 10571, 1297, 20, 139], [

## Download and process GloVe embeddings

In [7]:
if not exists(KERAS_DATASETS_DIR + GLOVE_ZIP_FILE):
    zipfile = ZipFile(get_file(GLOVE_ZIP_FILE, GLOVE_ZIP_FILE_URL))
    zipfile.extract(GLOVE_FILE, path=KERAS_DATASETS_DIR)
    
print("Processing", GLOVE_FILE)

embeddings_index = {}
with open(KERAS_DATASETS_DIR + GLOVE_FILE, encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings: %d' % len(embeddings_index))

Processing glove.840B.300d.txt
Word embeddings: 2196016


In [13]:
with open(KERAS_DATASETS_DIR + GLOVE_FILE, encoding='utf-8') as f:
    count=0
    for line in f:
        print(line)
        count += 1
        if count >= 20:
            break


, -0.082752 0.67204 -0.14987 -0.064983 0.056491 0.40228 0.0027747 -0.3311 -0.30691 2.0817 0.031819 0.013643 0.30265 0.0071297 -0.5819 -0.2774 -0.062254 1.1451 -0.24232 0.1235 -0.12243 0.33152 -0.006162 -0.30541 -0.13057 -0.054601 0.037083 -0.070552 0.5893 -0.30385 0.2898 -0.14653 -0.27052 0.37161 0.32031 -0.29125 0.0052483 -0.13212 -0.052736 0.087349 -0.26668 -0.16897 0.015162 -0.0083746 -0.14871 0.23413 -0.20719 -0.091386 0.40075 -0.17223 0.18145 0.37586 -0.28682 0.37289 -0.16185 0.18008 0.3032 -0.13216 0.18352 0.095759 0.094916 0.008289 0.11761 0.34046 0.03677 -0.29077 0.058303 -0.027814 0.082941 0.1862 -0.031494 0.27985 -0.074412 -0.13762 -0.21866 0.18138 0.040855 -0.113 0.24107 0.3657 -0.27525 -0.05684 0.34872 0.011884 0.14517 -0.71395 0.48497 0.14807 0.62287 0.20599 0.58379 -0.13438 0.40207 0.18311 0.28021 -0.42349 -0.25626 0.17715 -0.54095 0.16596 -0.036058 0.08499 -0.64989 0.075549 -0.28831 0.40626 -0.2802 0.094062 0.32406 0.28437 -0.26341 0.11553 0.071918 -0.47215 -0.18366 -0.3

In [15]:
count = 0
for key in embeddings_index.keys():
    print(key)
    print(embeddings_index[key])
    count += 1
    if count >= 20:
        break
    

Responsibility
[ -3.51539999e-01  -8.28119963e-02  -2.05290005e-01   1.28999993e-01
  -1.37429997e-01   3.53890002e-01   1.59290005e-02   1.36770005e-03
   1.52919993e-01   1.10300004e+00  -1.12950003e+00  -2.87380010e-01
  -1.01870000e-01   1.22919999e-01  -9.71259996e-02  -1.00160003e-01
  -1.87570006e-01   1.90450009e-02  -2.40840003e-01   7.04530001e-01
   7.02069998e-01   1.20559998e-01   4.22919989e-01   1.77010000e-01
   4.79240000e-01  -8.04859959e-03  -3.27560008e-01   9.90099981e-02
  -3.48379999e-01   9.92320031e-02   1.57650001e-02   1.58019997e-02
  -2.20909998e-01  -3.04310005e-02   4.01100010e-01  -8.97369981e-02
   9.23670009e-02  -1.12760000e-01   5.36440015e-02   3.40769999e-03
   2.09729999e-01   1.65839996e-02  -3.58850002e-01   3.42489988e-01
  -2.45770007e-01  -7.02650011e-01  -3.94800007e-01   1.24629997e-01
  -6.40669987e-02   1.82400003e-01  -9.30930004e-02   2.39040002e-01
   3.34879994e-01  -3.56929988e-01  -2.80889988e-01   8.66940022e-02
  -1.20909996e-01  

In [21]:
count = 0
for word, i in word_index.items():
    print(word, i)
    count += 1
    if count >= 20:
        break

anakin's 51169
poosiya 49965
waitresses 28625
gross 4469
army 873
forum 4367
bg 21506
proofs 6779
7cfalse 78248
© 29465
c26000 61460
boos 54151
“you 23880
clot 18615
pernicious 94797
deduplication 43044
bluff 24411
rudras 86967
demonstrated 19840
forusmle 75746


## Prepare word embedding matrix

In [16]:
nb_words = min(MAX_NB_WORDS, len(word_index))
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))

Null word embeddings: 29276


In [42]:
print(word_embedding_matrix[1])

[  2.72040009e-01  -6.20299987e-02  -1.88400000e-01   2.32250001e-02
  -1.81580000e-02   6.71919994e-03  -1.38769999e-01   1.77080005e-01
   1.77090004e-01   2.58820009e+00  -3.51790011e-01  -1.73120007e-01
   4.32850003e-01  -1.07079998e-01   1.50059998e-01  -1.99819997e-01
  -1.90929994e-01   1.18710005e+00  -1.62070006e-01  -2.35379994e-01
   3.66399996e-03  -1.91560000e-01  -8.56619999e-02   3.91989984e-02
  -6.64490014e-02  -4.20899987e-02  -1.91220000e-01   1.16790002e-02
  -3.71380001e-01   2.18860000e-01   1.14229997e-03   4.31899995e-01
  -1.42049998e-01   3.80589992e-01   3.06540012e-01   2.01670006e-02
  -1.83160007e-01  -6.51860004e-03  -8.05489998e-03  -1.20630004e-01
   2.75069997e-02   2.98390001e-01  -2.28960007e-01  -2.28819996e-01
   1.46709993e-01  -7.63010010e-02  -1.26800001e-01  -6.66509988e-03
  -5.27950004e-02   1.42580003e-01   1.56100005e-01   5.55099994e-02
  -1.61489993e-01   9.62899998e-02  -7.65329972e-02  -4.99709994e-02
  -1.01950001e-02  -4.76410016e-02

In [46]:
s = list(word_index.keys())[list(word_index.values()).index(1)]
s

'the'

In [47]:
embeddings_index[s]

array([  2.72040009e-01,  -6.20299987e-02,  -1.88400000e-01,
         2.32250001e-02,  -1.81580000e-02,   6.71919994e-03,
        -1.38769999e-01,   1.77080005e-01,   1.77090004e-01,
         2.58820009e+00,  -3.51790011e-01,  -1.73120007e-01,
         4.32850003e-01,  -1.07079998e-01,   1.50059998e-01,
        -1.99819997e-01,  -1.90929994e-01,   1.18710005e+00,
        -1.62070006e-01,  -2.35379994e-01,   3.66399996e-03,
        -1.91560000e-01,  -8.56619999e-02,   3.91989984e-02,
        -6.64490014e-02,  -4.20899987e-02,  -1.91220000e-01,
         1.16790002e-02,  -3.71380001e-01,   2.18860000e-01,
         1.14229997e-03,   4.31899995e-01,  -1.42049998e-01,
         3.80589992e-01,   3.06540012e-01,   2.01670006e-02,
        -1.83160007e-01,  -6.51860004e-03,  -8.05489998e-03,
        -1.20630004e-01,   2.75069997e-02,   2.98390001e-01,
        -2.28960007e-01,  -2.28819996e-01,   1.46709993e-01,
        -7.63010010e-02,  -1.26800001e-01,  -6.66509988e-03,
        -5.27950004e-02,

## Prepare training data tensors

In [49]:
q1_data = pad_sequences(question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(is_duplicate, dtype=int)
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of question1 data tensor: (404290, 25)
Shape of question2 data tensor: (404290, 25)
Shape of label tensor: (404290,)


In [53]:
print(q1_data[:20])
print(labels[:20])

[[    0     0     0     0     0     0     0     0     0     0     0     2
      3     1  1222    57  1222  2581     7   576     8   763   383     8
     35]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     2     3     1   559    10 14300 13598     5 21311
   4565]
 [    0     0     0     0     0     0     0     0     0     0     0     4
     13     5   217     1   440    10    17   361  1827   200   146     6
   2773]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0    16    72     5  2774   312  2757     4    13     5   649
     19]
 [    0     0     0     0     0     0     0     0     0     0     0     0
     23    49  7131     8   231 35496  1891  2047 10570    12  1928 10924
   6456]
 [    0     0     0     0     0     0     0     0  2371     5    72     6
   9925   940  4451   813    12  4451  5037     2    21    28   238    46
     60]
 [    0     0     0     0     0     0     0     0     0   

## Persist training and configuration data to files

In [51]:
np.save(open(Q1_TRAINING_DATA_FILE, 'wb'), q1_data)
np.save(open(Q2_TRAINING_DATA_FILE, 'wb'), q2_data)
np.save(open(LABEL_TRAINING_DATA_FILE, 'wb'), labels)
np.save(open(WORD_EMBEDDING_MATRIX_FILE, 'wb'), word_embedding_matrix)
with open(NB_WORDS_DATA_FILE, 'w') as f:
    json.dump({'nb_words': nb_words}, f)

In [56]:
loaded = np.load("word_embedding_matrix.npy")

In [57]:
print(loaded)

[[ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.27204001 -0.06203    -0.1884     ...,  0.13015001 -0.18317001  0.1323    ]
 [-0.038548    0.54251999 -0.21843    ...,  0.11798     0.24590001
   0.22872999]
 ..., 
 [-0.1681      0.0039697   0.26023999 ..., -0.22747999 -0.45528999
   0.072328  ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 1.06700003 -0.23154999 -0.026282   ...,  0.30162001  0.051228    0.53963   ]]
