In [1]:
import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [2]:
data_path = '/home/jupyter/data/data.csv'

In [3]:
orig_data = pd.read_csv(data_path)

In [4]:
orig_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
orig_data.shape

(363861, 6)

In [6]:
orig_data['is_duplicate'].value_counts()

0    228686
1    135175
Name: is_duplicate, dtype: int64

In [7]:
duplicate_qns_rows = orig_data.loc[orig_data['is_duplicate'] == 1]
duplicate_qns_rows.head(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1
15,15,31,32,What would a Trump presidency mean for current...,How will a Trump presidency affect the student...,1
16,16,33,34,What does manipulation mean?,What does manipulation means?,1
18,18,37,38,Why are so many Quora users posting questions ...,Why do people ask Quora questions which can be...,1
20,20,41,42,Why do rockets look white?,Why are rockets and boosters painted white?,1
29,29,59,60,How should I prepare for CA final law?,How one should know that he/she completely pre...,1


## Check if row with NaN exists

In [12]:
orig_data.isnull().values.any()

True

In [13]:
nan_rows = orig_data[orig_data.isnull().T.any().T]

In [15]:
nan_rows.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0


In [16]:
clean_data = orig_data[orig_data.question2.notnull()]

## Check number of words in vocabulary

In [17]:
all_questions = []

all_question_1 = clean_data['question1']
all_question_2 = clean_data['question2']

for question in all_question_1:
    all_questions.append(question)
    
for question in all_question_2:
    all_questions.append(question)

In [18]:
tokenizer = Tokenizer(num_words=200000)
tokenizer.fit_on_texts(all_questions)

In [19]:
word_index = tokenizer.word_index

print("Words in index: %d" % len(word_index))

Words in index: 91013


## Check if rows with duplicate qid exists

In [20]:
duplicate_qid = clean_data.loc[clean_data.qid1 == clean_data.qid2]

In [21]:
duplicate_qid.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate


## Create train_val_split using Stratified Sampling

In [26]:
def get_train_test_inds(y,train_proportion=0.8):
    '''Generates indices, making random stratified split into training set and testing sets
    with proportions train_proportion and (1-train_proportion) of initial sample.
    y is any iterable indicating classes of each observation in the sample.
    Initial proportions of classes inside training and 
    testing sets are preserved (stratified sampling).
    '''
    y=np.array(y)
    train_inds = np.zeros(len(y),dtype=bool)
    test_inds = np.zeros(len(y),dtype=bool)
    values = np.unique(y)
    for value in values:
        value_inds = np.nonzero(y==value)[0]
        np.random.shuffle(value_inds)
        n = int(train_proportion*len(value_inds))

        train_inds[value_inds[:n]]=True
        test_inds[value_inds[n:]]=True

    return train_inds,test_inds

In [29]:
train_inds,test_inds = get_train_test_inds(clean_data['is_duplicate'],0.8)

In [31]:
train_df = clean_data[train_inds]
test_df = clean_data[test_inds]

In [32]:
train_df.head(20)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1


In [38]:
test_df.loc[test_df.qid1==174363]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
263095,263095,174363,239239,How can I develop android app?,What is the best way of creating Android apps?...,0


In [40]:
train_df.loc[train_df.qid1==174363]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate


In [46]:
train_df.to_csv('/home/jupyter/data/train.csv',index=False)

In [47]:
test_df.to_csv('/home/jupyter/data/test.csv',index=False)