In [20]:
import pandas as pd
import numpy as np
import re
import codecs
import csv

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Embedding
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
import bcolz
def save_array(fname, arr): c=bcolz.carray(arr, rootdir=fname, mode='w'); c.flush()
def load_array(fname): return bcolz.open(fname)[:]

In [3]:
#import nltk
#nltk.download()

In [4]:
path = '/cinc/data/quora-question-pairs/'
save_path = '/cinc/data/quora-question-pairs/save_data/'

MAX_SEQUENCE_LENGTH = 30

In [33]:
stops = set(stopwords.words("english"))
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
#    if stem_words:
#        text = text.split()
#        stemmer = SnowballStemmer('english')
#        stemmed_words = [stemmer.stem(word) for word in text]
#        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)


# read training text

In [34]:
#train_data = pd.read_csv(path + 'train/train-sample.csv')
train_data = pd.read_csv(path + 'train/train.csv')

In [35]:
nb_samples = train_data.shape[0]
train_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [36]:
train_data['question1'][1], text_to_wordlist(str(train_data['question1'][1]))

('What is the story of Kohinoor (Koh-i-Noor) Diamond?',
 'what is the story of kohinoor koh - i - noor diamond ')

In [37]:
train_text_1 = [None]*train_data.shape[0]
train_text_2 = [None]*train_data.shape[0]
train_labels = [None]*train_data.shape[0]

In [38]:
for i in range(train_data.shape[0]):
    train_text_1[i] = (text_to_wordlist(str(train_data['question1'][i]), remove_stopwords=True))
    train_text_2[i] = (text_to_wordlist(str(train_data['question2'][i]), remove_stopwords=True))
    train_labels[i] = (train_data['is_duplicate'][i])

In [39]:
len(train_text_1), len(train_text_2), len(train_labels), train_text_1[0], train_text_2[0], train_labels[0]

(404290,
 404290,
 404290,
 'step step guide invest share market india ',
 'step step guide invest share market ',
 0)

In [40]:
len(list(train_data['question1']) + list(train_data['question2']))

808580

# read test data

In [41]:
test_data = pd.read_csv(path + 'test/test.csv')
print test_data.shape[0]
test_data.head()

2345796


Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [45]:
test_text_1 = [None] * test_data.shape[0]
test_text_2 = [None] * test_data.shape[0]

In [46]:
for i in range(test_data.shape[0]):
    test_text_1[i] = text_to_wordlist(str(test_data['question1'][i]), remove_stopwords=True)
    test_text_2[i] = text_to_wordlist(str(test_data['question2'][i]), remove_stopwords=True)


In [47]:
len(test_text_1), len(test_text_2), test_text_1[0], test_text_2[0]

(2345796,
 2345796,
 'surface pro 4 compare ipad pro ',
 'microsoft choose core m3 core i3 home surface pro 4 ')

# tokenizer

In [48]:
tokenizer = Tokenizer(nb_words=20000)

In [49]:
tokenizer.fit_on_texts(train_text_1 + train_text_2 + test_text_1 + test_text_2)

In [50]:
tokenizer.word_index['what']

16

In [51]:
train_sequence_1 = tokenizer.texts_to_sequences(train_text_1)
train_sequence_2 = tokenizer.texts_to_sequences(train_text_2)

In [52]:
test_sequence_1  = tokenizer.texts_to_sequences(test_text_1)
test_sequence_2  = tokenizer.texts_to_sequences(test_text_2)

In [58]:
def max_length(text_list):
    max_len = 0
    for text in text_list:
        if (len(text) > max_len):
            max_len = len(text)
    return max_len

max_length(train_sequence_1), max_length(train_sequence_2), max_length(test_sequence_1), max_length(test_sequence_2)

(59, 118, 118, 119)

In [59]:
train_sequence_1[0], train_sequence_2[0], test_sequence_1[0], test_sequence_2[0]

([1183, 1183, 2854, 499, 685, 289, 3],
 [1183, 1183, 2854, 499, 685, 289],
 [1967, 753, 103, 286, 1586, 753],
 [655, 336, 837, 17733, 837, 4199, 172, 1967, 753, 103])

In [60]:
train_sequence_1 = pad_sequences(train_sequence_1, maxlen=MAX_SEQUENCE_LENGTH)
train_sequence_2 = pad_sequences(train_sequence_2, maxlen=MAX_SEQUENCE_LENGTH)

In [61]:
test_sequence_1 = pad_sequences(test_sequence_1, maxlen=MAX_SEQUENCE_LENGTH)
test_sequence_2 = pad_sequences(test_sequence_2, maxlen=MAX_SEQUENCE_LENGTH)

In [62]:
train_sequence_1.shape, train_sequence_2.shape, test_sequence_1.shape, test_sequence_2.shape

((404290, 30), (404290, 30), (2345796, 30), (2345796, 30))

# save for later

In [63]:
save_array(save_path + 'train_sequence_1', train_sequence_1)
save_array(save_path + 'train_sequence_2', train_sequence_1)
save_array(save_path + 'train_labels',  train_labels)

save_array(save_path + 'test_sequence_1',  test_sequence_1)
save_array(save_path + 'test_sequence_2',  test_sequence_2)

save_array(save_path + 'word_index',  tokenizer.word_index)

# load back

In [64]:
train_sequence_1 = load_array(save_path + 'train_sequence_1')
train_sequence_2 = load_array(save_path + 'train_sequence_2')

test_sequence_1 = load_array(save_path + 'test_sequence_1')
test_sequence_2 = load_array(save_path + 'test_sequence_2')

train_labels = load_array(save_path + 'train_labels')

word_index = load_array(save_path + 'word_index')

# random split

In [65]:
msk = np.random.rand(nb_samples) < 0.8
msk

array([ True,  True,  True, ...,  True,  True,  True], dtype=bool)

In [66]:
(train_sequence_1.shape, train_sequence_2.shape)

((404290, 30), (404290, 30))

In [67]:
X_all = np.hstack((train_sequence_1, train_sequence_2))
X_all.shape

(404290, 60)

In [68]:
y_all = np.array(train_labels)
y_all.shape, y_all[0]

((404290,), 0)

In [69]:
X_train = X_all[msk]
X_valid = X_all[~msk]
y_train = y_all[msk]
y_valid = y_all[~msk]
(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

((323433, 60), (323433,), (80857, 60), (80857,))

# simple model

In [70]:
model = Sequential([
        Embedding(20000, 50, input_length=60),
        Flatten(),
        #Dropout(0.5),
        #BatchNormalization(),
        Dense(200, activation='sigmoid'),
        #Dropout(0.5),
        #BatchNormalization(),
        Dense(1, activation='sigmoid')
    ])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_2 (Embedding)          (None, 60, 50)        1000000     embedding_input_2[0][0]          
____________________________________________________________________________________________________
flatten_2 (Flatten)              (None, 3000)          0           embedding_2[0][0]                
____________________________________________________________________________________________________
dense_3 (Dense)                  (None, 200)           600200      flatten_2[0][0]                  
____________________________________________________________________________________________________
dense_4 (Dense)                  (None, 1)             201         dense_3[0][0]                    
Total params: 1,600,401
Trainable params: 1,600,401
Non-trainable params: 0
_______________

In [71]:
#tokenizer.nb_words
model.compile(loss='binary_crossentropy', optimizer=Adam())

In [72]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=2, verbose=0),
    #ModelCheckpoint(model_path + 'word2vec_dropout.h5', monitor='val_loss', save_best_only=True, verbose=0)
]

In [73]:
model.fit(X_train, y_train, validation_data=(X_valid, y_valid), 
          batch_size=64, nb_epoch=20, verbose=2,
          callbacks=callbacks
         )

Train on 323433 samples, validate on 80857 samples
Epoch 1/20
217s - loss: 0.5313 - val_loss: 0.5099
Epoch 2/20
300s - loss: 0.4617 - val_loss: 0.5154
Epoch 3/20
284s - loss: 0.4227 - val_loss: 0.5257
Epoch 4/20
283s - loss: 0.3968 - val_loss: 0.5564


<keras.callbacks.History at 0x7f43f372b810>

# predict_train

In [38]:
pred_train = model.predict_classes(X_train, verbose=2)
sum(pred_train == y_train.reshape(len(y_train), 1)) * 1.0 / len(y_train)

array([ 0.82588512])

In [39]:
pred_valid = model.predict_classes(X_valid, verbose=2)
sum(pred_valid == y_valid.reshape(len(y_valid), 1)) * 1.0 / len(y_valid)
#pred_valid.shape, y_valid.shape, len(y_valid)

array([ 0.7577278])

In [41]:
zip(model.predict_proba(X_valid, verbose=2)[0:10], y_valid[0:10])

[(array([ 0.17802152], dtype=float32), 0),
 (array([ 0.38128787], dtype=float32), 0),
 (array([ 0.42360657], dtype=float32), 1),
 (array([ 0.24209851], dtype=float32), 0),
 (array([ 0.60948128], dtype=float32), 0),
 (array([ 0.71274632], dtype=float32), 0),
 (array([ 0.43072778], dtype=float32), 1),
 (array([ 0.09452189], dtype=float32), 0),
 (array([ 0.73663753], dtype=float32), 0),
 (array([ 0.97041601], dtype=float32), 1)]

In [42]:
X_test = np.hstack((test_sequence_1, test_sequence_2))
X_test.shape

(2345796, 60)

In [43]:
pred_test = model.predict_proba(X_test, verbose=2)

In [44]:
sum(pred_test)

array([ 650316.0625], dtype=float32)

# generate submission file

In [45]:
df_submission = pd.read_csv(path + 'submission/sample_submission.csv')

In [46]:
print df_submission.shape
df_submission.head()

(2345796, 2)


Unnamed: 0,test_id,is_duplicate
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


In [47]:
#pred_test.shape
df_submission['is_duplicate'] = pred_test
df_submission.head()

Unnamed: 0,test_id,is_duplicate
0,0,0.10053
1,1,0.419504
2,2,0.425452
3,3,0.266448
4,4,0.382258


In [48]:
df_submission.to_csv(path + 'submission/h1_2epochs_prob_20170504.csv', index=False)