In [1]:
import numpy as np
import pandas as pd

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Embedding, CuDNNGRU
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

With num_words we take most popular 500000 words in the dataset. Also every question will be 50 tokens long. 

In [2]:
num_words = 500000
max_tokens = 50
embedding_size = 300

Importing the dataset.

In [3]:
dataset = pd.read_csv('../input/train.csv')
dataset_test = pd.read_csv('../input/test.csv')

In [4]:
y_train = dataset['target'].values.tolist()
x_train = dataset['question_text'].values.tolist()
x_test = dataset_test['question_text'].values.tolist()

data = x_train + x_test

Tokenizing the questions. Every question will consist of 50 tokens. If a question has less than 50 tokens we will add 0 padding for the missing tokens. If question has more than 50 tokens we cut the extra tokens.

In [5]:
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(data)

x_train_tokens = tokenizer.texts_to_sequences(x_train)
x_test_tokens = tokenizer.texts_to_sequences(x_test)

x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens)
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens)

For embedding we're using Stanford's pretrained glove model. If a word isn't in the glove file it will be randomly initialized. 

In [6]:
word2vec = {}
with open('../input/embeddings/glove.840B.300d/glove.840B.300d.txt', encoding='UTF-8') as f:
    for i, line in enumerate(f):
        values = line[:-1].split()
        word = values[0]
        vec = np.asarray(values[-300:], dtype='float32')
        word2vec[word] = vec

embedding_matrix = np.random.uniform(-1, 1, (num_words, embedding_size))
for word, i in tokenizer.word_index.items():
    if i < num_words:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

Creating the model. We are using 3 layers of GRUs.

In [7]:
model = Sequential()

model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    weights=[embedding_matrix],
                    trainable=False,
                    name='embedding_layer'))

model.add(CuDNNGRU(units=32, return_sequences=True))
model.add(CuDNNGRU(units=32, return_sequences=True))
model.add(CuDNNGRU(units=32))
model.add(Dense(1, activation='sigmoid'))

optimizer = RMSprop(lr=1e-3)

model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

Training the model for 10 epochs with batch size of 256.

In [8]:
model.fit(x_train_pad, y_train, epochs=10, batch_size=256)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1e07d1170b8>

Lastly we test the model on test set and create a submission file.

In [9]:
y_pred = model.predict(x=x_test_pad)
cls_pred = np.array([1 if p > 0.5 else 0 for p in y_pred])
dataset_test['prediction'] = cls_pred.T
df_sub = dataset_test.drop('question_text', axis=1)
df_sub.to_csv('submission.csv', index=False)