In [91]:
import pandas as pd 
import numpy as np
import re

from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold

from keras.layers import Dense, Input, LSTM, Embedding, Dropout
from keras.layers.normalization import BatchNormalization
from keras.layers.noise import GaussianNoise
from keras.layers.core import Lambda
from keras.layers.merge import concatenate, add, multiply
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint

from nltk.corpus import stopwords

In [None]:
np.random.seed(0)

In [112]:
min_occurence = 200
unknown = "memento"
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 30
BATCH_SIZE = 1024
n_features = 10
n_folds=3

In [113]:
en_stop = set(stopwords.words('english'))

glove_file = "./data/word2vec.glove.840B.300d.txt"
glove_model = KeyedVectors.load_word2vec_format(glove_file)

In [11]:
train = pd.read_csv("./train.csv", names=['row_ID', 'text_a_ID', 'text_b_ID', 'text_a_text', 'text_b_text', 'have_same_meaning'], index_col=0)
test = pd.read_csv("./test.csv", names=['row_ID', 'text_a_ID', 'text_b_ID', 'text_a_text', 'text_b_text', 'have_same_meaning'], index_col=0)
submission_sample = pd.read_csv("./sample_submission_file.csv")

In [4]:
lemmatizer = WordNetLemmatizer()

def lemmatize(word, lemmatizer):
    return lemmatizer.lemmatize(word)

In [133]:
def clean(q):
    q = str(q).lower()
    q = q.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
                           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will")
    q = re.sub('[“”\(\'…\)\!\^\"\.;:,\-\?？\{\}\[\]\\/\*@]', ' ', q)
    q = re.sub(r"([0-9]+)000000", r"\1m", q)
    q = re.sub(r"([0-9]+)000", r"\1k", q)
    q = ' '.join([lemmatize(w,lemmatizer) for w in q.split()])
    return q

In [134]:
def get_model(embedding_matrix, nb_words, n_features):    
    embedding_layer = Embedding(nb_words,
                                    EMBEDDING_DIM,
                                    weights=[embedding_matrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    trainable=False)
    lstm_layer = LSTM(75, recurrent_dropout=0.2)

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = lstm_layer(embedded_sequences_1)

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = lstm_layer(embedded_sequences_2)

    features_input = Input(shape=(n_features,), dtype="float32")
    features_dense = BatchNormalization()(features_input)
    features_dense = Dense(200, activation="relu")(features_dense)
    features_dense = Dropout(0.2)(features_dense)

    addition = add([x1, y1])
    minus_y1 = Lambda(lambda x: -x)(y1)
    merged = add([x1, minus_y1])
    merged = multiply([merged, merged])
    merged = concatenate([merged, addition])
    merged = Dropout(0.4)(merged)

    merged = concatenate([merged, features_dense])
    merged = BatchNormalization()(merged)
    merged = GaussianNoise(0.1)(merged)

    merged = Dense(150, activation="relu")(merged)
    merged = Dropout(0.2)(merged)
    merged = BatchNormalization()(merged)

    out = Dense(1, activation="sigmoid")(merged)

    model = Model(inputs=[sequence_1_input, sequence_2_input, features_input], outputs=out)
    return model

In [137]:
def preprocess(q):
    processed_q = []
    for word in q.split():#[::-1]:
        if word in top_words:
            processed_q = [word] + processed_q
        #elif word not in en_stop:
        #    processed_q = [unknown] + processed_q
            
        if len(processed_q) == MAX_SEQUENCE_LENGTH:
            break
    return " ".join(processed_q)

In [138]:
train["text_a_text_clean"] = train["text_a_text"].fillna("").apply(clean)
train["text_b_text_clean"] = train["text_b_text"].fillna("").apply(clean)

unique_questions = pd.Series(train["text_a_text_clean"] + train["text_b_text_clean"]).unique()
count_vectorizer = CountVectorizer(lowercase=True, token_pattern="\S+", min_df=min_occurence)
count_vectorizer.fit(unique_questions)

top_words = set(count_vectorizer.vocabulary_.keys())
top_words.add(unknown)

train_q_a = train["text_a_text_clean"].apply(preprocess)
train_q_b = train["text_b_text_clean"].apply(preprocess)

tokenizer = Tokenizer(filters="")
tokenizer.fit_on_texts(np.append(train_q_a, train_q_b))
word_index = tokenizer.word_index

train_q_a_padded = pad_sequences(tokenizer.texts_to_sequences(train_q_a), maxlen=MAX_SEQUENCE_LENGTH)
train_q_b_padded = pad_sequences(tokenizer.texts_to_sequences(train_q_a), maxlen=MAX_SEQUENCE_LENGTH)

labels = np.array(train["have_same_meaning"])

nb_words = len(word_index) + 1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector=None
    if word in glove_model.wv:
        embedding_vector = glove_model.wv[word]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
train_nlp_features = pd.read_csv("data/nlp_features_train.csv")
train_non_nlp_features = pd.read_csv("data/non_nlp_features_train.csv")
features_train = np.hstack((train_nlp_features, train_non_nlp_features))

n_features = features_train.shape[1]

In [139]:
test["text_a_text_clean"] = test["text_a_text"].fillna("").apply(clean)
test["text_b_text_clean"] = test["text_b_text"].fillna("").apply(clean)

test_q_a = test["text_a_text_clean"].apply(preprocess)
test_q_b = test["text_b_text_clean"].apply(preprocess)

test_q_a_padded = pad_sequences(tokenizer.texts_to_sequences(test_q_a), maxlen=MAX_SEQUENCE_LENGTH)
test_q_a_padded = pad_sequences(tokenizer.texts_to_sequences(test_q_a), maxlen=MAX_SEQUENCE_LENGTH)

test_nlp_features = pd.read_csv("data/nlp_features_test.csv")
test_non_nlp_features = pd.read_csv("data/non_nlp_features_test.csv")
features_test = np.hstack((test_nlp_features, test_non_nlp_features))

In [140]:
kfold = StratifiedKFold(n_splits=n_folds, shuffle=True)

In [141]:
model_count = 0

In [142]:
for train_indices, validation_indices in kfold.split(train["have_same_meaning"], train["have_same_meaning"]):
    model_count+=1
    train_fold_a = train_q_a_padded[train_indices]
    train_fold_b = train_q_b_padded[train_indices]
    train_fold_features = features_train[train_indices]
    train_fold_labels = labels[train_indices]

    val_fold_a = train_q_a_padded[validation_indices]
    val_fold_b = train_q_a_padded[validation_indices]
    val_fold_features = features_train[validation_indices]
    val_fold_labels = labels[validation_indices]
    
    model = get_model(embedding_matrix, nb_words, n_features)
    model.compile(loss="binary_crossentropy", optimizer="adam")
    
    early_stopping = EarlyStopping(monitor="val_loss", patience=5)
    best_model_path = "best_model" + str(model_count) + ".h5"
    model_checkpoint = ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True)

    hist = model.fit([train_fold_a, train_fold_b, train_fold_features], train_fold_labels,
                     validation_data=([val_fold_a, val_fold_b, val_fold_features], val_fold_labels),
                     epochs=15, batch_size=BATCH_SIZE, shuffle=True,
                     callbacks=[early_stopping, model_checkpoint], verbose=1)

    model.load_weights(best_model_path)
    print(model_count, "validation loss:", min(hist.history["val_loss"]))

Train on 53400 samples, validate on 26700 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
1 validation loss: 0.16166811871171444
Train on 53400 samples, validate on 26700 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15

KeyboardInterrupt: 

In [105]:
nb_words

581