In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from utils.clean import text_to_wordlist
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
from utils.tweaks import reduce_mem_usage

In [2]:
%%time
train = pd.read_csv('data/train_clean.csv')
train = reduce_mem_usage(train)

Memory usage of dataframe is 104.87 MB --> 55.91 MB (Decreased by 46.7%)
CPU times: user 2.34 s, sys: 325 ms, total: 2.67 s
Wall time: 2.74 s


In [4]:
train[:5]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,freq_qid1,freq_qid2,q1len,q2len,...,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,wmd,jaccard
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,0.0,0.0,0.107084,0.049209,...,1.0,0.008772,0.093863,0.982759,1.0,0.93,0.93,1.0,0.147856,0.916667
1,1,3,4,what is the story of kohinoor koh - i - noor d...,what would happen if the indian government sto...,0,0.061224,0.0,0.085667,0.078207,...,1.0,0.02193,0.104693,0.611111,0.86,0.63,0.67,0.75,0.55437,0.444444
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0,0.0,0.0,0.118616,0.050967,...,1.0,0.017544,0.086643,0.166667,0.66,0.66,0.54,0.54,0.56244,0.2
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when math 23 ^ 24 math is d...,0,0.0,0.0,0.079077,0.053603,...,0.0,0.013158,0.090253,0.04,0.36,0.36,0.35,0.39,1.18491,0.0
4,4,9,10,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0,0.040816,0.0,0.120264,0.033392,...,1.0,0.026316,0.072202,0.175,0.67,0.47,0.45,0.56,0.79578,0.25


In [6]:
features_list = train.columns.tolist()[6:]
features_list

['freq_qid1',
 'freq_qid2',
 'q1len',
 'q2len',
 'q1_n_words',
 'q2_n_words',
 'word_Common',
 'word_Total',
 'word_share',
 'freq_q1+q2',
 'freq_q1-q2',
 'cwc_min',
 'cwc_max',
 'csc_min',
 'csc_max',
 'ctc_min',
 'ctc_max',
 'last_word_eq',
 'first_word_eq',
 'abs_len_diff',
 'mean_len',
 'longest_substr_ratio',
 'token_set_ratio',
 'token_sort_ratio',
 'fuzz_ratio',
 'fuzz_partial_ratio',
 'wmd',
 'jaccard']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train[features_list], train['is_duplicate'], test_size=0.33, random_state=42,stratify=train['is_duplicate'])

In [None]:
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.02,
    'max_depth': 4,
    }

d_train = xgb.DMatrix(X_train, label=y_train)
d_test = xgb.DMatrix(X_test, label=y_test)

watchlist = [(d_train, 'train'), (d_test, 'test')]

model = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)

In [None]:
xgb.plot_tree(model,num_trees=0)
plt.rcParams['figure.figsize'] = [50, 100]
plt.show()

In [None]:
xgb.plot_importance(model)
plt.rcParams['figure.figsize'] = [10, 10]
plt.rcParams.update({'font.size': 17})
plt.show()

In [None]:
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train)

In [None]:
print(classification_report(y_test, clf.predict(X_test)))

In [61]:
%%capture --no-display
import keras.layers as layers
from keras.models import Model
from keras import backend as K
import os
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

In [62]:
%%capture --no-display
import tensorflow as tf
import tensorflow_hub as hub

In [46]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

In [63]:
DROPOUT = 0.1

def UniversalEmbedding(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)))

def create_model():
    # Taking the question1 as input and ceating a embedding for each question before feed it to neural network
    q1 = layers.Input(shape=(1,), dtype=tf.string)
    embedding_q1 = layers.Lambda(UniversalEmbedding, output_shape=(512,))(q1)
    # Taking the question2 and doing the same thing mentioned above, using the lambda function
    q2 = layers.Input(shape=(1,), dtype=tf.string)
    embedding_q2 = layers.Lambda(UniversalEmbedding, output_shape=(512,))(q2)

    # Concatenating the both input layer
    merged = layers.concatenate([embedding_q1, embedding_q2])
    merged = layers.Dense(200, activation='relu')(merged)
    merged = layers.Dropout(DROPOUT)(merged)

    # Normalizing the input layer,applying dense and dropout  layer for fully connected model and to avoid overfitting 
    merged = layers.BatchNormalization()(merged)
    merged = layers.Dense(200, activation='relu')(merged)
    merged = layers.Dropout(DROPOUT)(merged)

    merged = layers.BatchNormalization()(merged)
    merged = layers.Dense(200, activation='relu')(merged)
    merged = layers.Dropout(DROPOUT)(merged)

    merged = layers.BatchNormalization()(merged)
    merged = layers.Dense(200, activation='relu')(merged)
    merged = layers.Dropout(DROPOUT)(merged)

    # Using the Sigmoid as the activation function and binary crossentropy for binary classifcation as 0 or 1
    merged = layers.BatchNormalization()(merged)
    pred = layers.Dense(2, activation='sigmoid')(merged)
    model = Model(inputs=[q1,q2], outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [37]:
X1 = train['question1']
X2 = train['question2']
y = train['is_duplicate']
# Using the sklearn to split data in question1 and question2 train and test in the ration 80-20 %
X1_train, X1_test,X2_train, X2_test, y_train, y_test = train_test_split(X1, X2, y, test_size=0.2, random_state=42)

train_q1 = X1_train.tolist()
train_q1 = np.array(train_q1, dtype=object)[:, np.newaxis]
train_q2 = X2_train.tolist()
train_q2 = np.array(train_q2, dtype=object)[:, np.newaxis]

train_labels = np.asarray(pd.get_dummies(y_train), dtype = np.int8)

test_q1 = X1_test.tolist()
test_q1 = np.array(test_q1, dtype=object)[:, np.newaxis]
test_q2 = X2_test.tolist()
test_q2 = np.array(test_q2, dtype=object)[:, np.newaxis]

test_labels = np.asarray(pd.get_dummies(y_test), dtype = np.int8)


In [64]:
nn_model = create_model()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
lambda_10 (Lambda)              (None, 512)          0           input_11[0][0]                   
__________________________________________________________________________________________________
lambda_11 (Lambda)              (None, 512)          0           input_12[0][0]                   
__________________________________________________________________________________________________
concatenat

In [None]:
from keras.callbacks import ModelCheckpoint

# Creating the tensorflow session to train the model and save checkpoint after every epoch.
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())

    filepath="models/model-{epoch:02d}-{val_acc:.2f}.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=True, mode='auto', period=1)
    callbacks_list = [checkpoint]
    history = nn_model.fit([train_q1, train_q2], 
            train_labels,
            validation_data=([test_q1, test_q2], test_labels),
            epochs=10,
            batch_size=512, callbacks=callbacks_list)

In [69]:
q1 = input("Type Question 1 here -->")
q2 = input("Type Question 2 here -->") 
q1 = np.array([[q1],[q1]])
q2 = np.array([[q2],[q2]])
# Using the same tensorflow session for embedding the test string
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    # Loading the save weights
    nn_model.load_weights('models/model-01-0.50.hdf5')  
    # Predicting the similarity between the two input questions 
    predicts = nn_model.predict([q1, q2], verbose=0)
    predict_logits = predicts.argmax(axis=1)
    print("----FINAL RESULT----")
    if(predict_logits[0] == 1):
        print("****Questions are Similar****")
    else:
        print("****Questions are not Similar****")

Type Question 1 here --> how are you
Type Question 2 here --> are you alright


----FINAL RESULT----
****Questions are not Similar****
