In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px
import plotly.graph_objects as go

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras import layers
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adadelta

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/glove6b/glove.6B.200d.txt
/kaggle/input/glove6b/glove.6B.50d.txt
/kaggle/input/glove6b/glove.6B.300d.txt
/kaggle/input/glove6b/glove.6B.100d.txt
/kaggle/input/quora-question-pairs/train.csv.zip
/kaggle/input/quora-question-pairs/sample_submission.csv.zip
/kaggle/input/quora-question-pairs/test.csv
/kaggle/input/quora-question-pairs/test.csv.zip


reference : https://towardsdatascience.com/quora-question-pairs-detecting-text-similarity-using-siamese-networks-a370f039731b

In [23]:
dirname = '/kaggle/input/quora-question-pairs/'
filename = 'train.csv.zip'

df = pd.read_csv(os.path.join(dirname, filename))
print(df.info())
print()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404287 entries, 0 to 404289
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404287 non-null  int64 
 1   qid1          404287 non-null  int64 
 2   qid2          404287 non-null  int64 
 3   question1     404287 non-null  object
 4   question2     404287 non-null  object
 5   is_duplicate  404287 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 21.6+ MB
None



Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
print('ratio of duplicates:', round(df['is_duplicate'].mean()*100, 2))

qids = pd.Series(df['qid1'].to_list() + df['qid2'].to_list())
print('total # of questions:', len(np.unique(qids)))
print('# of questions appearing multiple times:', np.sum(qids.value_counts() > 1))

ratio of duplicates: 36.92
total # of questions: 537933
# of questions appearing multiple times: 111780


In [7]:
sample = pd.read_csv(os.path.join(dirname, 'sample_submission.csv.zip'))
sample.head()

Unnamed: 0,test_id,is_duplicate
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


In [6]:
# check baseline

test_filename = 'test.csv'
df_test = pd.read_csv(os.path.join(dirname, test_filename))
p = df['is_duplicate'].mean()
baseline = pd.DataFrame({'test_id': df_test['test_id'], 'is_duplicate': p})
baseline.to_csv('submission.csv', index=False)
baseline.head()

Unnamed: 0,test_id,is_duplicate
0,0,0.369198
1,1,0.369198
2,2,0.369198
3,3,0.369198
4,4,0.369198


Score: 0.55525

## Train data preprocessing

In [24]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def no_abbv(phrase):
    pharse = phrase.lower()
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "cannot", phrase)
    
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s'", " ", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"e-mail", " email", phrase)
    phrase = re.sub(r" e.g. ", "  eg ", phrase)
    phrase = re.sub(r" u.s. ", "  american ", phrase)
    
    phrase = re.sub(r"[^A-Za-z0-9]+", " ", phrase)
    
    return phrase.strip()


def remove_stopwords(phrase):
    words = word_tokenize(phrase)
    sent = ' '.join(str(j) for j in words if (j not in stop_words) and (len(j) != 1))
    return sent

def preprocess_text(phrase):
    phrase = no_abbv(phrase)
    return remove_stopwords(phrase)

In [25]:
df = df.dropna()

df['question1'] = df['question1'].astype('string')
df['question2'] = df['question2'].astype('string')

df['q1_re'] = df['question1'].apply(preprocess_text)
df['q2_re'] = df['question2'].apply(preprocess_text)

'What step step guide invest share market india What step step guide invest share market'

In [29]:
df.shape, df[['q1_re', 'q2_re']].drop_duplicates().shape

((404287, 9), (395197, 2))

In [30]:
from sklearn.model_selection import train_test_split

X_tmp, X_test, y_tmp, y_test = train_test_split(df[['q1_re', 'q2_re']], df['is_duplicate'], 
                                                test_size=0.2, random_state=24)
X_train, X_val, y_train, y_val = train_test_split(X_tmp, y_tmp, test_size=0.2, random_state=24)

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print(X_test.shape)
print(y_test.shape)

(258743, 2)
(258743,)
(64686, 2)
(64686,)
(80858, 2)
(80858,)


In [27]:
qs = list(X_train['q1_re'].values) + list(X_train['q2_re'].values)

max_words = 10000
tok = Tokenizer(num_words=max_words, oov_token="<OOV>")

tok.fit_on_texts(qs)

sequences = tok.texts_to_sequences(qs)
sequences = pad_sequences(sequences, maxlen=300, padding='post')



Found 400000 word vectors.
85401


In [None]:
glove_dir = '/kaggle/input/glove6b/'
embeddings_index = {}
word_index = len(tok.word_index) + 1
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()

print(f'Found {len(embeddings_index)} word vectors.')
print(word_index)

In [None]:
from tensorflow.keras.layers import Dense, Flatten, Embedding, Bidirectional, LSTM

In [None]:
lstm_layer = Bidirectional(LSTM(20, dropout=0.2, recurrent_dropout=0.2))
emb = Embedding(max_words, embedding_dim, input_length=300, weights=[embedding_matrix], trainable=False)
input1 = tf.keras.Input(shape=(300,))
e1 = emb(input1)
x1 = lstm_layer(e1)

input2 = tf.keras.Input(shape=(300,))
e2 = emb(input2)
x2 = lstm_layer(e2)

mhd = lambda x: tf.keras.backend.abs(x[0] - x[1])
merged = Lambda(function=mhd, output_shape=lamba x: x[0], name='L1_distance')([x1, x2])
preds = Dense(1, activation='sigmoid')(merged)
model = tf.keras.Model(inputs=[input1, input2], outputs=preds)
model.compile(loss='mse', optimizer='adam')