In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
train = pd.DataFrame()
train = pd.read_csv('../input/quora-question-pairs/train.csv.zip')

In [3]:
train.head()

In [4]:
test = pd.read_csv('../input/quora-question-pairs/test.csv.zip')

In [5]:
test.head()

In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from subprocess import check_output
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import os
import gc

import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup

In [7]:
train.shape

In [8]:
test.shape

In [9]:
train.info()

In [10]:
train.groupby("is_duplicate")['id'].count().plot.bar()

In [11]:
print("total number of questions for training are:- {}".format(len(train)))

In [12]:
print('~> Question pairs are not Similar (is_duplicate = 0):\n   {}%'.format(round(len(train[train['is_duplicate']==0])/len(train) * 100),2))
print('\n~> Question pairs are Similar (is_duplicate = 1):\n   {}%'.format(round(len(train[train['is_duplicate']==1])/len(train) * 100),2))

In [13]:
#store all the question ids in a list 'qids'
qids = pd.Series(train['qid1'].tolist() + train['qid2'].tolist())

#get the unique qids by using np.unique on qids
unique_qs = len(np.unique(qids))

#get the sum of those qids which repeat more than one
qs_morethan_onetime = np.sum(qids.value_counts() > 1)

print ('Total number of  Unique Questions are: {}'.format(unique_qs))
print ('Number of unique questions that appear more than one time: {} ({}%)'.format(qs_morethan_onetime,qs_morethan_onetime/unique_qs*100))
print ('Max number of times a single question is repeated: {}'.format(max(qids.value_counts()))) 

In [14]:
x = ["unique_questions" , "Repeated Questions"]
y = [unique_qs , qs_morethan_onetime]

plt.figure(figsize=(10, 6))
plt.title ("Plot representing unique and repeated questions  ")
sns.barplot(x,y)
plt.show()

In [15]:
#checking for duplicate pairs by grouping on qid1 and qid2.
duplicate_pairs = train[['qid1','qid2','is_duplicate']].groupby(['qid1','qid2']).count().reset_index()
print("total number of duplicate questions: {}".format(duplicate_pairs.shape[0] - train.shape[0])) 

In [16]:
#plot occurences of questions
plt.figure(figsize=(20,10))
plt.hist(qids.value_counts(),bins=160)

plt.yscale('log', nonposy='clip')

plt.title("Occurence of questions")
plt.xlabel("number of occurence of question")
plt.ylabel("number of question")

print ('Maximum number of times a single question is repeated: {}\n'.format(max(qids.value_counts()))) 

In [17]:
#check for null values in an entire dataframe
null_rows = train[train.isnull().any(axis = 1)]
null_rows

In [18]:
#fill the nan values with a space
train = train.fillna(" ")
null_rows = train[train.isnull().any(axis = 1)]
print(null_rows)

**Basic Feature Extraction before cleaning**
* **freq_qid1** = Frequency of qid1's
* freq_qid2 = Frequency of qid2's
* q1len = Length of q1
* q2len = Length of q2
* q1_n_words = Number of words in Question 1
* q2_n_words = Number of words in Question 2
* word_Common = (Number of common unique words in Question 1 and Question 2)
* word_Total =(Total num of words in Question 1 + Total num of words in Question 2)
* word_share = (word_common)/(word_Total)
* freq_q1+freq_q2 = sum total of frequency of qid1 and qid2
* freq_q1-freq_q2 = absolute difference of frequency of qid1 and qid2

In [19]:
train['freq_qid1'] = train.groupby('qid1')['qid1'].transform('count') 
train['freq_qid2'] = train.groupby('qid2')['qid2'].transform('count')

train['q1len'] = train['question1'].astype(str).str.len()
train['q2len'] = train['question2'].astype(str).str.len()

train['q1_n_words'] = train['question1'].astype(str).apply(lambda row : len(row.split(" ")))
train['q2_n_words'] = train['question2'].astype(str).apply(lambda row : len(row.split(" ")))

In [20]:
train.head()

In [21]:
train.values

In [22]:
train['word_Overlap'] = [set(x[3].split()) & set(x[4].split()) for x in train.values]
train['word_Common'] = train['word_Overlap'].str.len()
train.head()

In [23]:
train['word_Total'] = train['question1'].str.split().map(len) + train['question2'].str.split().map(len)

In [24]:
train.head()

In [25]:
train.drop('word_Overlap' , axis = 1, inplace = True)
train.head()

In [26]:
train['word_share'] = train['word_Common'] / train['word_Total'] 

In [27]:
train.head()

In [28]:
train['freq_q1+q2'] = train['freq_qid1'] + train['freq_qid2']
train['freq_q1-q2'] = abs(train['freq_qid1'] - train['freq_qid2'])

In [29]:
train.head()

In [30]:
#Analysis of extracted features
print ("Minimum length of the questions in question1 : " , min(train['q1_n_words']))
print ("Minimum length of the questions in question2 : " , min(train['q2_n_words']))

In [31]:
print ("Number of Questions with minimum length [question1] :", train[train['q1_n_words']== 1].shape[0])
print ("Number of Questions with minimum length [question2] :", train[train['q2_n_words']== 1].shape[0])

In [32]:
plt.figure(figsize=(12, 8))

plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'word_share', data = train[0:])

plt.subplot(1,2,2)
sns.distplot(train[train['is_duplicate'] == 1.0]['word_share'][0:] , label = "1", color = 'red')
sns.distplot(train[train['is_duplicate'] == 0.0]['word_share'][0:] , label = "0" , color = 'blue' )
plt.show()

In [33]:
plt.figure(figsize=(12, 8))

plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'word_Common', data = train[0:])

plt.subplot(1,2,2)
sns.distplot(train[train['is_duplicate'] == 1.0]['word_Common'][0:] , label = "1", color = 'red')
sns.distplot(train[train['is_duplicate'] == 0.0]['word_Common'][0:] , label = "0" , color = 'blue' )
plt.show()

In [34]:
train.head(2)

## Preprocessing of Text
* Removing HTML tags
* Remove punctuation
* Stemming
* Removing stop words
* Expanding contractions

In [35]:
STOP_WORDS = stopwords.words("english")
def preprocess(x):
    x = str(x).lower()
    x = x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
                           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will")
    x = re.sub(r"([0-9]+)000000", r"\1m", x)
    x = re.sub(r"([0-9]+)000", r"\1k", x)
    
    
    porter = PorterStemmer()
    pattern = re.compile('\W')
    
    if type(x) == type(''):
        x = re.sub(pattern, ' ', x)
    
    
    if type(x) == type(''):
        x = porter.stem(x)
        example1 = BeautifulSoup(x)
        x = example1.get_text()
               
    
    return x

<h2> 3.5 Advanced Feature Extraction (NLP and Fuzzy Features) </h2>

In [36]:
SAFE_DIV = 0.0001

In [37]:
!pip install --user distance

In [38]:
def get_token_features(q1, q2):
    
    token_features = [0.0]*10
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features
    
    # Get the non-stopwords in Questions
    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
    
    #Get the stopwords in Questions
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
    
    # Get the common non-stopwords from Question pair
    common_word_count = len(q1_words.intersection(q2_words))
    
    # Get the common stopwords from Question pair
    common_stop_count = len(q1_stops.intersection(q2_stops))
    
    # Get the common Tokens from Question pair
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
    
    
    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)    #cwc_min
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)    #cwc_max
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)    #csc_min
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)    #csc_max
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV) #ctc_min
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV) #ctc_max
    
    # Last word of both question is same or not
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])  #last_word_eq
    
    # First word of both question is same or not
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])    #first_word_eq
    
    token_features[8] = abs(len(q1_tokens) - len(q2_tokens))  #abs_len_diff
    
    #Average Token Length of both Questions
    token_features[9] = (len(q1_tokens) + len(q2_tokens))/2   #mean_len
    return token_features

# get the Longest Common sub string
def get_longest_substr_ratio(a, b):
    strs = list(distance.lcsubstrings(a, b))
    if len(strs) == 0:
        return 0
    else:
        return len(strs[0]) / (min(len(a), len(b)) + 1)

def extract_features(df):
    # preprocessing each question
    df["question1"] = df["question1"].fillna("").apply(preprocess)
    df["question2"] = df["question2"].fillna("").apply(preprocess)

    print("token features...")
    
    # Merging Features with dataset
    
    token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
    
    df["cwc_min"]       = list(map(lambda x: x[0], token_features))
    df["cwc_max"]       = list(map(lambda x: x[1], token_features))
    df["csc_min"]       = list(map(lambda x: x[2], token_features))
    df["csc_max"]       = list(map(lambda x: x[3], token_features))
    df["ctc_min"]       = list(map(lambda x: x[4], token_features))
    df["ctc_max"]       = list(map(lambda x: x[5], token_features))
    df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
    df["first_word_eq"] = list(map(lambda x: x[7], token_features))
    df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
    df["mean_len"]      = list(map(lambda x: x[9], token_features))
   
    #Computing Fuzzy Features and Merging with Dataset
    print("fuzzy features..")

    df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
    # The token sort approach involves tokenizing the string in question, sorting the tokens alphabetically, and 
    # then joining them back into a string We then compare the transformed strings with a simple ratio().
    df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
    df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
    df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
    df["longest_substr_ratio"]  = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
    return df

In [39]:
from nltk.corpus import stopwords
# This package is used for finding longest common subsequence between two strings
# you can write your own dp code for this
import distance
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
from sklearn.manifold import TSNE
# Import the Required lib packages for WORD-Cloud generation
# https://stackoverflow.com/questions/45625434/how-to-install-wordcloud-in-python3-6
from wordcloud import WordCloud, STOPWORDS
from os import path
from PIL import Image

In [40]:
train = extract_features(train)
train.head()

In [41]:
train.head(2)

# Analysis of Extracted Features

In [42]:
train_duplicate = train[train['is_duplicate'] == 1]
train_nonduplicate = train[train['is_duplicate'] == 0]

In [43]:
# Converting 2d array of q1 and q2 and flatten the array: like {{1,2},{3,4}} to {1,2,3,4}
p = np.dstack([train_duplicate["question1"], train_duplicate["question2"]]).flatten()
n = np.dstack([train_nonduplicate["question1"], train_nonduplicate["question2"]]).flatten()

In [44]:
print ("Number of data points in class 1 (duplicate pairs) :",len(p))
print ("Number of data points in class 0 (non duplicate pairs) :",len(n))

In [45]:
#Saving the np array into a text file
np.savetxt('train_p.txt', p, delimiter=' ', fmt='%s')
np.savetxt('train_n.txt', n, delimiter=' ', fmt='%s')

In [46]:
# reading the text files and removing the Stop Words:
#d = path.dirname('.')

textp_w = open('train_p.txt').read()
textn_w = open('train_n.txt').read()
stopwords = set(STOPWORDS)
stopwords.add("said")
stopwords.add("br")
stopwords.add(" ")
stopwords.remove("not")

stopwords.remove("no")
#stopwords.remove("good")
#stopwords.remove("love")
stopwords.remove("like")
#stopwords.remove("best")
#stopwords.remove("!")
print ("Total number of words in duplicate pair questions :",len(textp_w))
print ("Total number of words in non duplicate pair questions :",len(textn_w))

In [47]:
wc = WordCloud(background_color="white", max_words=len(textp_w), stopwords=stopwords)
wc.generate(textp_w)
print ("Word Cloud for Duplicate Question pairs")
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [48]:
wc = WordCloud(background_color="white", max_words=len(textn_w), stopwords=stopwords)
wc.generate(textp_w)
print ("Word Cloud for Non Duplicate Question pairs")
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [49]:
n = train.shape[0]
sns.pairplot(train[['ctc_min', 'cwc_min', 'csc_min', 'token_sort_ratio', 'is_duplicate']][0:n], hue='is_duplicate', vars=['ctc_min', 'cwc_min', 'csc_min', 'token_sort_ratio'])
plt.show()

In [50]:
train.is_duplicate.value_counts()

In [51]:
# Distribution of the token_sort_ratio
plt.figure(figsize=(10, 8))

plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'token_sort_ratio', data = train[0:] , )

plt.subplot(1,2,2)
sns.distplot(train[train['is_duplicate'] == 1.0]['token_sort_ratio'][0:] , label = "1", color = 'red')
sns.distplot(train[train['is_duplicate'] == 0.0]['token_sort_ratio'][0:] , label = "0" , color = 'blue' )
plt.show()

In [52]:
plt.figure(figsize=(10, 8))

plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'fuzz_ratio', data = train[0:] , )

plt.subplot(1,2,2)
sns.distplot(train[train['is_duplicate'] == 1.0]['fuzz_ratio'][0:] , label = "1", color = 'red')
sns.distplot(train[train['is_duplicate'] == 0.0]['fuzz_ratio'][0:] , label = "0" , color = 'blue' )
plt.show()

In [53]:
train.shape

In [54]:
train.head(1)

In [55]:
pip install spacy

In [56]:
questions = train['question1'] + train['question2']

In [57]:
i = 0
list_of_sentence = []
for sentence in questions:
    list_of_sentence.append(sentence)

In [58]:
import nltk

In [59]:
list_of_sentence = [nltk.word_tokenize(sent) for sent in list_of_sentence]

In [60]:
import gensim
#from gensim.models import Word2Vec
w2vmodel = gensim.models.Word2Vec(list_of_sentence,min_count = 5, vector_size = 50, workers = 4)


In [61]:
print(w2vmodel.wv.most_similar('sun'))

In [62]:
print(w2vmodel.wv.most_similar('best'))

In [63]:
w2v_words = list(w2vmodel.wv.key_to_index)
print("number of words that occured minimum 5 times ",len(w2v_words))
print("sample words ", w2v_words[0:50])

In [64]:
i = 0
list_of_sentences = []
for sentence in questions:
    list_of_sentences.append(sentence)

In [65]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vect = TfidfVectorizer(ngram_range=(1,1), min_df=10)
tf_idf_vect.fit(list_of_sentences)
print("some sample features(unique words in the corpus)",tf_idf_vect.get_feature_names()[0:10])
print('='*50)

final_tf_idf = tf_idf_vect.transform(list_of_sentences)
print("the type of count vectorizer ",type(final_tf_idf))
print("the shape of out text TFIDF vectorizer ",final_tf_idf.get_shape())
print("the number of unique words including both unigrams and bigrams ", final_tf_idf.get_shape()[1])

In [66]:
dictionary = dict(zip(tf_idf_vect.get_feature_names(), list(tf_idf_vect.idf_)))

In [67]:
dictionary['step']

In [79]:
q1_feat = [nltk.word_tokenize(sent) for sent in train['question1']]
q2_feat = [nltk.word_tokenize(sent) for sent in train['question2']]

In [81]:
import tqdm  
# TF-IDF weighted Word2Vec
tfidf_feat = tf_idf_vect.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors_q1 = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in tqdm.tqdm(q1_feat): # for each review/sentence 
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    #print(sent)
    for word in sent: # for each word in a review/sentence
        #print('each word')
        #print(word)
        if word in w2v_words and word in tfidf_feat:
            vec = w2vmodel.wv[word]
            #print('vector')
            #print(vec)
#             tf_idf = tf_idf_matrix[row, tfidf_feat.index(word)]
            # to reduce the computation we are 
            # dictionary[word] = idf value of word in whole courpus
            # sent.count(word) = tf valeus of word in this review
            tf_idf = dictionary[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_sent_vectors_q1.append(sent_vec)
    row += 1

In [82]:
tfidf_sent_vectors_q1[0:2]

In [84]:
import tqdm  
# TF-IDF weighted Word2Vec
tfidf_feat = tf_idf_vect.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors_q2 = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in tqdm.tqdm(q2_feat): # for each review/sentence 
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    #print(sent)
    for word in sent: # for each word in a review/sentence
        #print('each word')
        #print(word)
        if word in w2v_words and word in tfidf_feat:
            vec = w2vmodel.wv[word]
            #print('vector')
            #print(vec)
#             tf_idf = tf_idf_matrix[row, tfidf_feat.index(word)]
            # to reduce the computation we are 
            # dictionary[word] = idf value of word in whole courpus
            # sent.count(word) = tf valeus of word in this review
            tf_idf = dictionary[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_sent_vectors_q2.append(sent_vec)
    row += 1

In [85]:
tfidf_sent_vectors_q2[0:2]

In [86]:
df = pd.DataFrame()
df['q1_feat'] = list(tfidf_sent_vectors_q1)

In [87]:
df.head()

In [88]:
df['q2_feat'] = tfidf_sent_vectors_q2
df.head()

In [91]:
df3_q1 = pd.DataFrame(df.q1_feat.values.tolist(), index= df.index)
df3_q2 = pd.DataFrame(df.q2_feat.values.tolist(), index= df.index)

In [94]:
df3_q1.head()

In [95]:
train.head()

In [96]:
df3_q2.head()

In [97]:
train.columns

In [98]:
train.drop(['qid1', 'qid2', 'question1', 'question2'],inplace = True, axis = 1)
#train.head

In [99]:
train.head()

In [100]:
df3_q1['id']= train['id']
df3_q2['id']= train['id']
train  = train.merge(df3_q1, on='id',how='left')
train = train.merge(df3_q2, on='id',how='left')
#result  = df1.merge(df2, on='id',how='left')
train.head()

In [101]:
train.head()

In [102]:
train.drop('is_duplicate', inplace= True, axis = 1)

In [111]:
train_new = pd.DataFrame()
train_new = pd.read_csv('../input/quora-question-pairs/train.csv.zip')

In [112]:
y = train_new['is_duplicate']

In [114]:
test.head()