In [2]:
import pandas as pd
import numpy as np

df_train = pd.read_csv("data/train.csv")
df_test  = pd.read_csv("data/test.csv")

In [3]:
# Data length

pd_columns = ['length', 'attributes']
pd_index   = ['train', 'test']
pd_data    = [[len(df_train), len(df_train.keys())], [len(df_test), len(df_test.keys())]]

pd.DataFrame(pd_data, index = pd_index, columns = pd_columns)

Unnamed: 0,length,attributes
train,404290,6
test,2345796,3


In [4]:
# Example training data

train_subset = df_train.iloc[0:5,:]
train_subset

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
# Example test data
test_subset = df_test.iloc[0:5,:]
test_subset

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [6]:
# Find number of duplicates and non-duplicates
pd_columns = ['duplicate', 'not duplicate', 'total', '%duplication']
pd_index   = ['train']

pd_data    = [
    [
        df_train.is_duplicate.sum(),
        len(df_train) - df_train.is_duplicate.sum(),
        len(df_train),
        df_train.is_duplicate.sum() / len(df_train) * 100
    ]
    ]

pd.DataFrame(pd_data, index = pd_index, columns = pd_columns)

Unnamed: 0,duplicate,not duplicate,total,%duplication
train,149263,255027,404290,36.919785


In [7]:
# Remove missing value and duplicates

pd_columns = ['length']
pd_index   = ['before cleaning', 'after cleaning']
len_before = len(df_train)
df_train.drop_duplicates(inplace=True)
df_train.dropna(inplace=True)

pd_data    = [len_before, len(df_train)]

pd.DataFrame(pd_data, index = pd_index, columns = pd_columns)

Unnamed: 0,length
before cleaning,404290
after cleaning,404287


In [16]:
# Seperate duplicates, non-duplicates and calculate their cosine similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
import copy

# Seperate training data into train and test set
df_train_part_1 = copy.deepcopy(df_train[:300000])
df_train_part_2 = copy.deepcopy(df_train[300000:])

print(df_train_part_1.shape)
print(df_train_part_2.shape)

df_train = df_train_part_1
print(df_train.shape)

# Seperation
corpus = pd.concat([df_train.question1, df_train.question2])

cv = CountVectorizer().fit(corpus)

duplicates = df_train.loc[df_train.is_duplicate==1, ['question1','question2']]
nondupes = df_train.loc[df_train.is_duplicate==0, ['question1','question2']]

(300000, 6)
(104287, 6)
(300000, 6)


In [17]:
# Non duplicate similarity
non_dupes_csim = []

for j in range(nondupes.shape[0]):
    
    a = cv.transform([nondupes.iloc[j].question1])
    b = cv.transform([nondupes.iloc[j].question2])
    
    non_dupes_csim.append( cosine_similarity(a,b).ravel()[0])

In [18]:
# Duplicate similarity
duplicates_csim = []

for j in range(duplicates.shape[0]):
    
    a = cv.transform([duplicates.iloc[j].question1])
    b = cv.transform([duplicates.iloc[j].question2])
    
    duplicates_csim.append( cosine_similarity(a,b).ravel()[0])

In [19]:
# Mean of non-duplicate similarity
mean_nondup = np.mean(non_dupes_csim)
mean_nondup

0.434002797142203

In [20]:
# Mean of duplicate similarity
mean_dup = np.mean(duplicates_csim)
mean_dup

0.6245984483328468

In [None]:
# Using mean data of similarity above, try to predict the extracted test (train data)

In [23]:
# Calculate similarity of test data
df_test = df_train_part_2

print(df_test.shape)
# print(df_test)

test_corpus = pd.concat([df_test.question1, df_test.question2])

cv = CountVectorizer().fit(test_corpus)

(104287, 6)


In [25]:
# Assign to respective bucket based on similarity measure
test_sim = [] 
total_correct = 0;
  
for j in range(len(df_test)):
    
    a = cv.transform([df_test.iloc[j].question1])
    b = cv.transform([df_test.iloc[j].question2])
    
    cosine_sim = cosine_similarity(a,b).ravel()[0]
    
    if cosine_sim <= mean_nondup:
        dup_bin = 0
    elif cosine_sim >= mean_dup:
        dup_bin = 1
    elif abs(cosine_sim - mean_nondup) < abs(cosine_sim - mean_dup):
        dup_bin = 0
    else: 
        dup_bin = 1
        
    # print(cosine_sim)
    
    correct_classify = 0
    if dup_bin == df_test.iloc[j].is_duplicate:
        correct_classify = 1
        total_correct = total_correct + 1
    
    test_sim.append([df_test.iloc[j].id, dup_bin, df_test.iloc[j].is_duplicate, correct_classify])
    
    
# Total correctly classified
print(total_correct/len(df_test))

# Create the pandas DataFrame 
test_sim_df = pd.DataFrame(test_sim, columns = ['id', 'dup_bin', 'is_duplicate', 'correct_classify']) 
test_sim_df

0.6592000920536596


Unnamed: 0,id,dup_bin,is_duplicate,correct_classify
0,300002,0,0,1
1,300003,0,0,1
2,300004,0,0,1
3,300005,0,0,1
4,300006,0,1,0
5,300007,1,0,0
6,300008,1,1,1
7,300009,0,1,0
8,300010,1,1,1
9,300011,1,1,1


In [None]:
# Create CSV to submit to Kaggle

test_sim_df.to_csv(r'submission.csv')