In [1]:
from time import time
start_nb = time()

In [3]:
# Initialize logging.

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')

sentence_obama = 'Obama speaks to the media in Illinois'
sentence_president = 'The president greets the press in Chicago'
sentence_obama = sentence_obama.lower().split()
sentence_president = sentence_president.lower().split()

In [4]:
# Import and download stopwords from NLTK.
from nltk.corpus import stopwords
from nltk import download
download('stopwords')  # Download stopwords list.

# Remove stopwords.
stop_words = stopwords.words('english')
sentence_obama = [w for w in sentence_obama if w not in stop_words]
sentence_president = [w for w in sentence_president if w not in stop_words]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\darpa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
start = time()
import os
import pyemd
global PYEMD_EXT
try:
    from pyemd import emd
    PYEMD_EXT = True
except ImportError:
    PYEMD_EXT = False
import gensim
from gensim.similarities import WmdSimilarity
from gensim.models.doc2vec import LabeledSentence
from gensim.models.doc2vec import TaggedLineDocument
from gensim.models import Word2Vec


In [6]:
#GoogleNews-vectors-negative300.bin.gz
if not os.path.exists('data/GoogleNews-vectors-negative300.bin.gz'):
    raise ValueError("SKIP: You need to download the google news model")
    
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)

print('Cell took %.2f seconds to run.' % (time() - start))



Cell took 148.68 seconds to run.


In [7]:
distance = model.wmdistance(sentence_obama, sentence_president)
print(distance)

3.3741233214730024


In [8]:
sentence_orange = 'Oranges are a great fruits'
sentence_orange = sentence_orange.lower().split()
sentence_orange = [w for w in sentence_orange if w not in stop_words]

distance = model.wmdistance(sentence_obama, sentence_orange)
print(distance)

4.380239402988511


In [9]:
start = time()

model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.

# After normalization
distance1 = model.wmdistance(sentence_obama, sentence_president)  # Compute WMD as normal.
distance2 = model.wmdistance(sentence_obama, sentence_orange)

print(distance1)
print(distance2)

1.0174646259300113
1.3663488311444436


In [12]:
# Read train data
import pandas as pd

train_data = pd.read_csv("data/train.csv", delimiter=',', encoding="utf-8-sig")
train_data.dropna(inplace=True)

In [25]:
# With this done, let's apply this model on our data of Quora Questions

import numpy as np
import pandas as pd
import nltk
import re
import random
import collections
from nltk.corpus import stopwords

def question_to_words(text, lower=False, remove_not=False):
    lemmetizer = nltk.WordNetLemmatizer()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"what's", "", text)
    text = re.sub(r"What's", "", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"60k", " 60000 ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " America ", text)
    text = re.sub(r" USA ", " America ", text)
    text = re.sub(r" u s ", " America ", text)
    text = re.sub(r" uk ", " England ", text)
    text = re.sub(r" UK ", " England ", text)
    text = re.sub(r"india", "India", text)
    text = re.sub(r"switzerland", "Switzerland", text)
    text = re.sub(r"china", "China", text)
    text = re.sub(r"chinese", "Chinese", text) 
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r"quora", "Quora", text)
    text = re.sub(r" dms ", "direct messages ", text)  
    text = re.sub(r"demonitization", "demonetization", text) 
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r"KMs", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text) 
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r" iPhone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text) 
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"gps", "GPS", text)
    text = re.sub(r"gst", "GST", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"dna", "DNA", text)
    text = re.sub(r"III", "3", text) 
    text = re.sub(r"the US", "America", text)
    text = re.sub(r"Astrology", "astrology", text)
    text = re.sub(r"Method", "method", text)
    text = re.sub(r"Find", "find", text) 
    text = re.sub(r"banglore", "Banglore", text)
    text = re.sub(r" J K ", " JK ", text)
    if lower == True:
        text = text.lower()
    
    words = nltk.word_tokenize(text)
    
    
    stops = set(stopwords.words("english"))
    not_stop = ['no', 'nor', 'not', "don't", "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't",
                'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn',
                "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren',
                "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    if remove_not == True:
        # stop = stops.difference(not_stop)
        stops = ['the', 'a', 'an', 'and', 'but', 'if', 'or', 'because', 'as', 'what', 'which', 'this', 'that', 'these',
                 'those', 'then',
                 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for', 'is', 'of', 'while', 'during', 'to',
                 'What', 'Which',
                 'Is', 'If', 'While', 'This']
    
    meaningful_words = [w for w in words if not w in stops]

    meaningful_words = [lemmetizer.lemmatize(w) for w in meaningful_words]
    
    return meaningful_words
    # return (" ".join(meaningful_words))
    
from nltk.stem import SnowballStemmer
import re
from string import punctuation
    
stop_words = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']

def text_to_wordlist(text, remove_stop_words=True, stem_words=True):
    # Clean the text, with the option to remove stop_words and to stem words.

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"what's", "", text)
    text = re.sub(r"What's", "", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"60k", " 60000 ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " America ", text)
    text = re.sub(r" USA ", " America ", text)
    text = re.sub(r" u s ", " America ", text)
    text = re.sub(r" uk ", " England ", text)
    text = re.sub(r" UK ", " England ", text)
    text = re.sub(r"india", "India", text)
    text = re.sub(r"switzerland", "Switzerland", text)
    text = re.sub(r"china", "China", text)
    text = re.sub(r"chinese", "Chinese", text) 
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r"quora", "Quora", text)
    text = re.sub(r" dms ", "direct messages ", text)  
    text = re.sub(r"demonitization", "demonetization", text) 
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r"KMs", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text) 
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r" iPhone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text) 
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"gps", "GPS", text)
    text = re.sub(r"gst", "GST", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"dna", "DNA", text)
    text = re.sub(r"III", "3", text) 
    text = re.sub(r"the US", "America", text)
    text = re.sub(r"Astrology", "astrology", text)
    text = re.sub(r"Method", "method", text)
    text = re.sub(r"Find", "find", text) 
    text = re.sub(r"banglore", "Banglore", text)
    text = re.sub(r" J K ", " JK ", text)
    
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)
    

def que_2_word_simple(question):
    question = re.sub("[^a-zA-Z]", " ", question).lower().split()
    question = [w for w in question if question not in stop_words]
    return question

In [26]:
# Prepare data for experiment

import copy
from sklearn.metrics import *

exp_data = copy.deepcopy(train_data[:10000])

# exp_data = train_data

exp_data['question1'] = exp_data['question1'].apply(question_to_words)
exp_data['question2'] = exp_data['question2'].apply(question_to_words)

exp_results = []
total_correct = 0

for j in range(len(exp_data)):
    q1 = exp_data.iloc[j].question1
    q2 = exp_data.iloc[j].question2
    
    #q1 = re.sub("[^a-zA-Z]", " ", q1).lower().split()
    #q1 = [w for w in q1 if w not in stop_words]
    
    #q2 = re.sub("[^a-zA-Z]", " ", q2).lower().split()
    #q2 = [w for w in q2 if w not in stop_words]

    
    dist = model.wmdistance(q1, q2)
    
    if(dist > 2):
        continue
        
    mean_nondup = 0.77
    mean_dup = 0.47
    
    if dist <= 0.60:
        dup_bin = 1
    else:
        dup_bin = 0
    
    if dup_bin == exp_data.iloc[j].is_duplicate:
        total_correct = total_correct + 1
    
    exp_results.append([exp_data.iloc[j].id, q1, q2, dist, dup_bin, exp_data.iloc[j].is_duplicate])
        
    
print(total_correct/len(exp_data))   
    
    
exp_results_df = pd.DataFrame(exp_results, columns = ['id', 'q1', 'q2', 'wmd_dist', 'dup_bin', 'is_duplicate']) 


print("f1 " + str(f1_score(exp_results_df['is_duplicate'], exp_results_df['dup_bin'])))
print("accu " + str(accuracy_score(exp_results_df['is_duplicate'], exp_results_df['dup_bin'])))
# exp_results_df

0.6675
f1 0.6101854024876789
accu 0.6677003100930279


In [90]:
duplicates = exp_results_df.loc[exp_results_df.is_duplicate==1, ['wmd_dist']]
nondupes = exp_results_df.loc[exp_results_df.is_duplicate==0, ['wmd_dist']]

duplicates.describe()

Unnamed: 0,wmd_dist
count,1907.0
mean,0.450566
std,0.270715
min,0.0
25%,0.270481
50%,0.453403
75%,0.649198
max,1.367003
