In [121]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from text_processor import TextProcessor
from gensim.models.doc2vec import Doc2Vec


pal = sns.color_palette()

## Loading Data

In [3]:
df_train = pd.read_csv('data/train.csv')

In [4]:
target = 'is_duplicate'
features = ['question1', 'question2']

X = df_train[features].as_matrix()
y = df_train[target].as_matrix()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Feature Extraction

In [59]:
tp = TextProcessor()

In [81]:
from nltk.corpus import stopwords

stopwords = set(stopwords.words("english"))

def words_in_common_similarity(q1, q2):
    """
    The similarity score is the ratio of words that appear in both questions over words which appear in either.
    Stop words are excluded.
    """
    q1_words = set([w for w in tp.tokenize(q1.lower()) if w not in stopwords])
    q2_words = set([w for w in tp.tokenize(q2.lower()) if w not in stopwords])
    
    words_in_common = len(q1_words & q2_words)
    total_words = len(q1_words | q2_words) 
    
    frac_words_in_common = words_in_common / total_words
    
    return frac_words_in_common

## Predictions

In [82]:
X_train_num = np.apply_along_axis(lambda x: words_in_common_similarity(str(x[0]), str(x[1])), 1, X_train).reshape(-1, 1)

In [113]:
clf = LinearRegression()

In [114]:
%%time
clf.fit(X_train_num, y_train)

CPU times: user 14.4 ms, sys: 28.4 ms, total: 42.8 ms
Wall time: 50.4 ms


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [85]:
X_test_num = np.apply_along_axis(lambda x: words_in_common_similarity(x[0], x[1]), 1, X_test).reshape(-1, 1)

In [115]:
y_pred = clf.predict(X_test_num)

In [116]:
log_loss(y_test, y_pred)

0.565030182540562

In [92]:
def get_features(X):
    return np.apply_along_axis(lambda x: words_in_common_similarity(x[0], x[1]), 1, X).reshape(-1, 1)

In [111]:
# x = np.array(['What is purpose of life?', "What's the purpose of life? What is life actually about?"])
x = np.array(['What is purpose of life?', "What's the meaning of life? What is life actually about?"])
x = get_features(x.reshape(1, -1))
x
# model.predict(x)
# np.array(["What is your name?", "What is your name"]).reshape(1, -1)

array([[0.33333333]])

In [104]:
X[79]

array(['What is purpose of life?',
       "What's the purpose of life? What is life actually about?"],
      dtype=object)

In [94]:
df_train[:100]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


## Load Word2Vec

In [124]:
w2v = Doc2Vec.load('models/doc2vec')

In [159]:
w2v.wv.most_similar(['purpose'], topn=10)

[('importance', 0.628302812576294),
 ('essence', 0.5861237049102783),
 ('meaning', 0.5789660811424255),
 ('motive', 0.5542106628417969),
 ('significance', 0.5441163778305054),
 ('origin', 0.534876823425293),
 ('definition', 0.530587911605835),
 ('aim', 0.5116595029830933),
 ('Importance', 0.5113579630851746),
 ('derivation', 0.5106101632118225)]

## Adversarial White Box Algorithm



In [133]:
def get_closest_word(word):
    return w2v.wv.most_similar([word], topn=1)[0][0]

In [155]:
def model(q1, q2, clf):
    q1 = ' '.join(q1)
    q2 = ' '.join(q2)
    x = np.array([q1, q2]).reshape(1, -1)
    x_num = get_features(x)
    
    return clf.predict(x_num)

In [156]:
def adversarial_change(q1, q2, model):
    """
    Initially, q1 and q2 are detected as similar by the classifier.
    Change q2 so that it retains the same meaning and is now detected as not similar to q2
    """
    q1_tokenized = tp.tokenize(q1)
    q2_tokenized = tp.tokenize(q2)
    successful = False
    replaced_words = 0
    
    while not successful or replaced_words >= 5:
        # Try changing each word in q2. At the end, select the change that gives the best improvement
        min_score = model(q1_tokenized, q2_tokenized, clf)
        new_q2 = q2_tokenized
        
        for i, word in enumerate(q2_tokenized):
            if word in w2v.wv.vocab:
                closest_word = get_closest_word(word)
                q2_modified = list(q2_tokenized)
                q2_modified[i] = closest_word
                score = model(q1_tokenized, q2_modified, clf)

                if score < min_score:
                    min_score = score
                    new_q2 = q2_modified
        
        if min_score < model(q1_tokenized, q2_tokenized, clf):
            q2_tokenized = new_q2
        
        if min_score < 0.5:
            successful = True
            print("q1: '{}'".format(' '.join(q1_tokenized)))
            print("q2: '{}'".format(' '.join(new_q2)))
            print("Similarity: {}".format(min_score))
            

In [160]:
adversarial_change('What is purpose of life?', "What's the purpose of life? What is life actually about?", model)

q1: 'What is purpose of life ?'
q2: 'What's the importance of life ? What is life actually about ?'
Similarity: [0.28415551]
