In [66]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from text_processor import TextProcessor
from gensim.models.doc2vec import Doc2Vec
import gensim
from sklearn.metrics import accuracy_score

from util import get_balanced_data
from word2vec import Word2Vec
from adversarial_algos import adversarial_white_box_change
from word_in_common_feature_extractor import WordsInCommonFeatureExtractor

%load_ext autoreload

%autoreload 2


pal = sns.color_palette()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Loading Data

In [3]:
X_train, X_test, y_train, y_test = get_balanced_data()

We have 298526 positive samples
We have 298526 negative samples


## Feature Extraction

In [67]:
tp = TextProcessor()

In [68]:
feature_extractor = WordsInCommonFeatureExtractor()

In [69]:
feature_extractor.get_features_for_sample("hello", "hallo")

[0.0]

In [70]:
# from nltk.corpus import stopwords

# stopwords = set(stopwords.words("english"))

# def words_in_common_similarity(q1, q2):
#     """
#     The similarity score is the ratio of words that appear in both questions over words which appear in either.
#     Stop words are excluded.
#     """
#     q1_words = set([w for w in tp.tokenize(q1.lower()) if w not in stopwords])
#     q2_words = set([w for w in tp.tokenize(q2.lower()) if w not in stopwords])
    
#     words_in_common = len(q1_words & q2_words)
#     total_words = len(q1_words | q2_words) 
    
#     frac_words_in_common = words_in_common / total_words
    
#     return frac_words_in_common

## Predictions

In [71]:
X_train_num = feature_extractor.get_features(X_train)

In [None]:
X_train_num

In [72]:
clf = LinearRegression()

In [73]:
%%time
clf.fit(X_train_num, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [74]:
X_test_num = feature_extractor.get_features(X_test)

In [75]:
y_pred = clf.predict(X_test_num)

NotFittedError: This LinearRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [None]:
log_loss(y_test, y_pred)

In [None]:
y_pred_bin = [0 if y < 0.38 else 1 for y in y_pred]

In [None]:
accuracy_score(y_test, y_pred_bin)

In [66]:
def get_features(X):
    return np.apply_along_axis(lambda x: words_in_common_similarity(x[0], x[1]), 1, X).reshape(-1, 1)

## Load Word2Vec

In [67]:
%%time
w2v = Word2Vec()

CPU times: user 41 s, sys: 3.26 s, total: 44.2 s
Wall time: 44.2 s


## Adversarial White Box Algorithm



In [97]:
def model(q1, q2):
    x = np.array([q1, q2]).reshape(1, -1)
    x_num = get_features(x)
    
    return clf.predict(x_num)

In [98]:
# def adversarial_change(q1, q2, model):
#     """
#     Initially, q1 and q2 are detected as similar by the classifier.
#     Change q2 so that it retains the same meaning and is now detected as not similar to q2
#     """    
#     q1_tokenized = tp.tokenize(q1)
#     q2_tokenized = tp.tokenize(q2)
#     successful = False
#     replaced_words = 0
#     print("initial q1: {}".format(q1))
#     print("initial q2: {}".format(q2))
#     print("Initial similarity is {}".format(model(q1_tokenized, q2_tokenized, clf)))
    
#     while not successful or replaced_words >= 5:
#         # Try changing each word in q2. At the end, select the change that gives the best improvement
#         min_score = model(q1_tokenized, q2_tokenized, clf)
#         new_q2 = q2_tokenized
        
#         for i, word in enumerate(q2_tokenized):
#             if word in w2v.model.vocab:
#                 closest_word = w2v.get_closest_word(word)
#                 q2_modified = list(q2_tokenized)
#                 q2_modified[i] = closest_word
#                 score = model(q1_tokenized, q2_modified, clf)

#                 if score < min_score:
#                     min_score = score
#                     new_q2 = q2_modified
        
#         if min_score < model(q1_tokenized, q2_tokenized, clf):
#             q2_tokenized = new_q2
#             replaced_words += 1
        
#         print()
#         print("q1: '{}'".format(tp.detokenize(q1_tokenized)))
#         print("q2: '{}'".format(tp.detokenize(new_q2)))
#         print("Replacing {} words we have a similarity of {}".format(replaced_words, min_score))
        
#         if min_score < 0.4:
#             successful = True

In [99]:
q1 = 'Who is the richest gambler of all time and how can I reach his level?'
q2 = 'Who is the richest gambler of all time and how can I reach his level as a gambler?'

adversarial_white_box_change(q1, q2, model, tp, w2v)

initial q1: Who is the richest gambler of all time and how can I reach his level?
initial q2: Who is the richest gambler of all time and how can I reach his level as a gambler?
Initial similarity is [0.94920687]

q1: 'Who is the richest gambler of all time and how can I reach his level?'
q2: 'Who is the wealthiest gambler of all time and how can I reach his level as a gambler?'
Replacing 1 words we have a similarity of [0.70803868]

q1: 'Who is the richest gambler of all time and how can I reach his level?'
q2: 'Who is the wealthiest gambler of all day and how can I reach his level as a gambler?'
Replacing 2 words we have a similarity of [0.52716254]

q1: 'Who is the richest gambler of all time and how can I reach his level?'
q2: 'Who is the wealthiest gambler of all day and how can I reaching his level as a gambler?'
Replacing 3 words we have a similarity of [0.38648109]


In [82]:
model(tp.tokenize('Who is the  of n I reach his level?'), tp.tokenize('Who is the richest gambler of all time and how can I reach his level as a gambler?'))

array([0.46687049])

In [91]:
for i in range(100):
    q1 = X_test[i, 0]
    q2 = X_test[i, 1]
    q1_tokenized = tp.tokenize(q1)
    q2_tokenized = tp.tokenize(q2)
    if model(q1_tokenized, q2_tokenized) > 0.6:
        adversarial_white_box_change(q1, q2, model, tp, w2v)
        print()
#         print(i)
#         print(q1)
#         print(q2)
#         print


initial q1: Can an introvert become an extrovert?
initial q2: Can an introvert and an extrovert be together?
Initial similarity is [0.6115714]

q1: 'Can an introvert become an extrovert?'
q2: 'Can an introverted and an extrovert be together?'
Replacing 1 words we have a similarity of [0.38648109]

initial q1: Is "Out" based on real events?
initial q2: Is "G" based on real events?
Initial similarity is [0.82862278]

q1: 'Is``Out"based on real events?'
q2: 'Is``G"based on genuine events?'
Replacing 1 words we have a similarity of [0.63267362]

q1: 'Is``Out"based on real events?'
q2: 'Is``G"based on genuine event?'
Replacing 2 words we have a similarity of [0.48026872]

q1: 'Is``Out"based on real events?'
q2: 'Is``##m_Tc_EC"based on genuine event?'
Replacing 3 words we have a similarity of [0.44275367]

q1: 'Is``Out"based on real events?'
q2: 'Is``##m_Tc_EC"based onthe genuine event?'
Replacing 4 words we have a similarity of [0.41205954]
UNSUCCESSFULL

initial q1: How hair grow after hai