In [1]:
import pandas as pd
import numpy as np
import nltk
from collections import Counter
from nltk.corpus import stopwords
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.optimize import minimize
stops = set(stopwords.words("english"))
import xgboost as xgb
from sklearn.cross_validation import train_test_split
import multiprocessing
import difflib



In [1]:
import gensim
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

model = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin.gz', binary=True)

2017-04-12 01:56:23,571 : INFO : loading projection weights from ../data/GoogleNews-vectors-negative300.bin.gz
2017-04-12 01:58:57,250 : INFO : loaded (3000000, 300) matrix from ../data/GoogleNews-vectors-negative300.bin.gz


In [2]:
model.similarity('pee', 'urine')

0.43420972721801765

In [2]:
features = [
#     'clean_q1_tokenized',
#     'clean_q2_tokenized',
#     'clean_q1_stem',
#     'clean_q2_stem',
#     'clean_q1_pos_tagged',
#     'clean_q2_pos_tagged',
#     'clean_q1_lemmatized',
#     'clean_q2_lemmatized',
    'clean_q1_lemmatized_stem',
    'clean_q2_lemmatized_stem'
]

In [3]:
def read_data(t, features):
    data = pd.read_csv('../input/{}.csv'.format(t))
    for feature in features:
        data = pd.merge(data, pd.read_pickle('../feature/{}_{}.pkl'.format(t, feature)))
    data.fillna(0.0)
    return data

In [4]:
train = read_data('train', features)[:1000]
test = read_data('test', features)[:1000]

In [5]:
stems = train.clean_q1_lemmatized_stem.tolist() + train.clean_q2_lemmatized_stem.tolist() + test.clean_q1_lemmatized_stem.tolist() + test.clean_q2_lemmatized_stem.tolist()

uniq_stems = set(' '.join(stem) for stem in stems)
len(uniq_stems)

3896

In [6]:
train['clean_q1_lemmatized_stem_str'] = train.clean_q1_lemmatized_stem.map(lambda a: ' '.join(a))
train['clean_q2_lemmatized_stem_str'] = train.clean_q2_lemmatized_stem.map(lambda a: ' '.join(a))

test['clean_q1_lemmatized_stem_str'] = test.clean_q1_lemmatized_stem.map(lambda a: ' '.join(a))
test['clean_q2_lemmatized_stem_str'] = test.clean_q2_lemmatized_stem.map(lambda a: ' '.join(a))

In [7]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 1))
#cvect = CountVectorizer(stop_words='english', ngram_range=(1, 1))

tfidf_txt = pd.Series(train['clean_q1_lemmatized_stem_str'].tolist() + train['clean_q2_lemmatized_stem_str'].tolist() + test['clean_q1_lemmatized_stem_str'].tolist() + test['clean_q2_lemmatized_stem_str'].tolist()).astype(str)
tfidf.fit_transform(tfidf_txt)

<4000x4600 sparse matrix of type '<class 'numpy.float64'>'
	with 19769 stored elements in Compressed Sparse Row format>

In [6]:
from nltk import FreqDist
words = []
for s in uniq_stems:
    words.extend(set(s.lower().split()))
freq = FreqDist(words)

In [34]:
list(reversed(list(enumerate(((w, f) for (w, f) in freq.most_common() if f > 1000)))))

[(2902, ('healthcar', 1001)),
 (2901, ('lee', 1001)),
 (2900, ('micro', 1001)),
 (2899, ('aadhaar', 1002)),
 (2898, ('login', 1003)),
 (2897, ('environment', 1003)),
 (2896, ('pink', 1003)),
 (2895, ('rural', 1004)),
 (2894, ('banana', 1004)),
 (2893, ('hydroxid', 1004)),
 (2892, ('possess', 1004)),
 (2891, ('km', 1004)),
 (2890, ('plug', 1005)),
 (2889, ('steam', 1005)),
 (2888, ('sync', 1007)),
 (2887, ('matur', 1007)),
 (2886, ('syndrom', 1007)),
 (2885, ('amiti', 1008)),
 (2884, ('draft', 1009)),
 (2883, ('induct', 1009)),
 (2882, ('will', 1011)),
 (2881, ('bean', 1012)),
 (2880, ('sensit', 1013)),
 (2879, ('anywher', 1013)),
 (2878, ('compress', 1013)),
 (2877, ('nexus', 1015)),
 (2876, ('cinema', 1015)),
 (2875, ('guest', 1015)),
 (2874, ('oneplus', 1015)),
 (2873, ('hdfc', 1016)),
 (2872, ('bca', 1016)),
 (2871, ('bull', 1016)),
 (2870, ('hobbi', 1017)),
 (2869, ('cant', 1017)),
 (2868, ('youth', 1018)),
 (2867, ('penni', 1018)),
 (2866, ('sibl', 1019)),
 (2865, ('stabl', 1019))

In [7]:
def get_weight(word):
    return 1 - freq[word] / len(uniq_stems)

In [None]:
def sum_weight_common_words(row):
    q1_stem = set(row.clean_q1_lemmatized_stem)
    q2_stem = set(row.clean_q2_lemmatized_stem)
    common_stem = q1_stem & q2_stem
    if common_stem:
        return sum(get_weight(stem) for stem in common_stem)
    else:
        return 0

In [None]:
train['sum_prob_weight_common_words'] = train.apply(sum_weight_common_words, axis=1)
test['sum_prob_weight_common_words'] = test.apply(sum_weight_common_words, axis=1)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.rcParams['figure.figsize'] = (15.0, 15.0)

In [None]:
# plt.hist(train.sum_prob_weight_common_words[train['is_duplicate'] == 0], bins=20, normed=True, label='0')
# plt.hist(train.sum_prob_weight_common_words[train['is_duplicate'] == 1], bins=20, normed=True, label='1', alpha=0.7)
# None

In [None]:
# features_to_save = [
#     'sum_prob_weight_common_words'
# ]

In [None]:
# import util
# util.save_feature(train, 'train', features_to_save, 'id')
# util.save_feature(test, 'test', features_to_save, 'test_id')