# word2vec: embedding quora questions in GoogleNews model

Goals:

- load a word2vec model
- get vector representations of quora words and questions.
- calculate similarity between pairs of vectors (words, questions)

In [2]:
import gensim
import nltk
import numpy as np
import pandas as pd
import itertools as it
import scipy as sc

In [3]:
csv = pd.read_csv("Processed_validation.csv", names = ["q1id", "q2id", "pid", "q1", "q2", "label"])

In [4]:
csv

Unnamed: 0,q1id,q2id,pid,q1,q2,label
0,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0
1,6,13,14,should i buy tiago,what keeps childern active and far from phone ...,0
2,14,29,30,what are the laws to change your status from a...,what are the laws to change your status from a...,0
3,15,31,32,what would a trump presidency mean for current...,how will a trump presidency affect the student...,1
4,31,63,64,what are some special cares for someone with a...,how can i keep my nose from getting stuffy at ...,1
5,37,75,76,when a girlfriend asks her boyfriend why did ...,my girlfriend said that we should end this bec...,0
6,41,83,84,when can i expect my cognizant confirmation mail,when can i expect cognizant confirmation mail,0
7,47,95,96,what are the stages of breaking up between cou...,who is affected more by a breakup the boy or ...,0
8,49,99,100,how do i make friends,how to make friends,1
9,57,115,116,what are some good rap songs to dance to,what are some of the best rap songs,0


In [20]:
sentences = csv.loc[:,"q1"].append(csv.loc[:,"q2"], ignore_index = True)

In [21]:
#google_news_model = gensim.models.KeyedVectors.load("GoogleNews-vectors-negative300.bin.gz")  # too big for RAM (needs +8GB)
#qmodel = gensim.models.Word2Vec(sentences, iter=3, size= 20)
wgmodel = gensim.models.KeyedVectors.load_word2vec_format("6B.50d.w2v.txt")

In [28]:
#qmodel
wgmodel

<gensim.models.keyedvectors.KeyedVectors at 0x7f4e37522a50>

In [29]:
#qmodel.save("quora_model.w2v")

In [30]:
sentences[:5]

0    how can i increase the speed of my internet co...
1                                  should i buy tiago 
2    what are the laws to change your status from a...
3    what would a trump presidency mean for current...
4    what are some special cares for someone with a...
dtype: object

In [31]:
csv.loc[0, "q1"]

'how can i increase the speed of my internet connection while using a vpn '

In [32]:
[wgmodel[w] for w in sentences[0].split() if w in wgmodel]

[array([  6.89379990e-01,  -1.06440000e-01,   1.70829996e-01,
         -3.75829995e-01,   7.51699984e-01,   7.81490002e-04,
         -5.31019986e-01,  -1.99029997e-01,  -1.44189999e-01,
          1.27480000e-01,  -2.80380011e-01,   7.07229972e-01,
         -5.41000009e-01,   1.96250007e-01,   9.66350019e-01,
          6.05189979e-01,   4.09179986e-01,  -3.16120014e-02,
          5.38999975e-01,  -8.70859981e-01,  -2.09120005e-01,
          5.68530023e-01,   6.59829974e-01,   1.45830005e-01,
          1.01119995e+00,  -2.07360005e+00,  -1.12419999e+00,
          5.96620026e-04,   7.03320026e-01,  -8.26080024e-01,
          3.44449997e+00,   3.29840004e-01,  -3.53240013e-01,
         -1.03349996e+00,  -1.47530004e-01,  -1.48739994e-01,
         -4.12459999e-01,   3.34890008e-01,   1.98410004e-01,
         -2.54779994e-01,  -4.71929997e-01,   6.67010024e-02,
          3.27769995e-01,   6.87810004e-01,   3.64279985e-01,
          2.15220004e-01,   1.64940000e-01,   4.17609990e-01,
        

In [33]:
def trim_vocab(model, q):
    return (w for w in q.split() if w in model)
def words_pairwise_similarity(model, wtuple):
    return model.similarity(wtuple[0], wtuple[1])
def questions_pairwise_similarity(model, q1, q2):
    ws1 = trim_vocab(model, q1)
    ws2 = trim_vocab(model, q2)
    return (words_pairwise_similarity(model, wtuple) for wtuple in it.product(ws1, ws2))
    


In [34]:
list(questions_pairwise_similarity(wgmodel, sentences[0], sentences[2]))

[0.96343386002995013,
 0.79678858739099934,
 0.67698766058109849,
 0.51018870242585823,
 0.79243584222637364,
 0.83026613775409475,
 0.77743724910541312,
 0.49998871331055705,
 0.620057548960111,
 0.64737116974393072,
 0.48134601828705398,
 0.26588552540069837,
 0.79243584222637364,
 0.64737116974393072,
 0.4620663330205067,
 0.45008945766561487,
 0.60215991740486297,
 0.67698766058109849,
 0.66494753609247792,
 1.0000000000000002,
 0.90919292878570435,
 0.88291424002502272,
 0.7512901278552806,
 0.79243584222637364,
 0.67698766058109849,
 0.45285267310777988,
 0.51018870242585823,
 0.60215991740486297,
 0.35645779564134267,
 0.80200359990077608,
 0.84066853237812189,
 0.67386800904998367,
 0.47727271193076776,
 0.81005056397484565,
 0.80614209758321598,
 0.78535160397338422,
 0.52460149489808794,
 0.62704629260623046,
 0.6248660820317834,
 0.40576099972687918,
 0.33414444804986221,
 0.81005056397484565,
 0.6248660820317834,
 0.52761178953620769,
 0.5639722548094237,
 0.576267965688914

In [35]:
true_sims = it.chain(*csv[csv["label"] == 1].sample(1000).apply(lambda row: questions_pairwise_similarity(wgmodel, row['q1'], row['q2']), axis = 1))
false_sims = it.chain(*csv[csv["label"] == 0].sample(1000).apply(lambda row: questions_pairwise_similarity(wgmodel, row['q1'], row['q2']), axis = 1))

distributions = pd.DataFrame.from_records(it.chain(((1, s) for s in true_sims), ((0, s) for s in false_sims)), columns=["label", "similarity"])



In [36]:
distributions

Unnamed: 0,label,similarity
0,1,0.963434
1,1,0.796789
2,1,0.817944
3,1,0.545038
4,1,0.760687
5,1,0.675733
6,1,0.647371
7,1,0.299587
8,1,0.221480
9,1,0.890818


In [37]:
distributions.to_csv("similarity_distributions.tsv", sep = "\t", index = False)

In [46]:
# modified from: https://docs.python.org/release/2.3.5/lib/itertools-example.html
def window(seq, n=2):
    "Returns a sliding window (of width n) over data from the iterable"
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    itr = iter(seq)
    result = tuple(it.islice(itr, n))
    if len(result) == n:
        yield result    
    for elem in itr:
        result = result[1:] + (elem,)
        yield result

def words_pairwise_distance(model, wtuple):
    ngram1 = wtuple[0]
    ngram2 = wtuple[1]
    return sc.spatial.distance.chebyshev(np.add.reduce([model[w] for w in ngram1]), np.add.reduce([model[w] for w in ngram2]))
    
def questions_windowed_distance(model, q1, q2, n = 3):
    ws1 = trim_vocab(model, q1)
    ws2 = trim_vocab(model, q2)
    ww1 = window(ws1, n = n)
    ww2 = window(ws2, n = n)
    return (words_pairwise_distance(model, wtuple) for wtuple in it.product(ww1, ww2))

In [47]:
true_sims = it.chain(*csv[csv["label"] == 1].sample(1000).apply(lambda row: questions_windowed_distance(wgmodel, row['q1'], row['q2'], n = 9), axis = 1))
false_sims = it.chain(*csv[csv["label"] == 0].sample(1000).apply(lambda row: questions_windowed_distance(wgmodel, row['q1'], row['q2'], n = 9), axis = 1))
window_dists = pd.DataFrame.from_records(it.chain(((1, s) for s in true_sims), ((0, s) for s in false_sims)), columns=["label", "chebychev"])


In [43]:
window_dists

Unnamed: 0,label,chebychev
0,1,1.014953
1,1,1.043160
2,1,1.471683
3,1,2.939780
4,1,0.754749
5,1,1.309930
6,1,1.170558
7,1,3.154870
8,1,1.580890
9,1,1.926900


In [48]:
window_dists.to_csv("chebychev_9gram_distributions.tsv", sep = "\t", index = False)