# word2vec: embedding quora questions in GoogleNews model

Goals:

- load a word2vec model
- get vector representations of quora words and questions.
- calculate similarity between pairs of vectors (words, questions)

In [1]:
import gensim
import nltk
import numpy as np
import pandas as pd
import itertools as it

In [2]:
csv = pd.read_csv("Processed_validation.csv", names = ["q1id", "q2id", "pid", "q1", "q2", "label"])

In [3]:
sentences = csv.loc[:,"q1"].append(csv.loc[:,"q2"], ignore_index = True)

In [4]:
#google_news_model = gensim.models.KeyedVectors.load("GoogleNews-vectors-negative300.bin.gz")  # too big for RAM (needs +8GB)
#qmodel = gensim.models.Word2Vec(sentences, iter=3, size= 20)
wgmodel = gensim.models.KeyedVectors.load_word2vec_format("6B.50d.w2v.txt")

In [5]:
qmodel
wgmodel

NameError: name 'qmodel' is not defined

In [None]:
qmodel.save("quora_model.w2v")

In [None]:
sentences[:5]

In [None]:
csv.loc[0, "q1"]

In [12]:
[wgmodel[w] for w in sentences[0].split() if w in wgmodel]

[array([  6.89379990e-01,  -1.06440000e-01,   1.70829996e-01,
         -3.75829995e-01,   7.51699984e-01,   7.81490002e-04,
         -5.31019986e-01,  -1.99029997e-01,  -1.44189999e-01,
          1.27480000e-01,  -2.80380011e-01,   7.07229972e-01,
         -5.41000009e-01,   1.96250007e-01,   9.66350019e-01,
          6.05189979e-01,   4.09179986e-01,  -3.16120014e-02,
          5.38999975e-01,  -8.70859981e-01,  -2.09120005e-01,
          5.68530023e-01,   6.59829974e-01,   1.45830005e-01,
          1.01119995e+00,  -2.07360005e+00,  -1.12419999e+00,
          5.96620026e-04,   7.03320026e-01,  -8.26080024e-01,
          3.44449997e+00,   3.29840004e-01,  -3.53240013e-01,
         -1.03349996e+00,  -1.47530004e-01,  -1.48739994e-01,
         -4.12459999e-01,   3.34890008e-01,   1.98410004e-01,
         -2.54779994e-01,  -4.71929997e-01,   6.67010024e-02,
          3.27769995e-01,   6.87810004e-01,   3.64279985e-01,
          2.15220004e-01,   1.64940000e-01,   4.17609990e-01,
        

In [13]:
def trim_vocab(model, q):
    return (w for w in q.split() if w in model)
def words_pairwise_similarity(model, wtuple):
    return model.similarity(wtuple[0], wtuple[1])
def questions_pairwise_similarity(model, q1, q2):
    ws1 = trim_vocab(model, q1)
    ws2 = trim_vocab(model, q2)
    return (words_pairwise_similarity(model, wtuple) for wtuple in it.product(ws1, ws2))
    


In [14]:
list(questions_pairwise_similarity(wgmodel, sentences[0], sentences[2]))

[0.96343386002995013,
 0.79678858739099934,
 0.67698766058109849,
 0.51018870242585823,
 0.79243584222637364,
 0.83026613775409475,
 0.77743724910541312,
 0.49998871331055705,
 0.620057548960111,
 0.64737116974393072,
 0.48134601828705398,
 0.26588552540069837,
 0.79243584222637364,
 0.64737116974393072,
 0.4620663330205067,
 0.45008945766561487,
 0.60215991740486297,
 0.67698766058109849,
 0.66494753609247792,
 1.0000000000000002,
 0.90919292878570435,
 0.88291424002502272,
 0.7512901278552806,
 0.79243584222637364,
 0.67698766058109849,
 0.45285267310777988,
 0.51018870242585823,
 0.60215991740486297,
 0.35645779564134267,
 0.80200359990077608,
 0.84066853237812189,
 0.67386800904998367,
 0.47727271193076776,
 0.81005056397484565,
 0.80614209758321598,
 0.78535160397338422,
 0.52460149489808794,
 0.62704629260623046,
 0.6248660820317834,
 0.40576099972687918,
 0.33414444804986221,
 0.81005056397484565,
 0.6248660820317834,
 0.52761178953620769,
 0.5639722548094237,
 0.576267965688914

In [15]:
true_sims = it.chain(*csv[csv["label"] == 1].sample(1000).apply(lambda row: questions_pairwise_similarity(wgmodel, row['q1'], row['q2']), axis = 1))
false_sims = it.chain(*csv[csv["label"] == 0].sample(1000).apply(lambda row: questions_pairwise_similarity(wgmodel, row['q1'], row['q2']), axis = 1))

distributions = pd.DataFrame.from_records(it.chain(((1, s) for s in true_sims), ((0, s) for s in false_sims)), columns=["label", "similarity"])



In [16]:
distributions

Unnamed: 0,label,similarity
0,1,0.698936
1,1,0.683368
2,1,0.680569
3,1,0.722995
4,1,0.439510
5,1,0.512781
6,1,0.517065
7,1,0.909193
8,1,1.000000
9,1,0.881150


In [17]:
distributions.to_csv("similarity_distributions.tsv", sep = "\t", index = False)