### Document embeddings
* Computing the position vectors using word2vec
* Computing the document embeddings using doc2Vec
* Computing the document embeddings using a subset of the methods proposed in the review

In [57]:
!pip install --upgrade gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

skipTraining = False

Requirement already up-to-date: gensim in /opt/conda/lib/python3.7/site-packages (4.0.1)


In [22]:
if not skipTraining:
    from preprocessing import *
    from cosine_sim import *
    from synonym_enrich import *

In [23]:
if not skipTraining:
    docs = parseDocs("cran/cran.all.1400")
    docs = tokenize_and_clean(docs)
    docs = lemmatize(docs)

## Position vector

In [29]:
vec_size = 50
if not skipTraining:

    # sg: The training algorithm, either CBOW(0) or skip gram(1). The default training algorithm is CBOW.
    # min_count: The minimum count of words to consider when training the model;
    w2v_model = Word2Vec(docs, min_count=2, vector_size= vec_size, sg = 1)
    w2v_model.save("w2v.model")

In [26]:
if skipTraining:
    w2v_model = Word2Vec.load("w2v.model")

In [31]:
vector = w2v_model.wv['computer']  # get numpy vector of a word
vector

array([ 0.16547997, -0.23902543,  0.34613046, -0.15685175, -0.33932897,
       -0.14475358,  0.35155395,  0.5407517 , -0.12688686, -0.30697745,
        0.12583013, -0.28550646,  0.14946432, -0.07647571, -0.21525793,
        0.01549983,  0.016679  , -0.12470072, -0.2797611 ,  0.0069176 ,
        0.15511991, -0.3263179 ,  0.6232955 , -0.064994  , -0.01753527,
        0.05150538, -0.35302025, -0.2049817 , -0.63992643,  0.21758004,
       -0.33853486,  0.0651901 , -0.04681535, -0.05625008,  0.00447547,
       -0.3965827 ,  0.02142886,  0.03432292, -0.4676598 , -0.13731252,
        0.6782075 , -0.04655042,  0.03315324, -0.18289414,  0.72673374,
       -0.12769197,  0.27449325, -0.35474014, -0.2160631 ,  0.23292093],
      dtype=float32)

In [33]:
w2v_model.wv.most_similar('computer')

[('digital', 0.979901909828186),
 ('automatic', 0.9666916131973267),
 ('evaluation', 0.9542814493179321),
 ('computing', 0.9386852383613586),
 ('complex', 0.9368526339530945),
 ('practical', 0.9331501722335815),
 ('table', 0.9330777525901794),
 ('applying', 0.9317920207977295),
 ('formulation', 0.931194007396698),
 ('aid', 0.9302501082420349)]

In [50]:
raise NotImplementedError #set weights
w = 1 #needed for weighted averaging

pos_vecs = []
for doc in docs:
    n_words = len(doc)
    weighted_sum = np.zeros((vec_size,))
    for word in doc:
        if word in w2v_model.wv:
            weighted_sum += w * w2v_model.wv[word]
        
    avg = weighted_sum/n_words
    pos_vecs.append(avg)

NotImplementedError: 

## Doc2Vec

In [59]:
vec_size = 50
if not skipTraining:
    tagged_docs = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(docs)]
    d2v_model = Doc2Vec(tagged_docs, min_count=2, vector_size= vec_size)
    d2v_model.save("d2v.model")

In [60]:
if skipTraining:
    d2v_model = Word2Vec.load("d2v.model")

In [63]:
v1 = d2v_model.infer_vector(docs[0])
v1

array([ 0.19874685, -0.12971978,  0.147887  ,  0.26760656,  0.09977112,
       -0.18255714,  0.14067312,  0.1964975 , -0.23258264, -0.07927153,
        0.35591024,  0.03969877,  0.05411862, -0.02184379, -0.32557228,
        0.02802402,  0.214599  , -0.66473687, -0.21687463, -0.13439621,
        0.10326765,  0.02690773,  0.03022747, -0.0941508 ,  0.15157737,
       -0.1088073 ,  0.02512744, -0.36371958, -0.2775667 , -0.12016089,
       -0.14123534,  0.11724566,  0.03826113,  0.13443409, -0.16424286,
        0.425728  ,  0.26057693, -0.16866845,  0.04461066, -0.02106278,
        0.31882682,  0.25649196, -0.0412026 ,  0.09873915, -0.04683453,
        0.06451657, -0.21350868, -0.1803241 ,  0.00790678,  0.31618693],
      dtype=float32)