### Document embeddings
* Computing the position vectors using word2vec
* Computing the document embeddings using doc2Vec
* Computing the document embeddings using a subset of the methods proposed in the review

In [1]:
!pip install --upgrade gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

skipTraining = False

Collecting gensim
  Downloading gensim-4.0.1-cp38-cp38-manylinux1_x86_64.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 7.8 MB/s eta 0:00:01
Collecting smart-open>=1.8.1
  Downloading smart_open-5.0.0-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 2.3 MB/s  eta 0:00:01
[?25hInstalling collected packages: smart-open, gensim
Successfully installed gensim-4.0.1 smart-open-5.0.0




In [9]:
if not skipTraining:
    from preprocessing import *
    from cosine_sim import *
    from synonym_enrich import *
    from baseline import *

In [3]:
if not skipTraining:
    docs = parseDocs("cran/cran.all.1400")
    docs = tokenize_and_clean(docs)
    docs = lemmatize(docs)

## Position vector

In [4]:
vec_size = 50
if not skipTraining:

    # sg: The training algorithm, either CBOW(0) or skip gram(1). The default training algorithm is CBOW.
    # min_count: The minimum count of words to consider when training the model;
    w2v_model = Word2Vec(docs, min_count=2, vector_size= vec_size, sg = 1)
    w2v_model.save("w2v.model")

In [5]:
if skipTraining:
    w2v_model = Word2Vec.load("w2v.model")

In [6]:
vector = w2v_model.wv['computer']  # get numpy vector of a word
vector

array([ 0.01503546, -0.19407776,  0.28160238, -0.17297941, -0.4450978 ,
       -0.10153174,  0.49798927,  0.28953522, -0.26928496, -0.3553878 ,
        0.15962848, -0.11752975,  0.01413823, -0.05062268, -0.11307315,
        0.01968868,  0.10698026,  0.06178449, -0.2272061 ,  0.11730936,
        0.0034541 , -0.24787344,  0.6591028 ,  0.07714592,  0.20062515,
        0.17791298, -0.26672757, -0.12411867, -0.7317228 , -0.0645718 ,
       -0.32005432,  0.15800847,  0.03260247,  0.14398654,  0.08191456,
       -0.42531094,  0.03000608,  0.07745912, -0.44342226, -0.16953444,
        0.3397785 , -0.03678326,  0.26648775, -0.21699211,  0.71049553,
       -0.11359502,  0.39135873, -0.24128321, -0.2730923 ,  0.48383117],
      dtype=float32)

In [7]:
w2v_model.wv.most_similar('computer')

[('digital', 0.9919171929359436),
 ('automatic', 0.9600348472595215),
 ('evaluation', 0.953161358833313),
 ('computing', 0.9528289437294006),
 ('complex', 0.9485465884208679),
 ('aid', 0.9430157542228699),
 ('formulation', 0.9394195675849915),
 ('suitable', 0.9387032389640808),
 ('practical', 0.9384737014770508),
 ('applying', 0.9383900761604309)]

In [27]:
tf_docs, sorted_vocab = create_term_doc_matrix(docs)

In [40]:
def vanilla_pos_vec(w2v_model, tf_matrix, sorted_vocab, docs):
    pos_vecs = []
    for i, doc in enumerate(docs):
        n_words = len(doc)
        weighted_sum = np.zeros((vec_size,))
        for word in doc:
            if word in w2v_model.wv:
                word_idx = sorted_vocab.index(word)
                w = tf_matrix[word_idx][i]
                weighted_sum += w * w2v_model.wv[word]

        avg = weighted_sum/n_words
        pos_vecs.append(avg)
        
    return pos_vecs

In [41]:
pos_vecs = vanilla_pos_vec(w2v_model, tf_docs, sorted_vocab, docs)
len(pos_vecs)

1398

## Doc2Vec

In [None]:
vec_size = 50
if not skipTraining:
    tagged_docs = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(docs)]
    d2v_model = Doc2Vec(tagged_docs, min_count=2, vector_size= vec_size)
    d2v_model.save("d2v.model")

In [None]:
if skipTraining:
    d2v_model = Word2Vec.load("d2v.model")

In [None]:
v1 = d2v_model.infer_vector(docs[0])
v1