In [1]:
import numpy as np
from time import time
from gensim.test.utils import datapath
from gensim import utils
import logging
from naive_word2vec import NaiveWord2VecNGS

import gensim
print(gensim.__version__)

4.1.2


In [2]:
logging.root.setLevel('WARNING')

In [3]:
def most_similar_score(vocab, source, target, source_model, target_model, top=5):
    result = []
  
    for word in vocab:
        count = 0
        
        if isinstance(source, gensim.models.keyedvectors.KeyedVectors):      
            source_most = [value[0] for value in source.most_similar(positive=[word], topn=top)]            
        elif isinstance(source, dict):
            source_most = [value[0] for value in source_model.most_similar_words(word, tops=top)]
        
        if isinstance(target, gensim.models.keyedvectors.KeyedVectors):
            target_most = [value[0] for value in target.most_similar(positive=[word], topn=top)]
        elif isinstance(target, dict):
            target_most = [value[0] for value in target_model.most_similar_words(word, tops=top)]  


        for i in range(top):
            if source_most[i] in target_most:
                count += 1

        result.append(count / top)
    return np.mean(result)    

# Train samples

In [4]:
tokens_size = 4000 

def samples(size=4000):
    tokens = []    
    corpus_path = datapath('lee_background.cor')
    for line in open(corpus_path):
        tokens += utils.simple_preprocess(line)
        if len(tokens) >= size:
            tokens = tokens[:size]
            break    
    corpus = " ".join(tokens)
    return corpus, tokens

corpus, sentences = samples()

# Comparison of words similarity

In [5]:
vector_sizes = [300]
epochs = [20]
window_sizes = [13, 15]
Ks = [15, 20]  # negative samples count

for i in range(2):
    print("Iter:", i + 1)
    for epoch in epochs:
        for vector_size in vector_sizes:
            for window_size in window_sizes:
                for K in Ks:
                    model_naive = NaiveWord2VecNGS(window_size=window_size, learning_rate=0.1, n_dim=vector_size, epochs=epoch, neg_samples=K)
                    start_time = time()
                    model_naive.train(corpus, compute_loss=True)
                    end_time = time()-start_time

                    gensim_model_ngs = gensim.models.Word2Vec(
                        sentences=[sentences],
                        vector_size=vector_size,
                        window=window_size,
                        alpha=0.1,
                        epochs=epoch,
                        min_count=0,
                        sg=1,
                        hs=0,
                        negative=K,
                        workers=1, compute_loss=False, batch_words=1, seed=42)

                    vocab = gensim_model_ngs.wv.key_to_index

                    print("window_size:", window_size, "K:", K, "time:", round(end_time, 2), "accuracy:", round(most_similar_score(vocab, model_naive.w2v, gensim_model_ngs.wv, source_model=model_naive, target_model=gensim_model_ngs, top=10), 2))


Iter: 1
window_size: 13 K: 15 time: 32.31 accuracy: 0.64
window_size: 13 K: 20 time: 33.08 accuracy: 0.65
window_size: 15 K: 15 time: 35.13 accuracy: 0.66
window_size: 15 K: 20 time: 31.25 accuracy: 0.67
Iter: 2
window_size: 13 K: 15 time: 32.31 accuracy: 0.65
window_size: 13 K: 20 time: 30.35 accuracy: 0.65
window_size: 15 K: 15 time: 33.68 accuracy: 0.66
window_size: 15 K: 20 time: 37.03 accuracy: 0.67
