### Sample program to make word2vec model using gensim  

- using text corpus  
- http://mattmahoney.net/dc/text8.zip

#### Import libraries  

In [6]:
import logging
from gensim.models import word2vec

#### Parameters  

In [7]:
corpus = 'data/text8'
words_per_sentence = 50
embed_size = 300
min_count = 30
model_file = 'word2vec_text8.model'

#### Exec word2vec  

In [3]:
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s",
                    level=logging.INFO)

sentences = word2vec.Text8Corpus(corpus, words_per_sentence)
model = word2vec.Word2Vec(sentences, size=embed_size, min_count=min_count)  # CBOW

# skip-gram (no negative sample)
#model = word2vec.Word2Vec(sentences, size=embed_size, min_count=min_count,
#                          sg=1)
# skip-gram (with negative sample)
#model = word2vec.Word2Vec(sentences, size=embed_size, min_count=min_count,
#                          sg=1, hs=0, negative=1)

# see https://radimrehurek.com/gensim/models/word2vec.html for more details

2021-10-31 16:51:54,353 : INFO : collecting all words and their counts
2021-10-31 16:51:54,354 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-10-31 16:51:54,477 : INFO : PROGRESS: at sentence #10000, processed 500000 words, keeping 33463 word types
2021-10-31 16:51:54,593 : INFO : PROGRESS: at sentence #20000, processed 1000000 words, keeping 52754 word types
2021-10-31 16:51:54,713 : INFO : PROGRESS: at sentence #30000, processed 1500000 words, keeping 65588 word types
2021-10-31 16:51:54,837 : INFO : PROGRESS: at sentence #40000, processed 2000000 words, keeping 78382 word types
2021-10-31 16:51:54,947 : INFO : PROGRESS: at sentence #50000, processed 2500000 words, keeping 88007 word types
2021-10-31 16:51:55,058 : INFO : PROGRESS: at sentence #60000, processed 3000000 words, keeping 96644 word types
2021-10-31 16:51:55,169 : INFO : PROGRESS: at sentence #70000, processed 3500000 words, keeping 104308 word types
2021-10-31 16:51:55,282 : INFO : PROGRE

#### Check model  

In [4]:
print(model.wv.vector_size)  # dimension of embedding
print(len(model.wv.vocab.keys()))  # number of words

300
25097


#### Save model into file  

In [5]:
model.init_sims(replace=True)  # normalization of vectors  
model.save(model_file)

2021-10-31 16:53:29,242 : INFO : precomputing L2-norms of word weight vectors
2021-10-31 16:53:29,272 : INFO : saving Word2Vec object under word2vec_text8.model, separately None
2021-10-31 16:53:29,273 : INFO : not storing attribute vectors_norm
2021-10-31 16:53:29,274 : INFO : not storing attribute cum_table
2021-10-31 16:53:29,898 : INFO : saved word2vec_text8.model
