# Computing the embeddings

- just a toy example with a tiny corpus (`corpus.txt`, 1000 text articles)

In [None]:
import os
import fasttext

In [None]:
epochs = 10
min_count = 2
n_dims = 300
max_vocab = 20000

n_threads = 8  # match your machine

In [None]:
print(f'Training FastText with {n_threads} threads')

model = fasttext.train_unsupervised(
    'corpus.txt',
    model='cbow',  #'skipgram'
    dim=n_dims,
    ws=5,
    epoch=epochs,
    minCount=min_count,
    thread=n_threads
)
print(f'FastText: epochs {epochs}, dims {n_dims}, max_vocab {max_vocab},' f'model words {len(model.words)}')

In [None]:
embeddings_file = 'new_embeddings.txt'

# Save FastText embeddings as txt in the same format used by Gensim/Spacy.
# - https://stackoverflow.com/a/58342618/3248063
# - https://github.com/facebookresearch/fastText/blob/master/python/doc/examples/bin_to_vec.py

if max_vocab > len(model.words):  # alternative: len(model.get_words())
    words = model.words[:max_vocab]
else:
    words = model.words

with open(embeddings_file, 'w') as f:
    # write file header like: 11995 300
    f.write(f'{len(words)} {model.get_dimension()}\n')

    for w in words:
        v = model.get_word_vector(w)
        v_str = f'{w} {" ".join([str(dim) for dim in v])}'
        f.write(v_str + '\n')

print(f'Finished writing embeddings to {embeddings_file}')