# Word Embeddings

## Training a simple word2vec model on your own text

First, define some training data

In [None]:
from gensim.models import Word2Vec
# add your sentences
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
			['this', 'is', 'the', 'second', 'sentence'],
			['yet', 'another', 'sentence'],
			['one', 'more', 'sentence'],
			['and', 'the', 'final', 'sentence']]
# train a model on this data, set min word count!
model = Word2Vec(sentences, min_count=1)
# print a summary of the model
print(model)

Inspect vocabulary

In [None]:
# summarize vocabulary
words = list(model.wv.vocab)
print(words)

Inspect vectors

In [None]:
# access vector for one word
print(model.wv['sentence'])

Save and load model from disk

In [None]:
# save model
model.save('model.bin')
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

## Visualization

via PCA plot

In [None]:
from sklearn.decomposition import PCA
from matplotlib import pyplot

X = new_model.wv[new_model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)
for i, word in enumerate(words):
	pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
display(pyplot.show())

## Evaluation 

via Word Analogy

In [None]:

word2vec_result = new_model.most_similar(positive=['sentence', 'word2vec'], negative=['this'], topn=1)
print(word2vec_result)

## Using Pretrained Embeddings

Download glove embeddings.
Downloading and accessing this might take up to several minutes.

In [None]:
import gensim.downloader as api

glove_model = api.load('glove-wiki-gigaword-50')
print('model loaded')

Again, evaluation via Word Analogy. This time, we can play with much more words.

In [None]:
# calculate: (king - man) + woman = ?
glove_result = glove_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(glove_result)

Evaluation via word distance

In [None]:
# calculate N most similar words to 
glove_model.most_similar(positive=['boy'], topn=10)

## Training Embeddings on larger texts

In [None]:
from gensim.models.word2vec import LineSentence
print(api.load('text8', return_path=True))
sentences = LineSentence('/root/gensim-data/text8/text8.gz')
# train model
wiki_model = Word2Vec(sentences, min_count=5)
# summarize the loaded model
print(wiki_model)

Inspect content of text

In [None]:
# print the first line
print(next(x for x in enumerate(sentences)))

Again, some evaluation

In [None]:
# find the most similar words to 'man' and 'woman'
wiki_model.wv.most_similar(positive=['woman'], topn=10)