In [1]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

# Sample corpus
corpus = [
    "this is a small example",
    "word embeddings are very useful",
    "we can use tensorflow to create embeddings"
]

# Preprocess the corpus
processed_corpus = [simple_preprocess(doc) for doc in corpus]

# Train Word2Vec model
model = Word2Vec(sentences=processed_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Example usage: get the embedding for a word
word = 'example'
embedding = model.wv[word]

print(f"Embedding for '{word}':\n{embedding}")
print(len(embedding))


Embedding for 'example':
[ 9.7702928e-03  8.1651136e-03  1.2809718e-03  5.0975787e-03
  1.4081288e-03 -6.4551616e-03 -1.4280510e-03  6.4491653e-03
 -4.6173059e-03 -3.9930656e-03  4.9244044e-03  2.7130984e-03
 -1.8479753e-03 -2.8769434e-03  6.0107317e-03 -5.7167388e-03
 -3.2367026e-03 -6.4878250e-03 -4.2346325e-03 -8.5809948e-03
 -4.4697891e-03 -8.5112294e-03  1.4037776e-03 -8.6181965e-03
 -9.9166557e-03 -8.2016252e-03 -6.7726658e-03  6.6805850e-03
  3.7845564e-03  3.5616636e-04 -2.9579818e-03 -7.4283206e-03
  5.3341867e-04  4.9989222e-04  1.9561886e-04  8.5259555e-04
  7.8633073e-04 -6.8160298e-05 -8.0070542e-03 -5.8702733e-03
 -8.3829118e-03 -1.3120425e-03  1.8206370e-03  7.4171280e-03
 -1.9634271e-03 -2.3252917e-03  9.4871549e-03  7.9704521e-05
 -2.4045217e-03  8.6048469e-03  2.6870037e-03 -5.3439722e-03
  6.5881060e-03  4.5101536e-03 -7.0544672e-03 -3.2317400e-04
  8.3448651e-04  5.7473574e-03 -1.7176545e-03 -2.8065301e-03
  1.7484308e-03  8.4717153e-04  1.1928272e-03 -2.6342822e-03

In [None]:
from gensim.models import Word2Vec

# Sample corpus
corpus = [
    "He is Walter",
    "He is William",
    "He isn’t Peter or September"
]

# Tokenize the corpus
tokenized_corpus = [sentence.split() for sentence in corpus]

# Train Word2Vec model
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, sg=1)

# Example: Get the vector for the word 'Walter'
vector = model.wv['Walter']
print("Vector for 'Walter':", vector)

# Example: Find most similar words to 'Walter'
similar_words = model.wv.most_similar('Walter')
print("Words most similar to 'Walter':", similar_words)


Vector for 'Walter': [ 8.1681199e-03 -4.4430327e-03  8.9854337e-03  8.2536647e-03
 -4.4352221e-03  3.0310510e-04  4.2744912e-03 -3.9263200e-03
 -5.5599655e-03 -6.5123225e-03 -6.7073823e-04 -2.9592158e-04
  4.4630850e-03 -2.4740540e-03 -1.7260908e-04  2.4618758e-03
  4.8675989e-03 -3.0808449e-05 -6.3394094e-03 -9.2608072e-03
  2.6657581e-05  6.6618943e-03  1.4660227e-03 -8.9665223e-03
 -7.9386048e-03  6.5519023e-03 -3.7856805e-03  6.2549924e-03
 -6.6810320e-03  8.4796622e-03 -6.5163244e-03  3.2880199e-03
 -1.0569858e-03 -6.7875278e-03 -3.2875966e-03 -1.1614120e-03
 -5.4709399e-03 -1.2113475e-03 -7.5633135e-03  2.6466595e-03
  9.0701487e-03 -2.3772502e-03 -9.7651005e-04  3.5135616e-03
  8.6650876e-03 -5.9218528e-03 -6.8875779e-03 -2.9329848e-03
  9.1476962e-03  8.6626766e-04 -8.6784009e-03 -1.4469790e-03
  9.4794659e-03 -7.5494875e-03 -5.3580985e-03  9.3165627e-03
 -8.9737261e-03  3.8259076e-03  6.6544057e-04  6.6607012e-03
  8.3127534e-03 -2.8507852e-03 -3.9923131e-03  8.8979173e-03
  2

In [3]:
import gensim.downloader as api
from gensim.models import Word2Vec

# Load the text8 corpus
dataset = api.load("text8")

# Build the Word2Vec model
model = Word2Vec(dataset, vector_size=100, window=5, min_count=1, workers=4)

# Save the model
model.save('text8_word2vec.model')

# Load the saved model
model = Word2Vec.load('text8_word2vec.model')

# Example: Get the vector for a word
word = "king"
if word in model.wv.key_to_index:
    vector = model.wv[word]
    print(f"Embedding for the word '{word}':\n{vector}")
else:
    print(f"The word '{word}' is not in the vocabulary.")

Embedding for the word 'king':
[ 3.56425524e-01 -3.09262931e-01  1.64713883e+00  2.36713719e+00
  3.12558631e-03  2.07166791e+00  6.60514474e-01  1.89080298e+00
  2.70714343e-01  1.06550038e+00  6.26207441e-02 -4.32187825e-01
  6.50705248e-02  3.99803567e+00 -1.24391389e+00 -7.10372865e-01
  6.70010626e-01 -1.98204126e-02  3.31455112e+00 -2.61795402e-01
 -1.25638753e-01  1.85342634e+00 -1.34533727e+00 -1.12235701e+00
 -2.39253283e+00 -1.41270101e+00  1.59852898e+00  5.22346914e-01
 -2.51499921e-01 -3.88127542e-03  2.05755487e-01  1.02056563e+00
 -6.04657590e-01  2.67404288e-01 -2.42026914e-02  3.22725987e+00
  6.83496237e-01 -3.28612423e+00  9.98235881e-01  3.04610491e-01
  2.34247714e-01  6.73422873e-01 -1.04952776e+00  6.52106464e-01
 -2.08781362e+00  8.07376146e-01  2.95666289e-02 -1.16191316e+00
  4.77713645e-01 -3.15957165e+00  1.84285879e+00  1.37641823e+00
 -2.98691154e+00  3.61585236e+00  7.00471222e-01 -1.27060390e+00
  4.17793036e-01  1.98889291e+00  8.64774082e-03 -3.8375220