In [7]:
import gensim
import gensim.downloader as api
from gensim.models import Word2Vec
from pprint import pprint

In [8]:
print("Loading pretrained model...")
pretrained_model = api.load("word2vec-google-news-300")
print("Loaded Google News Word2Vec model!")

Loading pretrained model...
Loaded Google News Word2Vec model!


In [9]:
words = ['science', 'coffee', 'music', 'apple', 'teacher']

print("\n--- Similar Words from Pretrained Model ---")
for word in words:
    print(f"\nTop 5 similar words to '{word}':")
    for similar, score in pretrained_model.most_similar(word, topn=5):
        print(f"  {similar} ({score:.4f})")


--- Similar Words from Pretrained Model ---

Top 5 similar words to 'science':
  faith_Jezierski (0.6965)
  sciences (0.6821)
  biology (0.6776)
  scientific (0.6535)
  mathematics (0.6301)

Top 5 similar words to 'coffee':
  coffees (0.7213)
  gourmet_coffee (0.7057)
  Coffee (0.6900)
  o_joe (0.6891)
  Starbucks_coffee (0.6875)

Top 5 similar words to 'music':
  classical_music (0.7198)
  jazz (0.6835)
  Music (0.6596)
  Without_Donny_Kirshner (0.6416)
  songs (0.6396)

Top 5 similar words to 'apple':
  apples (0.7204)
  pear (0.6451)
  fruit (0.6410)
  berry (0.6302)
  pears (0.6134)

Top 5 similar words to 'teacher':
  teachers (0.7434)
  Teacher (0.7094)
  guidance_counselor (0.6960)
  elementary (0.6791)
  PE_teacher (0.6539)


In [10]:
print("\n--- Word Analogies (Pretrained Model) ---")

analogies = [
    ('king', 'man', 'woman'),    # queen
    ('paris', 'france', 'italy'),  # rome
    ('doctor', 'hospital', 'school')  # teacher
]

for a, b, c in analogies:
    result = pretrained_model.most_similar(positive=[a, c], negative=[b], topn=1)
    print(f"{a} - {b} + {c} ≈ {result[0][0]} ({result[0][1]:.4f})")


--- Word Analogies (Pretrained Model) ---
king - man + woman ≈ queen (0.7118)
paris - france + italy ≈ lohan (0.5070)
doctor - hospital + school ≈ guidance_counselor (0.5970)


In [11]:
# Sample custom corpus
sentences = [
    ["machine", "learning", "is", "fun"],
    ["deep", "learning", "uses", "neural", "networks"],
    ["natural", "language", "processing", "with", "word2vec"],
    ["word", "embeddings", "capture", "semantic", "meaning"],
    ["vectors", "are", "mathematical", "representations"]
]

# Train model
custom_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
print("\nTrained custom Word2Vec model.")


print("\n--- Similar Words from Custom Model ---")
test_words = ["learning", "word2vec", "vectors"]
for word in test_words:
    print(f"\nTop similar words to '{word}':")
    pprint(custom_model.wv.most_similar(word, topn=3))


Trained custom Word2Vec model.

--- Similar Words from Custom Model ---

Top similar words to 'learning':
[('uses', 0.21883946657180786),
 ('embeddings', 0.21617142856121063),
 ('language', 0.0931052565574646)]

Top similar words to 'word2vec':
[('meaning', 0.2529045045375824),
 ('deep', 0.14257237315177917),
 ('representations', 0.13725489377975464)]

Top similar words to 'vectors':
[('natural', 0.17825926840305328),
 ('mathematical', 0.13149219751358032),
 ('word', 0.07499314099550247)]
