In [2]:
import numpy as np
import faiss
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

def load_corpus():
    corpus = [
        "The quick brown fox jumps over the lazy dog",
        "I love natural language processing and machine learning",
        "Word embeddings help capture semantic relationships",
        "The fox is clever and quick",
        "Dogs are loyal and friendly animals",
        "Machine learning models improve with more data",
    ]
    tokenized = [simple_preprocess(doc) for doc in corpus]
    return tokenized

def train_word2vec(tokenized_corpus, vector_size=50, window=3, min_count=1):
    model = Word2Vec(
        sentences=tokenized_corpus,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=1,
        seed=42
    )
    return model

def build_faiss_index(word_vectors):
    dim = word_vectors.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(word_vectors)
    return index

def query_similar_words(model, index, word, top_k=5):
    if word not in model.wv:
        print(f"Word '{word}' not in vocabulary.")
        return

    query_vec = model.wv[word].reshape(1, -1).astype(np.float32)
    distances, indices = index.search(query_vec, top_k + 1)

    print(f"\nTop {top_k} words similar to '{word}':")
    for dist, idx in zip(distances[0][1:], indices[0][1:]):
        similar_word = model.wv.index_to_key[idx]
        print(f"  {similar_word} (distance: {dist:.4f})")

def run_pipeline():
    tokenized_corpus = load_corpus()
    print("Tokenized corpus:")
    for sent in tokenized_corpus:
        print(sent)

    model = train_word2vec(tokenized_corpus)
    word_vectors = model.wv.vectors.astype(np.float32)
    faiss_index = build_faiss_index(word_vectors)

    query_words = ['fox', 'machine', 'dog', 'language', 'quick']
    for word in query_words:
        query_similar_words(model, faiss_index, word)

if __name__ == "__main__":
    run_pipeline()

Tokenized corpus:
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
['love', 'natural', 'language', 'processing', 'and', 'machine', 'learning']
['word', 'embeddings', 'help', 'capture', 'semantic', 'relationships']
['the', 'fox', 'is', 'clever', 'and', 'quick']
['dogs', 'are', 'loyal', 'and', 'friendly', 'animals']
['machine', 'learning', 'models', 'improve', 'with', 'more', 'data']

Top 5 words similar to 'fox':
  semantic (distance: 0.0088)
  word (distance: 0.0090)
  natural (distance: 0.0096)
  are (distance: 0.0099)
  language (distance: 0.0101)

Top 5 words similar to 'machine':
  friendly (distance: 0.0068)
  the (distance: 0.0092)
  are (distance: 0.0092)
  learning (distance: 0.0095)
  capture (distance: 0.0095)

Top 5 words similar to 'dog':
  loyal (distance: 0.0092)
  love (distance: 0.0104)
  and (distance: 0.0106)
  relationships (distance: 0.0108)
  dogs (distance: 0.0115)

Top 5 words similar to 'language':
  fox (distance: 0.0101)
  semantic (dist

In [2]:
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable
Collecting gensim
  Downloading gensim-4.4.0-cp313-cp313-win_amd64.whl.metadata (8.6 kB)
Collecting smart_open>=1.8.1 (from gensim)
  Downloading smart_open-7.4.2-py3-none-any.whl.metadata (24 kB)
Downloading gensim-4.4.0-cp313-cp313-win_amd64.whl (24.4 MB)
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.3/24.4 MB ? eta -:--:--
    --------------------------------------- 0.5/24.4 MB 2.1 MB/s eta 0:00:12
   -- ------------------------------------- 1.3/24.4 MB 2.5 MB/s eta 0:00:10
   --- ------------------------------------ 1.8/24.4 MB 2.7 MB/s eta 0:00:09
   ---- ----------------------------------- 2.6/24.4 MB 3.0 MB/s eta 0:00:08
   ------ --------------------------------- 3.7/24.4 MB 3.3 MB/s eta 0:00:07
   -------- ------------------------------- 5.0/24.4 MB 3.7 MB/s eta 0:00:06
   ---------- ----------------------------- 6.3/24.4 MB 4.1 MB


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
pip install --upgrade pip

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
