# 05. Word Embeddings

Word embeddings represent words as dense vectors where similar words are close in space.

We will use **Word2Vec** from the `gensim` library.

In [None]:
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Ensure punkt is downloaded
nltk.download('punkt')

## Training a Word2Vec Model
We will train a model on a small toy corpus.

In [None]:
# Toy corpus
sentences = [
    "I love machine learning",
    "I love deep learning",
    "Machine learning is great",
    "Deep learning is fantastic",
    "Natural Language Processing is a subfield of AI",
    "AI stands for Artificial Intelligence"
]

# Tokenize sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
print("Tokenized:", tokenized_sentences)

In [None]:
# Train Word2Vec model
# vector_size: Dimension of the embedding vector
# window: Maximum distance between current and predicted word
# min_count: Ignores all words with total frequency lower than this
# sg: 0 for CBOW, 1 for Skip-gram

model = Word2Vec(sentences=tokenized_sentences, vector_size=10, window=5, min_count=1, workers=4, sg=1)

print("Model trained successfully!")

## Exploring Semantic Relationships

In [None]:
# Get vector for a word
vector = model.wv['learning']
print("Vector for 'learning':", vector)

# Find similar words
similar_words = model.wv.most_similar('learning')
print("\nMost similar to 'learning':", similar_words)

## Similarity Calculation

In [None]:
similarity = model.wv.similarity('machine', 'learning')
print(f"Similarity between 'machine' and 'learning': {similarity:.4f}")