In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as f
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
import torchvision.transforms as transforms



In [None]:
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords and punkt for tokenization
nltk.download('stopwords')
nltk.download('punkt')

# Sample paragraph
paragraph = """Word2Vec is a popular algorithm used to transform words into continuous vector representations, capturing their semantic and syntactic relationships. 
It uses two key architectures: Continuous Bag of Words (CBOW), which predicts a target word from its surrounding context, and Skip-gram, which predicts the context words given a target word. 
By training a neural network on a large corpus of text, Word2Vec learns dense embeddings where similar words are closer in vector space. 
These embeddings can be used in downstream tasks such as text classification, clustering, or as input features for machine learning models. 
Implementation typically involves libraries like Gensim or TensorFlow, where preprocessing includes tokenization, removal of stopwords, and creating a vocabulary from the dataset. 
After training, the model generates vector representations that can be queried for similarity or arithmetic operations to explore relationships between words.T"""

# Preprocessing the paragraph (lowercase, remove punctuation, and tokenize)
text = paragraph.lower()  # Convert text to lowercase
text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
tokens = nltk.word_tokenize(text)  # Tokenize the text

# Optional: Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]

# Prepare context-target pairs with a window size of 2
data = []

for i in range(2, len(tokens) - 2):
    context = [tokens[i-2], tokens[i-1], tokens[i+1], tokens[i+2]]
    target = tokens[i]
    data.append((context, target))

# Display the first 5 context-target pairs
print(f"Sample Context-Target Pairs: {data[:5]}")



In [53]:
vocab= tokens
word_to_idx ={word: idx for idx,word in enumerate(vocab)}
id_to_word= {idx:word for word,idx in word_to_idx.items()}

encoded_data=[([word_to_idx[word] for word in context], word_to_idx[target]) for context,target in data]

In [54]:
class wordtovecdataset(Dataset):
    def __init__(self,data):
        self.data=data
    def __len__(self):
        return len(self.data)
    def __getitem__(self,idx):
        context ,target=self.data[idx]   
        return torch.tensor(context,dtype=torch.long),torch.tensor(target,dtype=torch.long) 
dataset = wordtovecdataset(encoded_data)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
    

In [55]:
class word2vec(nn.Module):
    def __init__(self,vocab_size,embedding_dim):
        super(word2vec,self).__init__()
        self.embeddings=nn.Embedding(vocab_size,embedding_dim)
        self.linear=nn.Linear(embedding_dim,vocab_size)

    def forward(self,context):
        contexemb=self.embeddings(context)
        contextemb=contexemb.mean(dim=1) 
        output=self.linear(contextemb)
        
        return output


In [None]:
embedding_dim=100
vocab_size=len(vocab)

model=word2vec(vocab_size,embedding_dim)


criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(), lr=0.001)

epochs=300
for epoch in range(epochs):
    total_loss=0
    for context,target in dataloader:
        optimizer.zero_grad()
        
        output=model(context)
        
        loss= criterion(output,target)
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

In [None]:

import torch
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Extract all embeddings (weights from the embedding layer)
word_embeddings = model.embeddings.weight.data

# Reduce to 3D using PCA
pca = PCA(n_components=3)
reduced_embeddings = pca.fit_transform(word_embeddings.numpy())  # Convert to NumPy for PCA

# Example: Visualize a single word in 3D


# Visualize all embeddings for context
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Plot each word's 3D embedding
count=0
for word, idx in word_to_idx.items():
    count=count+1
    ax.scatter(reduced_embeddings[idx, 0], reduced_embeddings[idx, 1], reduced_embeddings[idx, 2], label=word)
    ax.text(reduced_embeddings[idx, 0], reduced_embeddings[idx, 1], reduced_embeddings[idx, 2], word, fontsize=8)
    if(count>50):
        break

ax.set_title("3D Visualization of Word Embeddings")
ax.set_xlabel("PCA Component 1")
ax.set_ylabel("PCA Component 2")
ax.set_zlabel("PCA Component 3")

plt.show()



In [None]:
import torch
import torch.nn.functional as F

# Extract the embedding matrix from the model
embedding_matrix = model.embeddings.weight.data  # Shape: (vocab_size, embedding_dim)

# Function to find the most similar word
def find_most_similar(word, word_to_idx, id_to_word, embedding_matrix, top_n=5):
    # Get the index of the input word
    word_idx = word_to_idx[word]
    
    # Get the embedding of the input word
    word_embedding = embedding_matrix[word_idx]  # Shape: (embedding_dim,)
    
    # Compute cosine similarity between the word embedding and all other embeddings
    similarities = F.cosine_similarity(word_embedding.unsqueeze(0), embedding_matrix, dim=1)
    
    # Get the top N most similar words (excluding the input word itself)
    top_n_indices = torch.topk(similarities, top_n + 1).indices  # +1 to exclude the word itself
    top_n_indices = top_n_indices[top_n_indices != word_idx][:top_n]
    
    # Convert indices back to words
    similar_words = [id_to_word[idx.item()] for idx in top_n_indices]
    
    return similar_words

# Test the function
word = "architectures"
most_similar_words = find_most_similar(word, word_to_idx, id_to_word, embedding_matrix, top_n=5)
print(f"Words most similar to '{word}': {most_similar_words}")

word = "implementation"
most_similar_words = find_most_similar(word, word_to_idx, id_to_word, embedding_matrix, top_n=5)
print(f"Words most similar to '{word}': {most_similar_words}")

word = "algorithm"
most_similar_words = find_most_similar(word, word_to_idx, id_to_word, embedding_matrix, top_n=5)
print(f"Words most similar to '{word}': {most_similar_words}")

