# Word2Vec: Shakespeare
Word2Vec is a genius method of converting the semntic value and associations of words into a lower dimensional vector space. 
<br> <br>
In this notebook, we will be taking the many words of shakespeare and using them to train a machine learning model. 

### Get our Data

In [2]:
import re

text = open("alllines.txt", 'r').read()

text = text.lower()
text= text[: len(text) // 2]
# Extract tokens
tokens = re.split(r'\W+', text)
tokens = [token for token in re.split(r'\W+', text) if token]

def mapping(tokens):
    word_to_id = {}
    id_to_word = {}
    
    for i, token in enumerate(set(tokens)):
        word_to_id[token] = i
        id_to_word[i] = token
    
    return word_to_id, id_to_word

word_to_id, id_to_word = mapping(tokens)
# word_to_id

### Create Training Data (kinda)

Due to the size of the dataset and the huge vectors required (for 22,602 words total ;-;), we are writing to a csv file to store the word pairs created by our window size. 

In [3]:
# Return the relevant id for a given word
def get_id(word):
    return word_to_id[word]

# Define concat function for indices
def concat(*iterables):
    for iterable in iterables:
        yield from iterable

In [4]:
import csv

# Specify the CSV file name
csv_file = 'word_pairs.csv'

def generate_word_pairs(tokens, window):
    word_pairs = []
    n_tokens = len(tokens)
    
    for i in range(n_tokens):
        indices = concat(
            range(max(0, i - window), i), 
            range(i, min(n_tokens, i + window + 1))
        )
        for j in indices:
            if i == j:
                continue
            first_word = word_to_id[tokens[i]]
            second_word = word_to_id[tokens[j]]
            word_pairs.append((first_word, second_word))
    
    return word_pairs

pairs = generate_word_pairs(tokens, 2)


In [5]:

# pairs[0]
# id_to_word[11115]
# id_to_word[434]

In [6]:
import torch

def get_vec_from_id(ind):
    res_vec = torch.zeros(len(word_to_id), dtype=torch.float32)  
    res_vec[ind] = 1  
    return res_vec


def generate_training_data(pair): # Feed a pair, NOT pairs
    train = pair[0]
    label = pair[1]

    train_vec = get_vec_from_id(train)
    label_vec = get_vec_from_id(label)

    return [train_vec, label_vec]

# Testing a batch of 32 can load
# temp = []
# for i in range(0, 32):
#     temp.append(generate_training_data(pairs[i]))
    # print(temp)

### Building the Model

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

class SimpleNN(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(vocab_size, hidden_size)  # Input layer to hidden layer
        self.activation = nn.ReLU()                      # Activation function
        self.fc2 = nn.Linear(hidden_size, vocab_size)   # Hidden layer to output layer

    def forward(self, x):
        x = self.fc1(x)                # Linear transformation to hidden layer
        x = self.activation(x)         # Activation function
        x = self.fc2(x)                # Linear transformation to output layer
        return x                       # Logits for output

# Hyperparameters
vocab_size = len(word_to_id)  # Number of words in your vocabulary
pair_size = len(pairs)
hidden_size = 100              # Size of hidden layer
learning_rate = 0.01           # Learning rate
num_epochs = 10                # Number of epochs
batch_size = 32                # Try 32 batch size 

# Initialize model, loss function, and optimizer
model = SimpleNN(vocab_size, hidden_size)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    total_loss = 0

    # Wrap the training loop with tqdm to show the progress bar
    with tqdm(total=len(pairs), desc=f'Epoch {epoch + 1}/{num_epochs}', unit=' samples') as pbar:
        for start_index in range(0, len(pairs), batch_size):
            end_index = start_index + batch_size
            batch_pairs = pairs[start_index:end_index]

            # Initialize tensors for center and context
            center_batch = torch.zeros(batch_size, vocab_size)  # Shape: (batch_size, vocab_size)
            context_batch = torch.zeros(batch_size, vocab_size)  # Shape: (batch_size, vocab_size)

            for i, pair in enumerate(batch_pairs):
                center_vec, context_vec = generate_training_data(pair)
                center_batch[i] = center_vec.clone().detach()  # Assign the center vector to the batch
                context_batch[i] = context_vec.clone().detach()  # Assign the context vector to the batch

            # Forward pass
            log_probs = model(center_batch)  # Shape: (batch_size, vocab_size)

            # Compute the loss
            loss = loss_function(log_probs, context_batch)
            total_loss += loss.item()

            # Zero gradients, backward pass, and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Update the progress bar
            pbar.update(batch_size)

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(pairs):.4f}')

# Save the model and optimizer state
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict()
}, 'model_checkpoint.pth')


Epoch 1/10:   3%|▎         | 46560/1669986 [00:23<13:24, 2018.24 samples/s]


KeyboardInterrupt: 

In [None]:

import torch.optim as optim
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Assume the model has already been trained
# Extract weights (embeddings) from the first fully connected layer
embeddings = model.fc1.weight.data.cpu().numpy()  # Shape: (hidden_size, vocab_size)

# Reduce dimensions using PCA or t-SNE
def visualize_embeddings(embeddings, words, method='pca', n_components=2):
    if method == 'pca':
        pca = PCA(n_components=n_components)
        reduced_embeddings = pca.fit_transform(embeddings)
    elif method == 'tsne':
        tsne = TSNE(n_components=n_components)
        reduced_embeddings = tsne.fit_transform(embeddings)

    plt.figure(figsize=(30, 30))
    sns.scatterplot(x=reduced_embeddings[:, 0], y=reduced_embeddings[:, 1])

    # plt.xlim(-5, 5)
    # plt.ylim(-5, 5)

    # Annotate points with words from the vocabulary
    for i, word in enumerate(words):
        plt.annotate(word, (reduced_embeddings[i, 0], reduced_embeddings[i, 1]))

    plt.title(f"Word Embeddings Visualization using {method.upper()}")
    plt.show()

# Visualize the embeddings
word_list = list(word_to_id.keys())  # Assuming word_to_id is a dictionary mapping words to indices
visualize_embeddings(embeddings, word_list)  # You can switch 'tsne' to 'pca'
