# Word Embeddings


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

torch.manual_seed(1)

<torch._C.Generator at 0x7f00cd11d950>

We create random tensors for the vocabulary, where each word has an embedding vector associated with it. We create an index where the word at index $i$ has it's embedding stored in the $i^{th}$ row of the matrix.

In [2]:
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5) # 2 words in vocab, 5 dimensional embeddings. Starts with random weights between -1 and 1
lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long) # Converts the value of "hello" (0) to a tensor
hello_embed = embeds(lookup_tensor) # Lookup the tensor for "hello" at position 0
print(hello_embed)

tensor([[ 0.6614,  0.2669,  0.0617,  0.6213, -0.4519]],
       grad_fn=<EmbeddingBackward0>)


Let's train a simple neural network called an **N-Gram Language Model**, which tries to predict the word at position $i$ given the preceeding 2 words. It will use the trainable vector embeddings of the input words as the input, and try to predict which word comes next. By representing the numbers as a long series of embedded numbers, the neural network can modify the weights to learn to associate different words together. 

First, let's set a **context size** of 2, meaning that the neural network will look at the previous two words when making it's prediction. We will set an **embedding dimension** of 10, which will give five *latent semantic attributes** to each word.

In [3]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
# We will use Shakespeare Sonnet 2
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()
print(test_sentence[:5])

['When', 'forty', 'winters', 'shall', 'besiege']


We will split the test sentence into a list of tuples, where each tuple contains the word that should be predicted and a list of the two words preceding it.

In [4]:
ngrams = [
    (
        [test_sentence[i - j - 1] for j in range(CONTEXT_SIZE)],
        test_sentence[i]
    )
    for i in range(CONTEXT_SIZE, len(test_sentence))
]
print(ngrams[:3])

[(['forty', 'When'], 'winters'), (['winters', 'forty'], 'shall'), (['shall', 'winters'], 'besiege')]


Next, we make a set of the unique words in the passage as our vocabulary and create a word to index mapping.

In [5]:
vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}
print(f"Vocab has {len(vocab)} unique words")

Vocab has 97 unique words


We can now define the structure of our neural network.

In [6]:
class NGramLanguageModeler(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)       # 97 unique words * 10 embeddings for each word
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)     # Input = 20 (context of 2 words * embedding size of 10 -> 128 neurons
        self.linear2 = nn.Linear(128, vocab_size)                       # Input = 128 neurons -> 97 unique words
        
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(1, -1)                    # Squeezes the 10-length embeddings for the two words into a single 20-length tensor
        out = F.relu(self.linear1(embeds))                              # RelU activation function after the first neural network
        out = self.linear2(out)                                         # Runs the output of the first layer through the second layer
        log_probs = F.log_softmax(out, dim=1)                           # Returns the log probability for each word in the vocab using log softmax
        return log_probs
        

We will use the **negative log likelihood loss** function, which is useful for training classification problems. We will use the **stochastic gradient descent** optimizer for calculating parameter gradients.

In [7]:
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [8]:
print(f"Embedding layer: {model.embeddings.num_embeddings} unique words * {model.embeddings.embedding_dim} embedding dimension")
print(f"Linear1 layer: {model.linear1.in_features} inputs -> {model.linear1.out_features} outputs")
print(f"Linear2 layer: {model.linear2.in_features} inputs -> {model.linear2.out_features} outputs (1 for each word)")

Embedding layer: 97 unique words * 10 embedding dimension
Linear1 layer: 20 inputs -> 128 outputs
Linear2 layer: 128 inputs -> 97 outputs (1 for each word)


Now we can define the training loop

In [28]:
losses = []
first_context = []
for epoch in range(50):
    total_loss = 0
    for context, target in ngrams:
        
        # Step 1. Prepare the inputs to be passed to the model (i.e., turn the words 
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
        if len(first_context) == 0:
            first_context.append(context_idxs[:3])
        
        # Step 2. Recall that torch accumulates gradients. Before passing in a new instance,
        # you need to zero out the gradients from the old instance
        model.zero_grad()
        
        # Step 3. Run the forward pass, getting the log probabilities over the next words
        log_probs = model(context_idxs)
        
        # Step 4. Compute your loss function (Again, Torch wants the target word wrapped
        # in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))
        
        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()
        
        # Get the Python number from a 1-element tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
    
print(f"context_idxs = {first_context}")
print(f"Losses: {[round(i, 2) for i in losses[:5]]}...{[round(i, 2) for i in losses[-5:]]}")

# To get teh embedding of a particular word, e.g. "beauty"
word = "beauty"
print(f"Final word embedding for '{word}' \n\t{model.embeddings.weight[word_to_ix[word]]}")

context_idxs = [tensor([23, 72])]
Losses: [273.81, 271.05, 268.28, 265.51, 262.75]...[157.99, 155.78, 153.59, 151.42, 149.27]
Final word embedding for 'beauty' 
	tensor([-1.0367, -0.6847,  0.3079,  1.1811, -0.0095, -0.2741,  0.4733, -0.1618,
         0.3827,  2.3051], grad_fn=<SelectBackward0>)


## Continuous Bag-of-Words

Frequently used in NLP deep learning, this model tries to predict words given the context of a few words before and after the target word.

Given a target word $w_{i}$ and an $N$ context window on each side, $w_{i-1},...,w_{i-N}$ and $w_{i+1},...,w_{i+n}$, referring to all context words collectively as $C$, CBOW tries to minimize:

$$-\text{log}p(w_{i}|C)=-\text{log}\text{Softmax}\left(A(\sum_{w \in C}q_{w}+b)\right)$$

Where $q_{w}$ is the embedding for word $w$.

We will use a **context size** of 2, to look at two words to the left and two to the right of the target word.

In [42]:
CONTEXT_SIZE = 2
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []

# Iterate through each target word, from the 3rd word to the N-3rd word in the sequence
for i in range(CONTEXT_SIZE, len(raw_text) - CONTEXT_SIZE):
    context = (
        [raw_text[i - j - 1] for j in range(CONTEXT_SIZE)] +    # Get the words behind the target
        [raw_text[i + j + 1] for j in range(CONTEXT_SIZE)]      # Get the words in front of the target
    )
    target = raw_text[i]                                        # Get the target
    data.append((context, target))

print(data[:3])

[(['are', 'We', 'to', 'study'], 'about'), (['about', 'are', 'study', 'the'], 'to'), (['to', 'about', 'the', 'idea'], 'study')]


In [None]:
class CBOW(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)           # Vocab size of
        self.linear1 = nn.Linear(context_size * embedding_dim * 2, 128)     # Input = 40 (context of 2 words * embedding size of 10 * 2 for words before and after -> 128 neurons
        self.linear2 = nn.Linear(128, vocab_size)                           # Input = 128 neurons -> 97 unique words
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(1, -1)                    # Squeezes the 10-length embeddings for the two words into a single 20-length tensor
        out = F.relu(self.linear1(embeds))                              # RelU activation function after the first neural network
        out = self.linear2(out)                                         # Runs the output of the first layer through the second layer
        log_probs = F.log_softmax(out, dim=1)                           # Returns the log probability for each word in the vocab using log softmax
        return log_probs

['are', 'We', 'to', 'study']


In [47]:
loss_function = nn.NLLLoss()
model = CBOW(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [53]:
print(f"Embedding layer: {model.embeddings.num_embeddings} unique words * {model.embeddings.embedding_dim} embedding dimension")
print(f"Linear1 layer: {model.linear1.in_features} inputs -> {model.linear1.out_features} outputs")
print(f"Linear2 layer: {model.linear2.in_features} inputs -> {model.linear2.out_features} outputs (1 for each word)")

Embedding layer: 49 unique words * 10 embedding dimension
Linear1 layer: 40 inputs -> 128 outputs
Linear2 layer: 128 inputs -> 49 outputs (1 for each word)


In [52]:
def make_context_vector(context, word_to_ix):
    """Converts the input to a tensor with their corresponding indices in `word_to_ix`"""
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

losses = []
first_context = []
for epoch in range(50):
    total_loss = 0
    
    for context, target in data:
        
        # Step 1. Prepare the inputs to be passed to the model (i.e., turn the words 
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
        if len(first_context) == 0:
            first_context.append(context_idxs[:3])
        
        # Step 2. Recall that torch accumulates gradients. Before passing in a new instance,
        # you need to zero out the gradients from the old instance
        model.zero_grad()
        
        # Step 3. Run the forward pass, getting the log probabilities over the next words
        log_probs = model(context_idxs)
        
        # Step 4. Compute your loss function (Again, Torch wants the target word wrapped
        # in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))
        
        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()
        
        # Get the Python number from a 1-element tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
    
print(f"context_idxs = {first_context}")
print(f"Losses: {[round(i, 2) for i in losses[:5]]}...{[round(i, 2) for i in losses[-5:]]}")

context_idxs = [tensor([47, 27, 43])]
Losses: [36.06, 35.46, 34.86, 34.28, 33.7]...[17.75, 17.5, 17.26, 17.02, 16.79]


In [69]:

    
import pandas as pd
column_map = {i: f"Dim{i+1}" for i in range(EMBEDDING_DIM)}
bag_of_words_df = pd.DataFrame()

print("Final word embeddings for:")
for word in word_to_ix:
    bag_of_words_df[word] = [i for i in np.array(model.embeddings.weight[word_to_ix[word]].detach())]

bag_of_words_df = bag_of_words_df.T.rename(columns=column_map)

bag_of_words_df

Final word embeddings for:


Unnamed: 0,Dim1,Dim2,Dim3,Dim4,Dim5,Dim6,Dim7,Dim8,Dim9,Dim10
computational,-0.667542,0.334508,-1.247375,-0.56266,-1.771379,-0.75433,0.635134,-0.241553,0.438324,1.810085
processes,-0.314025,-0.45944,0.58965,1.720078,-0.568893,-0.85016,0.460905,-2.700313,-0.936856,-0.90749
spells.,-0.170851,1.58251,-1.186557,1.303331,-0.510969,2.334965,-1.935604,1.225858,0.773485,0.037681
about,-1.207395,0.341389,0.702124,1.028797,-0.751891,0.031643,-1.573422,-1.539577,0.963399,-0.595537
The,1.99317,-1.123482,0.694861,-0.57721,-0.569877,0.536461,0.929991,0.488416,-0.283366,0.33654
other,0.372892,-0.392172,-0.04296,0.914083,0.149853,-1.874531,-0.283943,0.23977,0.677913,1.253537
processes.,0.169697,0.29853,0.468822,0.62585,0.580149,0.087816,-1.116288,0.620169,1.067976,-1.2258
rules,-0.870946,-0.930983,-1.657568,-0.188027,-0.204704,-0.159206,0.238914,0.168362,0.750724,-0.648405
evolution,-0.307102,-0.053045,1.261929,-1.199492,0.879612,-1.045332,-0.249687,0.67651,-1.344324,-0.341031
Computational,2.054966,0.92757,-0.867204,-1.501737,-0.6875,1.194267,0.036527,-0.648118,-0.634796,0.635731


In [24]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import umap
from gensim.models import KeyedVectors
import argparse

def reduce_dimensions(embeddings, method, dims):
    """
    Reduce the dimensions of embeddings to 2D or 3D.

    Parameters:
    embeddings (array): High-dimensional embeddings.
    method (str): Dimensionality reduction method ('pca', 'tsne', or 'umap').
    dims (int): Number of dimensions to reduce embeddings to. 
    """
    n_samples = embeddings.shape[0]
    if method == 'pca':
        reducer = PCA(n_components=dims)
    elif method == 'tsne':
        perplexity = min(30, max(n_samples // 3, 5))  # Adjust perplexity for small datasets
        reducer = TSNE(n_components=dims, perplexity=perplexity, random_state=0)
    elif method == 'umap':
        reducer = umap.UMAP(n_components=dims)
    else:
        raise ValueError("Invalid method: choose 'pca', 'tsne', or 'umap'")

    return reducer.fit_transform(embeddings)

def plot_embeddings(embeddings, words, n_clusters):
    """
    Plot of the 2D embeddings using Plotly.
    """
    # Perform clustering on the reduced embeddings
    kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init='auto')
    cluster_labels = kmeans.fit_predict(embeddings)

    # Create a DataFrame for the embeddings, words, and clusters
    df = pd.DataFrame(embeddings, columns=['x', 'y', 'z'])
    df['word'] = words
    df['cluster'] = cluster_labels

    # Create the scatter plot
    fig = px.scatter_3d(df, x='x', y='y', z='z', hover_name='word', color='cluster',
                     title="Word Embeddings", template='plotly', width=800, height=800)

    fig.update_layout(coloraxis_showscale=False)
    
    # Add text labels
    for i, row in df.iterrows():
        fig.add_trace(
            go.Scatter3d(
                x=[row['x']], y=[row['y']], z=[row['z']], 
                mode='text', text=[row['word']],
                textposition='middle center',
                showlegend=False
                )
            )

    # Update traces and layout for better readability
    fig.update_traces(marker=dict(size=6, opacity=0.7))
    fig.update_layout(hovermode='closest', showlegend=True)
    fig.update_layout(scene=dict(xaxis_showgrid=False, yaxis_showgrid=False, zaxis_showgrid=False))
    fig.show()

In [50]:
all_final_embeddings = np.array([np.array(model.embeddings.weight[word_to_ix[i]].detach()) for i in vocab])
embeddings_d = reduce_dimensions(all_final_embeddings, 'umap', 3)
plot_embeddings(embeddings_d, list(vocab), n_clusters=10)