## Step 1: Preprocess Text

We will preprocess the text by converting it to lowercase, removing punctuation, and splitting it into sentences.


In [3]:
import numpy as np
from collections import defaultdict
import re

# Step 1: Split Sentences
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    sentences = text.split(".")
    sentences = [sentence.split() for sentence in sentences if sentence]
    return sentences

# Example text
text = "Do all the good you can, for all the people you can, in all the ways you can, as long as you can."
sentences = preprocess_text(text)
print("Sentences:", sentences)

Sentences: [['do', 'all', 'the', 'good', 'you', 'can', 'for', 'all', 'the', 'people', 'you', 'can', 'in', 'all', 'the', 'ways', 'you', 'can', 'as', 'long', 'as', 'you', 'can']]


## Step 2: Build Vocabulary

We will create a vocabulary of unique words and map each word to an index.


In [4]:
# Step 2: Make Vocabulary
def build_vocabulary(sentences):
    vocabulary = set()
    for sentence in sentences:
        vocabulary.update(sentence)
    word_to_index = {word: idx for idx, word in enumerate(vocabulary)}
    index_to_word = {idx: word for word, idx in word_to_index.items()}
    return word_to_index, index_to_word

word_to_index, index_to_word = build_vocabulary(sentences)
print("Word to Index:", word_to_index)

Word to Index: {'all': 0, 'ways': 1, 'the': 2, 'can': 3, 'in': 4, 'as': 5, 'long': 6, 'people': 7, 'for': 8, 'do': 9, 'you': 10, 'good': 11}


## Step 3: One-Hot Encoding

Convert each word into a one-hot encoded vector.


In [5]:
# Step 3: One-Hot Encode
def one_hot_encode(word, word_to_index):
    vector = np.zeros(len(word_to_index))
    vector[word_to_index[word]] = 1
    return vector

# Example of one-hot encoding
example_word = "you"
print(f"One-hot encoding for '{example_word}':", one_hot_encode(example_word, word_to_index))

One-hot encoding for 'you': [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]


## Step 4: Prepare Training Data

Generate training pairs of a target word and its surrounding context words.


In [6]:
# Step 4: Prepare Training Data
def generate_training_data(sentences, word_to_index, window_size=2):
    training_data = []
    for sentence in sentences:
        for i, target_word in enumerate(sentence):
            context = []
            for j in range(-window_size, window_size + 1):
                if j != 0 and 0 <= i + j < len(sentence):
                    context.append(sentence[i + j])
            training_data.append((context, target_word))
    return training_data

training_data = generate_training_data(sentences, word_to_index)
print("Training Data:", training_data)

Training Data: [(['all', 'the'], 'do'), (['do', 'the', 'good'], 'all'), (['do', 'all', 'good', 'you'], 'the'), (['all', 'the', 'you', 'can'], 'good'), (['the', 'good', 'can', 'for'], 'you'), (['good', 'you', 'for', 'all'], 'can'), (['you', 'can', 'all', 'the'], 'for'), (['can', 'for', 'the', 'people'], 'all'), (['for', 'all', 'people', 'you'], 'the'), (['all', 'the', 'you', 'can'], 'people'), (['the', 'people', 'can', 'in'], 'you'), (['people', 'you', 'in', 'all'], 'can'), (['you', 'can', 'all', 'the'], 'in'), (['can', 'in', 'the', 'ways'], 'all'), (['in', 'all', 'ways', 'you'], 'the'), (['all', 'the', 'you', 'can'], 'ways'), (['the', 'ways', 'can', 'as'], 'you'), (['ways', 'you', 'as', 'long'], 'can'), (['you', 'can', 'long', 'as'], 'as'), (['can', 'as', 'as', 'you'], 'long'), (['as', 'long', 'you', 'can'], 'as'), (['long', 'as', 'can'], 'you'), (['as', 'you'], 'can')]


## Step 5: Initialize Weights

Randomly initialize weights for the neural network.


In [7]:
# Step 5: Initialize Weights
def initialize_weights(vocab_size, embedding_dim):
    W1 = np.random.rand(vocab_size, embedding_dim)
    W2 = np.random.rand(embedding_dim, vocab_size)
    return W1, W2

vocab_size = len(word_to_index)
embedding_dim = 10
W1, W2 = initialize_weights(vocab_size, embedding_dim)
print("W1 Shape:", W1.shape)
print("W2 Shape:", W2.shape)

W1 Shape: (12, 10)
W2 Shape: (10, 12)


## Step 6: Forward Pass

Use the context words to predict the target word.


In [8]:
# Step 6: Forward Pass
def forward_pass(context_words, W1, W2, word_to_index):
    context_vectors = np.sum([one_hot_encode(word, word_to_index) for word in context_words], axis=0)
    hidden_layer = np.dot(context_vectors, W1)
    output_layer = np.dot(hidden_layer, W2)
    predictions = softmax(output_layer)
    return predictions, hidden_layer

def softmax(x):
    exp_x = np.exp(x - np.max(x))
    return exp_x / exp_x.sum(axis=0)

# Example forward pass
context_words = ["the", "you"]
predictions, hidden_layer = forward_pass(context_words, W1, W2, word_to_index)
print("Predictions:", predictions)

Predictions: [0.1064354  0.03189628 0.09234783 0.0145338  0.05194879 0.05719299
 0.0235437  0.31184857 0.06356431 0.04207062 0.16598507 0.03863265]


## Step 7: Calculate Loss

Compute the loss to measure how far the predictions are from the actual target.


In [9]:
# Step 7: Calculate Loss
def calculate_loss(predictions, target_word, word_to_index):
    target_vector = one_hot_encode(target_word, word_to_index)
    loss = -np.sum(target_vector * np.log(predictions))
    return loss

# Example loss calculation
target_word = "people"
loss = calculate_loss(predictions, target_word, word_to_index)
print("Loss:", loss)

Loss: 1.1652375598773608


## Step 8: Update Weights

Adjust the weights using backpropagation to minimize the loss.


In [10]:
# Step 8: Update Weights
def backpropagate(W1, W2, hidden_layer, context_words, predictions, target_word, word_to_index, learning_rate=0.01):
    target_vector = one_hot_encode(target_word, word_to_index)
    error = predictions - target_vector
    dW2 = np.outer(hidden_layer, error)
    dW1 = np.outer(np.sum([one_hot_encode(word, word_to_index) for word in context_words], axis=0), np.dot(W2, error))
    W1 -= learning_rate * dW1
    W2 -= learning_rate * dW2
    return W1, W2

# Example weight update
W1, W2 = backpropagate(W1, W2, hidden_layer, context_words, predictions, target_word, word_to_index)
print("Updated W1:", W1)
print("Updated W2:", W2)

Updated W1: [[8.66833928e-01 3.03421037e-01 7.68318448e-01 6.41010692e-01
  6.41322655e-01 3.57514393e-01 8.23384213e-02 6.97104547e-01
  7.87357177e-01 5.89386297e-01]
 [8.12133911e-01 8.68827777e-01 4.04661078e-01 2.43038845e-01
  7.36329484e-01 8.20479664e-01 9.20181798e-01 6.97825512e-01
  8.73882711e-01 1.90498343e-02]
 [1.59094675e-01 6.68858645e-01 4.70379115e-01 4.83024865e-01
  7.73750775e-01 5.55570416e-01 7.36411009e-01 7.19100007e-01
  1.68371542e-03 8.50767853e-01]
 [1.75309363e-02 5.48053815e-02 6.54673559e-01 3.80399396e-04
  7.74482454e-01 4.64245582e-01 8.41407525e-01 2.15037160e-01
  4.71566777e-01 9.73851496e-01]
 [3.59470800e-01 8.84070442e-01 7.05206425e-01 5.71526428e-01
  4.37769582e-01 7.65634160e-01 8.44967976e-02 1.60965537e-01
  4.57091873e-01 5.68071428e-01]
 [1.58690309e-01 7.14605948e-01 2.85556443e-01 7.98283872e-01
  4.56681119e-01 9.85806779e-01 8.31624537e-03 8.41517722e-01
  8.70994433e-01 9.30731618e-01]
 [2.87768865e-01 5.13340680e-01 1.62914511e-01

## Training the CBOW Model

We will train the CBOW model on the example text corpus for multiple epochs.


In [12]:
# Training the CBOW Model
for epoch in range(1000):
    total_loss = 0
    for context_words, target_word in training_data:
        predictions, hidden_layer = forward_pass(context_words, W1, W2, word_to_index)
        loss = calculate_loss(predictions, target_word, word_to_index)
        total_loss += loss
        W1, W2 = backpropagate(W1, W2, hidden_layer, context_words, predictions, target_word, word_to_index)
    if epoch % 100 == 0:
        print(f"Epoch {epoch+100}, Loss: {total_loss:.4f}")

Epoch 100, Loss: 8.1494
Epoch 200, Loss: 8.1489
Epoch 300, Loss: 8.1484
Epoch 400, Loss: 8.1478
Epoch 500, Loss: 8.1473
Epoch 600, Loss: 8.1468
Epoch 700, Loss: 8.1463
Epoch 800, Loss: 8.1459
Epoch 900, Loss: 8.1454
Epoch 1000, Loss: 8.1449


In [11]:
# Training the CBOW Model
for epoch in range(10000):
    total_loss = 0
    for context_words, target_word in training_data:
        predictions, hidden_layer = forward_pass(context_words, W1, W2, word_to_index)
        loss = calculate_loss(predictions, target_word, word_to_index)
        total_loss += loss
        W1, W2 = backpropagate(W1, W2, hidden_layer, context_words, predictions, target_word, word_to_index)
    if epoch % 1000 == 0:
        print(f"Epoch {epoch+1000}, Loss: {total_loss:.4f}")

Epoch 1000, Loss: 67.8641
Epoch 2000, Loss: 8.5707
Epoch 3000, Loss: 8.3330
Epoch 4000, Loss: 8.2597
Epoch 5000, Loss: 8.2227
Epoch 6000, Loss: 8.1996
Epoch 7000, Loss: 8.1836
Epoch 8000, Loss: 8.1718
Epoch 9000, Loss: 8.1626
Epoch 10000, Loss: 8.1554
