In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter

# 导入所需的库：PyTorch核心、神经网络模块、优化器、NumPy和Counter

text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules called a program.
People create programs to direct processes."""

# 定义一个简单的文本作为训练数据

words = text.split()
# 将文本分割成单词列表

vocab = set(words)
# 创建词汇表（去重）

word_to_ix = {word: i for i, word in enumerate(vocab)}
# 创建单词到索引的映射字典

ix_to_word = [word for word, i in word_to_ix.items()]
# 创建索引到单词的映射字典

In [15]:
print([element for idx, element in enumerate(vocab) if idx < 10])
print([{element: idx} for idx, element in enumerate(word_to_ix) if idx <10])
print(ix_to_word[:10])

['things', 'called', 'by', 'direct', 'As', 'computational', 'The', 'inhabit', 'that', 'beings']
[{'things': 0}, {'called': 1}, {'by': 2}, {'direct': 3}, {'As': 4}, {'computational': 5}, {'The': 6}, {'inhabit': 7}, {'that': 8}, {'beings': 9}]
['things', 'called', 'by', 'direct', 'As', 'computational', 'The', 'inhabit', 'that', 'beings']


In [16]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # 创建词嵌入层
        self.linear = nn.Linear(embedding_dim, vocab_size)
        # 创建线性层，用于将嵌入映射回词汇表大小

    def forward(self, inputs):
        embeds = self.embeddings(inputs).mean(dim=0)
        # 获取输入单词的嵌入并计算平均值
        out = self.linear(embeds)
        # 通过线性层传递嵌入
        log_probs = nn.functional.log_softmax(out, dim=0)
        # 应用log softmax函数获取对数概率
        return log_probs

In [17]:
def create_cbow_dataset(words, context_size=2):
    data = []
    for i in range(context_size, len(words) - context_size):
        context = [words[i-2], words[i-1], words[i+1], words[i+2]]
        # 获取目标词的上下文（前后各两个词）
        target = words[i]
        # 获取目标词
        data.append((context, target))
    return data
    # 返回(上下文, 目标词)对的列表

In [18]:
def train_cbow(model, data, word_to_ix, learning_rate=0.1, num_epochs=100):
    loss_function = nn.NLLLoss()
    # 定义负对数似然损失函数
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    # 定义随机梯度下降优化器

    for epoch in range(num_epochs):
        total_loss = 0
        for context, target in data:
            context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
            # 将上下文单词转换为索引张量
            model.zero_grad()
            # 清除之前的梯度
            log_probs = model(context_idxs)
            # 前向传播
            loss = loss_function(log_probs.unsqueeze(0), torch.tensor([word_to_ix[target]]))
            # 计算损失
            loss.backward()
            # 反向传播
            optimizer.step()
            # 更新模型参数
            total_loss += loss.item()
        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {total_loss}')
            # 每10个epoch打印一次损失

In [19]:
def main():
    CONTEXT_SIZE = 2
    EMBEDDING_DIM = 10
    
    vocab_size = len(word_to_ix)
    
    cbow_dataset = create_cbow_dataset(words, CONTEXT_SIZE)
    # 创建CBOW数据集
    model = CBOW(vocab_size, EMBEDDING_DIM)
    # 初始化CBOW模型
    
    train_cbow(model, cbow_dataset, word_to_ix)
    # 训练模型
    
    context = ['We', 'are', 'to', 'study']
    context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
    # 准备测试上下文
    
    with torch.no_grad():
        log_probs = model(context_idxs)
    # 使用训练好的模型进行预测
    
    predicted_word_idx = torch.argmax(log_probs).item()
    predicted_word = ix_to_word[predicted_word_idx]
    # 获取预测的单词
    
    print(f"Context: {context}")
    print(f"Predicted word: {predicted_word}")
    # 打印结果

if __name__ == "__main__":
    main()
    # 如果直接运行此脚本，执行main函数

Epoch 0, Loss: 174.11824560165405
Epoch 10, Loss: 92.34939754009247
Epoch 20, Loss: 45.422476068139076
Epoch 30, Loss: 22.1721078902483
Epoch 40, Loss: 12.155173022300005
Epoch 50, Loss: 7.560733236372471
Epoch 60, Loss: 5.208675600588322
Epoch 70, Loss: 3.8647050466388464
Epoch 80, Loss: 3.0234724394977093
Epoch 90, Loss: 2.4582118475809693
Context: ['We', 'are', 'to', 'study']
Predicted word: about
