# 构造词向量

## 介绍

这里我们借鉴了word embedding的思路去改进成自己的char embedding，即将一个字母看成是一个向量，而一个单词就是由多个字母组成的，也就是由多个向量组成的一个矩阵，我们利用这个矩阵来表示一个单词之后就可以训练出自己的从而看字母之间的关联程度。

In [6]:
# 从https://raw.githubusercontent.com/dwyl/english-words/master/words.txt这个网址下载英文单词表
# 然后训练一个我们的字母嵌入模型
# 用于将字母转换为向量
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

In [2]:
# 下载单词表到本地
# !wget https://raw.githubusercontent.com/dwyl/english-words/master/words.txt
# 已经下载好了，不需要再下载了，文件名为words.txt

In [4]:
open_file = open('words.txt', 'r')
words = open_file.read().split('\n')
open_file.close()

In [6]:
print('type of words: ', type(words))
print('Number of words: ', len(words))
print('First 10 words: ', words[:10])

type of words:  <class 'list'>
Number of words:  466551
First 10 words:  ['2', '1080', '&c', '10-point', '10th', '11-point', '12-point', '16-point', '18-point', '1st']


## 数据预处理

In [12]:
# 数据预处理
# 将所有的单词转换为小写
words = [word.lower() for word in words]
# 去掉单词中的空格、特殊字符、数字
words = [word for word in words if word.isalpha()]

In [15]:
print('type of words: ', type(words))
print('Number of words: ', len(words))
print('First 10 words: ', words[:10])

type of words:  <class 'list'>
Number of words:  416296
First 10 words:  ['a', 'aa', 'aaa', 'aaaa', 'aaaaaa', 'aaal', 'aaas', 'aaberg', 'aachen', 'aae']


In [16]:
# # 保存预处理后的单词表
# with open('words_clean.txt', 'w') as f:
#     for word in words:
#         f.write(word + '\n')

## 字母向量（词向量）

在自然语言处理中(NLP)，我们拥有很多个英文单词的时候是不可能只用一个one-hot向量去表示的了，而是采取了embedding的方法，将数据投影在规定的维度上，也就是我们定义这种方法为词嵌入模型，利用类似于词嵌入的方法，我们提出了字母嵌入，即将26个字母也映射到统一的维度上，然后计算每一个字母都是一个向量，那么向量的夹角我们就是字母的相关程度，通过统计一个单词中俩俩字母的相关程度，我们就可以得出该单词的怪异程度。

In [1]:
# Define characters and indices
chars = "abcdefghijklmnopqrstuvwxyz"
char2ind = {char: index for index, char in enumerate(chars)}
ind2char = {index: char for index, char in enumerate(chars)}

In [3]:
# 加载预处理后的单词表
open_file = open('words_clean.txt', 'r')
words_clean = open_file.read().split('\n')
open_file.close()

In [8]:
# Create character tensor for each word
max_word_len = max([len(word) for word in words_clean])
print('max_word_len: ', max_word_len)

max_word_len:  45


In [7]:
X_char = torch.zeros((len(words_clean), max_word_len), dtype=torch.long)
for i, word in enumerate(words_clean):
  for j, char in enumerate(word):
    X_char[i, j] = char2ind[char]


KeyboardInterrupt: 

In [None]:
# 随机选取几个单词，查看其对应的字符张量
for i in range(10):
    idx = np.random.randint(0, len(words_clean))
    print("word: ", words_clean[idx])
    print("char tensor: ", X_char[idx])

# 随机选取几个单词制作一个小样本
# 用于后面的训练
sample_size = 20
idx = np.random.randint(0, len(words_clean), sample_size)
words_sample = [words_clean[i] for i in idx]
X_char_sample = X_char[idx]
for i in range(sample_size):
    print("word: ", words_sample[i])
    print("char tensor: ", X_char_sample[i])

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Define model parameters
char_embedding_size = 3 # arbitrary choice
hidden_size = 50 # arbitrary choice

In [None]:
# Define model layer
embedding_layer = nn.Embedding(num_embeddings=len(chars), embedding_dim=char_embedding_size)
bidirectional_layer = nn.LSTM(input_size=char_embedding_size, hidden_size=hidden_size, bidirectional=True)

In [None]:
# Get embeddings for each word
embeddings = embedding_layer(X_char)
output, (hidden, cell) = bidirectional_layer(embeddings)

In [None]:
# Import module
import torch.nn.functional as F

# Define two characters and their indices
char1 = "h"
char2 = "w"
ind1 = char2ind[char1]
ind2 = char2ind[char2]

# Get embeddings for each character
emb1 = embedding_layer(torch.tensor(ind1))
emb2 = embedding_layer(torch.tensor(ind2))

In [None]:
# Compute cosine similarity
similarity = F.cosine_similarity(emb1, emb2, dim=-1)
print(similarity)

```python

# 导入PyTorch库
import torch
import torch.nn as nn
import torch.optim as optim

# 定义单词列表和索引映射
word_list = ["apple", "banana", "strawberry"]
word_to_idx = {"apple": 0, "banana": 1, "strawberry": 2}

# 定义嵌入矩阵的大小和随机种子
embedding_size = 3
torch.manual_seed(0)

# 创建一个嵌入层对象，初始化嵌入矩阵为随机值
embedding = nn.Embedding(len(word_list), embedding_size)

# 打印初始的嵌入矩阵
print("Initial embedding matrix:")
print(embedding.weight)

# 定义一个简单的神经网络模型，包含一个嵌入层和一个线性层
class Model(nn.Module):
    def __init__(self, embedding):
        super(Model, self).__init__()
        self.embedding = embedding # 使用已有的嵌入层对象
        self.linear = nn.Linear(embedding_size, 1) # 定义一个线性层，输出一个标量

    def forward(self, x):
        x = self.embedding(x) # 将输入的索引转换为嵌入向量
        x = self.linear(x) # 将嵌入向量输入到线性层，得到输出标量
        return x

# 创建一个模型对象，并将其移动到GPU（如果可用）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Model(embedding).to(device)

# 定义损失函数和优化器
criterion = nn.MSELoss() # 使用均方误差作为损失函数
optimizer = optim.SGD(model.parameters(), lr=0.01) # 使用随机梯度下降作为优化器

# 定义训练数据和标签（这里只是随便定义了一些数据，你可以根据你的任务来定义）
inputs = torch.tensor([0, 1, 2]).to(device) # 输入三个单词的索引
labels = torch.tensor([0.5, -0.5, 0.8]).to(device) # 输出三个标量作为标签

# 训练模型（这里只训练了10个epoch，你可以根据你的需要调整）
epochs = 10 
for epoch in range(epochs):
    optimizer.zero_grad() # 清空梯度缓存
    outputs = model(inputs) # 前向传播，得到模型输出
    loss = criterion(outputs.squeeze(), labels) # 计算损失值
    loss.backward() # 反向传播，计算梯度值
    optimizer.step() # 更新参数值

    print(f"Epoch {epoch+1}, loss: {loss.item():.4f}") # 打印每个epoch的损失值

# 打印训练后的嵌入矩阵（注意这里只是演示，实际上可能并没有收敛）
print("Final embedding matrix:")
print(embedding.weight)

```