自然语言处理

1. 自然语言的张量表示, one-hot,word-bag,
2. 词嵌入: skip-gram, bow


定风波·莫听穿林打叶声

三月七日，沙湖道中遇雨，雨具先去，同行皆狼狈，余独不觉。已而遂晴，故作此(词)。

莫听穿林打叶声，何妨吟啸且徐行。

竹杖芒鞋轻胜马，谁怕？一蓑烟雨任平生。

料峭春风吹酒醒，微冷，山头斜照却相迎。

回首向来萧瑟处，归去，也无风雨也无晴。


## 对于自然语言，如何用向量来表示？

### 第一步，对于语料库的文本拆分统计，得到一个词典，为每个字分配一个位置编号

In [1]:
import random
import numpy as np

np.random.seed(0)

txt = "定风波·莫听穿林打叶声 三月七日，沙湖道中遇雨，雨具先去，同行皆狼狈，余独不觉。已而遂晴，故作此(词)。莫听穿林打叶声，何妨吟啸且徐行。竹杖芒鞋轻胜马，谁怕？一蓑烟雨任平生。料峭春风吹酒醒，微冷，山头斜照却相迎。回首向来萧瑟处，归去，也无风雨也无晴。"
tokens = set([token for token in txt]) 

print(set(tokens))

vocabs = {}
for idx,token in enumerate(tokens):
    vocabs[token]=idx

vocabs_len = len(vocabs)
print(vocabs_len,vocabs)

{'打', '徐', '先', '觉', '雨', '同', '作', '且', '(', '谁', '波', '日', '蓑', '狈', '首', '料', '独', '故', '遇', '也', ' ', '啸', '三', '一', '斜', '却', '归', '穿', '照', '风', '平', '具', '鞋', '相', '定', '春', '胜', '杖', '月', '处', ')', '叶', '无', '晴', '芒', '沙', '回', '此', '吟', '·', '瑟', '酒', '醒', '余', '妨', '莫', '，', '遂', '怕', '？', '任', '湖', '竹', '林', '轻', '而', '听', '。', '生', '峭', '道', '不', '山', '向', '微', '行', '去', '烟', '迎', '皆', '头', '已', '七', '马', '来', '中', '何', '词', '冷', '吹', '狼', '萧', '声'}
93 {'打': 0, '谁': 1, '波': 2, '日': 3, '首': 4, '料': 5, ' ': 6, '啸': 7, '三': 8, '一': 9, '斜': 10, '穿': 11, '照': 12, '具': 13, '鞋': 14, '相': 15, '定': 16, '春': 17, '胜': 18, ')': 19, '叶': 20, '晴': 21, '·': 22, '瑟': 23, '余': 24, '莫': 25, '，': 26, '遂': 27, '怕': 28, '？': 29, '竹': 30, '而': 31, '生': 32, '道': 33, '山': 34, '向': 35, '微': 36, '去': 37, '烟': 38, '七': 39, '中': 40, '词': 41, '冷': 42, '吹': 43, '狼': 44, '萧': 45, '徐': 46, '先': 47, '觉': 48, '雨': 49, '同': 50, '作': 51, '且': 52, '(': 53, '蓑': 54, '狈': 55, '独': 56, '故': 57, '遇': 58, '也': 59, 

In [None]:
# pytorch 针对自然语言处理的一些封装
from torchtext.vocab import vocab
from collections import Counter, OrderedDict

counter = Counter([token for token in txt])
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)


v1 = vocab(ordered_dict)
print(v1['山'])
print(v1.lookup_indices(['山','定']))

# torch.txt 0.12支持
unk_token = '<unk>'
default_index = -1
# v2 = vocab(OrderedDict([(token, 1) for token in txt]), specials=[unk_token])
# v2.set_default_index(default_index)
# print(v2['<unk>']) #prints 0
# print(v2['out of vocab']) #prints -1
# #make default index same as index of unk_token
# v2.set_default_index(v2[unk_token])
# v2['out of vocab'] is v2[unk_token] #prints True



### 第二步，词向量表达

In [2]:
# 1. one-hot 方式，一个词典大小维度的向量，例如上面的词典总计有93维，"晴"在词典中位置=39，"晴"可以表示为39位置是1，其他位置都是0的向量
# 缺点：稀疏矩阵，参与计算时很浪费

# 单个字，映射为一个向量
def convert_word_to_vector(word):
    word_idx = vocabs[word]
#     print(word_idx)
    vector = np.zeros(vocabs_len)
    vector[word_idx]=1
#     print(vector)
    return vector
    
v = convert_word_to_vector(word = "晴")
print(v)

# 输入是一句话，映射为多个向量构成的矩阵，最后会产出一个n*m 的矩阵，m为词典大小，n为输入句子的长度
input = "莫听穿林打叶声"
vector_li = []
for token in input:
#     print(token)
    v = convert_word_to_vector(token)
    vector_li.append(v)

out = np.array(vector_li)
print(out.shape)
print(out)


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
(7, 93)
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [3]:
# 2. 词袋法(Bag of Words), 对于输入一个句子来说，产出的不是一个矩阵，而是一个向量，维度为词典大小，其他位置0,出现的字的位置为1。
# 除了将出现字的位置设置为1外，还可以设置为该字在整个句子中出现的次数，或者tf-idf值
# idf(逆文档频率) = log(语料库中文档总数/包含该词的文档数+1)
# tf(词频) = 某个词在文章中的出现次数/文章总词数
# tf-idf = tf * idf 
# 缺点：输词的位置信息丢了。
input = "莫听穿林打叶声"

vector = np.zeros(vocabs_len)
for token in input:
    word_idx = vocabs[token]
    vector[word_idx]=1

print(vector)
    

[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]


In [4]:
# 3. 词嵌入 Word embedding
# 设定一个词为一个128维的向量，先随机初始化这个向量，通过某种方式调整这个向量大小，经过训练后会行程一个 n*m, n为词典大小，m为每个词向量大小。
vector_size = 5
WordEmbedding = np.random.rand(vocabs_len,vector_size)
print(WordEmbedding.shape)

txt_input = "莫听穿林打叶声"
words_idx = [vocabs[token] for token in txt_input]
print(words_idx)
print(WordEmbedding[words_idx,:])

(93, 5)
[25, 79, 11, 77, 0, 20, 92]
[[0.60639321 0.0191932  0.30157482 0.66017354 0.29007761]
 [0.95898272 0.35536885 0.35670689 0.0163285  0.18523233]
 [0.16130952 0.65310833 0.2532916  0.46631077 0.24442559]
 [0.90884372 0.81552382 0.15941446 0.62889844 0.39843426]
 [0.5488135  0.71518937 0.60276338 0.54488318 0.4236548 ]
 [0.67781654 0.27000797 0.73519402 0.96218855 0.24875314]
 [0.2724369  0.3790569  0.37429618 0.74878826 0.23780724]]


In [5]:
# 基于pytorch的嵌入
import torch
import torch.nn as nn
embedding = nn.Embedding(vocabs_len,vector_size)
words_idx_t = torch.LongTensor([words_idx])
print(embedding(words_idx_t))

tensor([[[-0.0263,  0.4056, -0.5449,  0.3652, -1.1004],
         [-0.4647,  1.6371,  1.5121,  0.1880,  0.6070],
         [-0.5138, -0.9206, -1.1940,  0.2056,  0.8584],
         [-2.0510, -0.6589, -0.8776,  0.5585, -0.0444],
         [-0.2092, -0.7802,  1.3900, -0.9586,  0.2622],
         [ 1.5567, -1.1102, -1.2972,  0.0290, -0.0998],
         [-0.1247,  0.2919, -1.3336,  0.3391, -0.9337]]],
       grad_fn=<EmbeddingBackward>)


### 如何调整Word embedding矩阵中每个词向量的值

自然语言模型中，哪些词是经常一起出现的，哪些不是？ 给定左右出现的词后，它们中间的词会是哪些？CBOW(Continuous Bag-of-Words Model)；或者给定一个词或者一个字，它左右经常出现的词是哪些？Skip-Gram Continuous skip-gram Model.

* CBOW: 优点:训练稳定，容易收敛；缺点:生僻词效果不佳
* Skip-gram: 优点：对生僻字不敏感， 缺点：不太稳定？

接下来使用Skip-gram尝试训练词嵌入
1. 准备训练数据
设置一个滑动窗口，一般为5，演示方便设置为3，从语料库开始位置，选取滑动窗口大小的字，中间的为输入1，两边的为输入2
2. 搭建模型
3. 训练词嵌入
4. 词嵌入的一些特点

In [6]:
#思路1, 输入是中间词，输出是上下文词，过模型，softmax后预测词表中每个词的概率值，缺点：由于实际中词表非长大，计算量大，浪费资源
slide_windows_cnt = 5
for i in range(len(txt)):
    windows_token = txt[i:i+slide_windows_cnt]
    middle_idx = (slide_windows_cnt-1)//2
    middle = windows_token[middle_idx]
    for j in range(slide_windows_cnt):
        if j != middle_idx:
            print("in_out:",windows_token[middle_idx],windows_token[j])
#     print(middle,windows_token)
    if i>5:
        break 

in_out: 波 定
in_out: 波 风
in_out: 波 ·
in_out: 波 莫
in_out: · 风
in_out: · 波
in_out: · 莫
in_out: · 听
in_out: 莫 波
in_out: 莫 ·
in_out: 莫 听
in_out: 莫 穿
in_out: 听 ·
in_out: 听 莫
in_out: 听 穿
in_out: 听 林
in_out: 穿 莫
in_out: 穿 听
in_out: 穿 林
in_out: 穿 打
in_out: 林 听
in_out: 林 穿
in_out: 林 打
in_out: 林 叶
in_out: 打 穿
in_out: 打 林
in_out: 打 叶
in_out: 打 声


In [7]:
# 思路2, 模型接受2个输入，中间词+真实的上下文词(或从词典里随机抽些词)，
# 输出：中间词+真实上下文词输出1，中间词+负采样输出0，将多分类问题转化为一个二分类问题
def Negative_sampling(tokens):
    return random.choices(list(tokens),k=slide_windows_cnt-1) #random.randint(0,tokens)

def load_dataset(txt,tokens):
    dataset = []
    slide_windows_cnt = 5
    for i in range(len(txt)):
        windows_token = txt[i:i+slide_windows_cnt]
        middle_idx = (slide_windows_cnt-1)//2 
        if middle_idx>len(windows_token)-1:
            break
        middle = windows_token[middle_idx]
        for j in range(len(windows_token)):
            if j != middle_idx:
#                 print(j,middle_idx)
                dataset.append([1,windows_token[middle_idx],windows_token[j]])
#                 print("\t".join(["1",windows_token[middle_idx],windows_token[j]]))
        for token in Negative_sampling(tokens):
            dataset.append([0,windows_token[middle_idx],token])
#             print("\t".join(["0",windows_token[middle_idx],token])
    return dataset 

load_dataset(txt,tokens)

# 实际生成 label,middel,context时，需要考虑词在语料库中出现的次数，对于出现次数较多的词，需要减少他们在生成数据中的出现次数。
print(Negative_sampling(tokens))



# 使用pytorch, Dataset,DataLoader的方式实现训练数据的加载
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
class LanguageDataset(Dataset):
    def __init__(self):
      self.dataset = load_dataset(txt,tokens)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx): 
        row = self.dataset[idx] 
        label = row[0]
        in_idx = vocabs[row[1]]
        cxt_idx = vocabs[row[2]]
#         print(label,in_idx,cxt_idx)
        
        return np.array(label,dtype=np.float32),in_idx,cxt_idx

data = LanguageDataset()
print(data[0])

train_dataloader = DataLoader(data, batch_size=10, shuffle=True)
label, in_idx,cxt_idx = next(iter(train_dataloader))
print(f"label batch shape: {label.size()}",label)
print(f"in_idx batch shape: {in_idx.size()}",in_idx)
print(f"cxt_idx batch shape: {cxt_idx.size()}",cxt_idx)

['无', '芒', '此', '一']
(array(1., dtype=float32), 2, 16)
label batch shape: torch.Size([10]) tensor([0., 0., 0., 0., 1., 0., 0., 1., 0., 1.])
in_idx batch shape: torch.Size([10]) tensor([39, 62, 32, 32, 36, 26, 88, 12, 67, 77])
cxt_idx batch shape: torch.Size([10]) tensor([69, 59, 49, 18, 26, 15,  9, 15, 73, 11])


In [8]:
# Skip-gram model
class SkipgramModel(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(SkipgramModel, self).__init__()
        self.emb_in = nn.Embedding(vocab_size, embedding_size)
        self.emb_cxt = nn.Embedding(vocab_size, embedding_size) 

    def forward(self, inToken,cxtToken): 
        in_tensor = self.emb_in(inToken)
#         print(in_tensor.shape,in_tensor)
        cxt_tensor = self.emb_cxt(cxtToken)
#         print(cxt_tensor.shape,cxt_tensor) 
#         tmp = in_tensor * cxt_tensor 
        out = torch.mul(in_tensor, cxt_tensor)
        print("out:",out)
        out = torch.sum(out, dim=-1)
        out = torch.sigmoid(out)
        print(out.shape,out)
        return out

model = SkipgramModel(vocabs_len,vector_size)

#单个例子test
model(torch.tensor([[1]]),torch.tensor([[2]]))

#多个测试
label, in_idx,cxt_idx = next(iter(train_dataloader))
print(label, in_idx,cxt_idx)
pred = model(in_idx,cxt_idx)
print(pred)

out: tensor([[[ 0.1401,  0.0610,  1.0358, -0.3441, -1.0490]]],
       grad_fn=<MulBackward0>)
torch.Size([1, 1]) tensor([[0.4610]], grad_fn=<SigmoidBackward>)
tensor([0., 0., 1., 0., 0., 1., 1., 0., 0., 1.]) tensor([ 0, 72, 83,  8, 80, 79, 80, 26, 59, 45]) tensor([89, 74, 80, 31, 78, 77, 21, 43, 92, 89])
out: tensor([[-0.0751,  1.0648,  0.0086,  0.9577,  0.4810],
        [-0.6918, -0.0039,  1.9472, -0.7381,  2.2029],
        [-1.5257, -0.0879, -0.3471,  0.9209,  0.2662],
        [ 0.8744,  0.1277, -1.2394,  0.2109, -1.4333],
        [-1.5130,  0.0870,  0.9916,  0.9618, -0.1716],
        [ 0.0224, -1.7983,  0.8457, -0.5641, -0.3229],
        [-0.6765, -0.5366,  0.7538,  0.6974, -0.2330],
        [-0.2077, -1.2751,  0.8477,  0.0059, -0.1134],
        [-2.5775,  0.6961, -0.6527,  1.1731,  0.2835],
        [-0.0856, -0.5757, -0.1493, -0.2413, -1.8482]], grad_fn=<MulBackward0>)
torch.Size([10]) tensor([0.9196, 0.9380, 0.3157, 0.1885, 0.5880, 0.1398, 0.5012, 0.3224, 0.2540,
        0.0522], 

In [9]:
# 训练, 纯演示，数据量比较小，很难收敛
learning_rate = 0.0001
criterion = nn.BCELoss() #定义损失函数
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) #定义最优化算法

def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (label, in_idx,cxt_idx) in enumerate(dataloader):    
        pred = model(in_idx,cxt_idx)
        loss = criterion(pred,label)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 5 == 0:
            loss, current = loss.item(), batch * len(label)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            
    print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

train(train_dataloader, model, criterion, optimizer)

out: tensor([[ 5.1338e-01, -3.2567e-02,  2.2558e-01, -1.1504e-01, -1.3168e+00],
        [ 3.2692e-03, -3.6338e+00,  1.7465e-01, -7.1994e-01,  1.2621e+00],
        [ 6.7210e-01, -3.1703e-01, -2.0234e+00,  3.5504e-01,  1.4407e+00],
        [ 7.7532e-01,  2.0064e-01, -1.1363e+00, -3.3789e-01,  6.0580e-01],
        [ 4.5659e-01, -1.0578e+00, -3.6325e-01,  1.5888e-01,  1.2875e-01],
        [ 2.9871e-03,  5.5018e-02, -7.7735e-01,  3.9210e-01,  1.4635e+00],
        [-3.1587e-01, -3.9140e-01,  7.9908e-01, -1.5911e-01,  1.1052e-01],
        [-4.6835e-02, -3.5396e-03, -8.7775e-01, -3.0353e-02, -1.4621e-01],
        [ 6.8691e-01, -1.6944e+00,  2.6739e-01, -6.6844e-02,  1.1297e+00],
        [ 1.2454e-01,  4.4316e-01, -3.2308e-01, -3.3373e-01, -9.2537e-01]],
       grad_fn=<MulBackward0>)
torch.Size([10]) tensor([0.3262, 0.0515, 0.5318, 0.5269, 0.3370, 0.7570, 0.5108, 0.2489, 0.5800,
        0.2661], grad_fn=<SigmoidBackward>)
loss: 0.720184  [    0/  981]
out: tensor([[ 1.0991e-01, -3.1363e-02, -4

In [10]:
# pytorch 针对自然语言处理的一些封装
from torchtext.vocab import vocab
from collections import Counter, OrderedDict

counter = Counter([token for token in txt])
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)


v1 = vocab(ordered_dict)
print(v1['山'])
print(v1.lookup_indices(['山','定']))

# torch.txt 0.12支持
unk_token = '<unk>'
default_index = -1
# v2 = vocab(OrderedDict([(token, 1) for token in txt]), specials=[unk_token])
# v2.set_default_index(default_index)
# print(v2['<unk>']) #prints 0
# print(v2['out of vocab']) #prints -1
# #make default index same as index of unk_token
# v2.set_default_index(v2[unk_token])
# v2['out of vocab'] is v2[unk_token] #prints True


78
[78, 16]


In [None]:
# https://wmathor.com/index.php/archives/1569/
# n > 8.33logN, N词表大小