# 词向量

* 学习词向量的概念
* 使用 skip-tought模型
* 学习 dataset dataloader
* 学习pytorch 中Module Embedding

在计算机中表示一个词， 向量

* 离散表示 One-hot
* 离散表示 Bag of Words
 > 文档的向量表 将各个词向量表示加和
 > 词权重，TF-IDF ， Binary
* 离散表示 Bi-gram 和 N-gram
词编码需要保证词的相似性



In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as tud

from collections import Counter
import numpy as np
import random
import math

import pandas as pd
import scipy
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

random.seed(1)
np.random.seed(1)
torch.manual_seed(1)
IS_CUDA = torch.cuda.is_available()

if IS_CUDA:
    torch.cuda.manual_seed(1)


In [9]:
# 超参数
C = 3 #
K = 100
Epochs = 2
max_vocab_size = 30000
batch_size =128
learning_rate = 0.2
embedding_size = 100

def word_tokenize(text):
    return text.split()

In [10]:
train_file = r'E:\Edgar\网课学习\5. 2019最好Pytorch视频教程-七月在线\第二课资料\text8\text8\text8.train.txt'
with open(train_file,'r') as fin:
    text = fin.read()
words = text.split()
vocab = dict(Counter(words).most_common(max_vocab_size-1))
vocab['<unk>'] = len(words) - np.sum(list(vocab.values())) # 各个单词的频率

idx_to_word = [word for word in vocab.keys()]
word_to_idx = {word: i for i,word in enumerate(idx_to_word)}
print(list(word_to_idx.items())[:100])


[('the', 0), ('of', 1), ('and', 2), ('one', 3), ('in', 4), ('a', 5), ('to', 6), ('zero', 7), ('nine', 8), ('two', 9), ('is', 10), ('as', 11), ('eight', 12), ('for', 13), ('s', 14), ('five', 15), ('three', 16), ('was', 17), ('by', 18), ('that', 19), ('four', 20), ('six', 21), ('seven', 22), ('with', 23), ('on', 24), ('are', 25), ('it', 26), ('from', 27), ('or', 28), ('his', 29), ('an', 30), ('be', 31), ('this', 32), ('he', 33), ('at', 34), ('which', 35), ('not', 36), ('also', 37), ('have', 38), ('were', 39), ('has', 40), ('but', 41), ('other', 42), ('their', 43), ('its', 44), ('first', 45), ('they', 46), ('had', 47), ('some', 48), ('more', 49), ('all', 50), ('can', 51), ('most', 52), ('been', 53), ('such', 54), ('who', 55), ('many', 56), ('new', 57), ('there', 58), ('used', 59), ('after', 60), ('american', 61), ('when', 62), ('time', 63), ('into', 64), ('these', 65), ('only', 66), ('see', 67), ('may', 68), ('than', 69), ('i', 70), ('world', 71), ('b', 72), ('d', 73), ('would', 74), ('no

In [11]:
word_counts = np.array([count for count in vocab.values()], dtype=np.float32)
word_freqs = word_counts / np.sum(word_counts) # 单词频率
word_freqs = word_freqs ** (3./4.)
# word_freqs = word_counts / np.sum(word_counts)
vocab_size = len(idx_to_word) # 单词量
print('vocab size:', vocab_size)

vocab size: 30000


## 实现Dataloader

1. 单词word编码成数字
2. 保存
3.

In [2]:
# 定义一个 dataset

class WordEmbeddingDataset(torch.utils.data.Dataset):
    def __init__(self,words, word_to_idx, idx_to_word, word_freqs, word_counts):
        super(WordEmbeddingDataset, self).__init__()
        self.text_encoded = [word_to_idx(word, word_to_idx['<unk>']) for word in words]
        self.text_encoded = torch.LongTensor(self.text_encoded)
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
        self.word_freqs = torch.Tensor(word_freqs)
        self.word_counts = torch.Tensor(word_counts)

    def __len__(self):
        return len(self.text_encoded)
    def __getitem__(self, idx):
        # 返回index的中心词
        center_words = self.text_encoded[idx]
        pos_indices = list(range(idx-C)) + list (range(idx+1, idx+C+1)) # windows 内单词index
        pos_indices = [i % len(self.text_encoded) for i in pos_indices]
        pos_words = self.text_encoded[pos_indices] # 周围单词
        neg_words = torch.multinomial(self.word_freqs, K*pos_word.shape[0], True) # 负例采样
        return center_words, pos_words, neg_words

NameError: name 'torch' is not defined

In [None]:
dataset = WordEmbeddingDataset(words, word_to_idx, idx_to_word, word_freqs, word_counts)
dataloader = tud.DataLoader(dataset, batch_size=batch_size,shuffle=True, num_workers=4)


In [None]:
# 定义模型

class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(EmbeddingModel, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.in_embed = nn.Embedding(self.vocab_size, self.embed_size)
        self.out_embed = nn.Embedding(self.vocab_size, self.embed_size)

    def forward(self, input_labels, pos_labels, neg_labels):
        # input label : [batch_dize]
        # post_lable [batch_size, (windos_size * 2)]
        # neg_labels [batch_size, windows_size*2 *K]
        input_embedding = self.in_embed(input_labels) # batch_size * embend_size的tensor [batch_]
        pos_embedding = self.in_embed(pos_labels)
        neg_embedding = self.in_embed(neg_labels)

        input_embedding = input_embedding.unsqueeze(2) # [batch_size, embed_size, 1]
        pos_dot = torch.bmm(pos_embedding, input_embedding).squeeze(2) # [batch_size, window_size]
        neg_dot = torch.bmm(neg_embedding, -input_embedding).squeeze(2) # [batch_size, windows_size*2*K]
        log_pos = F.log_softmax(pos_dot).sum(1)
        log_neg = F.log_softmax(neg_dot).sum(1)
        loss = log_pos + log_neg
        return -loss

    def input_embeddings(self):
        return self.in_embed.weight.data.cpu().numpy()


In [None]:
model = EmbeddingModel(vocab_size, embedding_size)
model

In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
for e in range(Epochs):
    for i , (input_labels, pos_labels, neg_labels) in enumerate(dataloader):
        input_labels = input_labels.long()
        pos_labels = pos_labels.long()
        neg_labels = neg_labels.long()
        if IS_CUDA:
            pass
        optimizer.zero_grad()
        loss = model(input_labels, pos_labels, neg_labels).mean()
        loss.backword()
        optimizer.step()
        if not i % 100:
            print('epoch', e, 'iteration', i , loss.item())