本教程实现原始的skip-gram模型

In [1]:
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import Counter
import torch.optim as optim
import random
import numpy as np

In [2]:
nltk.__version__

'3.2.4'

In [3]:
torch.__version__

'0.4.1.post2'

In [4]:
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
device

device(type='cuda')

In [5]:
def get_batch(batch_size, train_data):
    """
    得到一个批量的数据
    """
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [6]:
def prepare_sequence(seq, word2index):
    """
    seq：一个单词序列，如['are', 'you', 'ok']
    word2index：单词到索引的映射
    return：单词序列的index tensor
    """
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

def prepare_word(word, word2index):
    """
    word: 一个单词，如"love"
    word2index: 单词到索引的映射
    return: 单词的index tensor
    """
    return torch.LongTensor([word2index[word]]) if word2index.get(word) is not None else torch.LongTensor([word2index["<UNK>"]])

# 准备数据

In [7]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [8]:
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:100] # sampling sentences for test
# 全部转为小写
corpus = [[word.lower() for word in sent] for sent in corpus]

拿出3个看看长啥样

In [9]:
corpus[0:3]

[['[', 'moby', 'dick', 'by', 'herman', 'melville', '1851', ']'],
 ['etymology', '.'],
 ['(',
  'supplied',
  'by',
  'a',
  'late',
  'consumptive',
  'usher',
  'to',
  'a',
  'grammar',
  'school',
  ')']]

In [10]:
flatten = lambda l : [word for sublist in l for word in sublist]
word_count = Counter(flatten(corpus))
border = int(len(word_count) * 0.01) 

In [11]:
border

5

将出现频率最高的和频率最低的都去掉

In [12]:
stopwords = word_count.most_common()[:border] + list(reversed(word_count.most_common()))[:border]

In [13]:
stopwords

[(',', 96),
 ('.', 66),
 ('the', 58),
 ('of', 36),
 ('and', 35),
 ('man', 1),
 ('artificial', 1),
 ('civitas', 1),
 ('--(', 1),
 ('state', 1)]

In [14]:
stopwords = [s[0] for s in stopwords]
stopwords

[',', '.', 'the', 'of', 'and', 'man', 'artificial', 'civitas', '--(', 'state']

In [15]:
vocab = list(set(flatten(corpus)) - set(stopwords))
vocab.append('<UNK>')

In [16]:
print(len(set(flatten(corpus))), len(vocab))

592 583


In [17]:
word2index = {'<UNK>' : 0} 

for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)

index2word = {v:k for k, v in word2index.items()} 

得到skip-gram中的窗口，这里先是给一个句子的前后都加了window_size个`<DUMMY>`，然后使用nltk.ngrams生成窗口内容，最后展开得到所有的窗口集合。

In [18]:
WINDOW_SIZE = 3
windows = flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])
windows[0:5]

[('<DUMMY>', '<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by'),
 ('<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by', 'herman'),
 ('<DUMMY>', '[', 'moby', 'dick', 'by', 'herman', 'melville'),
 ('[', 'moby', 'dick', 'by', 'herman', 'melville', '1851'),
 ('moby', 'dick', 'by', 'herman', 'melville', '1851', ']')]

接下来我们从所有的窗口中提取出训练需要的单词对。

In [19]:
train_data = []

for window in windows:
    for i in range(WINDOW_SIZE * 2 + 1):
        if i == WINDOW_SIZE or window[i] == '<DUMMY>': 
            continue
        train_data.append((window[WINDOW_SIZE], window[i]))

print(train_data[:WINDOW_SIZE * 2])

[('[', 'moby'), ('[', 'dick'), ('[', 'by'), ('moby', '['), ('moby', 'dick'), ('moby', 'by')]


In [20]:
X_p = []
y_p = []

In [21]:
train_data[0]

('[', 'moby')

将这些单词对都转为tensor类型的索引，放在X_p和Y_p里面

In [22]:
for tr in train_data:
    X_p.append(prepare_word(tr[0], word2index).view(1, -1))
    y_p.append(prepare_word(tr[1], word2index).view(1, -1))

In [23]:
train_data = list(zip(X_p, y_p))

In [24]:
len(train_data)

7606

# 模型

skip-gram模型的计算图如下：
- 首先，将输入词向量(one-hot)投影到隐藏层(投影层)，获得中心词的词向量，即$1\times |V|\times |V| \times d=1\times d$。这里的代码中没有用 one-hot 编码，直接用索引来代替，效果是一样的，都是得到中心词的词向量。
- 将隐藏层的结果乘以隐藏层到输出层的权重矩阵 $W$，即 $1\times m \times m\times |V|=1\times |V|$。在代码中是先提取表示target_word的词向量，与中心词词向量相乘，得到对应的score，然后所有词的词向量与中心词词向量相乘，得到所有的score。
- 将输出层的输出结果进行softmax归一化，$y=\frac{e^{y}}{\sum_{i\in y}e^i }$。代码中用了-log_softmax，直接输出损失。


In [25]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, projection_dim):
        super().__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim)
        self.embedding_u = nn.Embedding(vocab_size, projection_dim)

        self.embedding_v.weight.data.uniform_(-1, 1) # init
        self.embedding_u.weight.data.uniform_(0, 0) # init
        #self.out = nn.Linear(projection_dim,vocab_size)
    def forward(self, center_words, target_words, outer_words):
        center_embeds = self.embedding_v(center_words) # B x 1 x D
        target_embeds = self.embedding_u(target_words) # B x 1 x D
        outer_embeds = self.embedding_u(outer_words) # B x V x D
        
        scores = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # Bx1xD * BxDx1 => Bx1
        norm_scores = outer_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # BxVxD * BxDx1 => BxV
        
        nll = -torch.mean(torch.log(torch.exp(scores) / torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        
        return nll # negative log likelihood
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds 

# 训练

In [26]:
EMBEDDING_SIZE = 30
BATCH_SIZE = 256
EPOCH = 100

losses = []
model = Skipgram(len(word2index), EMBEDDING_SIZE)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [27]:
vocabs = prepare_sequence(list(vocab), word2index).expand(BATCH_SIZE, len(vocab))  # shape：(batch_size, voc_len)
vocabs.shape

torch.Size([256, 583])

In [28]:
for epoch in range(EPOCH):
    for i, batch in enumerate(get_batch(BATCH_SIZE, train_data)):
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs) # shape: (batch_size, 1)
        targets = torch.cat(targets) # shape: (batch_size, 1)
        vocabs = prepare_sequence(list(vocab), word2index).expand(inputs.size(0), len(vocab))  # shape：(batch_size, voc_len)
        model.zero_grad()

        inputs, targets, vocabs = inputs.to(device), targets.to(device), vocabs.to(device)
        loss = model(inputs, targets, vocabs)
        loss.backward()
        optimizer.step()
   
        losses.append(loss.data)

    if epoch % 10 == 0:
        print("Epoch : %d, mean_loss : %.02f" % (epoch,np.mean(losses)))
        losses = []


Epoch : 0, mean_loss : 6.20
Epoch : 10, mean_loss : 4.38
Epoch : 20, mean_loss : 3.47
Epoch : 30, mean_loss : 3.31
Epoch : 40, mean_loss : 3.26
Epoch : 50, mean_loss : 3.23
Epoch : 60, mean_loss : 3.22
Epoch : 70, mean_loss : 3.21
Epoch : 80, mean_loss : 3.21
Epoch : 90, mean_loss : 3.20


# 预测

In [29]:
def word_similarity(target, vocab):
    target_idx = prepare_word(target, word2index).to(device)
    target_v = model.prediction(target_idx)
    similarities = []
    for i in range(len(vocab)):
        if vocab[i] == target:
            continue
        word_idx = prepare_word(vocab[i], word2index).to(device)
        vector = model.prediction(word_idx)
        cosine_sim = F.cosine_similarity(target_v, vector).tolist()[0]
        similarities.append([vocab[i], cosine_sim])
    return similarities

In [30]:
test = random.choice(vocab)
test

'solely'

In [31]:
result = word_similarity(test, vocab)
result = sorted(result, key=(lambda x : x[1]), reverse=True)

看看和测试单词最相关的单词、最不相关的单词分别有哪些。

In [32]:
result[0:10]

[['valuable', 0.8013838529586792],
 ['appearing', 0.5342181324958801],
 ['are', 0.48857781291007996],
 ['entertaining', 0.4821161925792694],
 ['fishes', 0.47675496339797974],
 ['brain', 0.47199326753616333],
 ['cetology', 0.45881494879722595],
 ['extracts', 0.45706838369369507],
 ['heart', 0.45462527871131897],
 ['queen', 0.4418097138404846]]

In [33]:
result[-10:]

[['wine', -0.23280653357505798],
 ['boil', -0.2420017272233963],
 ['sub', -0.25242048501968384],
 ['no', -0.2563296854496002],
 ['prophet', -0.2904263734817505],
 ['anyways', -0.29620370268821716],
 [']', -0.30552300810813904],
 ['find', -0.32694023847579956],
 ['librarian', -0.34085381031036377],
 [').', -0.3557795584201813]]