# N-Gram 模型

下面我们直接用代码进行说明

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

CONTEXT_SIZE = 2 # 依据的单词数
EMBEDDING_DIM = 10 # 词向量的维度
# 我们使用莎士比亚的诗
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

这里的 `CONTEXT_SIZE` 表示我们希望由前面几个单词来预测这个单词，这里使用两个单词，`EMBEDDING_DIM` 表示词嵌入的维度。

接着我们建立训练集，便利整个语料库，将单词三个分组，前面两个作为输入，最后一个作为预测的结果。

In [2]:
 trigram = [((test_sentence[i], test_sentence[i+1]), test_sentence[i+2]) 
            for i in range(len(test_sentence)-2)]

In [3]:
# 总的数据量
len(trigram)

113

In [4]:
# 取出第一个数据看看
trigram[0]

(('When', 'forty'), 'winters')

In [5]:
# 建立每个词与数字的编码，据此构建词嵌入
vocb = set(test_sentence) # 使用 set 将重复的元素去掉
word_to_idx = {word: i for i, word in enumerate(vocb)}
idx_to_word = {word_to_idx[word]: word for word in word_to_idx}

In [6]:
word_to_idx

{"'This": 53,
 'And': 22,
 'How': 21,
 'If': 63,
 'Proving': 60,
 'Shall': 4,
 'Then': 89,
 'This': 57,
 'Thy': 28,
 'To': 14,
 'Were': 38,
 'When': 84,
 'Where': 24,
 'Will': 82,
 'a': 3,
 'all': 2,
 'all-eating': 67,
 'an': 61,
 'and': 46,
 'answer': 52,
 'art': 35,
 'asked,': 80,
 'be': 37,
 'beauty': 81,
 "beauty's": 31,
 'being': 92,
 'besiege': 75,
 'blood': 91,
 'brow,': 55,
 'by': 65,
 'child': 17,
 'cold.': 85,
 'couldst': 77,
 'count,': 39,
 'days;': 70,
 'deep': 44,
 "deserv'd": 87,
 'dig': 76,
 "excuse,'": 90,
 'eyes,': 49,
 'fair': 56,
 "feel'st": 73,
 'field,': 54,
 'forty': 45,
 'gazed': 13,
 'held:': 78,
 'his': 74,
 'in': 62,
 'it': 66,
 'lies,': 95,
 'livery': 34,
 'lusty': 33,
 'made': 0,
 'make': 27,
 'mine': 50,
 'more': 96,
 'much': 64,
 'my': 25,
 'new': 6,
 'now,': 30,
 'of': 10,
 'old': 93,
 'old,': 20,
 'on': 48,
 'own': 5,
 'praise': 36,
 'praise.': 32,
 'proud': 29,
 'say,': 40,
 'see': 8,
 'shall': 41,
 'shame,': 16,
 'small': 15,
 'so': 12,
 'succession': 

从上面可以看到每个词都对应一个数字，且这里的单词都各不相同

接着我们定义模型，模型的输入就是前面的两个词，输出就是预测单词的概率

In [7]:
import tensorflow as tf
import tensorflow.contrib.slim as slim

  from ._conv import register_converters as _register_converters


In [8]:
def n_gram(inputs, vocab_size, context_size=CONTEXT_SIZE, n_dim=EMBEDDING_DIM, scope='n-gram', reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        with tf.device('/cpu:0'):
            embeddings = tf.get_variable('embeddings', shape=[vocab_size, n_dim], initializer=tf.random_uniform_initializer)
        embed = tf.nn.embedding_lookup(embeddings, inputs)
        
        net = tf.reshape(embed, (1, -1))
        net = slim.fully_connected(net, vocab_size, activation_fn=None, scope='classification')
        
        return net

In [9]:
input_ph = tf.placeholder(dtype=tf.int64, shape=[2 ], name='input')
label_ph = tf.placeholder(dtype=tf.int64, shape=[1,], name='label')

net = n_gram(input_ph, len(word_to_idx))

In [10]:
loss = tf.losses.sparse_softmax_cross_entropy(label_ph, net, scope='loss')

In [11]:
opt = tf.train.MomentumOptimizer(1e-2, 0.9)
train_op = opt.minimize(loss)

In [12]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [13]:
for e in range(100):
    train_loss = 0
    for word, label in trigram[:100]:
        word = [word_to_idx[i] for i in word]
        label = [word_to_idx[label]]
        
        _, curr_loss = sess.run([train_op, loss], feed_dict={input_ph: word, label_ph: label})
        train_loss += curr_loss
    
    if (e + 1) % 20 == 0:
        print('Epoch: {}, Loss: {:.6f}'.format(e + 1, train_loss / 100))

Epoch: 20, Loss: 0.211326
Epoch: 40, Loss: 0.064162
Epoch: 60, Loss: 0.045913
Epoch: 80, Loss: 0.037720
Epoch: 100, Loss: 0.033036


In [14]:
# 测试一下结果
word, label = trigram[19]
print('input: {}'.format(word))
print('label: {}'.format(label))
print()
word = [word_to_idx[i] for i in word]
out = sess.run(net, feed_dict={input_ph: word})
pred_label_idx = out[0].argmax()
predict_word = idx_to_word[pred_label_idx]
print('real word is {}, predicted word is {}'.format(label, predict_word))

input: ('so', 'gazed')
label: on

real word is on, predicted word is on


In [15]:
word, label = trigram[75]
print('input: {}'.format(word))
print('label: {}'.format(label))
print()
word = [word_to_idx[i] for i in word]
out = sess.run(net, feed_dict={input_ph: word})
pred_label_idx = out[0].argmax()
predict_word = idx_to_word[pred_label_idx]
print('real word is {}, predicted word is {}'.format(label, predict_word))

input: ("'This", 'fair')
label: child

real word is child, predicted word is child


可以看到网络在训练集上基本能够预测准确，不过这里样本太少，特别容易过拟合。

下一次课我们会讲一讲 RNN 如何应用在自然语言处理中