In [12]:
%pylab inline
from IPython.display import Image, display

import tensorflow as tf
import numpy as np

Populating the interactive namespace from numpy and matplotlib


## Word2Vec

# Task


* Represent a word as a vector
  - Two related words should have two vectors close by.
* Data. 
  - 全唐诗 + 全宋词

# Model

* Embedding of words (characters)
  - E.g., words related semanticly should have close-by embedding vector representation.


<img src='data/linear-relationships.png' />

* $p(w, c; \theta)$ is modeled as
$$
p(w, c; \theta) = \sigma(v_w^T v_c)
$$

* $\sigma(x)$ is the sigmoid function: $\frac{1}{1+e^{-x}}$
* $v_w^T v_c$ is the dot product of embedding vectors of $w$ and $c$

* $p(w, c; \theta)$ denotes the probability a word ($w$) and a context ($c$) appears in the training data.
* Objective:
$$ 
\operatorname*{arg\,max}_\theta \prod_{(w,c) \in D} p(w, c; \theta)\prod_{(w,c) \notin D} (1-p(w, c; \theta))
$$
$$ 
= \operatorname*{arg\,min}_\theta [-\sum_{(w,c) \in D} log(p(w, c; \theta)) - \sum_{(w,c) \notin D} log((1-p(w, c; \theta)))]
$$
* $(w,c) \notin D$ is called negative examples. 
  - Generated randomly. Left as an exercise.

## Data

In [13]:
class TrainData(object):

    def __init__(self, corpus, batch, windows):
        self.batch = batch
        self.windows = windows
        words = open(corpus, mode='r').read()
        words = words.replace('。', '').replace('，', '').replace('\n', '')
        words_as_set = set(words)
        self.id_to_word = sorted(set(words))
        self.word_to_id = {w: i for i, w in enumerate(self.id_to_word)}
        self.data = [self.word_to_id[w] for w in words]
        print('Number of unique chars: ', len(self.id_to_word))
        print('Number of training chars: ', len(self.data))
        self.seqgen = self.skipgram_generator()

    @property
    def vocab(self):
        return len(self.id_to_word)
    
    def skipgram_generator(self):
        curr = 0
        while True:
            curr %= len(self.data)
            x = self.data[curr]
            left = self.data[max(0, curr - self.windows):curr]
            right = self.data[curr + 1:curr + 1 + self.windows]
            for y in left + right:
                yield (x, y)
            curr += 1
            
    def get_batch(self):
        input, target = [], []
        for _ in range(self.batch):
            x, y = next(self.seqgen)
            input.append(x)
            target.append(y)
        return np.array(input), np.array(target)
    
    def to_ids(self, words):
        ids = []
        for w in words:
            if w in self.word_to_id:
                ids.append(self.word_to_id[w])
        return ids


In [None]:
class TrainData(object):

    def __init__(self, corpus, batch, windows):
        self.batch = batch
        self.windows = windows
        words = open(corpus, mode='r').read()
        words = words.replace('。', '').replace('，', '').replace('\n', '')
        words_as_set = set(words)
        self.id_to_word = sorted(set(words))
        self.word_to_id = {w: i for i, w in enumerate(self.id_to_word)}
        self.data = [self.word_to_id[w] for w in words]
        print('Number of unique chars: ', len(self.id_to_word))
        print('Number of training chars: ', len(self.data))
        self.seqgen = self.skipgram_generator()

    @property
    def vocab(self):
        return len(self.id_to_word)
    
    def to_ids(self, words):
        ids = []
        for w in words:
            if w in self.word_to_id:
                ids.append(self.word_to_id[w])
        return ids

In [3]:
    def skipgram_generator(self):
        curr = 0
        while True:
            curr %= len(self.data)
            x = self.data[curr]
            left = self.data[max(0, curr - self.windows):curr]
            right = self.data[curr + 1:curr + 1 + self.windows]
            for y in left + right:
                yield (x, y)
            curr += 1
            
    def get_batch(self):
        input, target = [], []
        for _ in range(self.batch):
            x, y = next(self.seqgen)
            input.append(x)
            target.append(y)
        return np.array(input), np.array(target)
    

### Example

In [5]:
data = TrainData('./data/poem.txt', 2, 2)

Number of unique chars:  7955
Number of training chars:  3987351


In [6]:
for _ in range(10):
    print(data.get_batch())

(array([4562, 4562]), array([1707, 7209]))
(array([1707, 1707]), array([4562, 7209]))
(array([1707, 7209]), array([1746, 4562]))
(array([7209, 7209]), array([1707, 1746]))
(array([7209, 1746]), array([1472, 1707]))
(array([1746, 1746]), array([7209, 1472]))
(array([1746, 1472]), array([ 552, 7209]))
(array([1472, 1472]), array([1746,  552]))
(array([1472,  552]), array([6301, 1746]))
(array([552, 552]), array([1472, 6301]))


### Implement the Model

In [14]:

class Model(object):
    
    def __init__(self, dims, vocab, lr):
        # Configs.
        self.dims = dims
        self.vocab = vocab
        self.lr = lr

        self.graph = tf.Graph()
        with self.graph.as_default():
            # Var
            self.embedding = tf.Variable(
                tf.random_uniform([vocab, dims], -0.02, 0.02))

            # Feeds.
            self.inputs = tf.placeholder(tf.int64)
            self.targets = tf.placeholder(tf.int64)

            # Define forward.
            x_emb = tf.nn.embedding_lookup(self.embedding, self.inputs)
            y_emb = tf.nn.embedding_lookup(self.embedding, self.targets)

            # Compute the loss.
            scores = tf.reduce_sum(x_emb * y_emb, [1])
            probs = tf.sigmoid(scores)
            logp = tf.log(probs)
            self.loss = - tf.reduce_mean(logp)

            # Define training.
            self.global_step = tf.Variable(0, trainable=False, name='global_step')
            vars = tf.trainable_variables()
            grads = tf.gradients(self.loss, vars)
            optimizer = tf.train.GradientDescentOptimizer(lr)
            self.train_op = optimizer.apply_gradients(
                zip(grads, vars), global_step=self.global_step)

            # Summary
            tf.scalar_summary('loss', self.loss)
            self.summary = tf.merge_summary(tf.get_collection(tf.GraphKeys.SUMMARIES))

            # Inference

            # Nearest neighbors
            norm_embs = tf.nn.l2_normalize(self.embedding, 1)
            word_embs = tf.nn.embedding_lookup(norm_embs, self.inputs)
            distance = tf.matmul(word_embs, norm_embs, transpose_b=True)
            self.neighbors_topk = tf.nn.top_k(distance, k=10)

            # Analogy
            a, b, c = word_embs[1, :], word_embs[0, :], word_embs[2, :]
            d = b - a + c
            target = tf.reshape(d, [1, -1])
            dist = tf.matmul(target, norm_embs, transpose_b=True)
            self.analogy_topk = tf.nn.top_k(dist, k=10)

            # Init
            self.init = tf.initialize_all_variables()

            # Saver
            self.saver = tf.train.Saver(tf.all_variables())
            
        self.sess = tf.Session(graph=self.graph)
        self.sess.run(self.init)
        
    def train(self, data, logdir, total_steps):
        swriter = tf.train.SummaryWriter(logdir)
        
        # Recover.
        self.load(tf.train.latest_checkpoint(logdir))
        
        steps = self.sess.run(self.global_step)
        while steps < total_steps:
            if steps % 1000 == 0:
                self.saver.save(
                    self.sess, logdir + '/wv_params', global_step=steps)
            x, y = data.get_batch()
            if steps % 100 == 0:
                loss, summary = self.sess.run(
                    [self.loss, self.summary],
                    feed_dict={self.inputs: x, self.targets: y})
                swriter.add_summary(summary, steps)
                swriter.flush()
                print('step %d: %.4f' % (steps, loss))
            else:
                self.sess.run(
                    self.train_op,
                    feed_dict={self.inputs: x, self.targets: y})
            steps += 1

    def load(self, checkpoint):
        if checkpoint is not None:
            print('restore %s', checkpoint)
            self.saver.restore(self.sess, checkpoint)
            
    def nearby(self, data, words):
        ids = data.to_ids(words)
        print('ids = %s' % ids)
        _, neighbors = self.sess.run(
            self.neighbors_topk, feed_dict={self.inputs : ids})
        for (w, n) in zip(words, neighbors):
            print('nearby  %s --> %s' % (w, ''.join(
                        [data.id_to_word[x] for x in n])))
            
    def analogy(self, data, words):
        _, neighbors = self.sess.run(
            self.analogy_topk,
            feed_dict={self.inputs : data.to_ids(words)})
        neighbors = [data.id_to_word[x] for x in neighbors[0, :]]
        print('analogy %s %s' % (words, ''.join(neighbors)))

### Parameters of the model
  * Embedding dimensions
    - dims = 256
  * Vocab size
    - data.vocab = 7955
  * Learning rate for SGD
    - lr = 0.5

In [None]:
class Model(object):
    
    def __init__(self, dims, vocab, lr):
        # Configs.
        self.dims = dims
        self.vocab = vocab
        self.lr = lr


### Describe the model as a graph

In [None]:
        self.graph = tf.Graph()
        with self.graph.as_default():


### Declare embedding vectors

In [None]:
            # Var
            self.embedding = tf.Variable(
                tf.random_uniform([vocab, dims], -0.02, 0.02))



### Feeds: inputs to the model

In [None]:
            # Feeds.
            self.inputs = tf.placeholder(tf.int64)
            self.targets = tf.placeholder(tf.int64)

### Forward function
  - Really simple for this model. Just converts word ids into vectors.

In [None]:
            # Define forward.
            x_emb = tf.nn.embedding_lookup(self.embedding, self.inputs)
            y_emb = tf.nn.embedding_lookup(self.embedding, self.targets)



### Loss: the optimization goal
$$ -\sum_{(w,c) \in D} log(p(w, c; \theta)) $$

$$
p(w, c; \theta) = \sigma(v_w^T v_c)
$$


In [None]:
            # Compute the loss.
            scores = tf.reduce_sum(x_emb * y_emb, [1])
            probs = tf.sigmoid(scores) 
            logp = tf.log(probs)
            self.loss = - tf.reduce_mean(logp)

### Optimizer: how to minimize the loss
  * Stochastic gradient descent.
    - Only hyper-parameter: learning rate

In [None]:
            # Define training.
            self.global_step = tf.Variable(0, trainable=False, name='global_step')
            vars = tf.trainable_variables()
            grads = tf.gradients(self.loss, vars)
            optimizer = tf.train.GradientDescentOptimizer(lr)
            self.train_op = optimizer.apply_gradients(
                zip(grads, vars), global_step=self.global_step)

### Training support
  - Summary: pretty plots over time
  - Saver: checkpoint trainig state
  - Initialization

In [None]:
            # Summary
            tf.scalar_summary('loss', self.loss)
            self.summary = tf.merge_summary(
                tf.get_collection(tf.GraphKeys.SUMMARIES))

            # Init
            self.init = tf.initialize_all_variables()

            # Saver
            self.saver = tf.train.Saver(tf.all_variables())

### Inference
  * Similiar words for a given word
  * Analogy: A vs. B as C vs. ???

In [None]:
            # Inference

            # Nearest neighbors
            norm_embs = tf.nn.l2_normalize(self.embedding, 1)
            word_embs = tf.nn.embedding_lookup(norm_embs, self.inputs)
            distance = tf.matmul(word_embs, norm_embs, transpose_b=True)
            self.neighbors_topk = tf.nn.top_k(distance, k=10)

            # Analogy
            a, b, c = word_embs[1, :], word_embs[0, :], word_embs[2, :]
            d = b - a + c
            target = tf.reshape(d, [1, -1])
            dist = tf.matmul(target, norm_embs, transpose_b=True)
            self.analogy_topk = tf.nn.top_k(dist, k=10)

### Session
  - Connect to TensorFlow runtime
  - Initialize everything

In [None]:
       self.sess = tf.Session(graph=self.graph)
        self.sess.run(self.init)


### Training
  - Iterative algorithm
  - Periodic checkpoint and summary

In [None]:
    def train(self, data, logdir, total_steps):
        swriter = tf.train.SummaryWriter(logdir)
        
        # Recover.
        self.load(tf.train.latest_checkpoint(logdir))
        
        steps = self.sess.run(self.global_step)
        while steps < total_steps:
            if steps % 1000 == 0:
                self.saver.save(
                    self.sess, logdir + '/wv_params', global_step=steps)
            x, y = data.get_batch()
            if steps % 100 == 0:
                loss, summary = self.sess.run(
                    [self.loss, self.summary],
                    feed_dict={self.inputs: x, self.targets: y})
                swriter.add_summary(summary, steps)
                swriter.flush()
                print('step %d: %.4f' % (steps, loss))
            else:
                self.sess.run(
                    self.train_op,
                    feed_dict={self.inputs: x, self.targets: y})
            steps += 1

### Restore

In [None]:
    def load(self, checkpoint):
        if checkpoint is not None:
            print('restore ', checkpoint)
            self.saver.restore(self.sess, checkpoint)

### Instantiate the data and model

In [15]:
windows = 8
batches = 32
data = TrainData('./data/poem.txt', batches, windows)

dims = 256
vocab = data.vocab
lr = 0.5
model = Model(dims, vocab, lr)

Number of unique chars:  7955
Number of training chars:  3987351


In [16]:
model.train(data, './data', 200)

step 0: 0.6931
step 100: 0.6933


## Nearest neighbors

In [59]:
model.load('./data/wv_params-0')
model.nearby(data, '明月几时有')

restore %s ./data/wv_params-0
ids = [2623, 2723, 540, 2611, 2724]
nearby  明 --> 明蚴檠咸眄垒尧眴牣锺
nearby  月 --> 月唏齰麦疗妠嚬泾耎鎗
nearby  几 --> 几咮柑臆祲允募咛涤憪
nearby  时 --> 时须遣搵汐遗荣磬鞨翛
nearby  有 --> 有贡鹦襋骠躔姣ㄒ缙疃


In [60]:
model.load('./data/wv_params-807000')
model.nearby(data, '明月几时有')

restore %s ./data/wv_params-807000
ids = [2623, 2723, 540, 2611, 2724]
nearby  明 --> 明中道下上不水日月心
nearby  月 --> 月风下明夜落上归日起
nearby  几 --> 几不一上汉下西何来知
nearby  时 --> 时不相灵日东君如在当
nearby  有 --> 有无不何中山人青心门


## Analogy
  - 清:风 vs. 明:?

In [61]:
model.load('./data/wv_params-0')
model.analogy(data, '清风明')

restore %s ./data/wv_params-0
analogy 清风明 清明撚尧隔檠玺纥鳢猰


In [62]:
model.load('./data/wv_params-807000')
model.analogy(data, '清风明')

restore %s ./data/wv_params-807000
analogy 清风明 明清中德天归得心如思


### Exercise
  - Train a bigger model.
  - Different data set. 
    - http://mattmahoney.net/dc/text8.zip
  - Adds negative samples
    - 