executable file 267 lines (216 sloc) 9.12 KB
#!/usr/bin/env python
"""Sample script of word embedding model.
This code implements skip-gram model and continuous-bow model.
import argparse
import collections
import numpy as np
import six
import chainer
from chainer.backends import cuda
import chainer.functions as F
import chainer.initializers as I
import chainer.links as L
import chainer.optimizers as O
from chainer import reporter
from chainer import training
from import extensions
class ContinuousBoW(chainer.Chain):
"""Definition of Continuous Bag of Words Model"""
def __init__(self, n_vocab, n_units, loss_func):
super(ContinuousBoW, self).__init__()
with self.init_scope():
self.embed = L.EmbedID(
n_vocab, n_units, initialW=I.Uniform(1. / n_units))
self.loss_func = loss_func
def forward(self, x, contexts):
e = self.embed(contexts)
h = F.sum(e, axis=1) * (1. / contexts.shape[1])
loss = self.loss_func(h, x){'loss': loss}, self)
return loss
class SkipGram(chainer.Chain):
"""Definition of Skip-gram Model"""
def __init__(self, n_vocab, n_units, loss_func):
super(SkipGram, self).__init__()
with self.init_scope():
self.embed = L.EmbedID(
n_vocab, n_units, initialW=I.Uniform(1. / n_units))
self.loss_func = loss_func
def forward(self, x, contexts):
e = self.embed(contexts)
batch_size, n_context, n_units = e.shape
x = F.broadcast_to(x[:, None], (batch_size, n_context))
e = F.reshape(e, (batch_size * n_context, n_units))
x = F.reshape(x, (batch_size * n_context,))
loss = self.loss_func(e, x){'loss': loss}, self)
return loss
class SoftmaxCrossEntropyLoss(chainer.Chain):
"""Softmax cross entropy loss function preceded by linear transformation.
def __init__(self, n_in, n_out):
super(SoftmaxCrossEntropyLoss, self).__init__()
with self.init_scope():
self.out = L.Linear(n_in, n_out, initialW=0)
def forward(self, x, t):
return F.softmax_cross_entropy(self.out(x), t)
class WindowIterator(chainer.dataset.Iterator):
"""Dataset iterator to create a batch of sequences at different positions.
This iterator returns a pair of the current words and the context words.
def __init__(self, dataset, window, batch_size, repeat=True):
self.dataset = np.array(dataset, np.int32)
self.window = window # size of context window
self.batch_size = batch_size
self._repeat = repeat
# order is the array which is shuffled ``[window, window + 1, ...,
# len(dataset) - window - 1]``
self.order = np.random.permutation(
len(dataset) - window * 2).astype(np.int32)
self.order += window
self.current_position = 0
# Number of completed sweeps over the dataset. In this case, it is
# incremented if every word is visited at least once after the last
# increment.
self.epoch = 0
# True if the epoch is incremented at the last iteration.
self.is_new_epoch = False
def __next__(self):
"""This iterator returns a list representing a mini-batch.
Each item indicates a different position in the original sequence.
if not self._repeat and self.epoch > 0:
raise StopIteration
i = self.current_position
i_end = i + self.batch_size
position = self.order[i:i_end]
w = np.random.randint(self.window - 1) + 1
offset = np.concatenate([np.arange(-w, 0), np.arange(1, w + 1)])
pos = position[:, None] + offset[None, :]
contexts = self.dataset.take(pos)
center = self.dataset.take(position)
if i_end >= len(self.order):
self.epoch += 1
self.is_new_epoch = True
self.current_position = 0
self.is_new_epoch = False
self.current_position = i_end
return center, contexts
def epoch_detail(self):
return self.epoch + float(self.current_position) / len(self.order)
def serialize(self, serializer):
self.current_position = serializer('current_position',
self.epoch = serializer('epoch', self.epoch)
self.is_new_epoch = serializer('is_new_epoch', self.is_new_epoch)
if self._order is not None:
serializer('_order', self._order)
def convert(batch, device):
center, contexts = batch
if device >= 0:
center = cuda.to_gpu(center)
contexts = cuda.to_gpu(contexts)
return center, contexts
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--gpu', '-g', default=-1, type=int,
help='GPU ID (negative value indicates CPU)')
parser.add_argument('--unit', '-u', default=100, type=int,
help='number of units')
parser.add_argument('--window', '-w', default=5, type=int,
help='window size')
parser.add_argument('--batchsize', '-b', type=int, default=1000,
help='learning minibatch size')
parser.add_argument('--epoch', '-e', default=20, type=int,
help='number of epochs to learn')
parser.add_argument('--model', '-m', choices=['skipgram', 'cbow'],
help='model type ("skipgram", "cbow")')
parser.add_argument('--negative-size', default=5, type=int,
help='number of negative samples')
parser.add_argument('--out-type', '-o', choices=['hsm', 'ns', 'original'],
help='output model type ("hsm": hierarchical softmax, '
'"ns": negative sampling, "original": '
'no approximation)')
parser.add_argument('--out', default='result',
help='Directory to output the result')
parser.add_argument('--test', dest='test', action='store_true')
args = parser.parse_args()
if args.gpu >= 0:
print('GPU: {}'.format(args.gpu))
print('# unit: {}'.format(args.unit))
print('Window: {}'.format(args.window))
print('Minibatch-size: {}'.format(args.batchsize))
print('# epoch: {}'.format(args.epoch))
print('Training model: {}'.format(args.model))
print('Output type: {}'.format(args.out_type))
# Load the dataset
train, val, _ = chainer.datasets.get_ptb_words()
counts = collections.Counter(train)
n_vocab = max(train) + 1
if args.test:
train = train[:100]
val = val[:100]
vocab = chainer.datasets.get_ptb_words_vocabulary()
index2word = {wid: word for word, wid in six.iteritems(vocab)}
print('n_vocab: %d' % n_vocab)
print('data length: %d' % len(train))
if args.out_type == 'hsm':
HSM = L.BinaryHierarchicalSoftmax
tree = HSM.create_huffman_tree(counts)
loss_func = HSM(args.unit, tree)[...] = 0
elif args.out_type == 'ns':
cs = [counts[w] for w in range(len(counts))]
loss_func = L.NegativeSampling(args.unit, cs, args.negative_size)[...] = 0
elif args.out_type == 'original':
loss_func = SoftmaxCrossEntropyLoss(args.unit, n_vocab)
raise Exception('Unknown output type: {}'.format(args.out_type))
# Choose the model
if args.model == 'skipgram':
model = SkipGram(n_vocab, args.unit, loss_func)
elif args.model == 'cbow':
model = ContinuousBoW(n_vocab, args.unit, loss_func)
raise Exception('Unknown model type: {}'.format(args.model))
if args.gpu >= 0:
# Set up an optimizer
optimizer = O.Adam()
# Set up an iterator
train_iter = WindowIterator(train, args.window, args.batchsize)
val_iter = WindowIterator(val, args.window, args.batchsize, repeat=False)
# Set up an updater
updater = training.updaters.StandardUpdater(
train_iter, optimizer, converter=convert, device=args.gpu)
# Set up a trainer
trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
val_iter, model, converter=convert, device=args.gpu))
['epoch', 'main/loss', 'validation/main/loss']))
# Save the word2vec model
with open('word2vec.model', 'w') as f:
f.write('%d %d\n' % (len(index2word), args.unit))
w = cuda.to_cpu(
for i, wi in enumerate(w):
v = ' '.join(map(str, wi))
f.write('%s %s\n' % (index2word[i], v))
if __name__ == '__main__':