Skip to content
This repository has been archived by the owner on Jan 15, 2024. It is now read-only.

Commit

Permalink
[API] Better handle case when backoff is not possible in TokenEmbeddi…
Browse files Browse the repository at this point in the history
…ng (#459)

* Handle case when backoff is not possible

* Remove unknown_autoextend

* Improve doc

* Improve Vocab error checking if any idx_to_vec is None in set_embedding

* Improve path detection for fastText bin models

* Small refactor of evaluate_pretrained.py

* Add missing import

* Fix test
  • Loading branch information
leezu authored and astonzhang committed Jan 9, 2019
1 parent f24ecfd commit f4275c0
Show file tree
Hide file tree
Showing 6 changed files with 227 additions and 125 deletions.
8 changes: 4 additions & 4 deletions scripts/tests/test_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,21 +77,21 @@ def test_embedding_evaluate_pretrained(fasttextloadngrams):

@pytest.mark.serial
@pytest.mark.remote_required
@pytest.mark.parametrize('evaluatanalogies', [True, False])
@pytest.mark.parametrize('evaluateanalogies', [True, False])
@pytest.mark.parametrize('maxvocabsize', [None, 16])
def test_embedding_evaluate_from_path(evaluatanalogies, maxvocabsize):
def test_embedding_evaluate_from_path(evaluateanalogies, maxvocabsize):
path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
path = os.path.join(
path, '../../tests/unittest/train/test_embedding/lorem_ipsum.bin')
cmd = [
'python', './scripts/word_embeddings/evaluate_pretrained.py',
'--embedding-path', path]
if evaluatanalogies:
if evaluateanalogies:
cmd += ['--analogy-datasets', 'GoogleAnalogyTestSet']
else:
cmd += ['--analogy-datasets']
if maxvocabsize is not None:
cmd += ['--max-vocab-size', str(maxvocabsize)]
cmd += ['--analogy-max-vocab-size', str(maxvocabsize)]
subprocess.check_call(cmd)
time.sleep(5)

Expand Down
111 changes: 60 additions & 51 deletions scripts/word_embeddings/evaluate_pretrained.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
import os
import sys

import mxnet as mx

import evaluation
import gluonnlp as nlp
import utils
Expand Down Expand Up @@ -61,7 +63,7 @@ def get_args():
help=('Specify load_ngrams=True '
'when loading pretrained fastText embedding.'))
group.add_argument(
'--max-vocab-size', type=int, default=None,
'--analogy-max-vocab-size', type=int, default=None,
help=('Only retain the X first tokens from the pre-trained embedding. '
'The tokens are ordered by decreasing frequency.'
'As the analogy task takes the whole vocabulary into account, '
Expand Down Expand Up @@ -131,37 +133,53 @@ def validate_args(args):

def load_embedding_from_path(args):
"""Load a TokenEmbedding."""
if 'bin' in args.embedding_path:
if args.embedding_path.endswith('.bin'):
with utils.print_time('load fastText model.'):
model = \
nlp.model.train.FasttextEmbeddingModel.load_fasttext_format(
args.embedding_path)
idx_to_token = sorted(model._token_to_idx, key=model._token_to_idx.get)

embedding = nlp.embedding.TokenEmbedding(
unknown_token=None, unknown_lookup=model, allow_extend=True,
unknown_autoextend=True)
unknown_token=None, unknown_lookup=model, allow_extend=True)

idx_to_token = sorted(model._token_to_idx, key=model._token_to_idx.get)
# Analogy task is open-vocabulary, so must keep all known words.
# But if not evaluating analogy, no need to precompute now as all
# words for closed vocabulary task can be obtained via the unknown
# lookup
if not args.analogy_datasets:
# Prune tokens not used in evaluation datasets
eval_tokens_ = set(
evaluation.get_tokens_in_evaluation_datasets(args))
idx_to_token = [t for t in idx_to_token if t in eval_tokens_]
if args.max_vocab_size:
idx_to_token = idx_to_token[:args.max_vocab_size]

with utils.print_time('compute vectors from subwords '
'for {} words.'.format(len(idx_to_token))):
embedding[idx_to_token] = model[idx_to_token]

idx_to_token = []
elif args.analogy_datasets and args.analogy_max_vocab_size:
idx_to_token = idx_to_token[:args.analogy_max_vocab_size]

embedding['<unk>'] = mx.nd.zeros(model.weight.shape[1])
if idx_to_token:
with utils.print_time('compute vectors for {} known '
'words.'.format(len(idx_to_token))):
embedding[idx_to_token] = model[idx_to_token]
else:
embedding = nlp.embedding.TokenEmbedding.from_file(args.embedding_path)

return embedding


def load_embedding_from_gluonnlp(args):
if args.embedding_name.lower() == 'fasttext':
token_embedding = nlp.embedding.create(
args.embedding_name,
source=args.embedding_source,
load_ngrams=args.fasttext_load_ngrams)
else:
token_embedding = nlp.embedding.create(
args.embedding_name, source=args.embedding_source)
return token_embedding


def enforce_max_size(token_embedding, size):
assert token_embedding.idx_to_vec is not None
if size and len(token_embedding.idx_to_token) > size:
assert size > 0
size = size + 1 if token_embedding.unknown_token is not None else size
token_embedding._idx_to_token = token_embedding._idx_to_token[:size]
token_embedding._idx_to_vec = token_embedding._idx_to_vec[:size]
token_embedding._token_to_idx = {
Expand All @@ -181,44 +199,35 @@ def enforce_max_size(token_embedding, size):

# Load pre-trained embeddings
if not args_.embedding_path:
if args_.embedding_name.lower() == 'fasttext':
token_embedding_ = nlp.embedding.create(
args_.embedding_name,
source=args_.embedding_source,
load_ngrams=args_.fasttext_load_ngrams,
allow_extend=True,
unknown_autoextend=True)
else:
token_embedding_ = nlp.embedding.create(
args_.embedding_name, source=args_.embedding_source)
token_embedding_ = load_embedding_from_gluonnlp(args_)
name = '-' + args_.embedding_name + '-' + args_.embedding_source
else:
token_embedding_ = load_embedding_from_path(args_)
name = ''

enforce_max_size(token_embedding_, args_.max_vocab_size)
enforce_max_size(token_embedding_, args_.analogy_max_vocab_size)
known_tokens = set(token_embedding_.idx_to_token)
# Auto-extend token_embedding with unknown extra eval tokens
if token_embedding_.unknown_lookup is not None:
eval_tokens = evaluation.get_tokens_in_evaluation_datasets(args_)
# pylint: disable=pointless-statement
token_embedding_[[
t for t in eval_tokens - known_tokens
if t in token_embedding_.unknown_lookup
]]

if args_.max_vocab_size is not None and len(
token_embedding_.idx_to_token) > args_.max_vocab_size:
logging.warning('Computing embeddings for OOV words that occur '
'in the evaluation dataset lead to having '
'more words than --max-vocab-size. '
'Have %s words (--max-vocab-size %s)',
len(token_embedding_.idx_to_token),
args_.max_vocab_size)

similarity_results = evaluation.evaluate_similarity(
args_, token_embedding_, ctx, logfile=os.path.join(
args_.logdir, 'similarity{}.tsv'.format(name)))
analogy_results = evaluation.evaluate_analogy(
args_, token_embedding_, ctx, logfile=os.path.join(
args_.logdir, 'analogy{}.tsv'.format(name)))

if args_.similarity_datasets:
with utils.print_time('find relevant tokens for similarity'):
tokens = evaluation.get_similarity_task_tokens(args_)
vocab = nlp.Vocab(nlp.data.count_tokens(tokens))
with utils.print_time('set {} embeddings'.format(len(tokens))):
vocab.set_embedding(token_embedding_)
evaluation.evaluate_similarity(
args_, vocab.embedding, ctx, logfile=os.path.join(
args_.logdir, 'similarity{}.tsv'.format(name)))
if args_.analogy_datasets:
with utils.print_time('extend open vocabulary with '
'OOV tokens for analogy'):
tokens = evaluation.get_analogy_task_tokens(args_)
if token_embedding_.unknown_token is not None:
tokens.update(token_embedding_.idx_to_token[1:])
else:
tokens.update(token_embedding_.idx_to_token)
vocab = nlp.Vocab(nlp.data.count_tokens(tokens))
with utils.print_time('set {} embeddings'.format(len(tokens))):
vocab.set_embedding(token_embedding_)
evaluation.evaluate_analogy(
args_, vocab.embedding, ctx, logfile=os.path.join(
args_.logdir, 'analogy{}.tsv'.format(name)))
12 changes: 11 additions & 1 deletion scripts/word_embeddings/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,18 +117,28 @@ def iterate_analogy_datasets(args):
yield dataset_name, kwargs, nlp.data.create(dataset_name, **kwargs)


def get_tokens_in_evaluation_datasets(args):
def get_similarity_task_tokens(args):
"""Returns a set of all tokens occurring the evaluation datasets."""
tokens = set()
for _, _, dataset in iterate_similarity_datasets(args):
tokens.update(
itertools.chain.from_iterable((d[0], d[1]) for d in dataset))
return tokens


def get_analogy_task_tokens(args):
"""Returns a set of all tokens occuring the evaluation datasets."""
tokens = set()
for _, _, dataset in iterate_analogy_datasets(args):
tokens.update(
itertools.chain.from_iterable(
(d[0], d[1], d[2], d[3]) for d in dataset))
return tokens


def get_tokens_in_evaluation_datasets(args):
tokens = get_similarity_task_tokens(args)
tokens.update(get_analogy_task_tokens(args))
return tokens


Expand Down
70 changes: 26 additions & 44 deletions src/gluonnlp/embedding/token_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,24 +172,19 @@ class TokenEmbedding(object):
via token_embedding[tokens] = vecs. If False, only vectors for known
tokens can be updated.
unknown_lookup : object subscriptable with list of tokens returning nd.NDarray, default None
If not None, unknown_lookup[tokens] is called for any unknown tokens.
The result is cached if unknown_autoextend is True.
unknown_autoextend : bool, default True
If True, any unknown token for which a vector was looked up in
unknown_lookup together with the resulting vector will be added to
token_to_idx, idx_to_token and idx_to_vec, adding a new index. This
option is ignored if allow_extend is False.
If not None, the TokenEmbedding obtains embeddings for unknown tokens
automatically from `unknown_lookup[unknown_tokens]`. For example, in a
FastText model, embeddings for unknown tokens can be computed from the
subword information.
"""

def __init__(self, unknown_token='<unk>', init_unknown_vec=nd.zeros,
allow_extend=False, unknown_lookup=None,
unknown_autoextend=True):
allow_extend=False, unknown_lookup=None):
self._unknown_token = unknown_token
self._init_unknown_vec = init_unknown_vec
self._allow_extend = allow_extend
self._unknown_lookup = unknown_lookup
self._unknown_autoextend = unknown_autoextend
self._idx_to_token = [unknown_token] if unknown_token else []
if unknown_token:
self._token_to_idx = DefaultLookupDict(C.UNK_IDX)
Expand Down Expand Up @@ -457,8 +452,8 @@ def allow_extend(self):
def unknown_lookup(self):
"""Vector lookup for unknown tokens.
If not None, unknown_lookup[tokens] is called for any unknown tokens.
The result is cached if unknown_autoextend is True.
If not None, unknown_lookup[tokens] is automatically called for any
unknown tokens.
Returns
-------
Expand All @@ -473,7 +468,6 @@ def unknown_lookup(self, unknown_lookup):
"""Vector lookup for unknown tokens.
If not None, unknown_lookup[tokens] is called for any unknown tokens.
The result is cached if unknown_autoextend is True.
Parameters
----------
Expand All @@ -483,23 +477,6 @@ def unknown_lookup(self, unknown_lookup):
"""
self._unknown_lookup = unknown_lookup

@property
def unknown_autoextend(self):
"""Autoextension behavior for unknown token lookup.
If True, any unknown token for which a vector was looked up in
unknown_lookup together with the resulting vector will be added to
token_to_idx, idx_to_token and idx_to_vec, adding a new index. Applies
only if unknown_lookup is not None.
Returns
-------
bool
Autoextension behavior
"""

return self._unknown_autoextend

def __contains__(self, token):
"""Check if token is known.
Expand Down Expand Up @@ -552,19 +529,20 @@ def __getitem__(self, tokens):
if to_reduce:
tokens = [tokens]

if self.unknown_lookup is not None and (not self.allow_extend
or not self.unknown_autoextend):
vecs = [
self.idx_to_vec[self.token_to_idx[token]]
if token in self.token_to_idx else self.unknown_lookup[token]
for token in tokens
]
vecs = nd.stack(*vecs, axis=0)
if self.unknown_lookup is not None:
if self.idx_to_vec is None:
# May raise KeyError, but we cannot fallback to idx_to_vec's
# unknown vector, as idx_to_vec has not been initialized yet.
# Cannot initialize it, as we don't know the dimension.
vecs = self.unknown_lookup[tokens]
else:
vecs = [
self.idx_to_vec[self.token_to_idx[token]] if
(token in self.token_to_idx
or token not in self.unknown_lookup) else
self.unknown_lookup[token] for token in tokens]
vecs = nd.stack(*vecs, axis=0)
else:
if self.unknown_lookup is not None and self.allow_extend and self.unknown_autoextend:
new_tokens = [t for t in tokens if t not in self.token_to_idx]
self[new_tokens] = self.unknown_lookup[new_tokens]

indices = [self._token_to_idx[token] for token in tokens]
vecs = nd.Embedding(
nd.array(indices), self.idx_to_vec, self.idx_to_vec.shape[0],
Expand Down Expand Up @@ -612,10 +590,14 @@ def __setitem__(self, tokens, new_embedding):
the glossary. If `tokens` is a singleton, it must be 1-D or 2-D. If `tokens` is a list
of multiple strings, it must be 2-D.
"""
if self.allow_extend and self._idx_to_vec is None:
if not isinstance(tokens, (list, tuple)):
tokens = [tokens]
if ((self.allow_extend or all(t in self.token_to_idx for t in tokens))
and self._idx_to_vec is None):
# Initialize self._idx_to_vec
assert C.UNK_IDX == 0
self._idx_to_vec = self._init_unknown_vec(shape=(1, new_embedding.shape[-1]))
self._idx_to_vec = self._init_unknown_vec(
shape=(1, new_embedding.shape[-1]))

tokens = self._check_vector_update(tokens, new_embedding)

Expand Down
7 changes: 5 additions & 2 deletions src/gluonnlp/vocab/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,10 @@ def set_embedding(self, *embeddings):
assert isinstance(embs, emb.TokenEmbedding), \
'The argument `embeddings` must be an instance or a list of instances of ' \
'`gluonnlp.embedding.TokenEmbedding`.'
assert embs.idx_to_vec is not None, \
'For all specified `embeddings`, `embeddings.idx_to_vec` must be initialized. ' \
'Use eg. `emb[emb.unknown_token] = nd.zeros(emsize)` to initialize, ' \
'where `emsize` is the desired embedding dimensionality.'

assert all([embs.unknown_token for embs in embeddings]) or \
all([not embs.unknown_token for embs in embeddings]), \
Expand All @@ -307,8 +311,7 @@ def set_embedding(self, *embeddings):
new_embedding._token_to_idx = self.token_to_idx
new_embedding._idx_to_token = self.idx_to_token

new_vec_len = sum(embs.idx_to_vec.shape[1] for embs in embeddings
if embs and embs.idx_to_vec is not None)
new_vec_len = sum(embs.idx_to_vec.shape[1] for embs in embeddings)
new_idx_to_vec = nd.zeros(shape=(len(self), new_vec_len))

col_start = 0
Expand Down

0 comments on commit f4275c0

Please sign in to comment.