diff --git a/scripts/tests/test_scripts.py b/scripts/tests/test_scripts.py index c0b3f320b8..e23132dff2 100644 --- a/scripts/tests/test_scripts.py +++ b/scripts/tests/test_scripts.py @@ -77,21 +77,21 @@ def test_embedding_evaluate_pretrained(fasttextloadngrams): @pytest.mark.serial @pytest.mark.remote_required -@pytest.mark.parametrize('evaluatanalogies', [True, False]) +@pytest.mark.parametrize('evaluateanalogies', [True, False]) @pytest.mark.parametrize('maxvocabsize', [None, 16]) -def test_embedding_evaluate_from_path(evaluatanalogies, maxvocabsize): +def test_embedding_evaluate_from_path(evaluateanalogies, maxvocabsize): path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) path = os.path.join( path, '../../tests/unittest/train/test_embedding/lorem_ipsum.bin') cmd = [ 'python', './scripts/word_embeddings/evaluate_pretrained.py', '--embedding-path', path] - if evaluatanalogies: + if evaluateanalogies: cmd += ['--analogy-datasets', 'GoogleAnalogyTestSet'] else: cmd += ['--analogy-datasets'] if maxvocabsize is not None: - cmd += ['--max-vocab-size', str(maxvocabsize)] + cmd += ['--analogy-max-vocab-size', str(maxvocabsize)] subprocess.check_call(cmd) time.sleep(5) diff --git a/scripts/word_embeddings/evaluate_pretrained.py b/scripts/word_embeddings/evaluate_pretrained.py index b3a73a957a..b430511445 100644 --- a/scripts/word_embeddings/evaluate_pretrained.py +++ b/scripts/word_embeddings/evaluate_pretrained.py @@ -30,6 +30,8 @@ import os import sys +import mxnet as mx + import evaluation import gluonnlp as nlp import utils @@ -61,7 +63,7 @@ def get_args(): help=('Specify load_ngrams=True ' 'when loading pretrained fastText embedding.')) group.add_argument( - '--max-vocab-size', type=int, default=None, + '--analogy-max-vocab-size', type=int, default=None, help=('Only retain the X first tokens from the pre-trained embedding. ' 'The tokens are ordered by decreasing frequency.' 'As the analogy task takes the whole vocabulary into account, ' @@ -131,37 +133,53 @@ def validate_args(args): def load_embedding_from_path(args): """Load a TokenEmbedding.""" - if 'bin' in args.embedding_path: + if args.embedding_path.endswith('.bin'): with utils.print_time('load fastText model.'): model = \ nlp.model.train.FasttextEmbeddingModel.load_fasttext_format( args.embedding_path) + idx_to_token = sorted(model._token_to_idx, key=model._token_to_idx.get) embedding = nlp.embedding.TokenEmbedding( - unknown_token=None, unknown_lookup=model, allow_extend=True, - unknown_autoextend=True) + unknown_token=None, unknown_lookup=model, allow_extend=True) - idx_to_token = sorted(model._token_to_idx, key=model._token_to_idx.get) + # Analogy task is open-vocabulary, so must keep all known words. + # But if not evaluating analogy, no need to precompute now as all + # words for closed vocabulary task can be obtained via the unknown + # lookup if not args.analogy_datasets: - # Prune tokens not used in evaluation datasets - eval_tokens_ = set( - evaluation.get_tokens_in_evaluation_datasets(args)) - idx_to_token = [t for t in idx_to_token if t in eval_tokens_] - if args.max_vocab_size: - idx_to_token = idx_to_token[:args.max_vocab_size] - - with utils.print_time('compute vectors from subwords ' - 'for {} words.'.format(len(idx_to_token))): - embedding[idx_to_token] = model[idx_to_token] - + idx_to_token = [] + elif args.analogy_datasets and args.analogy_max_vocab_size: + idx_to_token = idx_to_token[:args.analogy_max_vocab_size] + + embedding[''] = mx.nd.zeros(model.weight.shape[1]) + if idx_to_token: + with utils.print_time('compute vectors for {} known ' + 'words.'.format(len(idx_to_token))): + embedding[idx_to_token] = model[idx_to_token] else: embedding = nlp.embedding.TokenEmbedding.from_file(args.embedding_path) return embedding +def load_embedding_from_gluonnlp(args): + if args.embedding_name.lower() == 'fasttext': + token_embedding = nlp.embedding.create( + args.embedding_name, + source=args.embedding_source, + load_ngrams=args.fasttext_load_ngrams) + else: + token_embedding = nlp.embedding.create( + args.embedding_name, source=args.embedding_source) + return token_embedding + + def enforce_max_size(token_embedding, size): + assert token_embedding.idx_to_vec is not None if size and len(token_embedding.idx_to_token) > size: + assert size > 0 + size = size + 1 if token_embedding.unknown_token is not None else size token_embedding._idx_to_token = token_embedding._idx_to_token[:size] token_embedding._idx_to_vec = token_embedding._idx_to_vec[:size] token_embedding._token_to_idx = { @@ -181,44 +199,35 @@ def enforce_max_size(token_embedding, size): # Load pre-trained embeddings if not args_.embedding_path: - if args_.embedding_name.lower() == 'fasttext': - token_embedding_ = nlp.embedding.create( - args_.embedding_name, - source=args_.embedding_source, - load_ngrams=args_.fasttext_load_ngrams, - allow_extend=True, - unknown_autoextend=True) - else: - token_embedding_ = nlp.embedding.create( - args_.embedding_name, source=args_.embedding_source) + token_embedding_ = load_embedding_from_gluonnlp(args_) name = '-' + args_.embedding_name + '-' + args_.embedding_source else: token_embedding_ = load_embedding_from_path(args_) name = '' - enforce_max_size(token_embedding_, args_.max_vocab_size) + enforce_max_size(token_embedding_, args_.analogy_max_vocab_size) known_tokens = set(token_embedding_.idx_to_token) - # Auto-extend token_embedding with unknown extra eval tokens - if token_embedding_.unknown_lookup is not None: - eval_tokens = evaluation.get_tokens_in_evaluation_datasets(args_) - # pylint: disable=pointless-statement - token_embedding_[[ - t for t in eval_tokens - known_tokens - if t in token_embedding_.unknown_lookup - ]] - - if args_.max_vocab_size is not None and len( - token_embedding_.idx_to_token) > args_.max_vocab_size: - logging.warning('Computing embeddings for OOV words that occur ' - 'in the evaluation dataset lead to having ' - 'more words than --max-vocab-size. ' - 'Have %s words (--max-vocab-size %s)', - len(token_embedding_.idx_to_token), - args_.max_vocab_size) - - similarity_results = evaluation.evaluate_similarity( - args_, token_embedding_, ctx, logfile=os.path.join( - args_.logdir, 'similarity{}.tsv'.format(name))) - analogy_results = evaluation.evaluate_analogy( - args_, token_embedding_, ctx, logfile=os.path.join( - args_.logdir, 'analogy{}.tsv'.format(name))) + + if args_.similarity_datasets: + with utils.print_time('find relevant tokens for similarity'): + tokens = evaluation.get_similarity_task_tokens(args_) + vocab = nlp.Vocab(nlp.data.count_tokens(tokens)) + with utils.print_time('set {} embeddings'.format(len(tokens))): + vocab.set_embedding(token_embedding_) + evaluation.evaluate_similarity( + args_, vocab.embedding, ctx, logfile=os.path.join( + args_.logdir, 'similarity{}.tsv'.format(name))) + if args_.analogy_datasets: + with utils.print_time('extend open vocabulary with ' + 'OOV tokens for analogy'): + tokens = evaluation.get_analogy_task_tokens(args_) + if token_embedding_.unknown_token is not None: + tokens.update(token_embedding_.idx_to_token[1:]) + else: + tokens.update(token_embedding_.idx_to_token) + vocab = nlp.Vocab(nlp.data.count_tokens(tokens)) + with utils.print_time('set {} embeddings'.format(len(tokens))): + vocab.set_embedding(token_embedding_) + evaluation.evaluate_analogy( + args_, vocab.embedding, ctx, logfile=os.path.join( + args_.logdir, 'analogy{}.tsv'.format(name))) diff --git a/scripts/word_embeddings/evaluation.py b/scripts/word_embeddings/evaluation.py index 737e592496..516e49eaf1 100644 --- a/scripts/word_embeddings/evaluation.py +++ b/scripts/word_embeddings/evaluation.py @@ -117,18 +117,28 @@ def iterate_analogy_datasets(args): yield dataset_name, kwargs, nlp.data.create(dataset_name, **kwargs) -def get_tokens_in_evaluation_datasets(args): +def get_similarity_task_tokens(args): """Returns a set of all tokens occurring the evaluation datasets.""" tokens = set() for _, _, dataset in iterate_similarity_datasets(args): tokens.update( itertools.chain.from_iterable((d[0], d[1]) for d in dataset)) + return tokens + +def get_analogy_task_tokens(args): + """Returns a set of all tokens occuring the evaluation datasets.""" + tokens = set() for _, _, dataset in iterate_analogy_datasets(args): tokens.update( itertools.chain.from_iterable( (d[0], d[1], d[2], d[3]) for d in dataset)) + return tokens + +def get_tokens_in_evaluation_datasets(args): + tokens = get_similarity_task_tokens(args) + tokens.update(get_analogy_task_tokens(args)) return tokens diff --git a/src/gluonnlp/embedding/token_embedding.py b/src/gluonnlp/embedding/token_embedding.py index e5d0d5fd14..f2b9c486d6 100644 --- a/src/gluonnlp/embedding/token_embedding.py +++ b/src/gluonnlp/embedding/token_embedding.py @@ -172,24 +172,19 @@ class TokenEmbedding(object): via token_embedding[tokens] = vecs. If False, only vectors for known tokens can be updated. unknown_lookup : object subscriptable with list of tokens returning nd.NDarray, default None - If not None, unknown_lookup[tokens] is called for any unknown tokens. - The result is cached if unknown_autoextend is True. - unknown_autoextend : bool, default True - If True, any unknown token for which a vector was looked up in - unknown_lookup together with the resulting vector will be added to - token_to_idx, idx_to_token and idx_to_vec, adding a new index. This - option is ignored if allow_extend is False. + If not None, the TokenEmbedding obtains embeddings for unknown tokens + automatically from `unknown_lookup[unknown_tokens]`. For example, in a + FastText model, embeddings for unknown tokens can be computed from the + subword information. """ def __init__(self, unknown_token='', init_unknown_vec=nd.zeros, - allow_extend=False, unknown_lookup=None, - unknown_autoextend=True): + allow_extend=False, unknown_lookup=None): self._unknown_token = unknown_token self._init_unknown_vec = init_unknown_vec self._allow_extend = allow_extend self._unknown_lookup = unknown_lookup - self._unknown_autoextend = unknown_autoextend self._idx_to_token = [unknown_token] if unknown_token else [] if unknown_token: self._token_to_idx = DefaultLookupDict(C.UNK_IDX) @@ -457,8 +452,8 @@ def allow_extend(self): def unknown_lookup(self): """Vector lookup for unknown tokens. - If not None, unknown_lookup[tokens] is called for any unknown tokens. - The result is cached if unknown_autoextend is True. + If not None, unknown_lookup[tokens] is automatically called for any + unknown tokens. Returns ------- @@ -473,7 +468,6 @@ def unknown_lookup(self, unknown_lookup): """Vector lookup for unknown tokens. If not None, unknown_lookup[tokens] is called for any unknown tokens. - The result is cached if unknown_autoextend is True. Parameters ---------- @@ -483,23 +477,6 @@ def unknown_lookup(self, unknown_lookup): """ self._unknown_lookup = unknown_lookup - @property - def unknown_autoextend(self): - """Autoextension behavior for unknown token lookup. - - If True, any unknown token for which a vector was looked up in - unknown_lookup together with the resulting vector will be added to - token_to_idx, idx_to_token and idx_to_vec, adding a new index. Applies - only if unknown_lookup is not None. - - Returns - ------- - bool - Autoextension behavior - """ - - return self._unknown_autoextend - def __contains__(self, token): """Check if token is known. @@ -552,19 +529,20 @@ def __getitem__(self, tokens): if to_reduce: tokens = [tokens] - if self.unknown_lookup is not None and (not self.allow_extend - or not self.unknown_autoextend): - vecs = [ - self.idx_to_vec[self.token_to_idx[token]] - if token in self.token_to_idx else self.unknown_lookup[token] - for token in tokens - ] - vecs = nd.stack(*vecs, axis=0) + if self.unknown_lookup is not None: + if self.idx_to_vec is None: + # May raise KeyError, but we cannot fallback to idx_to_vec's + # unknown vector, as idx_to_vec has not been initialized yet. + # Cannot initialize it, as we don't know the dimension. + vecs = self.unknown_lookup[tokens] + else: + vecs = [ + self.idx_to_vec[self.token_to_idx[token]] if + (token in self.token_to_idx + or token not in self.unknown_lookup) else + self.unknown_lookup[token] for token in tokens] + vecs = nd.stack(*vecs, axis=0) else: - if self.unknown_lookup is not None and self.allow_extend and self.unknown_autoextend: - new_tokens = [t for t in tokens if t not in self.token_to_idx] - self[new_tokens] = self.unknown_lookup[new_tokens] - indices = [self._token_to_idx[token] for token in tokens] vecs = nd.Embedding( nd.array(indices), self.idx_to_vec, self.idx_to_vec.shape[0], @@ -612,10 +590,14 @@ def __setitem__(self, tokens, new_embedding): the glossary. If `tokens` is a singleton, it must be 1-D or 2-D. If `tokens` is a list of multiple strings, it must be 2-D. """ - if self.allow_extend and self._idx_to_vec is None: + if not isinstance(tokens, (list, tuple)): + tokens = [tokens] + if ((self.allow_extend or all(t in self.token_to_idx for t in tokens)) + and self._idx_to_vec is None): # Initialize self._idx_to_vec assert C.UNK_IDX == 0 - self._idx_to_vec = self._init_unknown_vec(shape=(1, new_embedding.shape[-1])) + self._idx_to_vec = self._init_unknown_vec( + shape=(1, new_embedding.shape[-1])) tokens = self._check_vector_update(tokens, new_embedding) diff --git a/src/gluonnlp/vocab/vocab.py b/src/gluonnlp/vocab/vocab.py index 10b9e7fdd0..f3e26561c5 100644 --- a/src/gluonnlp/vocab/vocab.py +++ b/src/gluonnlp/vocab/vocab.py @@ -297,6 +297,10 @@ def set_embedding(self, *embeddings): assert isinstance(embs, emb.TokenEmbedding), \ 'The argument `embeddings` must be an instance or a list of instances of ' \ '`gluonnlp.embedding.TokenEmbedding`.' + assert embs.idx_to_vec is not None, \ + 'For all specified `embeddings`, `embeddings.idx_to_vec` must be initialized. ' \ + 'Use eg. `emb[emb.unknown_token] = nd.zeros(emsize)` to initialize, ' \ + 'where `emsize` is the desired embedding dimensionality.' assert all([embs.unknown_token for embs in embeddings]) or \ all([not embs.unknown_token for embs in embeddings]), \ @@ -307,8 +311,7 @@ def set_embedding(self, *embeddings): new_embedding._token_to_idx = self.token_to_idx new_embedding._idx_to_token = self.idx_to_token - new_vec_len = sum(embs.idx_to_vec.shape[1] for embs in embeddings - if embs and embs.idx_to_vec is not None) + new_vec_len = sum(embs.idx_to_vec.shape[1] for embs in embeddings) new_idx_to_vec = nd.zeros(shape=(len(self), new_vec_len)) col_start = 0 diff --git a/tests/unittest/test_vocab_embed.py b/tests/unittest/test_vocab_embed.py index 8c23d8237f..390d3d6512 100644 --- a/tests/unittest/test_vocab_embed.py +++ b/tests/unittest/test_vocab_embed.py @@ -762,6 +762,53 @@ def test_vocab_set_embedding_with_two_custom_embeddings(tmpdir, allow_extend): ) +@pytest.mark.parametrize('allow_extend', [True, False]) +@pytest.mark.parametrize('unknown_token', [True, False]) +@pytest.mark.parametrize('vocab_unknown_token', [True, False]) +@pytest.mark.parametrize('initialize', [True, False]) +def test_vocab_set_embedding_with_subword_lookup_only_token_embedding( + allow_extend, unknown_token, vocab_unknown_token, initialize): + embsize = 5 + + class NaiveLookup(object): + def __contains__(self, token): + return True + + def __getitem__(self, tokens): + if isinstance(tokens, _str_types): + return nd.ones(embsize) + else: + return nd.ones((len(tokens), embsize)) + + c = nlp.data.utils.Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) + v = nlp.Vocab(c, max_size=None, min_freq=1, + unknown_token='' if vocab_unknown_token else None, + padding_token='') + + assert v.embedding is None + + e = nlp.embedding.TokenEmbedding( + unknown_lookup=NaiveLookup(), allow_extend=allow_extend, + unknown_token='' if unknown_token else None) + + if initialize and unknown_token: + e[e.unknown_token] = nd.zeros(embsize) + elif initialize and allow_extend: + e["hello"] = e.unknown_lookup["hello"] + else: # Cannot initialize, even if initialize is True + with pytest.raises(AssertionError): + v.set_embedding(e) + return # cannot test more + + v.set_embedding(e) + assert v.embedding is not None + assert v.embedding.idx_to_vec is not None + assert v.embedding.idx_to_vec.shape == (len(v), embsize) + + for t in c.keys(): + assert np.all(np.isclose(1, v.embedding[t].asnumpy())) + + @pytest.mark.serial @pytest.mark.remote_required def test_download_embed(): @@ -862,41 +909,92 @@ def test_token_embedding_from_S3_fasttext_with_ngrams(load_ngrams): @pytest.mark.parametrize('setinconstructor', [True, False]) -def test_token_embedding_unknown_lookup(setinconstructor): +@pytest.mark.parametrize('lookup', ['naive', 'incapable']) +@pytest.mark.parametrize('initializetokenembedding', [True, False]) +@pytest.mark.parametrize('unknown_token', [True, False]) +@pytest.mark.parametrize('allow_extend', [True, False]) +def test_token_embedding_unknown_lookup(setinconstructor, lookup, + initializetokenembedding, + unknown_token, allow_extend, tmpdir): class NaiveLookup(object): - dim = 300 + dim = 5 # Must match _mk_my_pretrain_file + + def __contains__(self, token): + return True def __getitem__(self, tokens): if isinstance(tokens, _str_types): - return nd.zeros(self.dim) + return nd.ones(self.dim) else: - return nd.zeros((len(tokens), self.dim)) + return nd.ones((len(tokens), self.dim)) + + class IncapableLookup(object): + def __contains__(self, token): + return False + + def __getitem__(self, tokens): + raise KeyError + + if initializetokenembedding: + # Load a TokenEmbedding with idx_to_vec already initialized + embed_root = str(tmpdir) + embed_name = 'my_embed' + elem_delim = '\t' + pretrain_file = 'my_pretrain_file.txt' + _mk_my_pretrain_file( + os.path.join(embed_root, embed_name), elem_delim, pretrain_file) + pretrain_file_path = os.path.join(embed_root, embed_name, + pretrain_file) + TokenEmbedding = functools.partial( + nlp.embedding.TokenEmbedding.from_file, pretrain_file_path, + elem_delim) + else: + TokenEmbedding = nlp.embedding.token_embedding.TokenEmbedding + + Lookup = NaiveLookup if lookup == "naive" else IncapableLookup if setinconstructor: TokEmb = functools.partial( - nlp.embedding.token_embedding.TokenEmbedding, - unknown_lookup=NaiveLookup()) + TokenEmbedding, unknown_lookup=Lookup(), allow_extend=allow_extend, + unknown_token='' if unknown_token else None) else: + def TokEmb(*args, **kwargs): - token_embedding = nlp.embedding.token_embedding.TokenEmbedding( - *args, **kwargs) - token_embedding.unknown_lookup = NaiveLookup() + token_embedding = TokenEmbedding( + allow_extend=allow_extend, + unknown_token='' if unknown_token else None, *args, + **kwargs) + token_embedding.unknown_lookup = Lookup() return token_embedding - token_embedding = TokEmb(unknown_autoextend=False) - assert 'hello' not in token_embedding.token_to_idx - assert np.all(np.isclose(0, token_embedding['hello'].asnumpy())) - assert 'hello' not in token_embedding.token_to_idx - - token_embedding = TokEmb(unknown_autoextend=True, allow_extend=True) - assert 'hello' not in token_embedding.token_to_idx - assert np.all(np.isclose(0, token_embedding['hello'].asnumpy())) - assert 'hello' in token_embedding.token_to_idx - - token_embedding = TokEmb(unknown_autoextend=True, allow_extend=False) - assert 'hello' not in token_embedding.token_to_idx - assert np.all(np.isclose(0, token_embedding['hello'].asnumpy())) - assert 'hello' not in token_embedding.token_to_idx + token_embedding = TokEmb() + if lookup == 'incapable' and not initializetokenembedding: + with pytest.raises(KeyError): + token_embedding['hello'] + elif lookup == 'incapable' and initializetokenembedding and not unknown_token: + with pytest.raises(KeyError): + token_embedding['hello'] + elif lookup == 'incapable' and initializetokenembedding and unknown_token: + assert 'hello' not in token_embedding.token_to_idx + assert np.all(np.isclose(0, token_embedding['hello'].asnumpy())) + assert 'hello' not in token_embedding.token_to_idx + elif lookup != 'naive': + raise RuntimeError('Invalid test parameterization.') + else: + assert 'hello' not in token_embedding.token_to_idx + assert np.all(np.isclose(1, token_embedding['hello'].asnumpy())) + assert 'hello' not in token_embedding.token_to_idx + + if allow_extend: + token_embedding['hello'] = token_embedding.unknown_lookup['hello'] + assert 'hello' in token_embedding.token_to_idx + assert np.all(np.isclose(1, token_embedding['hello'].asnumpy())) + + token_embedding[['hello2', 'world']] = \ + token_embedding.unknown_lookup[['hello2', 'world']] + assert 'hello2' in token_embedding.token_to_idx + assert 'world' in token_embedding.token_to_idx + assert np.all(np.isclose(1, token_embedding['hello2'].asnumpy())) @pytest.mark.parametrize('initializeidxtovecbyextending', [True, False])