From 651463a3e865b2f34b94e45586653e09072d85d8 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sat, 1 Aug 2020 13:56:03 +0200 Subject: [PATCH 01/30] Add option to use fast HF tokenizer --- farm/infer.py | 21 ++++++++++++++++++--- farm/modeling/tokenization.py | 33 ++++++++++++++++++++++++--------- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/farm/infer.py b/farm/infer.py index 6a4399ad6..180e6a577 100644 --- a/farm/infer.py +++ b/farm/infer.py @@ -147,8 +147,10 @@ def load( extraction_strategy=None, s3e_stats=None, num_processes=None, - disable_tqdm=False - + disable_tqdm=False, + tokenizer_class=None, + use_fast=False, + tokenizer_args=None, ): """ Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by @@ -191,9 +193,18 @@ def load( :type num_processes: int :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing) :type disable_tqdm: bool + :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) + :type tokenizer_class: str + :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or + use the Python one (False). + :param tokenizer_args: (Optional) Will be passed to the Tokenizer ``__init__`` method. + :type tokenizer_args: dict + :type use_fast: bool :return: An instance of the Inferencer. """ + if tokenizer_args is None: + tokenizer_args = {} device, n_gpu = initialize_device_settings(use_cuda=gpu, local_rank=-1, use_amp=None) name = os.path.basename(model_name_or_path) @@ -221,7 +232,11 @@ def load( model = AdaptiveModel.convert_from_transformers(model_name_or_path, device, task_type) config = AutoConfig.from_pretrained(model_name_or_path) - tokenizer = Tokenizer.load(model_name_or_path) + tokenizer = Tokenizer.load(model_name_or_path, + tokenizer_class=tokenizer_class, + use_fast=use_fast, + **tokenizer_args, + ) # TODO infer task_type automatically from config (if possible) if task_type == "question_answering": diff --git a/farm/modeling/tokenization.py b/farm/modeling/tokenization.py index e0b125cb1..c5c36df8d 100644 --- a/farm/modeling/tokenization.py +++ b/farm/modeling/tokenization.py @@ -24,10 +24,10 @@ import numpy as np from transformers.tokenization_albert import AlbertTokenizer -from transformers.tokenization_bert import BertTokenizer, load_vocab -from transformers.tokenization_distilbert import DistilBertTokenizer -from transformers.tokenization_electra import ElectraTokenizer -from transformers.tokenization_roberta import RobertaTokenizer +from transformers.tokenization_bert import BertTokenizer, BertTokenizerFast, load_vocab +from transformers.tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast +from transformers.tokenization_electra import ElectraTokenizer, ElectraTokenizerFast +from transformers.tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast from transformers.tokenization_utils import PreTrainedTokenizer from transformers.tokenization_xlm_roberta import XLMRobertaTokenizer from transformers.tokenization_xlnet import XLNetTokenizer @@ -48,7 +48,7 @@ class Tokenizer: """ @classmethod - def load(cls, pretrained_model_name_or_path, tokenizer_class=None, **kwargs): + def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=False, **kwargs): """ Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from `pretrained_model_name_or_path` or define it manually via `tokenizer_class`. @@ -57,6 +57,9 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, **kwargs): :type pretrained_model_name_or_path: str :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str + :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or + use the Python one (False). + :type use_fast: bool :param kwargs: :return: Tokenizer """ @@ -98,15 +101,27 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, **kwargs): elif tokenizer_class == "XLMRobertaTokenizer": ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "RobertaTokenizer": - ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) + if use_fast: + ret = RobertaTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) + else: + ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DistilBertTokenizer": - ret = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) + if use_fast: + ret = DistilBertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) + else: + ret = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "BertTokenizer": - ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) + if use_fast: + ret = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) + else: + ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "XLNetTokenizer": ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) elif tokenizer_class == "ElectraTokenizer": - ret = ElectraTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) + if use_fast: + ret = ElectraTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) + else: + ret = ElectraTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "EmbeddingTokenizer": ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "CamembertTokenizer": From a433483ed80817f0fea298d49d819e4db3833e16 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sat, 1 Aug 2020 14:14:54 +0200 Subject: [PATCH 02/30] Hand merge tests from PR #205 --- test/test_tokenization.py | 65 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/test/test_tokenization.py b/test/test_tokenization.py index ec73773c1..29977577f 100644 --- a/test/test_tokenization.py +++ b/test/test_tokenization.py @@ -90,6 +90,45 @@ def test_truncate_sequences(caplog): assert len(trunc_a) + len(trunc_b) + tokenizer.num_special_tokens_to_add(pair=True) == max_seq_len +def test_fast_tokenizer(caplog): + fast_tokenizer = Tokenizer.load("bert-base-cased", lower_case=False, use_fast=True) + tokenizer = Tokenizer.load("bert-base-cased", lower_case=False, use_fast=False) + + texts = [ + "This is a sentence", + "Der entscheidende Pass", + "This is a sentence with multiple spaces", + "力加勝北区ᴵᴺᵀᵃছজটডণত", + "Thiso text is included tolod makelio sure Unicodeel is handled properly:", + "This is a sentence...", + "Let's see all on this text and. !23# neverseenwordspossible", + """This is a sentence. + With linebreak""", + """Sentence with multiple + + + newlines + """, + "and another one\n\n\nwithout space", + "This is a sentence with tab", + "This is a sentence with multiple tabs", + ] + for text in texts: + + # plain tokenize function + tokenized = tokenizer.tokenize(text) + fast_tokenized = fast_tokenizer.tokenize(text) + + assert tokenized == fast_tokenized + + # our tokenizer with metadata on "whitespace tokenized words" + tokenized_meta = tokenize_with_metadata(text=text, tokenizer=tokenizer) + fast_tokenized_meta = tokenize_with_metadata(text=text, tokenizer=fast_tokenizer) + + # verify that tokenization on full sequence is the same as the one on "whitespace tokenized words" + assert tokenized_meta == fast_tokenized_meta, f"Failed using {tokenizer.__class__.__name__}" + + def test_all_tokenizer_on_special_cases(caplog): caplog.set_level(logging.CRITICAL) @@ -173,5 +212,31 @@ def test_bert_custom_vocab(caplog): assert tokenized_meta["start_of_word"] == [True, True, True, True, True, True, False, False, False, False, True, True, True, False, False, False, False, False, False, False] +def test_fast_bert_custom_vocab(caplog): + caplog.set_level(logging.CRITICAL) + + lang_model = "bert-base-cased" + + tokenizer = Tokenizer.load( + pretrained_model_name_or_path=lang_model, + do_lower_case=False, use_fast=True + ) + + #deprecated: tokenizer.add_custom_vocab("samples/tokenizer/custom_vocab.txt") + tokenizer.add_tokens(new_tokens=["neverseentokens"]) + + basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" + + # original tokenizer from transformer repo + tokenized = tokenizer.tokenize(basic_text) + assert tokenized == ['Some', 'Text', 'with', 'neverseentokens', 'plus', '!', '215', '?', '#', '.', 'and', 'a', 'combined', '-', 'token', '_', 'with', '/', 'ch', '##ars'] + + # ours with metadata + tokenized_meta = tokenize_with_metadata(text=basic_text, tokenizer=tokenizer) + assert tokenized_meta["tokens"] == tokenized + assert tokenized_meta["offsets"] == [0, 5, 10, 15, 31, 36, 37, 40, 41, 42, 44, 48, 50, 58, 59, 64, 65, 69, 70, 72] + assert tokenized_meta["start_of_word"] == [True, True, True, True, True, True, False, False, False, False, True, True, True, False, False, False, False, False, False, False] + + if __name__ == "__main__": test_all_tokenizer_on_special_cases() From c20c5db7f2e62a129f4ccca6f6822fb52aee7b44 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sat, 1 Aug 2020 17:44:18 +0200 Subject: [PATCH 03/30] test_inferencer_with_fast_bert_tokenizer --- test/test_inference.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/test_inference.py b/test/test_inference.py index afefc8644..39647cfb0 100644 --- a/test/test_inference.py +++ b/test/test_inference.py @@ -1,5 +1,7 @@ import pytest import numpy as np +import transformers + from farm.infer import Inferencer @@ -95,5 +97,12 @@ def test_embeddings_extraction(num_processes): assert result[0]["context"] == ['Schar', '##tau', 'sagte', 'dem', 'Tages', '##spiegel', ',', 'dass', 'Fischer', 'ein', 'Id', '##iot', 'ist'] assert np.isclose(result[0]["vec"][0], 1.50174605e-02) + +def test_inferencer_with_fast_bert_tokenizer(): + model = Inferencer.load("bert-base-german-cased", task_type='text_classification', use_fast=True) + tokenizer = model.processor.tokenizer + assert type(tokenizer) is transformers.tokenization_bert.BertTokenizerFast + + if __name__ == "__main__": test_embeddings_extraction() From 5f2b5ee70a7a95300963ec02cd7ab12c65a4aec6 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sat, 1 Aug 2020 17:46:34 +0200 Subject: [PATCH 04/30] test_fast_bert_tokenizer --- test/test_tokenization.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/test/test_tokenization.py b/test/test_tokenization.py index 29977577f..254c0b2a9 100644 --- a/test/test_tokenization.py +++ b/test/test_tokenization.py @@ -1,6 +1,6 @@ import logging from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences -from transformers import BertTokenizer, RobertaTokenizer, XLNetTokenizer +from transformers import BertTokenizer, BertTokenizerFast, RobertaTokenizer, XLNetTokenizer import re @@ -238,5 +238,12 @@ def test_fast_bert_custom_vocab(caplog): assert tokenized_meta["start_of_word"] == [True, True, True, True, True, True, False, False, False, False, True, True, True, False, False, False, False, False, False, False] +def test_fast_bert_tokenizer(caplog): + caplog.set_level(logging.CRITICAL) + + tokenizer = Tokenizer.load("bert-base-german-cased", use_fast=True) + assert type(tokenizer) is BertTokenizerFast + + if __name__ == "__main__": test_all_tokenizer_on_special_cases() From fa3bd679713836b53823e6cbeae32eef59b24d2d Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sat, 1 Aug 2020 18:17:42 +0200 Subject: [PATCH 05/30] test_fast_bert_tokenizer_strip_accents --- test/test_tokenization.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test/test_tokenization.py b/test/test_tokenization.py index 254c0b2a9..1ad4e41ac 100644 --- a/test/test_tokenization.py +++ b/test/test_tokenization.py @@ -245,5 +245,16 @@ def test_fast_bert_tokenizer(caplog): assert type(tokenizer) is BertTokenizerFast +def test_fast_bert_tokenizer_strip_accents(caplog): + caplog.set_level(logging.CRITICAL) + + tokenizer = Tokenizer.load("dbmdz/bert-base-german-uncased", + use_fast=True, + strip_accents=False) + assert type(tokenizer) is BertTokenizerFast + assert tokenizer._tokenizer._parameters['strip_accents'] is False + assert tokenizer._tokenizer._parameters['lowercase'] + + if __name__ == "__main__": test_all_tokenizer_on_special_cases() From cd7298ce27b708075a1d0e001dc2a5a816d6e16e Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sat, 1 Aug 2020 18:31:05 +0200 Subject: [PATCH 06/30] test_fast_electra_tokenizer --- test/test_tokenization.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/test_tokenization.py b/test/test_tokenization.py index 1ad4e41ac..4ca3b5a4d 100644 --- a/test/test_tokenization.py +++ b/test/test_tokenization.py @@ -1,6 +1,8 @@ import logging from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences from transformers import BertTokenizer, BertTokenizerFast, RobertaTokenizer, XLNetTokenizer +from transformers import ElectraTokenizerFast + import re @@ -256,5 +258,13 @@ def test_fast_bert_tokenizer_strip_accents(caplog): assert tokenizer._tokenizer._parameters['lowercase'] +def test_fast_electra_tokenizer(caplog): + caplog.set_level(logging.CRITICAL) + + tokenizer = Tokenizer.load("dbmdz/electra-base-german-europeana-cased-discriminator", + use_fast=True) + assert type(tokenizer) is ElectraTokenizerFast + + if __name__ == "__main__": test_all_tokenizer_on_special_cases() From 01e5ffb9c0d514e7ecdc2379ee1ebc66beb3e4d9 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sat, 1 Aug 2020 19:09:02 +0200 Subject: [PATCH 07/30] Fix OOM issue of CI - set num_processes=0 for Inferencer --- test/test_inference.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_inference.py b/test/test_inference.py index 39647cfb0..c05c4211d 100644 --- a/test/test_inference.py +++ b/test/test_inference.py @@ -99,7 +99,8 @@ def test_embeddings_extraction(num_processes): def test_inferencer_with_fast_bert_tokenizer(): - model = Inferencer.load("bert-base-german-cased", task_type='text_classification', use_fast=True) + model = Inferencer.load("bert-base-german-cased", task_type='text_classification', + use_fast=True, num_processes=0) tokenizer = model.processor.tokenizer assert type(tokenizer) is transformers.tokenization_bert.BertTokenizerFast From 42f345f370f5b78836322baf192537b009e1c422 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sun, 2 Aug 2020 14:31:22 +0200 Subject: [PATCH 08/30] Extend test for fast tokenizer - electra - roberta --- test/test_tokenization.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/test/test_tokenization.py b/test/test_tokenization.py index 4ca3b5a4d..f17c46b04 100644 --- a/test/test_tokenization.py +++ b/test/test_tokenization.py @@ -1,9 +1,10 @@ import logging -from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences +import pytest +import re from transformers import BertTokenizer, BertTokenizerFast, RobertaTokenizer, XLNetTokenizer -from transformers import ElectraTokenizerFast +from transformers import ElectraTokenizerFast, RobertaTokenizerFast -import re +from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences def test_basic_loading(caplog): @@ -240,11 +241,16 @@ def test_fast_bert_custom_vocab(caplog): assert tokenized_meta["start_of_word"] == [True, True, True, True, True, True, False, False, False, False, True, True, True, False, False, False, False, False, False, False] -def test_fast_bert_tokenizer(caplog): +@pytest.mark.parametrize("model_name, tokenizer_type", [ + ("bert-base-german-cased", BertTokenizerFast), + ("google/electra-small-discriminator", ElectraTokenizerFast), + ("distilroberta-base", RobertaTokenizerFast), + ]) +def test_fast_tokenizer_type(caplog, model_name, tokenizer_type): caplog.set_level(logging.CRITICAL) - tokenizer = Tokenizer.load("bert-base-german-cased", use_fast=True) - assert type(tokenizer) is BertTokenizerFast + tokenizer = Tokenizer.load(model_name, use_fast=True) + assert type(tokenizer) is tokenizer_type def test_fast_bert_tokenizer_strip_accents(caplog): From 9b021ff36e86cb7e4b12c4fd45a112e27599e3f4 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sun, 2 Aug 2020 14:44:58 +0200 Subject: [PATCH 09/30] test_fast_tokenizer for more model typed - electra - roberta --- test/test_tokenization.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/test/test_tokenization.py b/test/test_tokenization.py index f17c46b04..2bfb8a3b9 100644 --- a/test/test_tokenization.py +++ b/test/test_tokenization.py @@ -93,9 +93,13 @@ def test_truncate_sequences(caplog): assert len(trunc_a) + len(trunc_b) + tokenizer.num_special_tokens_to_add(pair=True) == max_seq_len -def test_fast_tokenizer(caplog): - fast_tokenizer = Tokenizer.load("bert-base-cased", lower_case=False, use_fast=True) - tokenizer = Tokenizer.load("bert-base-cased", lower_case=False, use_fast=False) +@pytest.mark.parametrize("model_name", ["bert-base-german-cased", + "google/electra-small-discriminator", + "distilroberta-base", + ]) +def test_fast_tokenizer(caplog, model_name): + fast_tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=True) + tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=False) texts = [ "This is a sentence", From 86d7fd57558ee8aac0940615f99a949e9ab67ed1 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sun, 2 Aug 2020 15:19:55 +0200 Subject: [PATCH 10/30] Fix tokenize_with_metadata --- farm/modeling/tokenization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/farm/modeling/tokenization.py b/farm/modeling/tokenization.py index c5c36df8d..c3a578f47 100644 --- a/farm/modeling/tokenization.py +++ b/farm/modeling/tokenization.py @@ -309,7 +309,7 @@ def _words_to_tokens(words, word_offsets, tokenizer): elif len(tokens) == 0: tokens_word = tokenizer.tokenize(w) else: - if type(tokenizer) == RobertaTokenizer: + if (type(tokenizer) == RobertaTokenizer) or (type(tokenizer) == RobertaTokenizerFast): tokens_word = tokenizer.tokenize(w, add_prefix_space=True) else: tokens_word = tokenizer.tokenize(w) From a8f4638984422a09410f13a37133b58dd9ba1a40 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sun, 2 Aug 2020 19:15:33 +0200 Subject: [PATCH 11/30] Split tokenizer tests --- test/test_tokenization.py | 51 ++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/test/test_tokenization.py b/test/test_tokenization.py index 2bfb8a3b9..14a1fbdcc 100644 --- a/test/test_tokenization.py +++ b/test/test_tokenization.py @@ -7,6 +7,27 @@ from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences +TEXTS = [ + "This is a sentence", + "Der entscheidende Pass", + "This is a sentence with multiple spaces", + "力加勝北区ᴵᴺᵀᵃছজটডণত", + "Thiso text is included tolod makelio sure Unicodeel is handled properly:", + "This is a sentence...", + "Let's see all on this text and. !23# neverseenwordspossible", + """This is a sentence. + With linebreak""", + """Sentence with multiple + + + newlines + """, + "and another one\n\n\nwithout space", + "This is a sentence with tab", + "This is a sentence with multiple tabs", +] + + def test_basic_loading(caplog): caplog.set_level(logging.CRITICAL) tokenizer = Tokenizer.load( @@ -97,37 +118,23 @@ def test_truncate_sequences(caplog): "google/electra-small-discriminator", "distilroberta-base", ]) -def test_fast_tokenizer(caplog, model_name): +def test_fast_tokenizer_with_examples(caplog, model_name): fast_tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=True) tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=False) - texts = [ - "This is a sentence", - "Der entscheidende Pass", - "This is a sentence with multiple spaces", - "力加勝北区ᴵᴺᵀᵃছজটডণত", - "Thiso text is included tolod makelio sure Unicodeel is handled properly:", - "This is a sentence...", - "Let's see all on this text and. !23# neverseenwordspossible", - """This is a sentence. - With linebreak""", - """Sentence with multiple - - - newlines - """, - "and another one\n\n\nwithout space", - "This is a sentence with tab", - "This is a sentence with multiple tabs", - ] - for text in texts: - + for text in TEXTS: # plain tokenize function tokenized = tokenizer.tokenize(text) fast_tokenized = fast_tokenizer.tokenize(text) assert tokenized == fast_tokenized + +def test_fast_tokenizer_with_metadata_with_examples_(caplog, model_name): + fast_tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=True) + tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=False) + + for text in TEXTS: # our tokenizer with metadata on "whitespace tokenized words" tokenized_meta = tokenize_with_metadata(text=text, tokenizer=tokenizer) fast_tokenized_meta = tokenize_with_metadata(text=text, tokenizer=fast_tokenizer) From cdccafaf964c20d0cfd8bea6cc1a02bc4106afa4 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sun, 2 Aug 2020 19:25:29 +0200 Subject: [PATCH 12/30] Fix pytest params bug in test_tok --- test/test_tokenization.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_tokenization.py b/test/test_tokenization.py index 14a1fbdcc..4685cde6e 100644 --- a/test/test_tokenization.py +++ b/test/test_tokenization.py @@ -130,7 +130,11 @@ def test_fast_tokenizer_with_examples(caplog, model_name): assert tokenized == fast_tokenized -def test_fast_tokenizer_with_metadata_with_examples_(caplog, model_name): +@pytest.mark.parametrize("model_name", ["bert-base-german-cased", + "google/electra-small-discriminator", + "distilroberta-base", + ]) +def test_fast_tokenizer_with_metadata_with_examples(caplog, model_name): fast_tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=True) tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=False) From 47d4b6abfd266a56129f7c406d9ae79ac7b7b516 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Tue, 4 Aug 2020 09:53:54 +0200 Subject: [PATCH 13/30] Fix fast tokenizer usage --- farm/data_handler/input_features.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py index 0b8119b5b..aedf2ee0d 100644 --- a/farm/data_handler/input_features.py +++ b/farm/data_handler/input_features.py @@ -8,6 +8,8 @@ from dotmap import DotMap import numpy as np +from transformers.tokenization_utils_base import TruncationStrategy + from farm.data_handler.samples import Sample from farm.data_handler.utils import ( expand_labels, @@ -45,8 +47,10 @@ def sample_to_features_text( tokens_a, tokens_b, add_special_tokens=True, - truncation_strategy='do_not_truncate', - return_token_type_ids=True + truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE, + return_token_type_ids=True, + max_length=max_seq_len, + is_pretokenized=True, ) input_ids, segment_ids = inputs["input_ids"], inputs["token_type_ids"] @@ -539,4 +543,4 @@ def _SQUAD_improve_answer_span( if text_span == tok_answer_text: return (new_start, new_end) - return (input_start, input_end) + return (input_start, input_end) \ No newline at end of file From 83180631f295362b39ad2875ebb73a1f072d05f7 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Tue, 4 Aug 2020 09:55:58 +0200 Subject: [PATCH 14/30] add missing newline eof --- farm/data_handler/input_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py index aedf2ee0d..7afe9da6a 100644 --- a/farm/data_handler/input_features.py +++ b/farm/data_handler/input_features.py @@ -543,4 +543,4 @@ def _SQUAD_improve_answer_span( if text_span == tok_answer_text: return (new_start, new_end) - return (input_start, input_end) \ No newline at end of file + return (input_start, input_end) From 8c61e3b5c357b994be1c7b0eb472c03e538a1577 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Tue, 4 Aug 2020 10:02:00 +0200 Subject: [PATCH 15/30] Add test fast tok. doc_callif. --- test/test_doc_classification.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/test_doc_classification.py b/test/test_doc_classification.py index 23a4dbea0..ec54fe306 100644 --- a/test/test_doc_classification.py +++ b/test/test_doc_classification.py @@ -19,7 +19,8 @@ @pytest.mark.parametrize("data_dir_path,text_column_name", [("samples/doc_class", None), ("samples/doc_class_other_text_column_name", "text_other")]) -def test_doc_classification(data_dir_path, text_column_name, caplog=None): +@pytest.mark.parametrize("use_fast", [False, True]) +def test_doc_classification(data_dir_path, text_column_name, use_fast, caplog=None): if caplog: caplog.set_level(logging.CRITICAL) @@ -32,7 +33,9 @@ def test_doc_classification(data_dir_path, text_column_name, caplog=None): tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, - do_lower_case=False) + do_lower_case=False, + use_fast=use_fast, + ) tcp_params = dict(tokenizer=tokenizer, max_seq_len=8, From aec7d2d24a969155476649d0248ea5eac19b1eb8 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Tue, 4 Aug 2020 13:29:12 +0200 Subject: [PATCH 16/30] Remove RobertaTokenizerFast --- farm/modeling/tokenization.py | 7 ++++--- test/test_tokenization.py | 5 +---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/farm/modeling/tokenization.py b/farm/modeling/tokenization.py index c3a578f47..8c1d646dc 100644 --- a/farm/modeling/tokenization.py +++ b/farm/modeling/tokenization.py @@ -27,7 +27,7 @@ from transformers.tokenization_bert import BertTokenizer, BertTokenizerFast, load_vocab from transformers.tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast from transformers.tokenization_electra import ElectraTokenizer, ElectraTokenizerFast -from transformers.tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast +from transformers.tokenization_roberta import RobertaTokenizer from transformers.tokenization_utils import PreTrainedTokenizer from transformers.tokenization_xlm_roberta import XLMRobertaTokenizer from transformers.tokenization_xlnet import XLNetTokenizer @@ -59,6 +59,7 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=Fals :type tokenizer_class: str :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or use the Python one (False). + TODO: Say which models support fast tokenizers. :type use_fast: bool :param kwargs: :return: Tokenizer @@ -102,7 +103,7 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=Fals ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "RobertaTokenizer": if use_fast: - ret = RobertaTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) + raise ValueError('RobertaTokenizerFast is not supportet!') else: ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DistilBertTokenizer": @@ -309,7 +310,7 @@ def _words_to_tokens(words, word_offsets, tokenizer): elif len(tokens) == 0: tokens_word = tokenizer.tokenize(w) else: - if (type(tokenizer) == RobertaTokenizer) or (type(tokenizer) == RobertaTokenizerFast): + if type(tokenizer) == RobertaTokenizer: tokens_word = tokenizer.tokenize(w, add_prefix_space=True) else: tokens_word = tokenizer.tokenize(w) diff --git a/test/test_tokenization.py b/test/test_tokenization.py index 4685cde6e..c8959e8f1 100644 --- a/test/test_tokenization.py +++ b/test/test_tokenization.py @@ -2,7 +2,7 @@ import pytest import re from transformers import BertTokenizer, BertTokenizerFast, RobertaTokenizer, XLNetTokenizer -from transformers import ElectraTokenizerFast, RobertaTokenizerFast +from transformers import ElectraTokenizerFast from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences @@ -116,7 +116,6 @@ def test_truncate_sequences(caplog): @pytest.mark.parametrize("model_name", ["bert-base-german-cased", "google/electra-small-discriminator", - "distilroberta-base", ]) def test_fast_tokenizer_with_examples(caplog, model_name): fast_tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=True) @@ -132,7 +131,6 @@ def test_fast_tokenizer_with_examples(caplog, model_name): @pytest.mark.parametrize("model_name", ["bert-base-german-cased", "google/electra-small-discriminator", - "distilroberta-base", ]) def test_fast_tokenizer_with_metadata_with_examples(caplog, model_name): fast_tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=True) @@ -259,7 +257,6 @@ def test_fast_bert_custom_vocab(caplog): @pytest.mark.parametrize("model_name, tokenizer_type", [ ("bert-base-german-cased", BertTokenizerFast), ("google/electra-small-discriminator", ElectraTokenizerFast), - ("distilroberta-base", RobertaTokenizerFast), ]) def test_fast_tokenizer_type(caplog, model_name, tokenizer_type): caplog.set_level(logging.CRITICAL) From 75ea9dd9d88d885b786ab25c28b958fce04d982a Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Tue, 4 Aug 2020 21:14:08 +0200 Subject: [PATCH 17/30] Fix Tokenizer load and save. --- farm/data_handler/input_features.py | 4 +--- farm/data_handler/processor.py | 6 +++++- farm/modeling/tokenization.py | 9 +++++---- test/test_doc_classification.py | 2 +- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py index 7afe9da6a..50e5ca4a5 100644 --- a/farm/data_handler/input_features.py +++ b/farm/data_handler/input_features.py @@ -8,8 +8,6 @@ from dotmap import DotMap import numpy as np -from transformers.tokenization_utils_base import TruncationStrategy - from farm.data_handler.samples import Sample from farm.data_handler.utils import ( expand_labels, @@ -47,7 +45,7 @@ def sample_to_features_text( tokens_a, tokens_b, add_special_tokens=True, - truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE, + truncation=False, # truncation_strategy is depricated return_token_type_ids=True, max_length=max_seq_len, is_pretokenized=True, diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py index c8aa46c29..6ff6116e1 100644 --- a/farm/data_handler/processor.py +++ b/farm/data_handler/processor.py @@ -231,7 +231,11 @@ def save(self, save_dir): config = self.generate_config() # save tokenizer incl. attributes config["tokenizer"] = self.tokenizer.__class__.__name__ - self.tokenizer.save_pretrained(save_dir) + + # Because the fast tokenizers expect a str and not Path + # always convert Path to str here. + self.tokenizer.save_pretrained(str(save_dir)) + # save processor config["processor"] = self.__class__.__name__ output_config_file = Path(save_dir) / "processor_config.json" diff --git a/farm/modeling/tokenization.py b/farm/modeling/tokenization.py index 8c1d646dc..084767df5 100644 --- a/farm/modeling/tokenization.py +++ b/farm/modeling/tokenization.py @@ -97,28 +97,29 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=Fals f"XLNetTokenizer.") logger.info(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object + ret = None if tokenizer_class == "AlbertTokenizer": ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) elif tokenizer_class == "XLMRobertaTokenizer": ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif tokenizer_class == "RobertaTokenizer": + elif "RobertaTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: raise ValueError('RobertaTokenizerFast is not supportet!') else: ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif tokenizer_class == "DistilBertTokenizer": + elif "DistilBertTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = DistilBertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif tokenizer_class == "BertTokenizer": + elif "BertTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "XLNetTokenizer": ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) - elif tokenizer_class == "ElectraTokenizer": + elif "ElectraTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = ElectraTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: diff --git a/test/test_doc_classification.py b/test/test_doc_classification.py index ec54fe306..dcfd31293 100644 --- a/test/test_doc_classification.py +++ b/test/test_doc_classification.py @@ -87,7 +87,7 @@ def test_doc_classification(data_dir_path, text_column_name, use_fast, caplog=No trainer.train() - save_dir = Path("testsave/doc_class") + save_dir = Path("testsave/doc_class_bert") model.save(save_dir) processor.save(save_dir) From 2d2cd00dc69638c932bcc8673db66874e00dd3b1 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Tue, 4 Aug 2020 21:57:07 +0200 Subject: [PATCH 18/30] Fix typo --- farm/data_handler/input_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py index 50e5ca4a5..dde46e71e 100644 --- a/farm/data_handler/input_features.py +++ b/farm/data_handler/input_features.py @@ -45,7 +45,7 @@ def sample_to_features_text( tokens_a, tokens_b, add_special_tokens=True, - truncation=False, # truncation_strategy is depricated + truncation=False, # truncation_strategy is deprecated return_token_type_ids=True, max_length=max_seq_len, is_pretokenized=True, From 8afa136c5015cfcae6996a181bf2c5de6a7755a7 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Wed, 5 Aug 2020 17:12:35 +0200 Subject: [PATCH 19/30] Improve test test_embeddings_extraction - add shape assert - fix embedding assert --- test/test_inference.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_inference.py b/test/test_inference.py index c05c4211d..9815b1405 100644 --- a/test/test_inference.py +++ b/test/test_inference.py @@ -95,7 +95,8 @@ def test_embeddings_extraction(num_processes): # Get embeddings for input text (you can vary the strategy and layer) result = model.inference_from_dicts(dicts=basic_texts) assert result[0]["context"] == ['Schar', '##tau', 'sagte', 'dem', 'Tages', '##spiegel', ',', 'dass', 'Fischer', 'ein', 'Id', '##iot', 'ist'] - assert np.isclose(result[0]["vec"][0], 1.50174605e-02) + assert result[0]["vec"].shape == (768,) + assert np.isclose(result[0]["vec"][0], -0.032460204579613426) def test_inferencer_with_fast_bert_tokenizer(): From 042fde0c9ca5d621f7b4c4fa3b441adfdc289c88 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Wed, 5 Aug 2020 17:37:57 +0200 Subject: [PATCH 20/30] Dosctring for fast tokenizers improved --- farm/modeling/tokenization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/farm/modeling/tokenization.py b/farm/modeling/tokenization.py index 084767df5..82f8adeed 100644 --- a/farm/modeling/tokenization.py +++ b/farm/modeling/tokenization.py @@ -59,7 +59,7 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=Fals :type tokenizer_class: str :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or use the Python one (False). - TODO: Say which models support fast tokenizers. + Only DistilBERT, BERT and Electra fast tokenizers are supported. :type use_fast: bool :param kwargs: :return: Tokenizer From 7ed385fe4b28b1dc3179afb7fbc5912f13f224ba Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Wed, 5 Aug 2020 17:44:10 +0200 Subject: [PATCH 21/30] tokenizer_args docstring --- farm/infer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/farm/infer.py b/farm/infer.py index 180e6a577..fcbef006e 100644 --- a/farm/infer.py +++ b/farm/infer.py @@ -198,6 +198,8 @@ def load( :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or use the Python one (False). :param tokenizer_args: (Optional) Will be passed to the Tokenizer ``__init__`` method. + See https://huggingface.co/transformers/main_classes/tokenizer.html and detailed tokenizer documentation + on `Hugging Face Transformers `_. :type tokenizer_args: dict :type use_fast: bool :return: An instance of the Inferencer. From d4eb59ccca931d94e5a77023b21b40c2b6e2cffc Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Wed, 5 Aug 2020 17:46:44 +0200 Subject: [PATCH 22/30] Extend test_embeddings_extraction to fast tok. --- test/test_inference.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_inference.py b/test/test_inference.py index 9815b1405..9a219f82d 100644 --- a/test/test_inference.py +++ b/test/test_inference.py @@ -75,7 +75,8 @@ def test_qa_format_and_results(adaptive_model_qa, streaming, multiprocessing_chu @pytest.mark.parametrize("num_processes", [0], scope="session") -def test_embeddings_extraction(num_processes): +@pytest.mark.parametrize("use_fast", [False, True]) +def test_embeddings_extraction(num_processes, use_fast): # Input basic_texts = [ {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist"}, @@ -90,6 +91,7 @@ def test_embeddings_extraction(num_processes): batch_size=5, extraction_strategy="reduce_mean", extraction_layer=-2, + use_fast=use_fast, num_processes=num_processes) # Get embeddings for input text (you can vary the strategy and layer) From 4f87604200bb14c50d0b0aabeeccdec249f596d5 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Wed, 5 Aug 2020 17:49:47 +0200 Subject: [PATCH 23/30] extend test_ner with fast tok. --- test/test_ner.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/test_ner.py b/test/test_ner.py index fe69452f1..1285331c8 100644 --- a/test/test_ner.py +++ b/test/test_ner.py @@ -1,4 +1,5 @@ from pathlib import Path +import pytest import numpy as np @@ -16,7 +17,8 @@ import logging -def test_ner(caplog): +@pytest.mark.parametrize("use_fast", [False, True]) +def test_ner(caplog, use_fast): if caplog: caplog.set_level(logging.CRITICAL) @@ -28,7 +30,8 @@ def test_ner(caplog): lang_model = "distilbert-base-german-cased" tokenizer = Tokenizer.load( - pretrained_model_name_or_path=lang_model, do_lower_case=False + pretrained_model_name_or_path=lang_model, do_lower_case=False, + use_fast=use_fast, ) ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", From bc7abcace5063cced9a5a2e592948459f2633383 Mon Sep 17 00:00:00 2001 From: Malte Pietsch Date: Thu, 6 Aug 2020 12:08:36 +0200 Subject: [PATCH 24/30] fix sample_to_features_ner for fast tokenizer --- farm/data_handler/input_features.py | 5 +++-- test/test_ner.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py index dde46e71e..3500046a1 100644 --- a/farm/data_handler/input_features.py +++ b/farm/data_handler/input_features.py @@ -141,9 +141,10 @@ def samples_to_features_ner( inputs = tokenizer.encode_plus(text=tokens, text_pair=None, add_special_tokens=True, - truncation_strategy='do_not_truncate', # We've already truncated our tokens before + truncation=False, return_special_tokens_mask=True, - return_token_type_ids=True + return_token_type_ids=True, + is_pretokenized=True ) input_ids, segment_ids, special_tokens_mask = inputs["input_ids"], inputs["token_type_ids"], inputs["special_tokens_mask"] diff --git a/test/test_ner.py b/test/test_ner.py index 1285331c8..9e34407a9 100644 --- a/test/test_ner.py +++ b/test/test_ner.py @@ -101,4 +101,4 @@ def test_ner(caplog, use_fast): if __name__ == "__main__": - test_ner(None) + test_ner(None, True) From da9c2f520d1f86c14fed4121204f616b950cb50f Mon Sep 17 00:00:00 2001 From: Malte Pietsch Date: Thu, 6 Aug 2020 13:17:50 +0200 Subject: [PATCH 25/30] temp fix for is_pretokenized until fixed upstream --- farm/data_handler/input_features.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py index 3500046a1..f886a0200 100644 --- a/farm/data_handler/input_features.py +++ b/farm/data_handler/input_features.py @@ -41,6 +41,12 @@ def sample_to_features_text( tokens_a = sample.tokenized["tokens"] tokens_b = sample.tokenized.get("tokens_b", None) + # is_pretokenized seems to be broken upstream for slow tokenizers, while fast tokenizers rely on it + # temp fix until fixed upstream (see https://github.com/huggingface/transformers/issues/6046) + if tokenizer.is_fast: + is_pretokenized = True + else: + is_pretokenized = False inputs = tokenizer.encode_plus( tokens_a, tokens_b, @@ -48,7 +54,7 @@ def sample_to_features_text( truncation=False, # truncation_strategy is deprecated return_token_type_ids=True, max_length=max_seq_len, - is_pretokenized=True, + is_pretokenized=is_pretokenized, ) input_ids, segment_ids = inputs["input_ids"], inputs["token_type_ids"] @@ -138,13 +144,20 @@ def samples_to_features_ner( """ tokens = sample.tokenized["tokens"] + + # is_pretokenized seems to be broken upstream for slow tokenizers, while fast tokenizers rely on it + # temp fix until fixed upstream (see https://github.com/huggingface/transformers/issues/6046) + if tokenizer.is_fast: + is_pretokenized = True + else: + is_pretokenized = False inputs = tokenizer.encode_plus(text=tokens, text_pair=None, add_special_tokens=True, truncation=False, return_special_tokens_mask=True, return_token_type_ids=True, - is_pretokenized=True + is_pretokenized=is_pretokenized ) input_ids, segment_ids, special_tokens_mask = inputs["input_ids"], inputs["token_type_ids"], inputs["special_tokens_mask"] From 19cc2110e5b933a60616884684295ae3b4f3bf7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogdan=20Kosti=C4=87?= Date: Tue, 25 Aug 2020 18:00:18 +0200 Subject: [PATCH 26/30] Make use of fast tokenizer possible + fix bug in offset calculation --- farm/data_handler/input_features.py | 35 +++++++++-------- farm/modeling/tokenization.py | 58 +++++++++++++++++++---------- test/test_inference.py | 2 +- test/test_ner.py | 2 +- 4 files changed, 58 insertions(+), 39 deletions(-) diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py index f886a0200..fd45c6b37 100644 --- a/farm/data_handler/input_features.py +++ b/farm/data_handler/input_features.py @@ -36,26 +36,25 @@ def sample_to_features_text( :rtype: list """ - #TODO It might be cleaner to adjust the data structure in sample.tokenized - # Verify if this current quickfix really works for pairs - tokens_a = sample.tokenized["tokens"] - tokens_b = sample.tokenized.get("tokens_b", None) - - # is_pretokenized seems to be broken upstream for slow tokenizers, while fast tokenizers rely on it - # temp fix until fixed upstream (see https://github.com/huggingface/transformers/issues/6046) if tokenizer.is_fast: - is_pretokenized = True + text = sample.clear_text["text"] + # Here, we tokenize the sample for the second time... + inputs = tokenizer(text, return_token_type_ids=True, max_length=max_seq_len) else: - is_pretokenized = False - inputs = tokenizer.encode_plus( - tokens_a, - tokens_b, - add_special_tokens=True, - truncation=False, # truncation_strategy is deprecated - return_token_type_ids=True, - max_length=max_seq_len, - is_pretokenized=is_pretokenized, - ) + # TODO It might be cleaner to adjust the data structure in sample.tokenized + # Verify if this current quickfix really works for pairs + tokens_a = sample.tokenized["tokens"] + tokens_b = sample.tokenized.get("tokens_b", None) + + inputs = tokenizer.encode_plus( + tokens_a, + tokens_b, + add_special_tokens=True, + truncation=False, # truncation_strategy is deprecated + return_token_type_ids=True, + max_length=max_seq_len, + is_pretokenized=False, + ) input_ids, segment_ids = inputs["input_ids"], inputs["token_type_ids"] diff --git a/farm/modeling/tokenization.py b/farm/modeling/tokenization.py index 82f8adeed..b78de0835 100644 --- a/farm/modeling/tokenization.py +++ b/farm/modeling/tokenization.py @@ -258,25 +258,41 @@ def tokenize_with_metadata(text, tokenizer): :rtype: dict """ + # Fast Tokenizers return offsets, so we don't need to calculate them ourselves + if tokenizer.is_fast: + tokenized = tokenizer(text, return_offsets_mapping=True, return_special_tokens_mask=True) + tokens = [] + offsets = [] + start_of_word = [] + previous_token_end = -1 + for token_id, is_special_token, offset in zip(tokenized["input_ids"], + tokenized["special_tokens_mask"], + tokenized["offset_mapping"]): + if is_special_token == 0: + tokens.append(tokenizer.decode([token_id])) + offsets.append(offset[0]) + start_of_word.append(True if offset[0] != previous_token_end else False) + previous_token_end = offset[1] + tokenized = {"tokens": tokens, "offsets": offsets, "start_of_word": start_of_word} + else: + # normalize all other whitespace characters to " " + # Note: using text.split() directly would destroy the offset, + # since \n\n\n would be treated similarly as a single \n + text = re.sub(r"\s", " ", text) + # split text into "words" (here: simple whitespace tokenizer). + words = text.split(" ") + word_offsets = [] + cumulated = 0 + for idx, word in enumerate(words): + word_offsets.append(cumulated) + cumulated += len(word) + 1 # 1 because we so far have whitespace tokenizer + + # split "words" into "subword tokens" + tokens, offsets, start_of_word = _words_to_tokens( + words, word_offsets, tokenizer + ) - # normalize all other whitespace characters to " " - # Note: using text.split() directly would destroy the offset, - # since \n\n\n would be treated similarly as a single \n - text = re.sub(r"\s", " ", text) - # split text into "words" (here: simple whitespace tokenizer). - words = text.split(" ") - word_offsets = [] - cumulated = 0 - for idx, word in enumerate(words): - word_offsets.append(cumulated) - cumulated += len(word) + 1 # 1 because we so far have whitespace tokenizer - - # split "words"into "subword tokens" - tokens, offsets, start_of_word = _words_to_tokens( - words, word_offsets, tokenizer - ) - - tokenized = {"tokens": tokens, "offsets": offsets, "start_of_word": start_of_word} + tokenized = {"tokens": tokens, "offsets": offsets, "start_of_word": start_of_word} return tokenized @@ -327,7 +343,11 @@ def _words_to_tokens(words, word_offsets, tokenizer): # Depending on the tokenizer type special chars are added to distinguish tokens with preceeding # whitespace (=> "start of a word"). We need to get rid of these to calculate the original length of the token orig_tok = re.sub(SPECIAL_TOKENIZER_CHARS, "", tok) - w_off += len(orig_tok) + # Don't use length of unk token for offset calculation + if orig_tok == tokenizer.special_tokens_map["unk_token"]: + w_off += 1 + else: + w_off += len(orig_tok) if first_tok: start_of_word.append(True) first_tok = False diff --git a/test/test_inference.py b/test/test_inference.py index 9a219f82d..eacfaa5bb 100644 --- a/test/test_inference.py +++ b/test/test_inference.py @@ -98,7 +98,7 @@ def test_embeddings_extraction(num_processes, use_fast): result = model.inference_from_dicts(dicts=basic_texts) assert result[0]["context"] == ['Schar', '##tau', 'sagte', 'dem', 'Tages', '##spiegel', ',', 'dass', 'Fischer', 'ein', 'Id', '##iot', 'ist'] assert result[0]["vec"].shape == (768,) - assert np.isclose(result[0]["vec"][0], -0.032460204579613426) + assert np.isclose(result[0]["vec"][0], 0.01501756374325071) def test_inferencer_with_fast_bert_tokenizer(): diff --git a/test/test_ner.py b/test/test_ner.py index 9e34407a9..12aa9dbe7 100644 --- a/test/test_ner.py +++ b/test/test_ner.py @@ -89,7 +89,7 @@ def test_ner(caplog, use_fast): basic_texts = [ {"text": "Paris is a town in France."}, ] - model = Inferencer.load(model_name_or_path="dbmdz/bert-base-cased-finetuned-conll03-english", num_processes=0, task_type="ner") + model = Inferencer.load(model_name_or_path="dbmdz/bert-base-cased-finetuned-conll03-english", num_processes=0, task_type="ner", use_fast=use_fast) # labels arent correctly inserted from transformers # They are converted to LABEL_1 ... LABEL_N # For the inference result to contain predictions we need them in IOB NER format From 7e75de15ac2da4da1bbf443f1448ea67716552fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogdan=20Kosti=C4=87?= Date: Mon, 31 Aug 2020 17:57:41 +0200 Subject: [PATCH 27/30] Make fast tokenization possible with NER, LM and QA --- farm/data_handler/input_features.py | 115 +++++++++++++++++++++------- 1 file changed, 89 insertions(+), 26 deletions(-) diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py index fd45c6b37..4bfa38cec 100644 --- a/farm/data_handler/input_features.py +++ b/farm/data_handler/input_features.py @@ -4,6 +4,7 @@ import logging +import re import collections from dotmap import DotMap import numpy as np @@ -39,7 +40,14 @@ def sample_to_features_text( if tokenizer.is_fast: text = sample.clear_text["text"] # Here, we tokenize the sample for the second time... - inputs = tokenizer(text, return_token_type_ids=True, max_length=max_seq_len) + inputs = tokenizer(text, + return_token_type_ids=True, + max_length=max_seq_len, + return_special_tokens_mask=True) + + if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]): + logger.error("FastTokenizer produced different number of tokens in input_features.py and" + "tokenize_with_metadata.py") else: # TODO It might be cleaner to adjust the data structure in sample.tokenized # Verify if this current quickfix really works for pairs @@ -147,17 +155,25 @@ def samples_to_features_ner( # is_pretokenized seems to be broken upstream for slow tokenizers, while fast tokenizers rely on it # temp fix until fixed upstream (see https://github.com/huggingface/transformers/issues/6046) if tokenizer.is_fast: - is_pretokenized = True + text = sample.clear_text["text"] + # Here, we tokenize the sample for the second time. + inputs = tokenizer(text, + return_token_type_ids=True, + max_length=max_seq_len, + return_special_tokens_mask=True) + + if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]): + logger.error("FastTokenizer produced different number of tokens in input_features.py and" + "tokenize_with_metadata.py") else: - is_pretokenized = False - inputs = tokenizer.encode_plus(text=tokens, - text_pair=None, - add_special_tokens=True, - truncation=False, - return_special_tokens_mask=True, - return_token_type_ids=True, - is_pretokenized=is_pretokenized - ) + inputs = tokenizer.encode_plus(text=tokens, + text_pair=None, + add_special_tokens=True, + truncation=False, + return_special_tokens_mask=True, + return_token_type_ids=True, + is_pretokenized=False + ) input_ids, segment_ids, special_tokens_mask = inputs["input_ids"], inputs["token_type_ids"], inputs["special_tokens_mask"] @@ -246,6 +262,14 @@ def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=T tokens_b, t2_label = mask_random_words(tokens_b, tokenizer.vocab, token_groups=sample.tokenized["text_b"]["start_of_word"]) + + if tokenizer.is_fast: + # Detokenize input as fast tokenizer can't handle tokenized input + tokens_a = " ".join(tokens_a) + tokens_a = re.sub(r"^(##|Ġ|▁)", "", tokens_a) + tokens_b = " ".join(tokens_b) + tokens_b = re.sub(r"^(##|Ġ|▁)", "", tokens_b) + # convert lm labels to ids t1_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t1_label] t2_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t2_label] @@ -261,18 +285,36 @@ def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=T tokens_b = None tokens_a, t1_label = mask_random_words(tokens_a, tokenizer.vocab, token_groups=sample.tokenized["text_a"]["start_of_word"]) + if tokenizer.is_fast: + # Detokenize input as fast tokenizer can't handle tokenized input + tokens_a = " ".join(tokens_a) + tokens_a = re.sub(r"^(##|Ġ|▁)", "", tokens_a) + # convert lm labels to ids lm_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t1_label] - # encode string tokens to input_ids and add special tokens - inputs = tokenizer.encode_plus(text=tokens_a, - text_pair=tokens_b, - add_special_tokens=True, - truncation_strategy='do_not_truncate', - # We've already truncated our tokens before - return_special_tokens_mask=True, - return_token_type_ids=True - ) + if tokenizer.is_fast: + inputs = tokenizer(text=tokens_a, + text_pair=tokens_b, + add_special_tokens=True, + return_special_tokens_mask=True, + return_token_type_ids=True) + + seq_b_len = len(sample.tokenized["text_b"]["tokens"]) if "text_b" in sample.tokenized else 0 + if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != \ + (len(sample.tokenized["text_a"]["tokens"]) + seq_b_len): + logger.error("FastTokenizer produced different number of tokens in input_features.py and" + "tokenize_with_metadata.py") + else: + # encode string tokens to input_ids and add special tokens + inputs = tokenizer.encode_plus(text=tokens_a, + text_pair=tokens_b, + add_special_tokens=True, + truncation_strategy='do_not_truncate', + # We've already truncated our tokens before + return_special_tokens_mask=True, + return_token_type_ids=True + ) input_ids, segment_ids, special_tokens_mask = inputs["input_ids"], inputs["token_type_ids"], inputs[ "special_tokens_mask"] @@ -373,12 +415,33 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, sp_toks_start, sp_toks # (question_len_t + passage_len_t + n_special_tokens). This may be less than max_seq_len but will not be greater # than max_seq_len since truncation was already performed when the document was chunked into passages # (c.f. create_samples_squad() ) - encoded = tokenizer.encode_plus(text=sample.tokenized["question_tokens"], - text_pair=sample.tokenized["passage_tokens"], - add_special_tokens=True, - truncation_strategy='do_not_truncate', - return_token_type_ids=True, - return_tensors=None) + + if tokenizer.is_fast: + # Detokenize input as fast tokenizer can't handle tokenized input + question_tokens = " ".join(question_tokens) + question_tokens = re.sub(r"^(##|Ġ|▁)", "", question_tokens) + passage_tokens = " ".join(passage_tokens) + passage_tokens = re.sub(r"^(##|Ġ|▁)", "", passage_tokens) + + encoded = tokenizer(text=question_tokens, + text_pair=passage_tokens, + add_special_tokens=True, + return_special_tokens_mask=True, + return_token_type_ids=True) + + if (len(encoded["input_ids"]) - encoded["special_tokens_mask"].count(1)) != \ + (len(sample.tokenized["question_tokens"]) + len(sample.tokenized["passage_tokens"])): + logger.error("FastTokenizer produced different number of tokens in input_features.py and" + "tokenize_with_metadata.py") + + else: + encoded = tokenizer.encode_plus(text=sample.tokenized["question_tokens"], + text_pair=sample.tokenized["passage_tokens"], + add_special_tokens=True, + truncation_strategy='do_not_truncate', + return_token_type_ids=True, + return_tensors=None) + input_ids = encoded["input_ids"] segment_ids = encoded["token_type_ids"] From eb466292ebae086d1248339a4f05e2bffe72d3ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogdan=20Kosti=C4=87?= Date: Tue, 1 Sep 2020 11:10:35 +0200 Subject: [PATCH 28/30] Change error messages --- farm/data_handler/input_features.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py index 4bfa38cec..84308c3cd 100644 --- a/farm/data_handler/input_features.py +++ b/farm/data_handler/input_features.py @@ -46,8 +46,10 @@ def sample_to_features_text( return_special_tokens_mask=True) if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]): - logger.error("FastTokenizer produced different number of tokens in input_features.py and" - "tokenize_with_metadata.py") + logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to " + f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs " + f"from number of tokens produced in tokenize_with_metadata.py") + logger.error("Further processing is likely to be wrong.") else: # TODO It might be cleaner to adjust the data structure in sample.tokenized # Verify if this current quickfix really works for pairs @@ -163,8 +165,10 @@ def samples_to_features_ner( return_special_tokens_mask=True) if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]): - logger.error("FastTokenizer produced different number of tokens in input_features.py and" - "tokenize_with_metadata.py") + logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to " + f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs " + f"from number of tokens produced in tokenize_with_metadata.py") + logger.error("Further processing is likely to be wrong.") else: inputs = tokenizer.encode_plus(text=tokens, text_pair=None, @@ -303,8 +307,10 @@ def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=T seq_b_len = len(sample.tokenized["text_b"]["tokens"]) if "text_b" in sample.tokenized else 0 if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != \ (len(sample.tokenized["text_a"]["tokens"]) + seq_b_len): - logger.error("FastTokenizer produced different number of tokens in input_features.py and" - "tokenize_with_metadata.py") + logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to " + f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs " + f"from number of tokens produced in tokenize_with_metadata.py") + logger.error("Further processing is likely to be wrong.") else: # encode string tokens to input_ids and add special tokens inputs = tokenizer.encode_plus(text=tokens_a, @@ -431,8 +437,10 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, sp_toks_start, sp_toks if (len(encoded["input_ids"]) - encoded["special_tokens_mask"].count(1)) != \ (len(sample.tokenized["question_tokens"]) + len(sample.tokenized["passage_tokens"])): - logger.error("FastTokenizer produced different number of tokens in input_features.py and" - "tokenize_with_metadata.py") + logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to " + f"{len(encoded['input_ids']) - encoded['special_tokens_mask'].count(1)} tokens, which differs " + f"from number of tokens produced in tokenize_with_metadata.py") + logger.error("Further processing is likely to be wrong.") else: encoded = tokenizer.encode_plus(text=sample.tokenized["question_tokens"], From 06d51c026e643ce436506bea0ec0362dcf364289 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogdan=20Kosti=C4=87?= Date: Tue, 1 Sep 2020 15:35:57 +0200 Subject: [PATCH 29/30] Add tests --- farm/data_handler/input_features.py | 18 ++++++++++------ farm/modeling/tokenization.py | 33 +++++++++++++++++++++++------ test/conftest.py | 16 ++++++++------ test/test_question_answering.py | 6 +++++- test/test_tokenization.py | 23 ++++++++++++++++++-- 5 files changed, 74 insertions(+), 22 deletions(-) diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py index 84308c3cd..5b956d739 100644 --- a/farm/data_handler/input_features.py +++ b/farm/data_handler/input_features.py @@ -270,9 +270,9 @@ def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=T if tokenizer.is_fast: # Detokenize input as fast tokenizer can't handle tokenized input tokens_a = " ".join(tokens_a) - tokens_a = re.sub(r"^(##|Ġ|▁)", "", tokens_a) + tokens_a = re.sub(r"(^|\s)(##)", "", tokens_a) tokens_b = " ".join(tokens_b) - tokens_b = re.sub(r"^(##|Ġ|▁)", "", tokens_b) + tokens_b = re.sub(r"(^|\s)(##)", "", tokens_b) # convert lm labels to ids t1_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t1_label] @@ -292,7 +292,7 @@ def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=T if tokenizer.is_fast: # Detokenize input as fast tokenizer can't handle tokenized input tokens_a = " ".join(tokens_a) - tokens_a = re.sub(r"^(##|Ġ|▁)", "", tokens_a) + tokens_a = re.sub(r"(^|\s)(##)", "", tokens_a) # convert lm labels to ids lm_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t1_label] @@ -425,9 +425,9 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, sp_toks_start, sp_toks if tokenizer.is_fast: # Detokenize input as fast tokenizer can't handle tokenized input question_tokens = " ".join(question_tokens) - question_tokens = re.sub(r"^(##|Ġ|▁)", "", question_tokens) + question_tokens = re.sub(r"(^|\s)(##)", "", question_tokens) passage_tokens = " ".join(passage_tokens) - passage_tokens = re.sub(r"^(##|Ġ|▁)", "", passage_tokens) + passage_tokens = re.sub(r"(^|\s)(##)", "", passage_tokens) encoded = tokenizer(text=question_tokens, text_pair=passage_tokens, @@ -553,8 +553,12 @@ def combine_vecs(question_vec, passage_vec, tokenizer, spec_tok_val=-1): # Join question_label_vec and passage_label_vec and add slots for special tokens vec = tokenizer.build_inputs_with_special_tokens(token_ids_0=question_vec, token_ids_1=passage_vec) - spec_toks_mask = tokenizer.get_special_tokens_mask(token_ids_0=question_vec, - token_ids_1=passage_vec) + if tokenizer.is_fast: + spec_toks_mask = tokenizer.get_special_tokens_mask(token_ids_0=vec, + already_has_special_tokens=True) + else: + spec_toks_mask = tokenizer.get_special_tokens_mask(token_ids_0=question_vec, + token_ids_1=passage_vec) # If a value in vec corresponds to a special token, it will be replaced with spec_tok_val combined = [v if not special_token else spec_tok_val for v, special_token in zip(vec, spec_toks_mask)] diff --git a/farm/modeling/tokenization.py b/farm/modeling/tokenization.py index b78de0835..89e051d98 100644 --- a/farm/modeling/tokenization.py +++ b/farm/modeling/tokenization.py @@ -99,12 +99,21 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=Fals # return appropriate tokenizer object ret = None if tokenizer_class == "AlbertTokenizer": - ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) + if use_fast: + logger.error('AlbertTokenizerFast is not supported! Using AlbertTokenizer instead.') + ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) + else: + ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) elif tokenizer_class == "XLMRobertaTokenizer": - ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) + if use_fast: + logger.error('XLMRobertaTokenizerFast is not supported! Using XLMRobertaTokenizer instead.') + ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) + else: + ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif "RobertaTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: - raise ValueError('RobertaTokenizerFast is not supportet!') + logger.error('RobertaTokenizerFast is not supported! Using RobertaTokenizer instead.') + ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif "DistilBertTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" @@ -118,16 +127,28 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=Fals else: ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "XLNetTokenizer": - ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) + if use_fast: + logger.error('XLNetTokenizerFast is not supported! Using XLNetTokenizer instead.') + ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) + else: + ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) elif "ElectraTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = ElectraTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = ElectraTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "EmbeddingTokenizer": - ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) + if use_fast: + logger.error('EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.') + ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) + else: + ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "CamembertTokenizer": - ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs) + if use_fast: + logger.error('CamembertTokenizerFast is not supported! Using CamembertTokenizer instead.') + ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs) + else: + ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs) if ret is None: raise Exception("Unable to load tokenizer") else: diff --git a/test/conftest.py b/test/conftest.py index 74d2b6c2f..8063b717b 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -61,18 +61,20 @@ def adaptive_model_qa(use_gpu, num_processes): assert len(children) == 0 -@pytest.fixture() -def bert_base_squad2(): +@pytest.fixture(params=[True, False]) +def bert_base_squad2(request): model = QAInferencer.load( "deepset/bert-base-cased-squad2", task_type="question_answering", batch_size=16, - num_processes=0) + num_processes=0, + use_fast=request.param + ) return model -@pytest.fixture() -def distilbert_squad(): +@pytest.fixture(params=[True, False]) +def distilbert_squad(request): set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 2 @@ -81,7 +83,9 @@ def distilbert_squad(): base_LM_model = "distilbert-base-uncased" tokenizer = Tokenizer.load( - pretrained_model_name_or_path=base_LM_model, do_lower_case=True + pretrained_model_name_or_path=base_LM_model, + do_lower_case=True, + use_fast=request.param ) label_list = ["start_token", "end_token"] processor = SquadProcessor( diff --git a/test/test_question_answering.py b/test/test_question_answering.py index 9832453aa..3b640e799 100644 --- a/test/test_question_answering.py +++ b/test/test_question_answering.py @@ -9,7 +9,7 @@ from farm.infer import Inferencer, QAInferencer from farm.data_handler.inputs import QAInput, Question - +@pytest.mark.parametrize("distilbert_squad", [True, False], indirect=True) def test_training(distilbert_squad, caplog=None): if caplog: caplog.set_level(logging.CRITICAL) @@ -19,6 +19,7 @@ def test_training(distilbert_squad, caplog=None): assert type(processor) == SquadProcessor +@pytest.mark.parametrize("distilbert_squad", [True, False], indirect=True) def test_save_load(distilbert_squad, caplog=None): if caplog: caplog.set_level(logging.CRITICAL) @@ -33,6 +34,7 @@ def test_save_load(distilbert_squad, caplog=None): assert inferencer is not None +@pytest.mark.parametrize("bert_base_squad2", [True, False], indirect=True) def test_inference_dicts(bert_base_squad2): qa_format_1 = [ { @@ -49,6 +51,7 @@ def test_inference_dicts(bert_base_squad2): @pytest.fixture() +@pytest.mark.parametrize("bert_base_squad2", [True, False], indirect=True) def span_inference_result(bert_base_squad2, caplog=None): if caplog: caplog.set_level(logging.CRITICAL) @@ -59,6 +62,7 @@ def span_inference_result(bert_base_squad2, caplog=None): @pytest.fixture() +@pytest.mark.parametrize("bert_base_squad2", [True, False], indirect=True) def no_answer_inference_result(bert_base_squad2, caplog=None): if caplog: caplog.set_level(logging.CRITICAL) diff --git a/test/test_tokenization.py b/test/test_tokenization.py index c8959e8f1..4c9557eee 100644 --- a/test/test_tokenization.py +++ b/test/test_tokenization.py @@ -1,7 +1,7 @@ import logging import pytest import re -from transformers import BertTokenizer, BertTokenizerFast, RobertaTokenizer, XLNetTokenizer +from transformers import BertTokenizer, BertTokenizerFast, RobertaTokenizer, XLNetTokenizer, RobertaTokenizerFast from transformers import ElectraTokenizerFast from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences @@ -284,5 +284,24 @@ def test_fast_electra_tokenizer(caplog): assert type(tokenizer) is ElectraTokenizerFast +@pytest.mark.parametrize("model_name", ["bert-base-cased", "distilbert-base-uncased", "deepset/electra-base-squad2"]) +def test_detokenization_in_fast_tokenizers(model_name): + tokenizer = Tokenizer.load( + pretrained_model_name_or_path=model_name, + use_fast=True + ) + for text in TEXTS: + tokens_with_metadata = tokenize_with_metadata(text, tokenizer) + tokens = tokens_with_metadata["tokens"] + + detokenized = " ".join(tokens) + detokenized = re.sub(r"(^|\s+)(##)", "", detokenized) + + detokenized_ids = tokenizer(detokenized, add_special_tokens=False)["input_ids"] + detokenized_tokens = [tokenizer.decode([tok_id]).strip() for tok_id in detokenized_ids] + + assert tokens == detokenized_tokens + + if __name__ == "__main__": - test_all_tokenizer_on_special_cases() + test_all_tokenizer_on_special_cases() \ No newline at end of file From 1acaff4ea7f6f3c19080ee468c05eabfe1b4229d Mon Sep 17 00:00:00 2001 From: Malte Pietsch Date: Wed, 2 Sep 2020 10:34:57 +0200 Subject: [PATCH 30/30] update error messages, comments and truncation arg in tokenizer --- farm/data_handler/input_features.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py index 5b956d739..a6ccf4429 100644 --- a/farm/data_handler/input_features.py +++ b/farm/data_handler/input_features.py @@ -39,7 +39,8 @@ def sample_to_features_text( if tokenizer.is_fast: text = sample.clear_text["text"] - # Here, we tokenize the sample for the second time... + # Here, we tokenize the sample for the second time to get all relevant ids + # This should change once we git rid of FARM's tokenize_with_metadata() inputs = tokenizer(text, return_token_type_ids=True, max_length=max_seq_len, @@ -48,11 +49,10 @@ def sample_to_features_text( if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]): logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to " f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs " - f"from number of tokens produced in tokenize_with_metadata.py") - logger.error("Further processing is likely to be wrong.") + f"from number of tokens produced in tokenize_with_metadata(). \n" + f"Further processing is likely to be wrong.") else: # TODO It might be cleaner to adjust the data structure in sample.tokenized - # Verify if this current quickfix really works for pairs tokens_a = sample.tokenized["tokens"] tokens_b = sample.tokenized.get("tokens_b", None) @@ -154,11 +154,10 @@ def samples_to_features_ner( tokens = sample.tokenized["tokens"] - # is_pretokenized seems to be broken upstream for slow tokenizers, while fast tokenizers rely on it - # temp fix until fixed upstream (see https://github.com/huggingface/transformers/issues/6046) if tokenizer.is_fast: text = sample.clear_text["text"] - # Here, we tokenize the sample for the second time. + # Here, we tokenize the sample for the second time to get all relevant ids + # This should change once we git rid of FARM's tokenize_with_metadata() inputs = tokenizer(text, return_token_type_ids=True, max_length=max_seq_len, @@ -167,8 +166,8 @@ def samples_to_features_ner( if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]): logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to " f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs " - f"from number of tokens produced in tokenize_with_metadata.py") - logger.error("Further processing is likely to be wrong.") + f"from number of tokens produced in tokenize_with_metadata().\n" + f"Further processing is likely to be wrong!") else: inputs = tokenizer.encode_plus(text=tokens, text_pair=None, @@ -309,13 +308,14 @@ def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=T (len(sample.tokenized["text_a"]["tokens"]) + seq_b_len): logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to " f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs " - f"from number of tokens produced in tokenize_with_metadata.py") - logger.error("Further processing is likely to be wrong.") + f"from number of tokens produced in tokenize_with_metadata(). \n" + f"Further processing is likely to be wrong.") else: # encode string tokens to input_ids and add special tokens inputs = tokenizer.encode_plus(text=tokens_a, text_pair=tokens_b, add_special_tokens=True, + truncation=False, truncation_strategy='do_not_truncate', # We've already truncated our tokens before return_special_tokens_mask=True, @@ -439,13 +439,13 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, sp_toks_start, sp_toks (len(sample.tokenized["question_tokens"]) + len(sample.tokenized["passage_tokens"])): logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to " f"{len(encoded['input_ids']) - encoded['special_tokens_mask'].count(1)} tokens, which differs " - f"from number of tokens produced in tokenize_with_metadata.py") - logger.error("Further processing is likely to be wrong.") - + f"from number of tokens produced in tokenize_with_metadata(). \n" + f"Further processing is likely to be wrong.") else: encoded = tokenizer.encode_plus(text=sample.tokenized["question_tokens"], text_pair=sample.tokenized["passage_tokens"], add_special_tokens=True, + truncation=False, truncation_strategy='do_not_truncate', return_token_type_ids=True, return_tensors=None)