Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to use fast HF tokenizer. #482

Merged
merged 32 commits into from Sep 2, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
651463a
Add option to use fast HF tokenizer
PhilipMay Aug 1, 2020
a433483
Hand merge tests from PR #205
PhilipMay Aug 1, 2020
c20c5db
test_inferencer_with_fast_bert_tokenizer
PhilipMay Aug 1, 2020
5f2b5ee
test_fast_bert_tokenizer
PhilipMay Aug 1, 2020
fa3bd67
test_fast_bert_tokenizer_strip_accents
PhilipMay Aug 1, 2020
cd7298c
test_fast_electra_tokenizer
PhilipMay Aug 1, 2020
01e5ffb
Fix OOM issue of CI
PhilipMay Aug 1, 2020
42f345f
Extend test for fast tokenizer
PhilipMay Aug 2, 2020
9b021ff
test_fast_tokenizer for more model typed
PhilipMay Aug 2, 2020
86d7fd5
Fix tokenize_with_metadata
PhilipMay Aug 2, 2020
a8f4638
Split tokenizer tests
PhilipMay Aug 2, 2020
cdccafa
Fix pytest params bug in test_tok
PhilipMay Aug 2, 2020
47d4b6a
Fix fast tokenizer usage
PhilipMay Aug 4, 2020
8318063
add missing newline eof
PhilipMay Aug 4, 2020
8c61e3b
Add test fast tok. doc_callif.
PhilipMay Aug 4, 2020
aec7d2d
Remove RobertaTokenizerFast
PhilipMay Aug 4, 2020
75ea9dd
Fix Tokenizer load and save.
PhilipMay Aug 4, 2020
2d2cd00
Fix typo
PhilipMay Aug 4, 2020
8afa136
Improve test test_embeddings_extraction
PhilipMay Aug 5, 2020
042fde0
Dosctring for fast tokenizers improved
PhilipMay Aug 5, 2020
7ed385f
tokenizer_args docstring
PhilipMay Aug 5, 2020
d4eb59c
Extend test_embeddings_extraction to fast tok.
PhilipMay Aug 5, 2020
4f87604
extend test_ner with fast tok.
PhilipMay Aug 5, 2020
bc7abca
fix sample_to_features_ner for fast tokenizer
tholor Aug 6, 2020
da9c2f5
temp fix for is_pretokenized until fixed upstream
tholor Aug 6, 2020
19cc211
Make use of fast tokenizer possible + fix bug in offset calculation
bogdankostic Aug 25, 2020
6d0a3c1
Merge branch 'master' into add_fast_tokenizer
bogdankostic Aug 25, 2020
7e75de1
Make fast tokenization possible with NER, LM and QA
bogdankostic Aug 31, 2020
0e4b1b0
Merge remote-tracking branch 'origin/add_fast_tokenizer' into add_fas…
bogdankostic Aug 31, 2020
eb46629
Change error messages
bogdankostic Sep 1, 2020
06d51c0
Add tests
bogdankostic Sep 1, 2020
1acaff4
update error messages, comments and truncation arg in tokenizer
tholor Sep 2, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
21 changes: 18 additions & 3 deletions farm/infer.py
Expand Up @@ -147,8 +147,10 @@ def load(
extraction_strategy=None,
s3e_stats=None,
num_processes=None,
disable_tqdm=False

disable_tqdm=False,
tokenizer_class=None,
use_fast=False,
tokenizer_args=None,
):
"""
Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by
Expand Down Expand Up @@ -191,9 +193,18 @@ def load(
:type num_processes: int
:param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing)
:type disable_tqdm: bool
:param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
:type tokenizer_class: str
:param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
use the Python one (False).
:param tokenizer_args: (Optional) Will be passed to the Tokenizer ``__init__`` method.
PhilipMay marked this conversation as resolved.
Show resolved Hide resolved
:type tokenizer_args: dict
:type use_fast: bool
:return: An instance of the Inferencer.

"""
if tokenizer_args is None:
tokenizer_args = {}

device, n_gpu = initialize_device_settings(use_cuda=gpu, local_rank=-1, use_amp=None)
name = os.path.basename(model_name_or_path)
Expand Down Expand Up @@ -221,7 +232,11 @@ def load(

model = AdaptiveModel.convert_from_transformers(model_name_or_path, device, task_type)
config = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = Tokenizer.load(model_name_or_path)
tokenizer = Tokenizer.load(model_name_or_path,
tokenizer_class=tokenizer_class,
use_fast=use_fast,
**tokenizer_args,
)

# TODO infer task_type automatically from config (if possible)
if task_type == "question_answering":
Expand Down
33 changes: 24 additions & 9 deletions farm/modeling/tokenization.py
Expand Up @@ -24,10 +24,10 @@

import numpy as np
from transformers.tokenization_albert import AlbertTokenizer
from transformers.tokenization_bert import BertTokenizer, load_vocab
from transformers.tokenization_distilbert import DistilBertTokenizer
from transformers.tokenization_electra import ElectraTokenizer
from transformers.tokenization_roberta import RobertaTokenizer
from transformers.tokenization_bert import BertTokenizer, BertTokenizerFast, load_vocab
from transformers.tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast
from transformers.tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
from transformers.tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.tokenization_xlm_roberta import XLMRobertaTokenizer
from transformers.tokenization_xlnet import XLNetTokenizer
Expand All @@ -48,7 +48,7 @@ class Tokenizer:
"""

@classmethod
def load(cls, pretrained_model_name_or_path, tokenizer_class=None, **kwargs):
def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=False, **kwargs):
"""
Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
`pretrained_model_name_or_path` or define it manually via `tokenizer_class`.
Expand All @@ -57,6 +57,9 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, **kwargs):
:type pretrained_model_name_or_path: str
:param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
:type tokenizer_class: str
:param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
use the Python one (False).
:type use_fast: bool
:param kwargs:
:return: Tokenizer
"""
Expand Down Expand Up @@ -98,15 +101,27 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, **kwargs):
elif tokenizer_class == "XLMRobertaTokenizer":
ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif tokenizer_class == "RobertaTokenizer":
ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
if use_fast:
ret = RobertaTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
else:
ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif tokenizer_class == "DistilBertTokenizer":
ret = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
if use_fast:
ret = DistilBertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
else:
ret = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif tokenizer_class == "BertTokenizer":
ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
if use_fast:
ret = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
else:
ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif tokenizer_class == "XLNetTokenizer":
ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs)
elif tokenizer_class == "ElectraTokenizer":
ret = ElectraTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
if use_fast:
ret = ElectraTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
else:
ret = ElectraTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif tokenizer_class == "EmbeddingTokenizer":
ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif tokenizer_class == "CamembertTokenizer":
Expand Down
10 changes: 10 additions & 0 deletions test/test_inference.py
@@ -1,5 +1,7 @@
import pytest
import numpy as np
import transformers

from farm.infer import Inferencer


Expand Down Expand Up @@ -95,5 +97,13 @@ def test_embeddings_extraction(num_processes):
assert result[0]["context"] == ['Schar', '##tau', 'sagte', 'dem', 'Tages', '##spiegel', ',', 'dass', 'Fischer', 'ein', 'Id', '##iot', 'ist']
assert np.isclose(result[0]["vec"][0], 1.50174605e-02)


def test_inferencer_with_fast_bert_tokenizer():
model = Inferencer.load("bert-base-german-cased", task_type='text_classification',
use_fast=True, num_processes=0)
tokenizer = model.processor.tokenizer
assert type(tokenizer) is transformers.tokenization_bert.BertTokenizerFast


if __name__ == "__main__":
test_embeddings_extraction()
95 changes: 94 additions & 1 deletion test/test_tokenization.py
@@ -1,6 +1,8 @@
import logging
from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences
from transformers import BertTokenizer, RobertaTokenizer, XLNetTokenizer
from transformers import BertTokenizer, BertTokenizerFast, RobertaTokenizer, XLNetTokenizer
from transformers import ElectraTokenizerFast

import re


Expand Down Expand Up @@ -90,6 +92,45 @@ def test_truncate_sequences(caplog):
assert len(trunc_a) + len(trunc_b) + tokenizer.num_special_tokens_to_add(pair=True) == max_seq_len


def test_fast_tokenizer(caplog):
PhilipMay marked this conversation as resolved.
Show resolved Hide resolved
fast_tokenizer = Tokenizer.load("bert-base-cased", lower_case=False, use_fast=True)
tokenizer = Tokenizer.load("bert-base-cased", lower_case=False, use_fast=False)

texts = [
"This is a sentence",
"Der entscheidende Pass",
"This is a sentence with multiple spaces",
"力加勝北区ᴵᴺᵀᵃছজটডণত",
"Thiso text is included tolod makelio sure Unicodeel is handled properly:",
"This is a sentence...",
"Let's see all on this text and. !23# neverseenwordspossible",
"""This is a sentence.
With linebreak""",
"""Sentence with multiple


newlines
""",
"and another one\n\n\nwithout space",
"This is a sentence with tab",
"This is a sentence with multiple tabs",
]
for text in texts:

# plain tokenize function
tokenized = tokenizer.tokenize(text)
fast_tokenized = fast_tokenizer.tokenize(text)

assert tokenized == fast_tokenized

# our tokenizer with metadata on "whitespace tokenized words"
tokenized_meta = tokenize_with_metadata(text=text, tokenizer=tokenizer)
fast_tokenized_meta = tokenize_with_metadata(text=text, tokenizer=fast_tokenizer)

# verify that tokenization on full sequence is the same as the one on "whitespace tokenized words"
assert tokenized_meta == fast_tokenized_meta, f"Failed using {tokenizer.__class__.__name__}"


def test_all_tokenizer_on_special_cases(caplog):
caplog.set_level(logging.CRITICAL)

Expand Down Expand Up @@ -173,5 +214,57 @@ def test_bert_custom_vocab(caplog):
assert tokenized_meta["start_of_word"] == [True, True, True, True, True, True, False, False, False, False, True, True, True, False, False, False, False, False, False, False]


def test_fast_bert_custom_vocab(caplog):
caplog.set_level(logging.CRITICAL)

lang_model = "bert-base-cased"

tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=False, use_fast=True
)

#deprecated: tokenizer.add_custom_vocab("samples/tokenizer/custom_vocab.txt")
tokenizer.add_tokens(new_tokens=["neverseentokens"])

basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"

# original tokenizer from transformer repo
tokenized = tokenizer.tokenize(basic_text)
assert tokenized == ['Some', 'Text', 'with', 'neverseentokens', 'plus', '!', '215', '?', '#', '.', 'and', 'a', 'combined', '-', 'token', '_', 'with', '/', 'ch', '##ars']

# ours with metadata
tokenized_meta = tokenize_with_metadata(text=basic_text, tokenizer=tokenizer)
assert tokenized_meta["tokens"] == tokenized
assert tokenized_meta["offsets"] == [0, 5, 10, 15, 31, 36, 37, 40, 41, 42, 44, 48, 50, 58, 59, 64, 65, 69, 70, 72]
assert tokenized_meta["start_of_word"] == [True, True, True, True, True, True, False, False, False, False, True, True, True, False, False, False, False, False, False, False]


def test_fast_bert_tokenizer(caplog):
caplog.set_level(logging.CRITICAL)

tokenizer = Tokenizer.load("bert-base-german-cased", use_fast=True)
assert type(tokenizer) is BertTokenizerFast


def test_fast_bert_tokenizer_strip_accents(caplog):
caplog.set_level(logging.CRITICAL)

tokenizer = Tokenizer.load("dbmdz/bert-base-german-uncased",
use_fast=True,
strip_accents=False)
assert type(tokenizer) is BertTokenizerFast
assert tokenizer._tokenizer._parameters['strip_accents'] is False
assert tokenizer._tokenizer._parameters['lowercase']


def test_fast_electra_tokenizer(caplog):
caplog.set_level(logging.CRITICAL)

tokenizer = Tokenizer.load("dbmdz/electra-base-german-europeana-cased-discriminator",
use_fast=True)
assert type(tokenizer) is ElectraTokenizerFast


if __name__ == "__main__":
test_all_tokenizer_on_special_cases()