From 651463a3e865b2f34b94e45586653e09072d85d8 Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Sat, 1 Aug 2020 13:56:03 +0200
Subject: [PATCH 01/30] Add option to use fast HF tokenizer

---
 farm/infer.py                 | 21 ++++++++++++++++++---
 farm/modeling/tokenization.py | 33 ++++++++++++++++++++++++---------
 2 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/farm/infer.py b/farm/infer.py
index 6a4399ad6..180e6a577 100644
--- a/farm/infer.py
+++ b/farm/infer.py
@@ -147,8 +147,10 @@ def load(
         extraction_strategy=None,
         s3e_stats=None,
         num_processes=None,
-        disable_tqdm=False
-
+        disable_tqdm=False,
+        tokenizer_class=None,
+        use_fast=False,
+        tokenizer_args=None,
     ):
         """
         Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by
@@ -191,9 +193,18 @@ def load(
         :type num_processes: int
         :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing)
         :type disable_tqdm: bool
+        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
+        :type tokenizer_class: str
+        :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
+            use the Python one (False).
+        :param tokenizer_args: (Optional) Will be passed to the Tokenizer ``__init__`` method.
+        :type tokenizer_args: dict
+        :type use_fast: bool
         :return: An instance of the Inferencer.
 
         """
+        if tokenizer_args is None:
+            tokenizer_args = {}
 
         device, n_gpu = initialize_device_settings(use_cuda=gpu, local_rank=-1, use_amp=None)
         name = os.path.basename(model_name_or_path)
@@ -221,7 +232,11 @@ def load(
 
             model = AdaptiveModel.convert_from_transformers(model_name_or_path, device, task_type)
             config = AutoConfig.from_pretrained(model_name_or_path)
-            tokenizer = Tokenizer.load(model_name_or_path)
+            tokenizer = Tokenizer.load(model_name_or_path,
+                                       tokenizer_class=tokenizer_class,
+                                       use_fast=use_fast,
+                                       **tokenizer_args,
+                                       )
 
             # TODO infer task_type automatically from config (if possible)
             if task_type == "question_answering":
diff --git a/farm/modeling/tokenization.py b/farm/modeling/tokenization.py
index e0b125cb1..c5c36df8d 100644
--- a/farm/modeling/tokenization.py
+++ b/farm/modeling/tokenization.py
@@ -24,10 +24,10 @@
 
 import numpy as np
 from transformers.tokenization_albert import AlbertTokenizer
-from transformers.tokenization_bert import BertTokenizer, load_vocab
-from transformers.tokenization_distilbert import DistilBertTokenizer
-from transformers.tokenization_electra import ElectraTokenizer
-from transformers.tokenization_roberta import RobertaTokenizer
+from transformers.tokenization_bert import BertTokenizer, BertTokenizerFast, load_vocab
+from transformers.tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast
+from transformers.tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
+from transformers.tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
 from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.tokenization_xlm_roberta import XLMRobertaTokenizer
 from transformers.tokenization_xlnet import XLNetTokenizer
@@ -48,7 +48,7 @@ class Tokenizer:
     """
 
     @classmethod
-    def load(cls, pretrained_model_name_or_path, tokenizer_class=None, **kwargs):
+    def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=False, **kwargs):
         """
         Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
         `pretrained_model_name_or_path` or define it manually via `tokenizer_class`.
@@ -57,6 +57,9 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, **kwargs):
         :type pretrained_model_name_or_path: str
         :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
         :type tokenizer_class: str
+        :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
+            use the Python one (False).
+        :type use_fast: bool
         :param kwargs:
         :return: Tokenizer
         """
@@ -98,15 +101,27 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, **kwargs):
         elif tokenizer_class == "XLMRobertaTokenizer":
             ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif tokenizer_class == "RobertaTokenizer":
-            ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            if use_fast:
+                ret = RobertaTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            else:
+                ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif tokenizer_class == "DistilBertTokenizer":
-            ret = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            if use_fast:
+                ret = DistilBertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            else:
+                ret = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif tokenizer_class == "BertTokenizer":
-            ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            if use_fast:
+                ret = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            else:
+                ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif tokenizer_class == "XLNetTokenizer":
             ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs)
         elif tokenizer_class == "ElectraTokenizer":
-            ret = ElectraTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            if use_fast:
+                ret = ElectraTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            else:
+                ret = ElectraTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif tokenizer_class == "EmbeddingTokenizer":
             ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif tokenizer_class == "CamembertTokenizer":

From a433483ed80817f0fea298d49d819e4db3833e16 Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Sat, 1 Aug 2020 14:14:54 +0200
Subject: [PATCH 02/30] Hand merge tests from PR #205

---
 test/test_tokenization.py | 65 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/test/test_tokenization.py b/test/test_tokenization.py
index ec73773c1..29977577f 100644
--- a/test/test_tokenization.py
+++ b/test/test_tokenization.py
@@ -90,6 +90,45 @@ def test_truncate_sequences(caplog):
             assert len(trunc_a) + len(trunc_b) + tokenizer.num_special_tokens_to_add(pair=True) == max_seq_len
 
 
+def test_fast_tokenizer(caplog):
+    fast_tokenizer = Tokenizer.load("bert-base-cased", lower_case=False, use_fast=True)
+    tokenizer = Tokenizer.load("bert-base-cased", lower_case=False, use_fast=False)
+
+    texts = [
+        "This is a sentence",
+        "Der entscheidende Pass",
+        "This      is a sentence with multiple spaces",
+        "力加勝北区ᴵᴺᵀᵃছজটডণত",
+        "Thiso text is included tolod makelio sure Unicodeel is handled properly:",
+        "This is a sentence...",
+        "Let's see all on this text and. !23# neverseenwordspossible",
+        """This is a sentence.
+        With linebreak""",
+        """Sentence with multiple
+
+
+        newlines
+        """,
+        "and another one\n\n\nwithout space",
+        "This is a sentence	with tab",
+        "This is a sentence			with multiple tabs",
+    ]
+    for text in texts:
+
+            # plain tokenize function
+            tokenized = tokenizer.tokenize(text)
+            fast_tokenized = fast_tokenizer.tokenize(text)
+
+            assert tokenized == fast_tokenized
+
+            # our tokenizer with metadata on "whitespace tokenized words"
+            tokenized_meta = tokenize_with_metadata(text=text, tokenizer=tokenizer)
+            fast_tokenized_meta = tokenize_with_metadata(text=text, tokenizer=fast_tokenizer)
+
+            # verify that tokenization on full sequence is the same as the one on "whitespace tokenized words"
+            assert tokenized_meta == fast_tokenized_meta, f"Failed using {tokenizer.__class__.__name__}"
+
+
 def test_all_tokenizer_on_special_cases(caplog):
     caplog.set_level(logging.CRITICAL)
 
@@ -173,5 +212,31 @@ def test_bert_custom_vocab(caplog):
     assert tokenized_meta["start_of_word"] == [True, True, True, True, True, True, False, False, False, False, True, True, True, False, False, False, False, False, False, False]
 
 
+def test_fast_bert_custom_vocab(caplog):
+    caplog.set_level(logging.CRITICAL)
+
+    lang_model = "bert-base-cased"
+
+    tokenizer = Tokenizer.load(
+        pretrained_model_name_or_path=lang_model,
+        do_lower_case=False, use_fast=True
+        )
+
+    #deprecated: tokenizer.add_custom_vocab("samples/tokenizer/custom_vocab.txt")
+    tokenizer.add_tokens(new_tokens=["neverseentokens"])
+
+    basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"
+
+    # original tokenizer from transformer repo
+    tokenized = tokenizer.tokenize(basic_text)
+    assert tokenized == ['Some', 'Text', 'with', 'neverseentokens', 'plus', '!', '215', '?', '#', '.', 'and', 'a', 'combined', '-', 'token', '_', 'with', '/', 'ch', '##ars']
+
+    # ours with metadata
+    tokenized_meta = tokenize_with_metadata(text=basic_text, tokenizer=tokenizer)
+    assert tokenized_meta["tokens"] == tokenized
+    assert tokenized_meta["offsets"] == [0, 5, 10, 15, 31, 36, 37, 40, 41, 42, 44, 48, 50, 58, 59, 64, 65, 69, 70, 72]
+    assert tokenized_meta["start_of_word"] == [True, True, True, True, True, True, False, False, False, False, True, True, True, False, False, False, False, False, False, False]
+
+
 if __name__ == "__main__":
     test_all_tokenizer_on_special_cases()

From c20c5db7f2e62a129f4ccca6f6822fb52aee7b44 Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Sat, 1 Aug 2020 17:44:18 +0200
Subject: [PATCH 03/30] test_inferencer_with_fast_bert_tokenizer

---
 test/test_inference.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/test_inference.py b/test/test_inference.py
index afefc8644..39647cfb0 100644
--- a/test/test_inference.py
+++ b/test/test_inference.py
@@ -1,5 +1,7 @@
 import pytest
 import numpy as np
+import transformers
+
 from farm.infer import Inferencer
 
 
@@ -95,5 +97,12 @@ def test_embeddings_extraction(num_processes):
     assert result[0]["context"] == ['Schar', '##tau', 'sagte', 'dem', 'Tages', '##spiegel', ',', 'dass', 'Fischer', 'ein', 'Id', '##iot', 'ist']
     assert np.isclose(result[0]["vec"][0], 1.50174605e-02)
 
+
+def test_inferencer_with_fast_bert_tokenizer():
+    model = Inferencer.load("bert-base-german-cased", task_type='text_classification', use_fast=True)
+    tokenizer = model.processor.tokenizer
+    assert type(tokenizer) is transformers.tokenization_bert.BertTokenizerFast
+
+
 if __name__ == "__main__":
     test_embeddings_extraction()

From 5f2b5ee70a7a95300963ec02cd7ab12c65a4aec6 Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Sat, 1 Aug 2020 17:46:34 +0200
Subject: [PATCH 04/30] test_fast_bert_tokenizer

---
 test/test_tokenization.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/test/test_tokenization.py b/test/test_tokenization.py
index 29977577f..254c0b2a9 100644
--- a/test/test_tokenization.py
+++ b/test/test_tokenization.py
@@ -1,6 +1,6 @@
 import logging
 from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences
-from transformers import BertTokenizer, RobertaTokenizer, XLNetTokenizer
+from transformers import BertTokenizer, BertTokenizerFast, RobertaTokenizer, XLNetTokenizer
 import re
 
 
@@ -238,5 +238,12 @@ def test_fast_bert_custom_vocab(caplog):
     assert tokenized_meta["start_of_word"] == [True, True, True, True, True, True, False, False, False, False, True, True, True, False, False, False, False, False, False, False]
 
 
+def test_fast_bert_tokenizer(caplog):
+    caplog.set_level(logging.CRITICAL)
+
+    tokenizer = Tokenizer.load("bert-base-german-cased", use_fast=True)
+    assert type(tokenizer) is BertTokenizerFast
+
+
 if __name__ == "__main__":
     test_all_tokenizer_on_special_cases()

From fa3bd679713836b53823e6cbeae32eef59b24d2d Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Sat, 1 Aug 2020 18:17:42 +0200
Subject: [PATCH 05/30] test_fast_bert_tokenizer_strip_accents

---
 test/test_tokenization.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/test/test_tokenization.py b/test/test_tokenization.py
index 254c0b2a9..1ad4e41ac 100644
--- a/test/test_tokenization.py
+++ b/test/test_tokenization.py
@@ -245,5 +245,16 @@ def test_fast_bert_tokenizer(caplog):
     assert type(tokenizer) is BertTokenizerFast
 
 
+def test_fast_bert_tokenizer_strip_accents(caplog):
+    caplog.set_level(logging.CRITICAL)
+
+    tokenizer = Tokenizer.load("dbmdz/bert-base-german-uncased",
+                               use_fast=True,
+                               strip_accents=False)
+    assert type(tokenizer) is BertTokenizerFast
+    assert tokenizer._tokenizer._parameters['strip_accents'] is False
+    assert tokenizer._tokenizer._parameters['lowercase']
+
+
 if __name__ == "__main__":
     test_all_tokenizer_on_special_cases()

From cd7298ce27b708075a1d0e001dc2a5a816d6e16e Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Sat, 1 Aug 2020 18:31:05 +0200
Subject: [PATCH 06/30] test_fast_electra_tokenizer

---
 test/test_tokenization.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test/test_tokenization.py b/test/test_tokenization.py
index 1ad4e41ac..4ca3b5a4d 100644
--- a/test/test_tokenization.py
+++ b/test/test_tokenization.py
@@ -1,6 +1,8 @@
 import logging
 from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences
 from transformers import BertTokenizer, BertTokenizerFast, RobertaTokenizer, XLNetTokenizer
+from transformers import ElectraTokenizerFast
+
 import re
 
 
@@ -256,5 +258,13 @@ def test_fast_bert_tokenizer_strip_accents(caplog):
     assert tokenizer._tokenizer._parameters['lowercase']
 
 
+def test_fast_electra_tokenizer(caplog):
+    caplog.set_level(logging.CRITICAL)
+
+    tokenizer = Tokenizer.load("dbmdz/electra-base-german-europeana-cased-discriminator",
+                               use_fast=True)
+    assert type(tokenizer) is ElectraTokenizerFast
+
+
 if __name__ == "__main__":
     test_all_tokenizer_on_special_cases()

From 01e5ffb9c0d514e7ecdc2379ee1ebc66beb3e4d9 Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Sat, 1 Aug 2020 19:09:02 +0200
Subject: [PATCH 07/30] Fix OOM issue of CI

- set num_processes=0 for Inferencer
---
 test/test_inference.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_inference.py b/test/test_inference.py
index 39647cfb0..c05c4211d 100644
--- a/test/test_inference.py
+++ b/test/test_inference.py
@@ -99,7 +99,8 @@ def test_embeddings_extraction(num_processes):
 
 
 def test_inferencer_with_fast_bert_tokenizer():
-    model = Inferencer.load("bert-base-german-cased", task_type='text_classification', use_fast=True)
+    model = Inferencer.load("bert-base-german-cased", task_type='text_classification',
+                            use_fast=True, num_processes=0)
     tokenizer = model.processor.tokenizer
     assert type(tokenizer) is transformers.tokenization_bert.BertTokenizerFast
 

From 42f345f370f5b78836322baf192537b009e1c422 Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Sun, 2 Aug 2020 14:31:22 +0200
Subject: [PATCH 08/30] Extend test for fast tokenizer

- electra
- roberta
---
 test/test_tokenization.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/test/test_tokenization.py b/test/test_tokenization.py
index 4ca3b5a4d..f17c46b04 100644
--- a/test/test_tokenization.py
+++ b/test/test_tokenization.py
@@ -1,9 +1,10 @@
 import logging
-from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences
+import pytest
+import re
 from transformers import BertTokenizer, BertTokenizerFast, RobertaTokenizer, XLNetTokenizer
-from transformers import ElectraTokenizerFast
+from transformers import ElectraTokenizerFast, RobertaTokenizerFast
 
-import re
+from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences
 
 
 def test_basic_loading(caplog):
@@ -240,11 +241,16 @@ def test_fast_bert_custom_vocab(caplog):
     assert tokenized_meta["start_of_word"] == [True, True, True, True, True, True, False, False, False, False, True, True, True, False, False, False, False, False, False, False]
 
 
-def test_fast_bert_tokenizer(caplog):
+@pytest.mark.parametrize("model_name, tokenizer_type", [
+                         ("bert-base-german-cased", BertTokenizerFast),
+                         ("google/electra-small-discriminator", ElectraTokenizerFast),
+                         ("distilroberta-base", RobertaTokenizerFast),
+                         ])
+def test_fast_tokenizer_type(caplog, model_name, tokenizer_type):
     caplog.set_level(logging.CRITICAL)
 
-    tokenizer = Tokenizer.load("bert-base-german-cased", use_fast=True)
-    assert type(tokenizer) is BertTokenizerFast
+    tokenizer = Tokenizer.load(model_name, use_fast=True)
+    assert type(tokenizer) is tokenizer_type
 
 
 def test_fast_bert_tokenizer_strip_accents(caplog):

From 9b021ff36e86cb7e4b12c4fd45a112e27599e3f4 Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Sun, 2 Aug 2020 14:44:58 +0200
Subject: [PATCH 09/30] test_fast_tokenizer for more model typed

- electra
- roberta
---
 test/test_tokenization.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/test/test_tokenization.py b/test/test_tokenization.py
index f17c46b04..2bfb8a3b9 100644
--- a/test/test_tokenization.py
+++ b/test/test_tokenization.py
@@ -93,9 +93,13 @@ def test_truncate_sequences(caplog):
             assert len(trunc_a) + len(trunc_b) + tokenizer.num_special_tokens_to_add(pair=True) == max_seq_len
 
 
-def test_fast_tokenizer(caplog):
-    fast_tokenizer = Tokenizer.load("bert-base-cased", lower_case=False, use_fast=True)
-    tokenizer = Tokenizer.load("bert-base-cased", lower_case=False, use_fast=False)
+@pytest.mark.parametrize("model_name", ["bert-base-german-cased",
+                         "google/electra-small-discriminator",
+                         "distilroberta-base",
+                         ])
+def test_fast_tokenizer(caplog, model_name):
+    fast_tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=True)
+    tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=False)
 
     texts = [
         "This is a sentence",

From 86d7fd57558ee8aac0940615f99a949e9ab67ed1 Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Sun, 2 Aug 2020 15:19:55 +0200
Subject: [PATCH 10/30] Fix tokenize_with_metadata

---
 farm/modeling/tokenization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/farm/modeling/tokenization.py b/farm/modeling/tokenization.py
index c5c36df8d..c3a578f47 100644
--- a/farm/modeling/tokenization.py
+++ b/farm/modeling/tokenization.py
@@ -309,7 +309,7 @@ def _words_to_tokens(words, word_offsets, tokenizer):
         elif len(tokens) == 0:
             tokens_word = tokenizer.tokenize(w)
         else:
-            if type(tokenizer) == RobertaTokenizer:
+            if (type(tokenizer) == RobertaTokenizer) or (type(tokenizer) == RobertaTokenizerFast):
                 tokens_word = tokenizer.tokenize(w, add_prefix_space=True)
             else:
                 tokens_word = tokenizer.tokenize(w)

From a8f4638984422a09410f13a37133b58dd9ba1a40 Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Sun, 2 Aug 2020 19:15:33 +0200
Subject: [PATCH 11/30] Split tokenizer tests

---
 test/test_tokenization.py | 51 ++++++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/test/test_tokenization.py b/test/test_tokenization.py
index 2bfb8a3b9..14a1fbdcc 100644
--- a/test/test_tokenization.py
+++ b/test/test_tokenization.py
@@ -7,6 +7,27 @@
 from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences
 
 
+TEXTS = [
+    "This is a sentence",
+    "Der entscheidende Pass",
+    "This      is a sentence with multiple spaces",
+    "力加勝北区ᴵᴺᵀᵃছজটডণত",
+    "Thiso text is included tolod makelio sure Unicodeel is handled properly:",
+    "This is a sentence...",
+    "Let's see all on this text and. !23# neverseenwordspossible",
+    """This is a sentence.
+    With linebreak""",
+    """Sentence with multiple
+
+
+    newlines
+    """,
+    "and another one\n\n\nwithout space",
+    "This is a sentence	with tab",
+    "This is a sentence			with multiple tabs",
+]
+
+
 def test_basic_loading(caplog):
     caplog.set_level(logging.CRITICAL)
     tokenizer = Tokenizer.load(
@@ -97,37 +118,23 @@ def test_truncate_sequences(caplog):
                          "google/electra-small-discriminator",
                          "distilroberta-base",
                          ])
-def test_fast_tokenizer(caplog, model_name):
+def test_fast_tokenizer_with_examples(caplog, model_name):
     fast_tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=True)
     tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=False)
 
-    texts = [
-        "This is a sentence",
-        "Der entscheidende Pass",
-        "This      is a sentence with multiple spaces",
-        "力加勝北区ᴵᴺᵀᵃছজটডণত",
-        "Thiso text is included tolod makelio sure Unicodeel is handled properly:",
-        "This is a sentence...",
-        "Let's see all on this text and. !23# neverseenwordspossible",
-        """This is a sentence.
-        With linebreak""",
-        """Sentence with multiple
-
-
-        newlines
-        """,
-        "and another one\n\n\nwithout space",
-        "This is a sentence	with tab",
-        "This is a sentence			with multiple tabs",
-    ]
-    for text in texts:
-
+    for text in TEXTS:
             # plain tokenize function
             tokenized = tokenizer.tokenize(text)
             fast_tokenized = fast_tokenizer.tokenize(text)
 
             assert tokenized == fast_tokenized
 
+
+def test_fast_tokenizer_with_metadata_with_examples_(caplog, model_name):
+    fast_tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=True)
+    tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=False)
+
+    for text in TEXTS:
             # our tokenizer with metadata on "whitespace tokenized words"
             tokenized_meta = tokenize_with_metadata(text=text, tokenizer=tokenizer)
             fast_tokenized_meta = tokenize_with_metadata(text=text, tokenizer=fast_tokenizer)

From cdccafaf964c20d0cfd8bea6cc1a02bc4106afa4 Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Sun, 2 Aug 2020 19:25:29 +0200
Subject: [PATCH 12/30] Fix pytest params bug in test_tok

---
 test/test_tokenization.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/test_tokenization.py b/test/test_tokenization.py
index 14a1fbdcc..4685cde6e 100644
--- a/test/test_tokenization.py
+++ b/test/test_tokenization.py
@@ -130,7 +130,11 @@ def test_fast_tokenizer_with_examples(caplog, model_name):
             assert tokenized == fast_tokenized
 
 
-def test_fast_tokenizer_with_metadata_with_examples_(caplog, model_name):
+@pytest.mark.parametrize("model_name", ["bert-base-german-cased",
+                         "google/electra-small-discriminator",
+                         "distilroberta-base",
+                         ])
+def test_fast_tokenizer_with_metadata_with_examples(caplog, model_name):
     fast_tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=True)
     tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=False)
 

From 47d4b6abfd266a56129f7c406d9ae79ac7b7b516 Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Tue, 4 Aug 2020 09:53:54 +0200
Subject: [PATCH 13/30] Fix fast tokenizer usage

---
 farm/data_handler/input_features.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py
index 0b8119b5b..aedf2ee0d 100644
--- a/farm/data_handler/input_features.py
+++ b/farm/data_handler/input_features.py
@@ -8,6 +8,8 @@
 from dotmap import DotMap
 import numpy as np
 
+from transformers.tokenization_utils_base import TruncationStrategy
+
 from farm.data_handler.samples import Sample
 from farm.data_handler.utils import (
     expand_labels,
@@ -45,8 +47,10 @@ def sample_to_features_text(
         tokens_a,
         tokens_b,
         add_special_tokens=True,
-        truncation_strategy='do_not_truncate',
-        return_token_type_ids=True
+        truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,
+        return_token_type_ids=True,
+        max_length=max_seq_len,
+        is_pretokenized=True,
     )
 
     input_ids, segment_ids = inputs["input_ids"], inputs["token_type_ids"]
@@ -539,4 +543,4 @@ def _SQUAD_improve_answer_span(
             if text_span == tok_answer_text:
                 return (new_start, new_end)
 
-    return (input_start, input_end)
+    return (input_start, input_end)
\ No newline at end of file

From 83180631f295362b39ad2875ebb73a1f072d05f7 Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Tue, 4 Aug 2020 09:55:58 +0200
Subject: [PATCH 14/30] add missing newline eof

---
 farm/data_handler/input_features.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py
index aedf2ee0d..7afe9da6a 100644
--- a/farm/data_handler/input_features.py
+++ b/farm/data_handler/input_features.py
@@ -543,4 +543,4 @@ def _SQUAD_improve_answer_span(
             if text_span == tok_answer_text:
                 return (new_start, new_end)
 
-    return (input_start, input_end)
\ No newline at end of file
+    return (input_start, input_end)

From 8c61e3b5c357b994be1c7b0eb472c03e538a1577 Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Tue, 4 Aug 2020 10:02:00 +0200
Subject: [PATCH 15/30] Add test fast tok. doc_callif.

---
 test/test_doc_classification.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/test_doc_classification.py b/test/test_doc_classification.py
index 23a4dbea0..ec54fe306 100644
--- a/test/test_doc_classification.py
+++ b/test/test_doc_classification.py
@@ -19,7 +19,8 @@
 @pytest.mark.parametrize("data_dir_path,text_column_name",
                          [("samples/doc_class", None),
                           ("samples/doc_class_other_text_column_name", "text_other")])
-def test_doc_classification(data_dir_path, text_column_name, caplog=None):
+@pytest.mark.parametrize("use_fast", [False, True])
+def test_doc_classification(data_dir_path, text_column_name, use_fast, caplog=None):
     if caplog:
         caplog.set_level(logging.CRITICAL)
 
@@ -32,7 +33,9 @@ def test_doc_classification(data_dir_path, text_column_name, caplog=None):
 
     tokenizer = Tokenizer.load(
         pretrained_model_name_or_path=lang_model,
-        do_lower_case=False)
+        do_lower_case=False,
+        use_fast=use_fast,
+        )
 
     tcp_params = dict(tokenizer=tokenizer,
                       max_seq_len=8,

From aec7d2d24a969155476649d0248ea5eac19b1eb8 Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Tue, 4 Aug 2020 13:29:12 +0200
Subject: [PATCH 16/30] Remove RobertaTokenizerFast

---
 farm/modeling/tokenization.py | 7 ++++---
 test/test_tokenization.py     | 5 +----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/farm/modeling/tokenization.py b/farm/modeling/tokenization.py
index c3a578f47..8c1d646dc 100644
--- a/farm/modeling/tokenization.py
+++ b/farm/modeling/tokenization.py
@@ -27,7 +27,7 @@
 from transformers.tokenization_bert import BertTokenizer, BertTokenizerFast, load_vocab
 from transformers.tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast
 from transformers.tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
-from transformers.tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
+from transformers.tokenization_roberta import RobertaTokenizer
 from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.tokenization_xlm_roberta import XLMRobertaTokenizer
 from transformers.tokenization_xlnet import XLNetTokenizer
@@ -59,6 +59,7 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=Fals
         :type tokenizer_class: str
         :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
             use the Python one (False).
+            TODO: Say which models support fast tokenizers.
         :type use_fast: bool
         :param kwargs:
         :return: Tokenizer
@@ -102,7 +103,7 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=Fals
             ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif tokenizer_class == "RobertaTokenizer":
             if use_fast:
-                ret = RobertaTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
+                raise ValueError('RobertaTokenizerFast is not supportet!')
             else:
                 ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif tokenizer_class == "DistilBertTokenizer":
@@ -309,7 +310,7 @@ def _words_to_tokens(words, word_offsets, tokenizer):
         elif len(tokens) == 0:
             tokens_word = tokenizer.tokenize(w)
         else:
-            if (type(tokenizer) == RobertaTokenizer) or (type(tokenizer) == RobertaTokenizerFast):
+            if type(tokenizer) == RobertaTokenizer:
                 tokens_word = tokenizer.tokenize(w, add_prefix_space=True)
             else:
                 tokens_word = tokenizer.tokenize(w)
diff --git a/test/test_tokenization.py b/test/test_tokenization.py
index 4685cde6e..c8959e8f1 100644
--- a/test/test_tokenization.py
+++ b/test/test_tokenization.py
@@ -2,7 +2,7 @@
 import pytest
 import re
 from transformers import BertTokenizer, BertTokenizerFast, RobertaTokenizer, XLNetTokenizer
-from transformers import ElectraTokenizerFast, RobertaTokenizerFast
+from transformers import ElectraTokenizerFast
 
 from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences
 
@@ -116,7 +116,6 @@ def test_truncate_sequences(caplog):
 
 @pytest.mark.parametrize("model_name", ["bert-base-german-cased",
                          "google/electra-small-discriminator",
-                         "distilroberta-base",
                          ])
 def test_fast_tokenizer_with_examples(caplog, model_name):
     fast_tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=True)
@@ -132,7 +131,6 @@ def test_fast_tokenizer_with_examples(caplog, model_name):
 
 @pytest.mark.parametrize("model_name", ["bert-base-german-cased",
                          "google/electra-small-discriminator",
-                         "distilroberta-base",
                          ])
 def test_fast_tokenizer_with_metadata_with_examples(caplog, model_name):
     fast_tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=True)
@@ -259,7 +257,6 @@ def test_fast_bert_custom_vocab(caplog):
 @pytest.mark.parametrize("model_name, tokenizer_type", [
                          ("bert-base-german-cased", BertTokenizerFast),
                          ("google/electra-small-discriminator", ElectraTokenizerFast),
-                         ("distilroberta-base", RobertaTokenizerFast),
                          ])
 def test_fast_tokenizer_type(caplog, model_name, tokenizer_type):
     caplog.set_level(logging.CRITICAL)

From 75ea9dd9d88d885b786ab25c28b958fce04d982a Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Tue, 4 Aug 2020 21:14:08 +0200
Subject: [PATCH 17/30] Fix Tokenizer load and save.

---
 farm/data_handler/input_features.py | 4 +---
 farm/data_handler/processor.py      | 6 +++++-
 farm/modeling/tokenization.py       | 9 +++++----
 test/test_doc_classification.py     | 2 +-
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py
index 7afe9da6a..50e5ca4a5 100644
--- a/farm/data_handler/input_features.py
+++ b/farm/data_handler/input_features.py
@@ -8,8 +8,6 @@
 from dotmap import DotMap
 import numpy as np
 
-from transformers.tokenization_utils_base import TruncationStrategy
-
 from farm.data_handler.samples import Sample
 from farm.data_handler.utils import (
     expand_labels,
@@ -47,7 +45,7 @@ def sample_to_features_text(
         tokens_a,
         tokens_b,
         add_special_tokens=True,
-        truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,
+        truncation=False,  # truncation_strategy is depricated
         return_token_type_ids=True,
         max_length=max_seq_len,
         is_pretokenized=True,
diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index c8aa46c29..6ff6116e1 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -231,7 +231,11 @@ def save(self, save_dir):
         config = self.generate_config()
         # save tokenizer incl. attributes
         config["tokenizer"] = self.tokenizer.__class__.__name__
-        self.tokenizer.save_pretrained(save_dir)
+
+        # Because the fast tokenizers expect a str and not Path
+        # always convert Path to str here.
+        self.tokenizer.save_pretrained(str(save_dir))
+
         # save processor
         config["processor"] = self.__class__.__name__
         output_config_file = Path(save_dir) / "processor_config.json"
diff --git a/farm/modeling/tokenization.py b/farm/modeling/tokenization.py
index 8c1d646dc..084767df5 100644
--- a/farm/modeling/tokenization.py
+++ b/farm/modeling/tokenization.py
@@ -97,28 +97,29 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=Fals
                                  f"XLNetTokenizer.")
             logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
         # return appropriate tokenizer object
+        ret = None
         if tokenizer_class == "AlbertTokenizer":
             ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True,  **kwargs)
         elif tokenizer_class == "XLMRobertaTokenizer":
             ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif tokenizer_class == "RobertaTokenizer":
+        elif "RobertaTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
             if use_fast:
                 raise ValueError('RobertaTokenizerFast is not supportet!')
             else:
                 ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif tokenizer_class == "DistilBertTokenizer":
+        elif "DistilBertTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
             if use_fast:
                 ret = DistilBertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
             else:
                 ret = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif tokenizer_class == "BertTokenizer":
+        elif "BertTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
             if use_fast:
                 ret = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
             else:
                 ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif tokenizer_class == "XLNetTokenizer":
             ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs)
-        elif tokenizer_class == "ElectraTokenizer":
+        elif "ElectraTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
             if use_fast:
                 ret = ElectraTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
             else:
diff --git a/test/test_doc_classification.py b/test/test_doc_classification.py
index ec54fe306..dcfd31293 100644
--- a/test/test_doc_classification.py
+++ b/test/test_doc_classification.py
@@ -87,7 +87,7 @@ def test_doc_classification(data_dir_path, text_column_name, use_fast, caplog=No
 
     trainer.train()
 
-    save_dir = Path("testsave/doc_class")
+    save_dir = Path("testsave/doc_class_bert")
     model.save(save_dir)
     processor.save(save_dir)
 

From 2d2cd00dc69638c932bcc8673db66874e00dd3b1 Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Tue, 4 Aug 2020 21:57:07 +0200
Subject: [PATCH 18/30] Fix typo

---
 farm/data_handler/input_features.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py
index 50e5ca4a5..dde46e71e 100644
--- a/farm/data_handler/input_features.py
+++ b/farm/data_handler/input_features.py
@@ -45,7 +45,7 @@ def sample_to_features_text(
         tokens_a,
         tokens_b,
         add_special_tokens=True,
-        truncation=False,  # truncation_strategy is depricated
+        truncation=False,  # truncation_strategy is deprecated
         return_token_type_ids=True,
         max_length=max_seq_len,
         is_pretokenized=True,

From 8afa136c5015cfcae6996a181bf2c5de6a7755a7 Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Wed, 5 Aug 2020 17:12:35 +0200
Subject: [PATCH 19/30] Improve test test_embeddings_extraction

- add shape assert
- fix embedding assert
---
 test/test_inference.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_inference.py b/test/test_inference.py
index c05c4211d..9815b1405 100644
--- a/test/test_inference.py
+++ b/test/test_inference.py
@@ -95,7 +95,8 @@ def test_embeddings_extraction(num_processes):
     # Get embeddings for input text (you can vary the strategy and layer)
     result = model.inference_from_dicts(dicts=basic_texts)
     assert result[0]["context"] == ['Schar', '##tau', 'sagte', 'dem', 'Tages', '##spiegel', ',', 'dass', 'Fischer', 'ein', 'Id', '##iot', 'ist']
-    assert np.isclose(result[0]["vec"][0], 1.50174605e-02)
+    assert result[0]["vec"].shape == (768,)
+    assert np.isclose(result[0]["vec"][0], -0.032460204579613426)
 
 
 def test_inferencer_with_fast_bert_tokenizer():

From 042fde0c9ca5d621f7b4c4fa3b441adfdc289c88 Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Wed, 5 Aug 2020 17:37:57 +0200
Subject: [PATCH 20/30] Dosctring for fast tokenizers improved

---
 farm/modeling/tokenization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/farm/modeling/tokenization.py b/farm/modeling/tokenization.py
index 084767df5..82f8adeed 100644
--- a/farm/modeling/tokenization.py
+++ b/farm/modeling/tokenization.py
@@ -59,7 +59,7 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=Fals
         :type tokenizer_class: str
         :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
             use the Python one (False).
-            TODO: Say which models support fast tokenizers.
+            Only DistilBERT, BERT and Electra fast tokenizers are supported.
         :type use_fast: bool
         :param kwargs:
         :return: Tokenizer

From 7ed385fe4b28b1dc3179afb7fbc5912f13f224ba Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Wed, 5 Aug 2020 17:44:10 +0200
Subject: [PATCH 21/30] tokenizer_args docstring

---
 farm/infer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/farm/infer.py b/farm/infer.py
index 180e6a577..fcbef006e 100644
--- a/farm/infer.py
+++ b/farm/infer.py
@@ -198,6 +198,8 @@ def load(
         :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
             use the Python one (False).
         :param tokenizer_args: (Optional) Will be passed to the Tokenizer ``__init__`` method.
+            See https://huggingface.co/transformers/main_classes/tokenizer.html and detailed tokenizer documentation
+            on `Hugging Face Transformers <https://huggingface.co/transformers/>`_.
         :type tokenizer_args: dict
         :type use_fast: bool
         :return: An instance of the Inferencer.

From d4eb59ccca931d94e5a77023b21b40c2b6e2cffc Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Wed, 5 Aug 2020 17:46:44 +0200
Subject: [PATCH 22/30] Extend test_embeddings_extraction to fast tok.

---
 test/test_inference.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/test_inference.py b/test/test_inference.py
index 9815b1405..9a219f82d 100644
--- a/test/test_inference.py
+++ b/test/test_inference.py
@@ -75,7 +75,8 @@ def test_qa_format_and_results(adaptive_model_qa, streaming, multiprocessing_chu
 
 
 @pytest.mark.parametrize("num_processes", [0], scope="session")
-def test_embeddings_extraction(num_processes):
+@pytest.mark.parametrize("use_fast", [False, True])
+def test_embeddings_extraction(num_processes, use_fast):
     # Input
     basic_texts = [
         {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist"},
@@ -90,6 +91,7 @@ def test_embeddings_extraction(num_processes):
         batch_size=5,
         extraction_strategy="reduce_mean",
         extraction_layer=-2,
+        use_fast=use_fast,
         num_processes=num_processes)
 
     # Get embeddings for input text (you can vary the strategy and layer)

From 4f87604200bb14c50d0b0aabeeccdec249f596d5 Mon Sep 17 00:00:00 2001
From: PhilipMay <eniak.info@gmail.com>
Date: Wed, 5 Aug 2020 17:49:47 +0200
Subject: [PATCH 23/30] extend test_ner with fast tok.

---
 test/test_ner.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/test_ner.py b/test/test_ner.py
index fe69452f1..1285331c8 100644
--- a/test/test_ner.py
+++ b/test/test_ner.py
@@ -1,4 +1,5 @@
 from pathlib import Path
+import pytest
 
 import numpy as np
 
@@ -16,7 +17,8 @@
 import logging
 
 
-def test_ner(caplog):
+@pytest.mark.parametrize("use_fast", [False, True])
+def test_ner(caplog, use_fast):
     if caplog:
         caplog.set_level(logging.CRITICAL)
 
@@ -28,7 +30,8 @@ def test_ner(caplog):
     lang_model = "distilbert-base-german-cased"
 
     tokenizer = Tokenizer.load(
-        pretrained_model_name_or_path=lang_model, do_lower_case=False
+        pretrained_model_name_or_path=lang_model, do_lower_case=False,
+        use_fast=use_fast,
     )
 
     ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH",

From bc7abcace5063cced9a5a2e592948459f2633383 Mon Sep 17 00:00:00 2001
From: Malte Pietsch <malte.pietsch@deepset.ai>
Date: Thu, 6 Aug 2020 12:08:36 +0200
Subject: [PATCH 24/30] fix sample_to_features_ner for fast tokenizer

---
 farm/data_handler/input_features.py | 5 +++--
 test/test_ner.py                    | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py
index dde46e71e..3500046a1 100644
--- a/farm/data_handler/input_features.py
+++ b/farm/data_handler/input_features.py
@@ -141,9 +141,10 @@ def samples_to_features_ner(
     inputs = tokenizer.encode_plus(text=tokens,
                                    text_pair=None,
                                    add_special_tokens=True,
-                                   truncation_strategy='do_not_truncate', # We've already truncated our tokens before
+                                   truncation=False,
                                    return_special_tokens_mask=True,
-                                   return_token_type_ids=True
+                                   return_token_type_ids=True,
+                                   is_pretokenized=True
                                    )
 
     input_ids, segment_ids, special_tokens_mask = inputs["input_ids"], inputs["token_type_ids"], inputs["special_tokens_mask"]
diff --git a/test/test_ner.py b/test/test_ner.py
index 1285331c8..9e34407a9 100644
--- a/test/test_ner.py
+++ b/test/test_ner.py
@@ -101,4 +101,4 @@ def test_ner(caplog, use_fast):
 
 
 if __name__ == "__main__":
-    test_ner(None)
+    test_ner(None, True)

From da9c2f520d1f86c14fed4121204f616b950cb50f Mon Sep 17 00:00:00 2001
From: Malte Pietsch <malte.pietsch@deepset.ai>
Date: Thu, 6 Aug 2020 13:17:50 +0200
Subject: [PATCH 25/30] temp fix for is_pretokenized until fixed upstream

---
 farm/data_handler/input_features.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py
index 3500046a1..f886a0200 100644
--- a/farm/data_handler/input_features.py
+++ b/farm/data_handler/input_features.py
@@ -41,6 +41,12 @@ def sample_to_features_text(
     tokens_a = sample.tokenized["tokens"]
     tokens_b = sample.tokenized.get("tokens_b", None)
 
+    # is_pretokenized seems to be broken upstream for slow tokenizers, while fast tokenizers rely on it
+    # temp fix until fixed upstream (see https://github.com/huggingface/transformers/issues/6046)
+    if tokenizer.is_fast:
+        is_pretokenized = True
+    else:
+        is_pretokenized = False
     inputs = tokenizer.encode_plus(
         tokens_a,
         tokens_b,
@@ -48,7 +54,7 @@ def sample_to_features_text(
         truncation=False,  # truncation_strategy is deprecated
         return_token_type_ids=True,
         max_length=max_seq_len,
-        is_pretokenized=True,
+        is_pretokenized=is_pretokenized,
     )
 
     input_ids, segment_ids = inputs["input_ids"], inputs["token_type_ids"]
@@ -138,13 +144,20 @@ def samples_to_features_ner(
     """
 
     tokens = sample.tokenized["tokens"]
+
+    # is_pretokenized seems to be broken upstream for slow tokenizers, while fast tokenizers rely on it
+    # temp fix until fixed upstream (see https://github.com/huggingface/transformers/issues/6046)
+    if tokenizer.is_fast:
+        is_pretokenized = True
+    else:
+        is_pretokenized = False
     inputs = tokenizer.encode_plus(text=tokens,
                                    text_pair=None,
                                    add_special_tokens=True,
                                    truncation=False,
                                    return_special_tokens_mask=True,
                                    return_token_type_ids=True,
-                                   is_pretokenized=True
+                                   is_pretokenized=is_pretokenized
                                    )
 
     input_ids, segment_ids, special_tokens_mask = inputs["input_ids"], inputs["token_type_ids"], inputs["special_tokens_mask"]

From 19cc2110e5b933a60616884684295ae3b4f3bf7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogdan=20Kosti=C4=87?= <bogdankostic@web.de>
Date: Tue, 25 Aug 2020 18:00:18 +0200
Subject: [PATCH 26/30] Make use of fast tokenizer possible + fix bug in offset
 calculation

---
 farm/data_handler/input_features.py | 35 +++++++++--------
 farm/modeling/tokenization.py       | 58 +++++++++++++++++++----------
 test/test_inference.py              |  2 +-
 test/test_ner.py                    |  2 +-
 4 files changed, 58 insertions(+), 39 deletions(-)

diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py
index f886a0200..fd45c6b37 100644
--- a/farm/data_handler/input_features.py
+++ b/farm/data_handler/input_features.py
@@ -36,26 +36,25 @@ def sample_to_features_text(
     :rtype: list
     """
 
-    #TODO It might be cleaner to adjust the data structure in sample.tokenized
-    # Verify if this current quickfix really works for pairs
-    tokens_a = sample.tokenized["tokens"]
-    tokens_b = sample.tokenized.get("tokens_b", None)
-
-    # is_pretokenized seems to be broken upstream for slow tokenizers, while fast tokenizers rely on it
-    # temp fix until fixed upstream (see https://github.com/huggingface/transformers/issues/6046)
     if tokenizer.is_fast:
-        is_pretokenized = True
+        text = sample.clear_text["text"]
+        # Here, we tokenize the sample for the second time...
+        inputs = tokenizer(text, return_token_type_ids=True, max_length=max_seq_len)
     else:
-        is_pretokenized = False
-    inputs = tokenizer.encode_plus(
-        tokens_a,
-        tokens_b,
-        add_special_tokens=True,
-        truncation=False,  # truncation_strategy is deprecated
-        return_token_type_ids=True,
-        max_length=max_seq_len,
-        is_pretokenized=is_pretokenized,
-    )
+        # TODO It might be cleaner to adjust the data structure in sample.tokenized
+        # Verify if this current quickfix really works for pairs
+        tokens_a = sample.tokenized["tokens"]
+        tokens_b = sample.tokenized.get("tokens_b", None)
+
+        inputs = tokenizer.encode_plus(
+            tokens_a,
+            tokens_b,
+            add_special_tokens=True,
+            truncation=False,  # truncation_strategy is deprecated
+            return_token_type_ids=True,
+            max_length=max_seq_len,
+            is_pretokenized=False,
+        )
 
     input_ids, segment_ids = inputs["input_ids"], inputs["token_type_ids"]
 
diff --git a/farm/modeling/tokenization.py b/farm/modeling/tokenization.py
index 82f8adeed..b78de0835 100644
--- a/farm/modeling/tokenization.py
+++ b/farm/modeling/tokenization.py
@@ -258,25 +258,41 @@ def tokenize_with_metadata(text, tokenizer):
     :rtype: dict
 
     """
+    # Fast Tokenizers return offsets, so we don't need to calculate them ourselves
+    if tokenizer.is_fast:
+        tokenized = tokenizer(text, return_offsets_mapping=True, return_special_tokens_mask=True)
+        tokens = []
+        offsets = []
+        start_of_word = []
+        previous_token_end = -1
+        for token_id, is_special_token, offset in zip(tokenized["input_ids"],
+                                                      tokenized["special_tokens_mask"],
+                                                      tokenized["offset_mapping"]):
+            if is_special_token == 0:
+                tokens.append(tokenizer.decode([token_id]))
+                offsets.append(offset[0])
+                start_of_word.append(True if offset[0] != previous_token_end else False)
+                previous_token_end = offset[1]
+        tokenized = {"tokens": tokens, "offsets": offsets, "start_of_word": start_of_word}
+    else:
+        # normalize all other whitespace characters to " "
+        # Note: using text.split() directly would destroy the offset,
+        # since \n\n\n would be treated similarly as a single \n
+        text = re.sub(r"\s", " ", text)
+        # split text into "words" (here: simple whitespace tokenizer).
+        words = text.split(" ")
+        word_offsets = []
+        cumulated = 0
+        for idx, word in enumerate(words):
+            word_offsets.append(cumulated)
+            cumulated += len(word) + 1  # 1 because we so far have whitespace tokenizer
+
+        # split "words" into "subword tokens"
+        tokens, offsets, start_of_word = _words_to_tokens(
+            words, word_offsets, tokenizer
+        )
 
-    # normalize all other whitespace characters to " "
-    # Note: using text.split() directly would destroy the offset,
-    # since \n\n\n would be treated similarly as a single \n
-    text = re.sub(r"\s", " ", text)
-    # split text into "words" (here: simple whitespace tokenizer).
-    words = text.split(" ")
-    word_offsets = []
-    cumulated = 0
-    for idx, word in enumerate(words):
-        word_offsets.append(cumulated)
-        cumulated += len(word) + 1  # 1 because we so far have whitespace tokenizer
-
-    # split "words"into "subword tokens"
-    tokens, offsets, start_of_word = _words_to_tokens(
-        words, word_offsets, tokenizer
-    )
-
-    tokenized = {"tokens": tokens, "offsets": offsets, "start_of_word": start_of_word}
+        tokenized = {"tokens": tokens, "offsets": offsets, "start_of_word": start_of_word}
     return tokenized
 
 
@@ -327,7 +343,11 @@ def _words_to_tokens(words, word_offsets, tokenizer):
             # Depending on the tokenizer type special chars are added to distinguish tokens with preceeding
             # whitespace (=> "start of a word"). We need to get rid of these to calculate the original length of the token
             orig_tok = re.sub(SPECIAL_TOKENIZER_CHARS, "", tok)
-            w_off += len(orig_tok)
+            # Don't use length of unk token for offset calculation
+            if orig_tok == tokenizer.special_tokens_map["unk_token"]:
+                w_off += 1
+            else:
+                w_off += len(orig_tok)
             if first_tok:
                 start_of_word.append(True)
                 first_tok = False
diff --git a/test/test_inference.py b/test/test_inference.py
index 9a219f82d..eacfaa5bb 100644
--- a/test/test_inference.py
+++ b/test/test_inference.py
@@ -98,7 +98,7 @@ def test_embeddings_extraction(num_processes, use_fast):
     result = model.inference_from_dicts(dicts=basic_texts)
     assert result[0]["context"] == ['Schar', '##tau', 'sagte', 'dem', 'Tages', '##spiegel', ',', 'dass', 'Fischer', 'ein', 'Id', '##iot', 'ist']
     assert result[0]["vec"].shape == (768,)
-    assert np.isclose(result[0]["vec"][0], -0.032460204579613426)
+    assert np.isclose(result[0]["vec"][0], 0.01501756374325071)
 
 
 def test_inferencer_with_fast_bert_tokenizer():
diff --git a/test/test_ner.py b/test/test_ner.py
index 9e34407a9..12aa9dbe7 100644
--- a/test/test_ner.py
+++ b/test/test_ner.py
@@ -89,7 +89,7 @@ def test_ner(caplog, use_fast):
     basic_texts = [
         {"text": "Paris is a town in France."},
     ]
-    model = Inferencer.load(model_name_or_path="dbmdz/bert-base-cased-finetuned-conll03-english", num_processes=0, task_type="ner")
+    model = Inferencer.load(model_name_or_path="dbmdz/bert-base-cased-finetuned-conll03-english", num_processes=0, task_type="ner", use_fast=use_fast)
     # labels arent correctly inserted from transformers
     # They are converted to LABEL_1 ... LABEL_N
     # For the inference result to contain predictions we need them in IOB NER format

From 7e75de15ac2da4da1bbf443f1448ea67716552fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogdan=20Kosti=C4=87?= <bogdankostic@web.de>
Date: Mon, 31 Aug 2020 17:57:41 +0200
Subject: [PATCH 27/30] Make fast tokenization possible with NER, LM and QA

---
 farm/data_handler/input_features.py | 115 +++++++++++++++++++++-------
 1 file changed, 89 insertions(+), 26 deletions(-)

diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py
index fd45c6b37..4bfa38cec 100644
--- a/farm/data_handler/input_features.py
+++ b/farm/data_handler/input_features.py
@@ -4,6 +4,7 @@
 
 
 import logging
+import re
 import collections
 from dotmap import DotMap
 import numpy as np
@@ -39,7 +40,14 @@ def sample_to_features_text(
     if tokenizer.is_fast:
         text = sample.clear_text["text"]
         # Here, we tokenize the sample for the second time...
-        inputs = tokenizer(text, return_token_type_ids=True, max_length=max_seq_len)
+        inputs = tokenizer(text,
+                           return_token_type_ids=True,
+                           max_length=max_seq_len,
+                           return_special_tokens_mask=True)
+
+        if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]):
+            logger.error("FastTokenizer produced different number of tokens in input_features.py and"
+                         "tokenize_with_metadata.py")
     else:
         # TODO It might be cleaner to adjust the data structure in sample.tokenized
         # Verify if this current quickfix really works for pairs
@@ -147,17 +155,25 @@ def samples_to_features_ner(
     # is_pretokenized seems to be broken upstream for slow tokenizers, while fast tokenizers rely on it
     # temp fix until fixed upstream (see https://github.com/huggingface/transformers/issues/6046)
     if tokenizer.is_fast:
-        is_pretokenized = True
+        text = sample.clear_text["text"]
+        # Here, we tokenize the sample for the second time.
+        inputs = tokenizer(text,
+                           return_token_type_ids=True,
+                           max_length=max_seq_len,
+                           return_special_tokens_mask=True)
+
+        if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]):
+            logger.error("FastTokenizer produced different number of tokens in input_features.py and"
+                         "tokenize_with_metadata.py")
     else:
-        is_pretokenized = False
-    inputs = tokenizer.encode_plus(text=tokens,
-                                   text_pair=None,
-                                   add_special_tokens=True,
-                                   truncation=False,
-                                   return_special_tokens_mask=True,
-                                   return_token_type_ids=True,
-                                   is_pretokenized=is_pretokenized
-                                   )
+        inputs = tokenizer.encode_plus(text=tokens,
+                                       text_pair=None,
+                                       add_special_tokens=True,
+                                       truncation=False,
+                                       return_special_tokens_mask=True,
+                                       return_token_type_ids=True,
+                                       is_pretokenized=False
+                                       )
 
     input_ids, segment_ids, special_tokens_mask = inputs["input_ids"], inputs["token_type_ids"], inputs["special_tokens_mask"]
 
@@ -246,6 +262,14 @@ def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=T
 
         tokens_b, t2_label = mask_random_words(tokens_b, tokenizer.vocab,
                                                token_groups=sample.tokenized["text_b"]["start_of_word"])
+
+        if tokenizer.is_fast:
+            # Detokenize input as fast tokenizer can't handle tokenized input
+            tokens_a = " ".join(tokens_a)
+            tokens_a = re.sub(r"^(##|Ġ|▁)", "", tokens_a)
+            tokens_b = " ".join(tokens_b)
+            tokens_b = re.sub(r"^(##|Ġ|▁)", "", tokens_b)
+
         # convert lm labels to ids
         t1_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t1_label]
         t2_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t2_label]
@@ -261,18 +285,36 @@ def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=T
         tokens_b = None
         tokens_a, t1_label = mask_random_words(tokens_a, tokenizer.vocab,
                                                token_groups=sample.tokenized["text_a"]["start_of_word"])
+        if tokenizer.is_fast:
+            # Detokenize input as fast tokenizer can't handle tokenized input
+            tokens_a = " ".join(tokens_a)
+            tokens_a = re.sub(r"^(##|Ġ|▁)", "", tokens_a)
+
         # convert lm labels to ids
         lm_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t1_label]
 
-    # encode string tokens to input_ids and add special tokens
-    inputs = tokenizer.encode_plus(text=tokens_a,
-                                   text_pair=tokens_b,
-                                   add_special_tokens=True,
-                                   truncation_strategy='do_not_truncate',
-                                   # We've already truncated our tokens before
-                                   return_special_tokens_mask=True,
-                                   return_token_type_ids=True
-                                   )
+    if tokenizer.is_fast:
+        inputs = tokenizer(text=tokens_a,
+                           text_pair=tokens_b,
+                           add_special_tokens=True,
+                           return_special_tokens_mask=True,
+                           return_token_type_ids=True)
+
+        seq_b_len = len(sample.tokenized["text_b"]["tokens"]) if "text_b" in sample.tokenized else 0
+        if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != \
+           (len(sample.tokenized["text_a"]["tokens"]) + seq_b_len):
+            logger.error("FastTokenizer produced different number of tokens in input_features.py and"
+                         "tokenize_with_metadata.py")
+    else:
+        # encode string tokens to input_ids and add special tokens
+        inputs = tokenizer.encode_plus(text=tokens_a,
+                                       text_pair=tokens_b,
+                                       add_special_tokens=True,
+                                       truncation_strategy='do_not_truncate',
+                                       # We've already truncated our tokens before
+                                       return_special_tokens_mask=True,
+                                       return_token_type_ids=True
+                                       )
 
     input_ids, segment_ids, special_tokens_mask = inputs["input_ids"], inputs["token_type_ids"], inputs[
         "special_tokens_mask"]
@@ -373,12 +415,33 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, sp_toks_start, sp_toks
     # (question_len_t + passage_len_t + n_special_tokens). This may be less than max_seq_len but will not be greater
     # than max_seq_len since truncation was already performed when the document was chunked into passages
     # (c.f. create_samples_squad() )
-    encoded = tokenizer.encode_plus(text=sample.tokenized["question_tokens"],
-                                    text_pair=sample.tokenized["passage_tokens"],
-                                    add_special_tokens=True,
-                                    truncation_strategy='do_not_truncate',
-                                    return_token_type_ids=True,
-                                    return_tensors=None)
+
+    if tokenizer.is_fast:
+        # Detokenize input as fast tokenizer can't handle tokenized input
+        question_tokens = " ".join(question_tokens)
+        question_tokens = re.sub(r"^(##|Ġ|▁)", "", question_tokens)
+        passage_tokens = " ".join(passage_tokens)
+        passage_tokens = re.sub(r"^(##|Ġ|▁)", "", passage_tokens)
+
+        encoded = tokenizer(text=question_tokens,
+                            text_pair=passage_tokens,
+                            add_special_tokens=True,
+                            return_special_tokens_mask=True,
+                            return_token_type_ids=True)
+
+        if (len(encoded["input_ids"]) - encoded["special_tokens_mask"].count(1)) != \
+           (len(sample.tokenized["question_tokens"]) + len(sample.tokenized["passage_tokens"])):
+            logger.error("FastTokenizer produced different number of tokens in input_features.py and"
+                         "tokenize_with_metadata.py")
+
+    else:
+        encoded = tokenizer.encode_plus(text=sample.tokenized["question_tokens"],
+                                        text_pair=sample.tokenized["passage_tokens"],
+                                        add_special_tokens=True,
+                                        truncation_strategy='do_not_truncate',
+                                        return_token_type_ids=True,
+                                        return_tensors=None)
+
     input_ids = encoded["input_ids"]
     segment_ids = encoded["token_type_ids"]
 

From eb466292ebae086d1248339a4f05e2bffe72d3ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogdan=20Kosti=C4=87?= <bogdankostic@web.de>
Date: Tue, 1 Sep 2020 11:10:35 +0200
Subject: [PATCH 28/30] Change error messages

---
 farm/data_handler/input_features.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py
index 4bfa38cec..84308c3cd 100644
--- a/farm/data_handler/input_features.py
+++ b/farm/data_handler/input_features.py
@@ -46,8 +46,10 @@ def sample_to_features_text(
                            return_special_tokens_mask=True)
 
         if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]):
-            logger.error("FastTokenizer produced different number of tokens in input_features.py and"
-                         "tokenize_with_metadata.py")
+            logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to "
+                         f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs "
+                         f"from number of tokens produced in tokenize_with_metadata.py")
+            logger.error("Further processing is likely to be wrong.")
     else:
         # TODO It might be cleaner to adjust the data structure in sample.tokenized
         # Verify if this current quickfix really works for pairs
@@ -163,8 +165,10 @@ def samples_to_features_ner(
                            return_special_tokens_mask=True)
 
         if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]):
-            logger.error("FastTokenizer produced different number of tokens in input_features.py and"
-                         "tokenize_with_metadata.py")
+            logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to "
+                         f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs "
+                         f"from number of tokens produced in tokenize_with_metadata.py")
+            logger.error("Further processing is likely to be wrong.")
     else:
         inputs = tokenizer.encode_plus(text=tokens,
                                        text_pair=None,
@@ -303,8 +307,10 @@ def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=T
         seq_b_len = len(sample.tokenized["text_b"]["tokens"]) if "text_b" in sample.tokenized else 0
         if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != \
            (len(sample.tokenized["text_a"]["tokens"]) + seq_b_len):
-            logger.error("FastTokenizer produced different number of tokens in input_features.py and"
-                         "tokenize_with_metadata.py")
+            logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to "
+                         f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs "
+                         f"from number of tokens produced in tokenize_with_metadata.py")
+            logger.error("Further processing is likely to be wrong.")
     else:
         # encode string tokens to input_ids and add special tokens
         inputs = tokenizer.encode_plus(text=tokens_a,
@@ -431,8 +437,10 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, sp_toks_start, sp_toks
 
         if (len(encoded["input_ids"]) - encoded["special_tokens_mask"].count(1)) != \
            (len(sample.tokenized["question_tokens"]) + len(sample.tokenized["passage_tokens"])):
-            logger.error("FastTokenizer produced different number of tokens in input_features.py and"
-                         "tokenize_with_metadata.py")
+            logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to "
+                         f"{len(encoded['input_ids']) - encoded['special_tokens_mask'].count(1)} tokens, which differs "
+                         f"from number of tokens produced in tokenize_with_metadata.py")
+            logger.error("Further processing is likely to be wrong.")
 
     else:
         encoded = tokenizer.encode_plus(text=sample.tokenized["question_tokens"],

From 06d51c026e643ce436506bea0ec0362dcf364289 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogdan=20Kosti=C4=87?= <bogdankostic@web.de>
Date: Tue, 1 Sep 2020 15:35:57 +0200
Subject: [PATCH 29/30] Add tests

---
 farm/data_handler/input_features.py | 18 ++++++++++------
 farm/modeling/tokenization.py       | 33 +++++++++++++++++++++++------
 test/conftest.py                    | 16 ++++++++------
 test/test_question_answering.py     |  6 +++++-
 test/test_tokenization.py           | 23 ++++++++++++++++++--
 5 files changed, 74 insertions(+), 22 deletions(-)

diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py
index 84308c3cd..5b956d739 100644
--- a/farm/data_handler/input_features.py
+++ b/farm/data_handler/input_features.py
@@ -270,9 +270,9 @@ def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=T
         if tokenizer.is_fast:
             # Detokenize input as fast tokenizer can't handle tokenized input
             tokens_a = " ".join(tokens_a)
-            tokens_a = re.sub(r"^(##|Ġ|▁)", "", tokens_a)
+            tokens_a = re.sub(r"(^|\s)(##)", "", tokens_a)
             tokens_b = " ".join(tokens_b)
-            tokens_b = re.sub(r"^(##|Ġ|▁)", "", tokens_b)
+            tokens_b = re.sub(r"(^|\s)(##)", "", tokens_b)
 
         # convert lm labels to ids
         t1_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t1_label]
@@ -292,7 +292,7 @@ def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=T
         if tokenizer.is_fast:
             # Detokenize input as fast tokenizer can't handle tokenized input
             tokens_a = " ".join(tokens_a)
-            tokens_a = re.sub(r"^(##|Ġ|▁)", "", tokens_a)
+            tokens_a = re.sub(r"(^|\s)(##)", "", tokens_a)
 
         # convert lm labels to ids
         lm_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t1_label]
@@ -425,9 +425,9 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, sp_toks_start, sp_toks
     if tokenizer.is_fast:
         # Detokenize input as fast tokenizer can't handle tokenized input
         question_tokens = " ".join(question_tokens)
-        question_tokens = re.sub(r"^(##|Ġ|▁)", "", question_tokens)
+        question_tokens = re.sub(r"(^|\s)(##)", "", question_tokens)
         passage_tokens = " ".join(passage_tokens)
-        passage_tokens = re.sub(r"^(##|Ġ|▁)", "", passage_tokens)
+        passage_tokens = re.sub(r"(^|\s)(##)", "", passage_tokens)
 
         encoded = tokenizer(text=question_tokens,
                             text_pair=passage_tokens,
@@ -553,8 +553,12 @@ def combine_vecs(question_vec, passage_vec, tokenizer, spec_tok_val=-1):
     # Join question_label_vec and passage_label_vec and add slots for special tokens
     vec = tokenizer.build_inputs_with_special_tokens(token_ids_0=question_vec,
                                                      token_ids_1=passage_vec)
-    spec_toks_mask = tokenizer.get_special_tokens_mask(token_ids_0=question_vec,
-                                                       token_ids_1=passage_vec)
+    if tokenizer.is_fast:
+        spec_toks_mask = tokenizer.get_special_tokens_mask(token_ids_0=vec,
+                                                           already_has_special_tokens=True)
+    else:
+        spec_toks_mask = tokenizer.get_special_tokens_mask(token_ids_0=question_vec,
+                                                           token_ids_1=passage_vec)
 
     # If a value in vec corresponds to a special token, it will be replaced with spec_tok_val
     combined = [v if not special_token else spec_tok_val for v, special_token in zip(vec, spec_toks_mask)]
diff --git a/farm/modeling/tokenization.py b/farm/modeling/tokenization.py
index b78de0835..89e051d98 100644
--- a/farm/modeling/tokenization.py
+++ b/farm/modeling/tokenization.py
@@ -99,12 +99,21 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=Fals
         # return appropriate tokenizer object
         ret = None
         if tokenizer_class == "AlbertTokenizer":
-            ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True,  **kwargs)
+            if use_fast:
+                logger.error('AlbertTokenizerFast is not supported! Using AlbertTokenizer instead.')
+                ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs)
+            else:
+                ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True,  **kwargs)
         elif tokenizer_class == "XLMRobertaTokenizer":
-            ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            if use_fast:
+                logger.error('XLMRobertaTokenizerFast is not supported! Using XLMRobertaTokenizer instead.')
+                ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            else:
+                ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif "RobertaTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
             if use_fast:
-                raise ValueError('RobertaTokenizerFast is not supportet!')
+                logger.error('RobertaTokenizerFast is not supported! Using RobertaTokenizer instead.')
+                ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
             else:
                 ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif "DistilBertTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
@@ -118,16 +127,28 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=Fals
             else:
                 ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif tokenizer_class == "XLNetTokenizer":
-            ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs)
+            if use_fast:
+                logger.error('XLNetTokenizerFast is not supported! Using XLNetTokenizer instead.')
+                ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs)
+            else:
+                ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs)
         elif "ElectraTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
             if use_fast:
                 ret = ElectraTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
             else:
                 ret = ElectraTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif tokenizer_class == "EmbeddingTokenizer":
-            ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            if use_fast:
+                logger.error('EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.')
+                ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            else:
+                ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif tokenizer_class == "CamembertTokenizer":
-            ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs)
+            if use_fast:
+                logger.error('CamembertTokenizerFast is not supported! Using CamembertTokenizer instead.')
+                ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs)
+            else:
+                ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs)
         if ret is None:
             raise Exception("Unable to load tokenizer")
         else:
diff --git a/test/conftest.py b/test/conftest.py
index 74d2b6c2f..8063b717b 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -61,18 +61,20 @@ def adaptive_model_qa(use_gpu, num_processes):
     assert len(children) == 0
 
 
-@pytest.fixture()
-def bert_base_squad2():
+@pytest.fixture(params=[True, False])
+def bert_base_squad2(request):
     model = QAInferencer.load(
             "deepset/bert-base-cased-squad2",
             task_type="question_answering",
             batch_size=16,
-            num_processes=0)
+            num_processes=0,
+            use_fast=request.param
+    )
     return model
 
 
-@pytest.fixture()
-def distilbert_squad():
+@pytest.fixture(params=[True, False])
+def distilbert_squad(request):
     set_all_seeds(seed=42)
     device, n_gpu = initialize_device_settings(use_cuda=False)
     batch_size = 2
@@ -81,7 +83,9 @@ def distilbert_squad():
     base_LM_model = "distilbert-base-uncased"
 
     tokenizer = Tokenizer.load(
-        pretrained_model_name_or_path=base_LM_model, do_lower_case=True
+        pretrained_model_name_or_path=base_LM_model,
+        do_lower_case=True,
+        use_fast=request.param
     )
     label_list = ["start_token", "end_token"]
     processor = SquadProcessor(
diff --git a/test/test_question_answering.py b/test/test_question_answering.py
index 9832453aa..3b640e799 100644
--- a/test/test_question_answering.py
+++ b/test/test_question_answering.py
@@ -9,7 +9,7 @@
 from farm.infer import Inferencer, QAInferencer
 from farm.data_handler.inputs import QAInput, Question
 
-
+@pytest.mark.parametrize("distilbert_squad", [True, False], indirect=True)
 def test_training(distilbert_squad, caplog=None):
     if caplog:
         caplog.set_level(logging.CRITICAL)
@@ -19,6 +19,7 @@ def test_training(distilbert_squad, caplog=None):
     assert type(processor) == SquadProcessor
 
 
+@pytest.mark.parametrize("distilbert_squad", [True, False], indirect=True)
 def test_save_load(distilbert_squad, caplog=None):
     if caplog:
         caplog.set_level(logging.CRITICAL)
@@ -33,6 +34,7 @@ def test_save_load(distilbert_squad, caplog=None):
     assert inferencer is not None
 
 
+@pytest.mark.parametrize("bert_base_squad2", [True, False], indirect=True)
 def test_inference_dicts(bert_base_squad2):
     qa_format_1 = [
         {
@@ -49,6 +51,7 @@ def test_inference_dicts(bert_base_squad2):
 
 
 @pytest.fixture()
+@pytest.mark.parametrize("bert_base_squad2", [True, False], indirect=True)
 def span_inference_result(bert_base_squad2, caplog=None):
     if caplog:
         caplog.set_level(logging.CRITICAL)
@@ -59,6 +62,7 @@ def span_inference_result(bert_base_squad2, caplog=None):
 
 
 @pytest.fixture()
+@pytest.mark.parametrize("bert_base_squad2", [True, False], indirect=True)
 def no_answer_inference_result(bert_base_squad2, caplog=None):
     if caplog:
         caplog.set_level(logging.CRITICAL)
diff --git a/test/test_tokenization.py b/test/test_tokenization.py
index c8959e8f1..4c9557eee 100644
--- a/test/test_tokenization.py
+++ b/test/test_tokenization.py
@@ -1,7 +1,7 @@
 import logging
 import pytest
 import re
-from transformers import BertTokenizer, BertTokenizerFast, RobertaTokenizer, XLNetTokenizer
+from transformers import BertTokenizer, BertTokenizerFast, RobertaTokenizer, XLNetTokenizer, RobertaTokenizerFast
 from transformers import ElectraTokenizerFast
 
 from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences
@@ -284,5 +284,24 @@ def test_fast_electra_tokenizer(caplog):
     assert type(tokenizer) is ElectraTokenizerFast
 
 
+@pytest.mark.parametrize("model_name", ["bert-base-cased", "distilbert-base-uncased", "deepset/electra-base-squad2"])
+def test_detokenization_in_fast_tokenizers(model_name):
+    tokenizer = Tokenizer.load(
+        pretrained_model_name_or_path=model_name,
+        use_fast=True
+    )
+    for text in TEXTS:
+        tokens_with_metadata = tokenize_with_metadata(text, tokenizer)
+        tokens = tokens_with_metadata["tokens"]
+
+        detokenized = " ".join(tokens)
+        detokenized = re.sub(r"(^|\s+)(##)", "", detokenized)
+
+        detokenized_ids = tokenizer(detokenized, add_special_tokens=False)["input_ids"]
+        detokenized_tokens = [tokenizer.decode([tok_id]).strip() for tok_id in detokenized_ids]
+
+        assert tokens == detokenized_tokens
+
+
 if __name__ == "__main__":
-    test_all_tokenizer_on_special_cases()
+    test_all_tokenizer_on_special_cases()
\ No newline at end of file

From 1acaff4ea7f6f3c19080ee468c05eabfe1b4229d Mon Sep 17 00:00:00 2001
From: Malte Pietsch <malte.pietsch@deepset.ai>
Date: Wed, 2 Sep 2020 10:34:57 +0200
Subject: [PATCH 30/30] update error messages, comments and truncation arg in
 tokenizer

---
 farm/data_handler/input_features.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py
index 5b956d739..a6ccf4429 100644
--- a/farm/data_handler/input_features.py
+++ b/farm/data_handler/input_features.py
@@ -39,7 +39,8 @@ def sample_to_features_text(
 
     if tokenizer.is_fast:
         text = sample.clear_text["text"]
-        # Here, we tokenize the sample for the second time...
+        # Here, we tokenize the sample for the second time to get all relevant ids
+        # This should change once we git rid of FARM's tokenize_with_metadata()
         inputs = tokenizer(text,
                            return_token_type_ids=True,
                            max_length=max_seq_len,
@@ -48,11 +49,10 @@ def sample_to_features_text(
         if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]):
             logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to "
                          f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs "
-                         f"from number of tokens produced in tokenize_with_metadata.py")
-            logger.error("Further processing is likely to be wrong.")
+                         f"from number of tokens produced in tokenize_with_metadata(). \n"
+                         f"Further processing is likely to be wrong.")
     else:
         # TODO It might be cleaner to adjust the data structure in sample.tokenized
-        # Verify if this current quickfix really works for pairs
         tokens_a = sample.tokenized["tokens"]
         tokens_b = sample.tokenized.get("tokens_b", None)
 
@@ -154,11 +154,10 @@ def samples_to_features_ner(
 
     tokens = sample.tokenized["tokens"]
 
-    # is_pretokenized seems to be broken upstream for slow tokenizers, while fast tokenizers rely on it
-    # temp fix until fixed upstream (see https://github.com/huggingface/transformers/issues/6046)
     if tokenizer.is_fast:
         text = sample.clear_text["text"]
-        # Here, we tokenize the sample for the second time.
+        # Here, we tokenize the sample for the second time to get all relevant ids
+        # This should change once we git rid of FARM's tokenize_with_metadata()
         inputs = tokenizer(text,
                            return_token_type_ids=True,
                            max_length=max_seq_len,
@@ -167,8 +166,8 @@ def samples_to_features_ner(
         if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]):
             logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to "
                          f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs "
-                         f"from number of tokens produced in tokenize_with_metadata.py")
-            logger.error("Further processing is likely to be wrong.")
+                         f"from number of tokens produced in tokenize_with_metadata().\n"
+                         f"Further processing is likely to be wrong!")
     else:
         inputs = tokenizer.encode_plus(text=tokens,
                                        text_pair=None,
@@ -309,13 +308,14 @@ def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=T
            (len(sample.tokenized["text_a"]["tokens"]) + seq_b_len):
             logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to "
                          f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs "
-                         f"from number of tokens produced in tokenize_with_metadata.py")
-            logger.error("Further processing is likely to be wrong.")
+                         f"from number of tokens produced in tokenize_with_metadata(). \n"
+                         f"Further processing is likely to be wrong.")
     else:
         # encode string tokens to input_ids and add special tokens
         inputs = tokenizer.encode_plus(text=tokens_a,
                                        text_pair=tokens_b,
                                        add_special_tokens=True,
+                                       truncation=False,
                                        truncation_strategy='do_not_truncate',
                                        # We've already truncated our tokens before
                                        return_special_tokens_mask=True,
@@ -439,13 +439,13 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, sp_toks_start, sp_toks
            (len(sample.tokenized["question_tokens"]) + len(sample.tokenized["passage_tokens"])):
             logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to "
                          f"{len(encoded['input_ids']) - encoded['special_tokens_mask'].count(1)} tokens, which differs "
-                         f"from number of tokens produced in tokenize_with_metadata.py")
-            logger.error("Further processing is likely to be wrong.")
-
+                         f"from number of tokens produced in tokenize_with_metadata(). \n"
+                         f"Further processing is likely to be wrong.")
     else:
         encoded = tokenizer.encode_plus(text=sample.tokenized["question_tokens"],
                                         text_pair=sample.tokenized["passage_tokens"],
                                         add_special_tokens=True,
+                                        truncation=False,
                                         truncation_strategy='do_not_truncate',
                                         return_token_type_ids=True,
                                         return_tensors=None)