Simplify language_modeling.py and tokenization.py (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication Co-authored-by: vblagoje <dovlex@gmail.com>
deepset-ai · Jul 22, 2022 · 4e45062 · 4e45062
1 parent 8ee2b6b
commit 4e45062
Show file tree

Hide file tree

Showing 28 changed files with 1,533 additions and 2,461 deletions.
diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md
@@ -519,7 +519,7 @@ Karpukhin, Vladimir, et al. (2020): "Dense Passage Retrieval for Open-Domain Que
 #### DensePassageRetriever.\_\_init\_\_
 
 ```python
-def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True)
+def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True)
 ```
 
 Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
@@ -561,8 +561,6 @@ The title is expected to be present in doc.meta["name"] and can be supplied in t
 before writing them to the DocumentStore like this:
 {"text": "my text", "meta": {"name": "my title"}}.
 - `use_fast_tokenizers`: Whether to use fast Rust tokenizers
-- `infer_tokenizer_classes`: Whether to infer tokenizer class from the model config / name.
-If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`.
 - `similarity_function`: Which function to apply for calculating the similarity of query and passage embeddings during training.
 Options: `dot_product` (Default) or `cosine`
 - `global_loss_buffer_size`: Buffer size for all_gather() in DDP.
@@ -871,7 +869,7 @@ None
 
 ```python
 @classmethod
-def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", infer_tokenizer_classes: bool = False)
+def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder")
 ```
 
 Load DensePassageRetriever from the specified directory.
@@ -895,7 +893,7 @@ Kostić, Bogdan, et al. (2021): "Multi-modal Retrieval of Tables and Texts Using
 #### TableTextRetriever.\_\_init\_\_
 
 ```python
-def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", passage_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", table_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True)
+def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", passage_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", table_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, use_fast: bool = True)
 ```
 
 Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
@@ -923,8 +921,6 @@ This is the approach used in the original paper and is likely to improve
 performance if your titles contain meaningful information for retrieval
 (topic, entities etc.).
 - `use_fast_tokenizers`: Whether to use fast Rust tokenizers
-- `infer_tokenizer_classes`: Whether to infer tokenizer class from the model config / name.
-If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`.
 - `similarity_function`: Which function to apply for calculating the similarity of query and passage embeddings during training.
 Options: `dot_product` (Default) or `cosine`
 - `global_loss_buffer_size`: Buffer size for all_gather() in DDP.
@@ -942,6 +938,7 @@ Additional information can be found here https://huggingface.co/transformers/mai
 - `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
 If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
 Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+- `use_fast`: Whether to use the fast version of DPR tokenizers or fallback to the standard version. Defaults to True.
 
 <a id="dense.TableTextRetriever.retrieve_batch"></a>
 
@@ -1153,7 +1150,7 @@ None
 
 ```python
 @classmethod
-def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder", infer_tokenizer_classes: bool = False)
+def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder")
 ```
 
 Load TableTextRetriever from the specified directory.

diff --git a/haystack/document_stores/memory.py b/haystack/document_stores/memory.py
@@ -10,7 +10,7 @@
 from tqdm import tqdm
 
 from haystack.schema import Document, Label
-from haystack.errors import DuplicateDocumentError
+from haystack.errors import DuplicateDocumentError, DocumentStoreError
 from haystack.document_stores import BaseDocumentStore
 from haystack.document_stores.base import get_batches_from_generator
 from haystack.modeling.utils import initialize_device_settings
@@ -448,8 +448,11 @@ def update_embeddings(
         ) as progress_bar:
             for document_batch in batched_documents:
                 embeddings = retriever.embed_documents(document_batch)  # type: ignore
-                assert len(document_batch) == len(embeddings)
-
+                if not len(document_batch) == len(embeddings):
+                    raise DocumentStoreError(
+                        "The number of embeddings does not match the number of documents in the batch "
+                        f"({len(embeddings)} != {len(document_batch)})"
+                    )
                 if embeddings[0].shape[0] != self.embedding_dim:
                     raise RuntimeError(
                         f"Embedding dim. of model ({embeddings[0].shape[0]})"

diff --git a/haystack/errors.py b/haystack/errors.py
@@ -35,6 +35,13 @@ def __repr__(self):
         return str(self)
 
 
+class ModelingError(HaystackError):
+    """Exception for issues raised by the modeling module"""
+
+    def __init__(self, message: Optional[str] = None, docs_link: Optional[str] = "https://haystack.deepset.ai/"):
+        super().__init__(message=message, docs_link=docs_link)
+
+
 class PipelineError(HaystackError):
     """Exception for issues raised within a pipeline"""
 

diff --git a/haystack/json-schemas/haystack-pipeline-master.schema.json b/haystack/json-schemas/haystack-pipeline-master.schema.json
@@ -2116,11 +2116,6 @@
               "default": true,
               "type": "boolean"
             },
-            "infer_tokenizer_classes": {
-              "title": "Infer Tokenizer Classes",
-              "default": false,
-              "type": "boolean"
-            },
             "similarity_function": {
               "title": "Similarity Function",
               "default": "dot_product",
@@ -4338,11 +4333,6 @@
               "default": true,
               "type": "boolean"
             },
-            "infer_tokenizer_classes": {
-              "title": "Infer Tokenizer Classes",
-              "default": false,
-              "type": "boolean"
-            },
             "similarity_function": {
               "title": "Similarity Function",
               "default": "dot_product",
@@ -4387,6 +4377,11 @@
               "title": "Scale Score",
               "default": true,
               "type": "boolean"
+            },
+            "use_fast": {
+              "title": "Use Fast",
+              "default": true,
+              "type": "boolean"
             }
           },
           "required": [

diff --git a/haystack/modeling/data_handler/data_silo.py b/haystack/modeling/data_handler/data_silo.py
@@ -812,7 +812,16 @@ def _run_teacher(self, batch: dict) -> List[torch.Tensor]:
         """
         Run the teacher model on the given batch.
         """
-        return self.teacher.inferencer.model(**batch)
+        params = {
+            "input_ids": batch["input_ids"],
+            "segment_ids": batch["segment_ids"],
+            "padding_mask": batch["padding_mask"],
+        }
+        if "output_hidden_states" in batch.keys():
+            params["output_hidden_states"] = batch["output_hidden_states"]
+        if "output_attentions" in batch.keys():
+            params["output_attentions"] = batch["output_attentions"]
+        return self.teacher.inferencer.model(**params)
 
     def _pass_batches(
         self,

diff --git a/haystack/modeling/data_handler/processor.py b/haystack/modeling/data_handler/processor.py
@@ -1,4 +1,4 @@
-from typing import Optional, Dict, List, Union, Any, Iterable
+from typing import Optional, Dict, List, Union, Any, Iterable, Type
 
 import os
 import json
@@ -16,9 +16,11 @@
 import requests
 from tqdm import tqdm
 from torch.utils.data import TensorDataset
+import transformers
+from transformers import PreTrainedTokenizer
 
 from haystack.modeling.model.tokenization import (
-    Tokenizer,
+    get_tokenizer,
     tokenize_batch_question_answering,
     tokenize_with_metadata,
     truncate_sequences,
@@ -176,11 +178,9 @@ def load_from_dir(cls, load_dir: str):
                 "Loading tokenizer from deprecated config. "
                 "If you used `custom_vocab` or `never_split_chars`, this won't work anymore."
             )
-            tokenizer = Tokenizer.load(
-                load_dir, tokenizer_class=config["tokenizer"], do_lower_case=config["lower_case"]
-            )
+            tokenizer = get_tokenizer(load_dir, tokenizer_class=config["tokenizer"], do_lower_case=config["lower_case"])
         else:
-            tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["tokenizer"])
+            tokenizer = get_tokenizer(load_dir, tokenizer_class=config["tokenizer"])
 
         # we have to delete the tokenizer string from config, because we pass it as Object
         del config["tokenizer"]
@@ -216,7 +216,7 @@ def convert_from_transformers(
         **kwargs,
     ):
         tokenizer_args = tokenizer_args or {}
-        tokenizer = Tokenizer.load(
+        tokenizer = get_tokenizer(
             tokenizer_name_or_path,
             tokenizer_class=tokenizer_class,
             use_fast=use_fast,
@@ -308,7 +308,9 @@ def file_to_dicts(self, file: str) -> List[dict]:
         raise NotImplementedError()
 
     @abstractmethod
-    def dataset_from_dicts(self, dicts: List[dict], indices: Optional[List[int]] = None, return_baskets: bool = False):
+    def dataset_from_dicts(
+        self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
+    ):
         raise NotImplementedError()
 
     @abstractmethod
@@ -445,7 +447,9 @@ def __init__(
                 "using the default task or add a custom task later via processor.add_task()"
             )
 
-    def dataset_from_dicts(self, dicts: List[dict], indices: Optional[List[int]] = None, return_baskets: bool = False):
+    def dataset_from_dicts(
+        self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
+    ):
         """
         Convert input dictionaries into a pytorch dataset for Question Answering.
         For this we have an internal representation called "baskets".
@@ -492,7 +496,7 @@ def file_to_dicts(self, file: str) -> List[dict]:
         return dicts
 
     # TODO use Input Objects instead of this function, remove Natural Questions (NQ) related code
-    def convert_qa_input_dict(self, infer_dict: dict):
+    def convert_qa_input_dict(self, infer_dict: dict) -> Dict[str, Any]:
         """Input dictionaries in QA can either have ["context", "qas"] (internal format) as keys or
         ["text", "questions"] (api format). This function converts the latter into the former. It also converts the
         is_impossible field to answer_type so that NQ and SQuAD dicts have the same format.
@@ -929,9 +933,15 @@ def load_from_dir(cls, load_dir: str):
         # read config
         processor_config_file = Path(load_dir) / "processor_config.json"
         config = json.load(open(processor_config_file))
-        # init tokenizer
-        query_tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["query_tokenizer"], subfolder="query")
-        passage_tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["passage_tokenizer"], subfolder="passage")
+        # init tokenizers
+        query_tokenizer_class: Type[PreTrainedTokenizer] = getattr(transformers, config["query_tokenizer"])
+        query_tokenizer = query_tokenizer_class.from_pretrained(
+            pretrained_model_name_or_path=load_dir, subfolder="query"
+        )
+        passage_tokenizer_class: Type[PreTrainedTokenizer] = getattr(transformers, config["passage_tokenizer"])
+        passage_tokenizer = passage_tokenizer_class.from_pretrained(
+            pretrained_model_name_or_path=load_dir, subfolder="passage"
+        )
 
         # we have to delete the tokenizer string from config, because we pass it as Object
         del config["query_tokenizer"]
@@ -978,7 +988,9 @@ def save(self, save_dir: Union[str, Path]):
         with open(output_config_file, "w") as file:
             json.dump(config, file)
 
-    def dataset_from_dicts(self, dicts: List[dict], indices: Optional[List[int]] = None, return_baskets: bool = False):
+    def dataset_from_dicts(
+        self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
+    ):
         """
         Convert input dictionaries into a pytorch dataset for TextSimilarity (e.g. DPR).
         For conversion we have an internal representation called "baskets".
@@ -1334,9 +1346,9 @@ def load_from_dir(cls, load_dir: str):
         processor_config_file = Path(load_dir) / "processor_config.json"
         config = json.load(open(processor_config_file))
         # init tokenizer
-        query_tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["query_tokenizer"], subfolder="query")
-        passage_tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["passage_tokenizer"], subfolder="passage")
-        table_tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["table_tokenizer"], subfolder="table")
+        query_tokenizer = get_tokenizer(load_dir, tokenizer_class=config["query_tokenizer"], subfolder="query")
+        passage_tokenizer = get_tokenizer(load_dir, tokenizer_class=config["passage_tokenizer"], subfolder="passage")
+        table_tokenizer = get_tokenizer(load_dir, tokenizer_class=config["table_tokenizer"], subfolder="table")
 
         # we have to delete the tokenizer string from config, because we pass it as Object
         del config["query_tokenizer"]
@@ -1488,7 +1500,9 @@ def _read_multimodal_dpr_json(self, file: str, max_samples: Optional[int] = None
             standard_dicts.append(sample)
         return standard_dicts
 
-    def dataset_from_dicts(self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False):
+    def dataset_from_dicts(
+        self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
+    ):
         """
         Convert input dictionaries into a pytorch dataset for TextSimilarity.
         For conversion we have an internal representation called "baskets".
@@ -1836,7 +1850,9 @@ def __init__(
     def file_to_dicts(self, file: str) -> List[Dict]:
         raise NotImplementedError
 
-    def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, debug=False):
+    def dataset_from_dicts(
+        self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
+    ):
         self.baskets = []
         # Tokenize in batches
         texts = [x["text"] for x in dicts]
@@ -1958,7 +1974,7 @@ def load_from_dir(cls, load_dir: str):
         processor_config_file = Path(load_dir) / "processor_config.json"
         config = json.load(open(processor_config_file))
         # init tokenizer
-        tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["tokenizer"])
+        tokenizer = get_tokenizer(load_dir, tokenizer_class=config["tokenizer"])
         # we have to delete the tokenizer string from config, because we pass it as Object
         del config["tokenizer"]
 
@@ -1979,7 +1995,9 @@ def convert_labels(self, dictionary: Dict):
         ret: Dict = {}
         return ret
 
-    def dataset_from_dicts(self, dicts: List[Dict], indices=None, return_baskets: bool = False, debug: bool = False):
+    def dataset_from_dicts(
+        self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
+    ):
         """
         Function to convert input dictionaries containing text into a torch dataset.
         For normal operation with Language Models it calls the superclass' TextClassification.dataset_from_dicts method.
@@ -2067,7 +2085,9 @@ def file_to_dicts(self, file: str) -> List[dict]:
                 dicts.append({"text": line})
         return dicts
 
-    def dataset_from_dicts(self, dicts: List[dict], indices: Optional[List[int]] = None, return_baskets: bool = False):
+    def dataset_from_dicts(
+        self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
+    ):
         if return_baskets:
             raise NotImplementedError("return_baskets is not supported by UnlabeledTextProcessor")
         texts = [dict_["text"] for dict_ in dicts]