Skip to content

Commit

Permalink
Simplify language_modeling.py and tokenization.py (#2703)
Browse files Browse the repository at this point in the history
* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
  • Loading branch information
ZanSara and vblagoje committed Jul 22, 2022
1 parent 8ee2b6b commit 4e45062
Show file tree
Hide file tree
Showing 28 changed files with 1,533 additions and 2,461 deletions.
13 changes: 5 additions & 8 deletions docs/_src/api/api/retriever.md
Expand Up @@ -519,7 +519,7 @@ Karpukhin, Vladimir, et al. (2020): "Dense Passage Retrieval for Open-Domain Que
#### DensePassageRetriever.\_\_init\_\_

```python
def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True)
def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True)
```

Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
Expand Down Expand Up @@ -561,8 +561,6 @@ The title is expected to be present in doc.meta["name"] and can be supplied in t
before writing them to the DocumentStore like this:
{"text": "my text", "meta": {"name": "my title"}}.
- `use_fast_tokenizers`: Whether to use fast Rust tokenizers
- `infer_tokenizer_classes`: Whether to infer tokenizer class from the model config / name.
If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`.
- `similarity_function`: Which function to apply for calculating the similarity of query and passage embeddings during training.
Options: `dot_product` (Default) or `cosine`
- `global_loss_buffer_size`: Buffer size for all_gather() in DDP.
Expand Down Expand Up @@ -871,7 +869,7 @@ None

```python
@classmethod
def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", infer_tokenizer_classes: bool = False)
def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder")
```

Load DensePassageRetriever from the specified directory.
Expand All @@ -895,7 +893,7 @@ Kostić, Bogdan, et al. (2021): "Multi-modal Retrieval of Tables and Texts Using
#### TableTextRetriever.\_\_init\_\_

```python
def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", passage_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", table_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True)
def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", passage_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", table_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, use_fast: bool = True)
```

Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
Expand Down Expand Up @@ -923,8 +921,6 @@ This is the approach used in the original paper and is likely to improve
performance if your titles contain meaningful information for retrieval
(topic, entities etc.).
- `use_fast_tokenizers`: Whether to use fast Rust tokenizers
- `infer_tokenizer_classes`: Whether to infer tokenizer class from the model config / name.
If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`.
- `similarity_function`: Which function to apply for calculating the similarity of query and passage embeddings during training.
Options: `dot_product` (Default) or `cosine`
- `global_loss_buffer_size`: Buffer size for all_gather() in DDP.
Expand All @@ -942,6 +938,7 @@ Additional information can be found here https://huggingface.co/transformers/mai
- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]).
If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
- `use_fast`: Whether to use the fast version of DPR tokenizers or fallback to the standard version. Defaults to True.

<a id="dense.TableTextRetriever.retrieve_batch"></a>

Expand Down Expand Up @@ -1153,7 +1150,7 @@ None

```python
@classmethod
def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder", infer_tokenizer_classes: bool = False)
def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder")
```

Load TableTextRetriever from the specified directory.
Expand Down
9 changes: 6 additions & 3 deletions haystack/document_stores/memory.py
Expand Up @@ -10,7 +10,7 @@
from tqdm import tqdm

from haystack.schema import Document, Label
from haystack.errors import DuplicateDocumentError
from haystack.errors import DuplicateDocumentError, DocumentStoreError
from haystack.document_stores import BaseDocumentStore
from haystack.document_stores.base import get_batches_from_generator
from haystack.modeling.utils import initialize_device_settings
Expand Down Expand Up @@ -448,8 +448,11 @@ def update_embeddings(
) as progress_bar:
for document_batch in batched_documents:
embeddings = retriever.embed_documents(document_batch) # type: ignore
assert len(document_batch) == len(embeddings)

if not len(document_batch) == len(embeddings):
raise DocumentStoreError(
"The number of embeddings does not match the number of documents in the batch "
f"({len(embeddings)} != {len(document_batch)})"
)
if embeddings[0].shape[0] != self.embedding_dim:
raise RuntimeError(
f"Embedding dim. of model ({embeddings[0].shape[0]})"
Expand Down
7 changes: 7 additions & 0 deletions haystack/errors.py
Expand Up @@ -35,6 +35,13 @@ def __repr__(self):
return str(self)


class ModelingError(HaystackError):
"""Exception for issues raised by the modeling module"""

def __init__(self, message: Optional[str] = None, docs_link: Optional[str] = "https://haystack.deepset.ai/"):
super().__init__(message=message, docs_link=docs_link)


class PipelineError(HaystackError):
"""Exception for issues raised within a pipeline"""

Expand Down
15 changes: 5 additions & 10 deletions haystack/json-schemas/haystack-pipeline-master.schema.json
Expand Up @@ -2116,11 +2116,6 @@
"default": true,
"type": "boolean"
},
"infer_tokenizer_classes": {
"title": "Infer Tokenizer Classes",
"default": false,
"type": "boolean"
},
"similarity_function": {
"title": "Similarity Function",
"default": "dot_product",
Expand Down Expand Up @@ -4338,11 +4333,6 @@
"default": true,
"type": "boolean"
},
"infer_tokenizer_classes": {
"title": "Infer Tokenizer Classes",
"default": false,
"type": "boolean"
},
"similarity_function": {
"title": "Similarity Function",
"default": "dot_product",
Expand Down Expand Up @@ -4387,6 +4377,11 @@
"title": "Scale Score",
"default": true,
"type": "boolean"
},
"use_fast": {
"title": "Use Fast",
"default": true,
"type": "boolean"
}
},
"required": [
Expand Down
11 changes: 10 additions & 1 deletion haystack/modeling/data_handler/data_silo.py
Expand Up @@ -812,7 +812,16 @@ def _run_teacher(self, batch: dict) -> List[torch.Tensor]:
"""
Run the teacher model on the given batch.
"""
return self.teacher.inferencer.model(**batch)
params = {
"input_ids": batch["input_ids"],
"segment_ids": batch["segment_ids"],
"padding_mask": batch["padding_mask"],
}
if "output_hidden_states" in batch.keys():
params["output_hidden_states"] = batch["output_hidden_states"]
if "output_attentions" in batch.keys():
params["output_attentions"] = batch["output_attentions"]
return self.teacher.inferencer.model(**params)

def _pass_batches(
self,
Expand Down
64 changes: 42 additions & 22 deletions haystack/modeling/data_handler/processor.py
@@ -1,4 +1,4 @@
from typing import Optional, Dict, List, Union, Any, Iterable
from typing import Optional, Dict, List, Union, Any, Iterable, Type

import os
import json
Expand All @@ -16,9 +16,11 @@
import requests
from tqdm import tqdm
from torch.utils.data import TensorDataset
import transformers
from transformers import PreTrainedTokenizer

from haystack.modeling.model.tokenization import (
Tokenizer,
get_tokenizer,
tokenize_batch_question_answering,
tokenize_with_metadata,
truncate_sequences,
Expand Down Expand Up @@ -176,11 +178,9 @@ def load_from_dir(cls, load_dir: str):
"Loading tokenizer from deprecated config. "
"If you used `custom_vocab` or `never_split_chars`, this won't work anymore."
)
tokenizer = Tokenizer.load(
load_dir, tokenizer_class=config["tokenizer"], do_lower_case=config["lower_case"]
)
tokenizer = get_tokenizer(load_dir, tokenizer_class=config["tokenizer"], do_lower_case=config["lower_case"])
else:
tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["tokenizer"])
tokenizer = get_tokenizer(load_dir, tokenizer_class=config["tokenizer"])

# we have to delete the tokenizer string from config, because we pass it as Object
del config["tokenizer"]
Expand Down Expand Up @@ -216,7 +216,7 @@ def convert_from_transformers(
**kwargs,
):
tokenizer_args = tokenizer_args or {}
tokenizer = Tokenizer.load(
tokenizer = get_tokenizer(
tokenizer_name_or_path,
tokenizer_class=tokenizer_class,
use_fast=use_fast,
Expand Down Expand Up @@ -308,7 +308,9 @@ def file_to_dicts(self, file: str) -> List[dict]:
raise NotImplementedError()

@abstractmethod
def dataset_from_dicts(self, dicts: List[dict], indices: Optional[List[int]] = None, return_baskets: bool = False):
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
):
raise NotImplementedError()

@abstractmethod
Expand Down Expand Up @@ -445,7 +447,9 @@ def __init__(
"using the default task or add a custom task later via processor.add_task()"
)

def dataset_from_dicts(self, dicts: List[dict], indices: Optional[List[int]] = None, return_baskets: bool = False):
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
):
"""
Convert input dictionaries into a pytorch dataset for Question Answering.
For this we have an internal representation called "baskets".
Expand Down Expand Up @@ -492,7 +496,7 @@ def file_to_dicts(self, file: str) -> List[dict]:
return dicts

# TODO use Input Objects instead of this function, remove Natural Questions (NQ) related code
def convert_qa_input_dict(self, infer_dict: dict):
def convert_qa_input_dict(self, infer_dict: dict) -> Dict[str, Any]:
"""Input dictionaries in QA can either have ["context", "qas"] (internal format) as keys or
["text", "questions"] (api format). This function converts the latter into the former. It also converts the
is_impossible field to answer_type so that NQ and SQuAD dicts have the same format.
Expand Down Expand Up @@ -929,9 +933,15 @@ def load_from_dir(cls, load_dir: str):
# read config
processor_config_file = Path(load_dir) / "processor_config.json"
config = json.load(open(processor_config_file))
# init tokenizer
query_tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["query_tokenizer"], subfolder="query")
passage_tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["passage_tokenizer"], subfolder="passage")
# init tokenizers
query_tokenizer_class: Type[PreTrainedTokenizer] = getattr(transformers, config["query_tokenizer"])
query_tokenizer = query_tokenizer_class.from_pretrained(
pretrained_model_name_or_path=load_dir, subfolder="query"
)
passage_tokenizer_class: Type[PreTrainedTokenizer] = getattr(transformers, config["passage_tokenizer"])
passage_tokenizer = passage_tokenizer_class.from_pretrained(
pretrained_model_name_or_path=load_dir, subfolder="passage"
)

# we have to delete the tokenizer string from config, because we pass it as Object
del config["query_tokenizer"]
Expand Down Expand Up @@ -978,7 +988,9 @@ def save(self, save_dir: Union[str, Path]):
with open(output_config_file, "w") as file:
json.dump(config, file)

def dataset_from_dicts(self, dicts: List[dict], indices: Optional[List[int]] = None, return_baskets: bool = False):
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
):
"""
Convert input dictionaries into a pytorch dataset for TextSimilarity (e.g. DPR).
For conversion we have an internal representation called "baskets".
Expand Down Expand Up @@ -1334,9 +1346,9 @@ def load_from_dir(cls, load_dir: str):
processor_config_file = Path(load_dir) / "processor_config.json"
config = json.load(open(processor_config_file))
# init tokenizer
query_tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["query_tokenizer"], subfolder="query")
passage_tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["passage_tokenizer"], subfolder="passage")
table_tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["table_tokenizer"], subfolder="table")
query_tokenizer = get_tokenizer(load_dir, tokenizer_class=config["query_tokenizer"], subfolder="query")
passage_tokenizer = get_tokenizer(load_dir, tokenizer_class=config["passage_tokenizer"], subfolder="passage")
table_tokenizer = get_tokenizer(load_dir, tokenizer_class=config["table_tokenizer"], subfolder="table")

# we have to delete the tokenizer string from config, because we pass it as Object
del config["query_tokenizer"]
Expand Down Expand Up @@ -1488,7 +1500,9 @@ def _read_multimodal_dpr_json(self, file: str, max_samples: Optional[int] = None
standard_dicts.append(sample)
return standard_dicts

def dataset_from_dicts(self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False):
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
):
"""
Convert input dictionaries into a pytorch dataset for TextSimilarity.
For conversion we have an internal representation called "baskets".
Expand Down Expand Up @@ -1836,7 +1850,9 @@ def __init__(
def file_to_dicts(self, file: str) -> List[Dict]:
raise NotImplementedError

def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, debug=False):
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
):
self.baskets = []
# Tokenize in batches
texts = [x["text"] for x in dicts]
Expand Down Expand Up @@ -1958,7 +1974,7 @@ def load_from_dir(cls, load_dir: str):
processor_config_file = Path(load_dir) / "processor_config.json"
config = json.load(open(processor_config_file))
# init tokenizer
tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["tokenizer"])
tokenizer = get_tokenizer(load_dir, tokenizer_class=config["tokenizer"])
# we have to delete the tokenizer string from config, because we pass it as Object
del config["tokenizer"]

Expand All @@ -1979,7 +1995,9 @@ def convert_labels(self, dictionary: Dict):
ret: Dict = {}
return ret

def dataset_from_dicts(self, dicts: List[Dict], indices=None, return_baskets: bool = False, debug: bool = False):
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
):
"""
Function to convert input dictionaries containing text into a torch dataset.
For normal operation with Language Models it calls the superclass' TextClassification.dataset_from_dicts method.
Expand Down Expand Up @@ -2067,7 +2085,9 @@ def file_to_dicts(self, file: str) -> List[dict]:
dicts.append({"text": line})
return dicts

def dataset_from_dicts(self, dicts: List[dict], indices: Optional[List[int]] = None, return_baskets: bool = False):
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
):
if return_baskets:
raise NotImplementedError("return_baskets is not supported by UnlabeledTextProcessor")
texts = [dict_["text"] for dict_ in dicts]
Expand Down

0 comments on commit 4e45062

Please sign in to comment.