Skip to content

Commit

Permalink
Docs: Update Reader's doc strings (#6312)
Browse files Browse the repository at this point in the history
* Update doc strings

* Add warm_up docs, fix margins

---------

Co-authored-by: Vladimir Blagojevic <dovlex@gmail.com>
  • Loading branch information
agnieszka-m and vblagoje committed Nov 24, 2023
1 parent b850b36 commit 27cf8ee
Showing 1 changed file with 57 additions and 23 deletions.
80 changes: 57 additions & 23 deletions haystack/preview/components/readers/extractive.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@
@component
class ExtractiveReader:
"""
A component for performing extractive QA.
Every possible answer span is assigned a confidence score independent of other answer spans. This fixes a common issue of other implementations which make comparisons across documents harder by normalising each document's answers independently.
A component that locates and extract answers to a given query from Documents. It's used for performing extractive
QA. The Reader assigns a probability score to every possible answer span independently of other answer spans.
This fixes a common issue of other implementations which make comparisons across documents harder by normalizing
each document's answers independently.
Example usage:
```python
Expand Down Expand Up @@ -51,29 +53,30 @@ def __init__(
) -> None:
"""
Creates an ExtractiveReader
:param model_name_or_path: A HuggingFace transformers question answering model.
Can either be a path to a folder containing the model files or an identifier for the HF hub
:param model_name_or_path: A Hugging Face transformers question answering model.
Can either be a path to a folder containing the model files or an identifier for the Hugging Face hub.
Default: `'deepset/roberta-base-squad2-distilled'`
:param device: Pytorch device string. Uses GPU by default if available
:param device: Pytorch device string. Uses GPU by default, if available.
:param token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
`transformers-cli login` (stored in ~/.huggingface) is used.
:param top_k: Number of answers to return per query.
It is required even if confidence_threshold is set. Defaults to 20.
An additional answer is returned if no_answer is set to True (default).
:param confidence_threshold: Answers with a confidence score below this value will not be returned
An additional answer with no text is returned if no_answer is set to True (default).
:param confidence_threshold: Returns only answers with the probability score above this threshold.
:param max_seq_length: Maximum number of tokens.
If exceeded by a sequence, the sequence will be split.
If a sequence exceeds it, the sequence is split.
Default: 384
:param stride: Number of tokens that overlap when sequence is split because it exceeds max_seq_length
:param stride: Number of tokens that overlap when sequence is split because it exceeds max_seq_length.
Default: 128
:param max_batch_size: Maximum number of samples that are fed through the model at the same time
:param max_batch_size: Maximum number of samples that are fed through the model at the same time.
:param answers_per_seq: Number of answer candidates to consider per sequence.
This is relevant when a document has been split into multiple sequence due to max_seq_length.
:param no_answer: Whether to return no answer scores
:param calibration_factor: Factor used for calibrating confidence scores
This is relevant when a Document was split into multiple sequences because of max_seq_length.
:param no_answer: Whether to return no answer scores.
:param calibration_factor: Factor used for calibrating probability scores.
:param model_kwargs: Additional keyword arguments passed to `AutoModelForQuestionAnswering.from_pretrained`
when loading the model specified in `model_name_or_path`.
when loading the model specified in `model_name_or_path`. For details on what kwargs you can pass,
see the model's documentation.
"""
torch_and_transformers_import.check()
self.model_name_or_path = str(model_name_or_path)
Expand Down Expand Up @@ -117,6 +120,9 @@ def to_dict(self) -> Dict[str, Any]:
)

def warm_up(self):
"""
Loads model and tokenizer
"""
if self.model is None:
if torch.cuda.is_available():
self.device = self.device or "cuda:0"
Expand All @@ -138,7 +144,7 @@ def _flatten_documents(
self, queries: List[str], documents: List[List[Document]]
) -> Tuple[List[str], List[Document], List[int]]:
"""
Flattens queries and documents so all query-document pairs are arranged along one batch axis.
Flattens queries and Documents so all query-document pairs are arranged along one batch axis.
"""
flattened_queries = [query for documents_, query in zip(documents, queries) for _ in documents_]
flattened_documents = [document for documents_ in documents for document in documents_]
Expand All @@ -149,14 +155,15 @@ def _preprocess(
self, queries: List[str], documents: List[Document], max_seq_length: int, query_ids: List[int], stride: int
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[Encoding], List[int], List[int]]:
"""
Split and tokenise documents and preserve structures by returning mappings to query and document ids.
Split and tokenize Documents and preserve structures by returning mappings to query and Document IDs.
"""
texts = []
document_ids = []
for i, doc in enumerate(documents):
if doc.content is None:
warnings.warn(
f"Document with id {doc.id} was passed to ExtractiveReader, but does not contain any text. It will be ignored."
f"Document with id {doc.id} was passed to ExtractiveReader. The Document doesn't "
f"contain any text and it will be ignored."
)
continue
texts.append(doc.content)
Expand Down Expand Up @@ -195,7 +202,9 @@ def _postprocess(
encodings: List[Encoding],
) -> Tuple[List[List[int]], List[List[int]], torch.Tensor]:
"""
Turn start and end logits into confidence scores for each answer span. Unlike most other implementations, there is no normalisation in each split to make the scores more comparable across different splits. The top k answer spans are returned.
Turn start and end logits into probability scores for each answer span. Unlike most other
implementations, it doesn't normalize the scores to make them easier to compare across different
splits. Returns the top k answer spans.
"""
mask = sequence_ids == 1
mask = torch.logical_and(mask, attention_mask == 1)
Expand Down Expand Up @@ -269,15 +278,29 @@ def _nest_answers(
no_answer: bool,
) -> List[List[ExtractedAnswer]]:
"""
Reconstructs the nested structure that existed before flattening. Also computes a no answer probability. This probability is different from most other implementations because it does not consider the no answer logit introduced with SQuAD 2. Instead, it just computes the probability that the answer does not exist in the top k or top p.
Reconstructs the nested structure that existed before flattening. Also computes a no answer probability.
This probability is different from most other implementations because it does not consider the no answer
logit introduced with SQuAD 2. Instead, it just computes the probability that the answer does not exist
in the top k or top p.
"""
flat_answers_without_queries = []
for document_id, start_candidates_, end_candidates_, probabilities_ in zip(
document_ids, start, end, probabilities
):
for start_, end_, probability in zip(start_candidates_, end_candidates_, probabilities_):
doc = flattened_documents[document_id]
flat_answers_without_queries.append({"data": doc.content[start_:end_], "document": doc, "probability": probability.item(), "start": start_, "end": end_, "metadata": {}}) # type: ignore # doc.content cannot be None, because those documents are filtered when preprocessing. However, mypy doesn't know that.
# doc.content cannot be None, because those documents are filtered when preprocessing.
# However, mypy doesn't know that.
flat_answers_without_queries.append(
{
"data": doc.content[start_:end_], # type: ignore
"document": doc,
"probability": probability.item(),
"start": start_,
"end": end_,
"metadata": {},
}
)
i = 0
nested_answers = []
for query_id in range(query_ids[-1] + 1):
Expand Down Expand Up @@ -316,13 +339,24 @@ def run(
no_answer: Optional[bool] = None,
):
"""
Performs extractive QA on the given documents using the given query.
Locates and extracts answers from the given Documents using the given query.
:param query: Query string.
:param documents: List of Documents to search for an answer to the query.
:param documents: List of Documents in which you want to search for an answer to the query.
:param top_k: The maximum number of answers to return.
An additional answer is returned if no_answer is set to True (default).
:param confidence_threshold:
:return: List of ExtractedAnswers sorted by (desc.) answer score.
:param confidence_threshold: Returns only answers with the probability score above this threshold.
:param max_seq_length: Maximum number of tokens.
If a sequence exceeds it, the sequence is split.
Default: 384
:param stride: Number of tokens that overlap when sequence is split because it exceeds max_seq_length.
Default: 128
:param max_batch_size: Maximum number of samples that are fed through the model at the same time.
:param answers_per_seq: Number of answer candidates to consider per sequence.
This is relevant when a Document was split into multiple sequences because of max_seq_length.
:param no_answer: Whether to return no answer scores.
"""
queries = [query] # Temporary solution until we have decided what batching should look like in v2
nested_documents = [documents]
Expand Down

0 comments on commit 27cf8ee

Please sign in to comment.