Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add options for what to do with missing metadata fields in MetaFieldRanker #7700

Merged
merged 15 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 62 additions & 8 deletions haystack/components/rankers/meta_field.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def __init__(
top_k: Optional[int] = None,
ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"] = "reciprocal_rank_fusion",
sort_order: Literal["ascending", "descending"] = "descending",
missing_meta: Literal["drop", "top", "bottom"] = "bottom",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I'd want to convert the Literal init parameters to follow the enum pattern seen in other parts of the library (c.f HFGenerationAPIType and HuggingFaceAPIGenerator).

Would you be up to fixing that in a follow-up PR? This would also mean that the validation code gets changed/moved around.

meta_value_type: Optional[Literal["float", "int", "date"]] = None,
):
"""
Expand All @@ -65,6 +66,14 @@ def __init__(
:param sort_order:
Whether to sort the meta field by ascending or descending order.
Possible values are `descending` (default) and `ascending`.
:param missing_meta:
What to do with documents that are missing the sorting metadata field.
Possible values are:
- 'drop' will drop the documents entirely.
- 'top' will place the documents at the top of the metadata-sorted list
(regardless of 'ascending' or 'descending').
- 'bottom' will place the documents at the bottom of metadata-sorted list
(regardless of 'ascending' or 'descending').
Comment on lines +70 to +76
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Once we introduce the enum, the bulk of this docstring can be moved to the corresponding docstrings of the former.

:param meta_value_type:
Parse the meta value into the data type specified before sorting.
This will only work if all meta values stored under `meta_field` in the provided documents are strings.
Expand All @@ -82,11 +91,13 @@ def __init__(
self.top_k = top_k
self.ranking_mode = ranking_mode
self.sort_order = sort_order
self.missing_meta = missing_meta
self._validate_params(
weight=self.weight,
top_k=self.top_k,
ranking_mode=self.ranking_mode,
sort_order=self.sort_order,
missing_meta=self.missing_meta,
meta_value_type=meta_value_type,
)
self.meta_value_type = meta_value_type
Expand All @@ -97,6 +108,7 @@ def _validate_params(
top_k: Optional[int],
ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"],
sort_order: Literal["ascending", "descending"],
missing_meta: Literal["drop", "top", "bottom"],
meta_value_type: Optional[Literal["float", "int", "date"]],
):
if top_k is not None and top_k <= 0:
Expand Down Expand Up @@ -125,6 +137,14 @@ def _validate_params(
"MetaFieldRanker." % sort_order
)

if missing_meta not in ["drop", "top", "bottom"]:
raise ValueError(
"The value of parameter <missing_meta> must be 'drop', 'top', or 'bottom', "
"but is currently set to '%s'.\n"
"Change the <missing_meta> value to 'drop', 'top', or 'bottom' when initializing the "
"MetaFieldRanker." % missing_meta
)

if meta_value_type not in ["float", "int", "date", None]:
raise ValueError(
"The value of parameter <meta_value_type> must be 'float', 'int', 'date' or None but is "
Expand All @@ -141,6 +161,7 @@ def run(
weight: Optional[float] = None,
ranking_mode: Optional[Literal["reciprocal_rank_fusion", "linear_score"]] = None,
sort_order: Optional[Literal["ascending", "descending"]] = None,
missing_meta: Optional[Literal["drop", "top", "bottom"]] = None,
meta_value_type: Optional[Literal["float", "int", "date"]] = None,
):
"""
Expand Down Expand Up @@ -171,6 +192,15 @@ def run(
Whether to sort the meta field by ascending or descending order.
Possible values are `descending` (default) and `ascending`.
If not provided, the sort_order provided at initialization time is used.
:param missing_meta:
What to do with documents that are missing the sorting metadata field.
Possible values are:
- 'drop' will drop the documents entirely.
- 'top' will place the documents at the top of the metadata-sorted list
(regardless of 'ascending' or 'descending').
- 'bottom' will place the documents at the bottom of metadata-sorted list
(regardless of 'ascending' or 'descending').
If not provided, the missing_meta provided at initialization time is used.
:param meta_value_type:
Parse the meta value into the data type specified before sorting.
This will only work if all meta values stored under `meta_field` in the provided documents are strings.
Expand Down Expand Up @@ -199,12 +229,14 @@ def run(
weight = weight if weight is not None else self.weight
ranking_mode = ranking_mode or self.ranking_mode
sort_order = sort_order or self.sort_order
missing_meta = missing_meta or self.missing_meta
meta_value_type = meta_value_type or self.meta_value_type
self._validate_params(
weight=weight,
top_k=top_k,
ranking_mode=ranking_mode,
sort_order=sort_order,
missing_meta=missing_meta,
meta_value_type=meta_value_type,
)

Expand All @@ -227,13 +259,27 @@ def run(
return {"documents": documents[:top_k]}

if len(docs_missing_meta_field) > 0:
logger.warning(
"The parameter <meta_field> is currently set to '{meta_field}' but the Documents with IDs {document_ids} don't have this meta key.\n"
"These Documents will be placed at the end of the sorting order.",
meta_field=self.meta_field,
document_ids=",".join([doc.id for doc in docs_missing_meta_field]),
warning_start = (
f"The parameter <meta_field> is currently set to '{self.meta_field}' but the Documents "
f"with IDs {','.join([doc.id for doc in docs_missing_meta_field])} don't have this meta key.\n"
)

if missing_meta == "bottom":
logger.warning(
"{warning_start}Because the parameter <missing_meta> is set to 'bottom', these Documents will be placed at the end of the sorting order.",
warning_start=warning_start,
)
elif missing_meta == "top":
logger.warning(
"{warning_start}Because the parameter <missing_meta> is set to 'top', these Documents will be placed at the top of the sorting order.",
warning_start=warning_start,
)
else:
logger.warning(
"{warning_start}Because the parameter <missing_meta> is set to 'drop', these Documents will be removed from the list of retrieved Documents.",
warning_start=warning_start,
)

# If meta_value_type is provided try to parse the meta values
parsed_meta = self._parse_meta(docs_with_meta_field=docs_with_meta_field, meta_value_type=meta_value_type)
tuple_parsed_meta_and_docs = list(zip(parsed_meta, docs_with_meta_field))
Expand All @@ -252,10 +298,18 @@ def run(
)
return {"documents": documents[:top_k]}

# Add the docs missing the meta_field back on the end
# Merge rankings and handle missing meta fields as specified in the missing_meta parameter
sorted_by_meta = [doc for meta, doc in tuple_sorted_by_meta]
sorted_documents = sorted_by_meta + docs_missing_meta_field
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode)
if missing_meta == "bottom":
sorted_documents = sorted_by_meta + docs_missing_meta_field
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode)
shadeMe marked this conversation as resolved.
Show resolved Hide resolved
elif missing_meta == "top":
sorted_documents = docs_missing_meta_field + sorted_by_meta
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode)
else:
sorted_documents = sorted_by_meta
sorted_documents = self._merge_rankings(docs_with_meta_field, sorted_documents, weight, ranking_mode)

return {"documents": sorted_documents[:top_k]}

def _parse_meta(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
features:
- |
Add a new `missing_meta` param to `MetaFieldRanker`, which determines what to do with
documents that lack the ranked meta field. Supported values are `"bottom"` (which
puts documents with missing meta at the bottom of the sorted list), `"top"` (which puts them
at the top), and `"drop"` (which removes them from the results entirely).
40 changes: 40 additions & 0 deletions test/components/rankers/test_metafield.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,10 @@ def test_raises_value_error_if_wrong_sort_order(self):
with pytest.raises(ValueError):
MetaFieldRanker(meta_field="rating", sort_order="wrong_order")

def test_raises_value_error_if_wrong_missing_meta(self):
with pytest.raises(ValueError):
MetaFieldRanker(meta_field="rating", missing_meta="wrong_missing_meta")

def test_raises_value_error_if_wrong_meta_value_type(self):
with pytest.raises(ValueError):
MetaFieldRanker(meta_field="rating", meta_value_type="wrong_type")
Expand Down Expand Up @@ -239,3 +243,39 @@ def test_different_ranking_mode_for_init_vs_run(self):
output = ranker.run(documents=docs_before, ranking_mode="reciprocal_rank_fusion")
docs_after = output["documents"]
assert docs_after[0].score == pytest.approx(0.016261, abs=1e-5)

def test_missing_meta_bottom(self):
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="bottom")
docs_before = [
Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6),
Document(id="2", content="abc", meta={}, score=0.4),
Document(id="3", content="abc", meta={"rating": 2.1}, score=0.39),
]
output = ranker.run(documents=docs_before)
docs_after = output["documents"]
assert len(docs_after) == 3
assert docs_after[2].id == "2"

def test_missing_meta_top(self):
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="top")
docs_before = [
Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6),
Document(id="2", content="abc", meta={}, score=0.59),
Document(id="3", content="abc", meta={"rating": 2.1}, score=0.4),
]
output = ranker.run(documents=docs_before)
docs_after = output["documents"]
assert len(docs_after) == 3
assert docs_after[0].id == "2"

def test_missing_meta_drop(self):
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="drop")
docs_before = [
Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6),
Document(id="2", content="abc", meta={}, score=0.59),
Document(id="3", content="abc", meta={"rating": 2.1}, score=0.4),
]
output = ranker.run(documents=docs_before)
docs_after = output["documents"]
assert len(docs_after) == 2
assert "2" not in [doc.id for doc in docs_after]