Skip to content

Commit

Permalink
feat: Add options for what to do with missing metadata fields in `Met…
Browse files Browse the repository at this point in the history
…aFieldRanker` (#7700)

* Add `missing_meta` param to `MetaFieldRanker`, plus checks for validation.

* Implement `missing_meta` functionality in `run()`.

* Finish first draft of revised `MetaFieldRanker` functionality.

* Add tests for `MetaFieldRanker` `missing_meta` functionality.

* Add `missing_meta` param to `MetaFieldRanker`, plus checks for validation.

* Implement `missing_meta` functionality in `run()`.

* Finish first draft of revised `MetaFieldRanker` functionality.

* Add tests for `MetaFieldRanker` `missing_meta` functionality.

* Add release notes for new `missing_meta` param of `MetaFieldRanker`

* Move part of docs_missing_meta_field warning string outside of `if...elif...else`.
  • Loading branch information
robpasternak committed Jun 12, 2024
1 parent 14c7b02 commit 28dd0f5
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 8 deletions.
70 changes: 62 additions & 8 deletions haystack/components/rankers/meta_field.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def __init__(
top_k: Optional[int] = None,
ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"] = "reciprocal_rank_fusion",
sort_order: Literal["ascending", "descending"] = "descending",
missing_meta: Literal["drop", "top", "bottom"] = "bottom",
meta_value_type: Optional[Literal["float", "int", "date"]] = None,
):
"""
Expand All @@ -65,6 +66,14 @@ def __init__(
:param sort_order:
Whether to sort the meta field by ascending or descending order.
Possible values are `descending` (default) and `ascending`.
:param missing_meta:
What to do with documents that are missing the sorting metadata field.
Possible values are:
- 'drop' will drop the documents entirely.
- 'top' will place the documents at the top of the metadata-sorted list
(regardless of 'ascending' or 'descending').
- 'bottom' will place the documents at the bottom of metadata-sorted list
(regardless of 'ascending' or 'descending').
:param meta_value_type:
Parse the meta value into the data type specified before sorting.
This will only work if all meta values stored under `meta_field` in the provided documents are strings.
Expand All @@ -82,11 +91,13 @@ def __init__(
self.top_k = top_k
self.ranking_mode = ranking_mode
self.sort_order = sort_order
self.missing_meta = missing_meta
self._validate_params(
weight=self.weight,
top_k=self.top_k,
ranking_mode=self.ranking_mode,
sort_order=self.sort_order,
missing_meta=self.missing_meta,
meta_value_type=meta_value_type,
)
self.meta_value_type = meta_value_type
Expand All @@ -97,6 +108,7 @@ def _validate_params(
top_k: Optional[int],
ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"],
sort_order: Literal["ascending", "descending"],
missing_meta: Literal["drop", "top", "bottom"],
meta_value_type: Optional[Literal["float", "int", "date"]],
):
if top_k is not None and top_k <= 0:
Expand Down Expand Up @@ -125,6 +137,14 @@ def _validate_params(
"MetaFieldRanker." % sort_order
)

if missing_meta not in ["drop", "top", "bottom"]:
raise ValueError(
"The value of parameter <missing_meta> must be 'drop', 'top', or 'bottom', "
"but is currently set to '%s'.\n"
"Change the <missing_meta> value to 'drop', 'top', or 'bottom' when initializing the "
"MetaFieldRanker." % missing_meta
)

if meta_value_type not in ["float", "int", "date", None]:
raise ValueError(
"The value of parameter <meta_value_type> must be 'float', 'int', 'date' or None but is "
Expand All @@ -141,6 +161,7 @@ def run(
weight: Optional[float] = None,
ranking_mode: Optional[Literal["reciprocal_rank_fusion", "linear_score"]] = None,
sort_order: Optional[Literal["ascending", "descending"]] = None,
missing_meta: Optional[Literal["drop", "top", "bottom"]] = None,
meta_value_type: Optional[Literal["float", "int", "date"]] = None,
):
"""
Expand Down Expand Up @@ -171,6 +192,15 @@ def run(
Whether to sort the meta field by ascending or descending order.
Possible values are `descending` (default) and `ascending`.
If not provided, the sort_order provided at initialization time is used.
:param missing_meta:
What to do with documents that are missing the sorting metadata field.
Possible values are:
- 'drop' will drop the documents entirely.
- 'top' will place the documents at the top of the metadata-sorted list
(regardless of 'ascending' or 'descending').
- 'bottom' will place the documents at the bottom of metadata-sorted list
(regardless of 'ascending' or 'descending').
If not provided, the missing_meta provided at initialization time is used.
:param meta_value_type:
Parse the meta value into the data type specified before sorting.
This will only work if all meta values stored under `meta_field` in the provided documents are strings.
Expand Down Expand Up @@ -199,12 +229,14 @@ def run(
weight = weight if weight is not None else self.weight
ranking_mode = ranking_mode or self.ranking_mode
sort_order = sort_order or self.sort_order
missing_meta = missing_meta or self.missing_meta
meta_value_type = meta_value_type or self.meta_value_type
self._validate_params(
weight=weight,
top_k=top_k,
ranking_mode=ranking_mode,
sort_order=sort_order,
missing_meta=missing_meta,
meta_value_type=meta_value_type,
)

Expand All @@ -227,13 +259,27 @@ def run(
return {"documents": documents[:top_k]}

if len(docs_missing_meta_field) > 0:
logger.warning(
"The parameter <meta_field> is currently set to '{meta_field}' but the Documents with IDs {document_ids} don't have this meta key.\n"
"These Documents will be placed at the end of the sorting order.",
meta_field=self.meta_field,
document_ids=",".join([doc.id for doc in docs_missing_meta_field]),
warning_start = (
f"The parameter <meta_field> is currently set to '{self.meta_field}' but the Documents "
f"with IDs {','.join([doc.id for doc in docs_missing_meta_field])} don't have this meta key.\n"
)

if missing_meta == "bottom":
logger.warning(
"{warning_start}Because the parameter <missing_meta> is set to 'bottom', these Documents will be placed at the end of the sorting order.",
warning_start=warning_start,
)
elif missing_meta == "top":
logger.warning(
"{warning_start}Because the parameter <missing_meta> is set to 'top', these Documents will be placed at the top of the sorting order.",
warning_start=warning_start,
)
else:
logger.warning(
"{warning_start}Because the parameter <missing_meta> is set to 'drop', these Documents will be removed from the list of retrieved Documents.",
warning_start=warning_start,
)

# If meta_value_type is provided try to parse the meta values
parsed_meta = self._parse_meta(docs_with_meta_field=docs_with_meta_field, meta_value_type=meta_value_type)
tuple_parsed_meta_and_docs = list(zip(parsed_meta, docs_with_meta_field))
Expand All @@ -252,10 +298,18 @@ def run(
)
return {"documents": documents[:top_k]}

# Add the docs missing the meta_field back on the end
# Merge rankings and handle missing meta fields as specified in the missing_meta parameter
sorted_by_meta = [doc for meta, doc in tuple_sorted_by_meta]
sorted_documents = sorted_by_meta + docs_missing_meta_field
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode)
if missing_meta == "bottom":
sorted_documents = sorted_by_meta + docs_missing_meta_field
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode)
elif missing_meta == "top":
sorted_documents = docs_missing_meta_field + sorted_by_meta
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode)
else:
sorted_documents = sorted_by_meta
sorted_documents = self._merge_rankings(docs_with_meta_field, sorted_documents, weight, ranking_mode)

return {"documents": sorted_documents[:top_k]}

def _parse_meta(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
features:
- |
Add a new `missing_meta` param to `MetaFieldRanker`, which determines what to do with
documents that lack the ranked meta field. Supported values are `"bottom"` (which
puts documents with missing meta at the bottom of the sorted list), `"top"` (which puts them
at the top), and `"drop"` (which removes them from the results entirely).
40 changes: 40 additions & 0 deletions test/components/rankers/test_metafield.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,10 @@ def test_raises_value_error_if_wrong_sort_order(self):
with pytest.raises(ValueError):
MetaFieldRanker(meta_field="rating", sort_order="wrong_order")

def test_raises_value_error_if_wrong_missing_meta(self):
with pytest.raises(ValueError):
MetaFieldRanker(meta_field="rating", missing_meta="wrong_missing_meta")

def test_raises_value_error_if_wrong_meta_value_type(self):
with pytest.raises(ValueError):
MetaFieldRanker(meta_field="rating", meta_value_type="wrong_type")
Expand Down Expand Up @@ -239,3 +243,39 @@ def test_different_ranking_mode_for_init_vs_run(self):
output = ranker.run(documents=docs_before, ranking_mode="reciprocal_rank_fusion")
docs_after = output["documents"]
assert docs_after[0].score == pytest.approx(0.016261, abs=1e-5)

def test_missing_meta_bottom(self):
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="bottom")
docs_before = [
Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6),
Document(id="2", content="abc", meta={}, score=0.4),
Document(id="3", content="abc", meta={"rating": 2.1}, score=0.39),
]
output = ranker.run(documents=docs_before)
docs_after = output["documents"]
assert len(docs_after) == 3
assert docs_after[2].id == "2"

def test_missing_meta_top(self):
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="top")
docs_before = [
Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6),
Document(id="2", content="abc", meta={}, score=0.59),
Document(id="3", content="abc", meta={"rating": 2.1}, score=0.4),
]
output = ranker.run(documents=docs_before)
docs_after = output["documents"]
assert len(docs_after) == 3
assert docs_after[0].id == "2"

def test_missing_meta_drop(self):
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="drop")
docs_before = [
Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6),
Document(id="2", content="abc", meta={}, score=0.59),
Document(id="3", content="abc", meta={"rating": 2.1}, score=0.4),
]
output = ranker.run(documents=docs_before)
docs_after = output["documents"]
assert len(docs_after) == 2
assert "2" not in [doc.id for doc in docs_after]

0 comments on commit 28dd0f5

Please sign in to comment.