Skip to content

Commit

Permalink
make value list a distr field (#1157)
Browse files Browse the repository at this point in the history
* make value list a distr field

* ColumnCategoryMetric counts field

* fix numpy 2.0

* limit numpy

* fix tests

* fix tests

* fix tests
  • Loading branch information
mike0sv committed Jun 17, 2024
1 parent 0f05b07 commit 580dd97
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 9 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
"statsmodels>=0.12.2",
"scikit-learn>=1.0.1",
"pandas[parquet]>=1.3.5",
"numpy>=1.22.0",
"numpy>=1.22.0,<2",
"nltk>=3.6.7",
"scipy>=1.10.0",
"requests>=2.32.0",
Expand Down
34 changes: 31 additions & 3 deletions src/evidently/metrics/data_quality/column_category_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from evidently.base_metric import Metric
from evidently.base_metric import MetricResult
from evidently.core import IncludeTags
from evidently.metric_results import HistogramData
from evidently.model.widget import BaseWidgetInfo
from evidently.options.base import AnyOptions
from evidently.renderers.base_renderer import MetricRenderer
Expand All @@ -28,21 +29,48 @@ class Config:
category_ratio: float


class CountOfValues(MetricResult):
current: HistogramData
reference: Optional[HistogramData] = None


class ColumnCategoryMetricResult(MetricResult):
class Config:
pd_exclude_fields = {"counts_of_values"}
pd_exclude_fields = {"counts"}
field_tags = {
"current": {IncludeTags.Current},
"reference": {IncludeTags.Reference},
"column_name": {IncludeTags.Parameter},
"counts_of_values": {IncludeTags.Extra},
"counts": {IncludeTags.Extra},
}

def __init__(self, **data):
"""for backward compatibility"""
if "counts_of_values" in data:
counts_of_values: Dict[str, pd.DataFrame] = data.pop("counts_of_values")
counts = CountOfValues(
current=HistogramData(x=counts_of_values["current"]["x"], count=counts_of_values["current"]["count"])
)
if "reference" in counts_of_values:
counts.reference = HistogramData(
x=counts_of_values["reference"]["x"], count=counts_of_values["reference"]["count"]
)
data["counts"] = counts
super().__init__(**data)

column_name: str
category: Union[int, float, str]
current: CategoryStat
reference: Optional[CategoryStat] = None
counts_of_values: Dict[str, pd.DataFrame]
counts: CountOfValues

@property
def counts_of_values(self) -> Dict[str, pd.DataFrame]:
"""for backward compatibility"""
result = {"current": pd.DataFrame({"x": self.counts.current.x, "count": self.counts.current.count})}
if self.counts.reference is not None:
result["reference"] = pd.DataFrame({"x": self.counts.reference.x, "count": self.counts.reference.count})
return result


class ColumnCategoryMetric(Metric[ColumnCategoryMetricResult]):
Expand Down
31 changes: 27 additions & 4 deletions src/evidently/metrics/data_quality/column_value_list_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from evidently.base_metric import MetricResult
from evidently.calculations.data_quality import get_rows_count
from evidently.core import IncludeTags
from evidently.metric_results import DistributionIncluded
from evidently.model.widget import BaseWidgetInfo
from evidently.options.base import AnyOptions
from evidently.renderers.base_renderer import MetricRenderer
Expand All @@ -25,18 +26,40 @@
class ValueListStat(MetricResult):
class Config:
field_tags = {
"values_in_list": {IncludeTags.Extra},
"values_not_in_list": {IncludeTags.Extra},
"values_in_list_dist": {IncludeTags.Extra},
"values_not_in_list_dist": {IncludeTags.Extra},
"rows_count": {IncludeTags.Extra},
}

def __init__(self, **data: Any):
if "values_in_list" in data:
values_in_list: List[Tuple[Any, int]] = data.pop("values_in_list")
data["values_in_list_dist"] = DistributionIncluded(
x=[v[0] for v in values_in_list], y=[v[1] for v in values_in_list]
)
if "values_not_in_list" in data:
values_not_in_list: List[Tuple[Any, int]] = data.pop("values_not_in_list")
data["values_not_in_list_dist"] = DistributionIncluded(
x=[v[0] for v in values_not_in_list], y=[v[1] for v in values_not_in_list]
)

super().__init__(**data)

number_in_list: int
number_not_in_list: int
share_in_list: float
share_not_in_list: float
values_in_list: List[Tuple[Any, int]]
values_not_in_list: List[Tuple[Any, int]]
rows_count: int
values_in_list_dist: DistributionIncluded
values_not_in_list_dist: DistributionIncluded

@property
def values_in_list(self) -> List[Tuple[Any, int]]:
return [(x, y) for x, y in zip(self.values_in_list_dist.x, self.values_in_list_dist.y)]

@property
def values_not_in_list(self) -> List[Tuple[Any, int]]:
return [(x, y) for x, y in zip(self.values_not_in_list_dist.x, self.values_not_in_list_dist.y)]


class ColumnValueListMetricResult(MetricResult):
Expand Down
69 changes: 68 additions & 1 deletion tests/metrics/data_quality/test_column_value_list_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd
import pytest

from evidently._pydantic_compat import parse_obj_as
from evidently.metrics import ColumnValueListMetric
from evidently.metrics.data_quality.column_value_list_metric import ColumnValueListMetricResult
from evidently.metrics.data_quality.column_value_list_metric import ValueListStat
Expand Down Expand Up @@ -202,7 +203,7 @@ def test_data_quality_value_list_metric_value_errors(


@pytest.mark.parametrize(
"current_data, reference_data, metric, expected_json",
"current_data, reference_data, metric, old_json",
(
(
pd.DataFrame({"col": [1, 2, 3]}),
Expand Down Expand Up @@ -257,6 +258,72 @@ def test_data_quality_value_list_metric_value_errors(
),
),
)
def test_data_quality_value_list_metric_with_report_compat(
current_data: pd.DataFrame, reference_data: pd.DataFrame, metric: ColumnValueListMetric, old_json: dict
):
report = Report(metrics=[metric])
report.run(current_data=current_data, reference_data=reference_data, column_mapping=ColumnMapping())

result = parse_obj_as(ColumnValueListMetricResult, old_json)
assert metric.get_result() == result


@pytest.mark.parametrize(
"current_data, reference_data, metric, expected_json",
(
(
pd.DataFrame({"col": [1, 2, 3]}),
None,
ColumnValueListMetric(column_name="col", values=[1]),
{
"column_name": "col",
"current": {
"number_in_list": 1,
"number_not_in_list": 2,
"rows_count": 3,
"share_in_list": 0.3333333333333333,
"share_not_in_list": 0.6666666666666666,
"values_in_list_dist": {"x": [1], "y": [1]},
"values_not_in_list_dist": {"x": [2, 3], "y": [1, 1]},
},
"reference": None,
"values": [1],
},
),
(
pd.DataFrame({"col1": [1, 2, 3], "col2": [10, 20, 3.5]}),
pd.DataFrame(
{
"col1": [10, 20, 3.5],
"col2": [1, 2, 3],
}
),
ColumnValueListMetric(column_name="col1"),
{
"column_name": "col1",
"current": {
"number_in_list": 0,
"number_not_in_list": 3,
"rows_count": 3,
"share_in_list": 0.0,
"share_not_in_list": 1.0,
"values_in_list_dist": {"x": [10.0, 20.0, 3.5], "y": [0, 0, 0]},
"values_not_in_list_dist": {"x": [1, 2, 3], "y": [1, 1, 1]},
},
"reference": {
"number_in_list": 3,
"number_not_in_list": 0,
"rows_count": 3,
"share_in_list": 1.0,
"share_not_in_list": 0.0,
"values_in_list_dist": {"x": [10.0, 20.0, 3.5], "y": [1, 1, 1]},
"values_not_in_list_dist": {"x": [], "y": []},
},
"values": [10.0, 20.0, 3.5],
},
),
),
)
def test_data_quality_value_list_metric_with_report(
current_data: pd.DataFrame, reference_data: pd.DataFrame, metric: ColumnValueListMetric, expected_json: dict
) -> None:
Expand Down

0 comments on commit 580dd97

Please sign in to comment.