Skip to content

Commit

Permalink
ignore min_category_size_ratio for balance_classes=True (#2336)
Browse files Browse the repository at this point in the history
  • Loading branch information
nirhutnik committed Feb 16, 2023
1 parent a04a354 commit 5c5b20a
Show file tree
Hide file tree
Showing 6 changed files with 23 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,10 @@ class TrainTestPredictionDrift(TrainTestCheck, ReduceMixin):
disproportionally. This filter is applied to both distributions, in both margins.
min_category_size_ratio: float, default 0.01
minimum size ratio for categories. Categories with size ratio lower than this number are binned
into an "Other" category.
into an "Other" category. Ignored if balance_classes=True.
max_num_categories_for_drift: int, default: None
Only relevant if drift is calculated for classification predictions. Max number of allowed categories.
If there are more,
they are binned into an "Other" category. This limit applies for both drift calculation and distribution plots.
If there are more, they are binned into an "Other" category.
max_num_categories_for_display: int, default: 10
Max number of categories to show in plot.
show_categories_by: str, default: 'largest_difference'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,10 @@ class TrainTestLabelDrift(TrainTestCheck, ReduceLabelMixin):
disproportionally. This filter is applied to both distributions, in both margins.
min_category_size_ratio: float, default 0.01
minimum size ratio for categories. Categories with size ratio lower than this number are binned
into an "Other" category.
into an "Other" category. Ignored if balance_classes=True.
max_num_categories_for_drift: int, default: None
Only for classification. Max number of allowed categories. If there are more,
they are binned into an "Other" category. This limit applies for both drift calculation and distribution plots
they are binned into an "Other" category.
max_num_categories_for_display: int, default: 10
Max number of categories to show in plot.
show_categories_by: str, default: 'largest_difference'
Expand Down
3 changes: 3 additions & 0 deletions deepchecks/utils/distribution/drift.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ def cramers_v(dist1: Union[np.ndarray, pd.Series], dist2: Union[np.ndarray, pd.S
the bias-corrected Cramer's V value of the 2 distributions.
"""
# If balance_classes is True, min_category_size_ratio should not affect results:
min_category_size_ratio = min_category_size_ratio if balance_classes is False else 0

dist1_counts, dist2_counts, _ = preprocess_2_cat_cols_to_same_bins(dist1, dist2, min_category_size_ratio,
max_num_categories, sort_by)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,16 +92,15 @@ class TrainTestPredictionDrift(TrainTestCheck, ReducePropertyMixin):
disproportionally. This filter is applied to both distributions, in both margins.
min_category_size_ratio: float, default 0.01
minimum size ratio for categories. Categories with size ratio lower than this number are binned
into an "Other" category.
into an "Other" category. Ignored if balance_classes=True.
max_num_categories_for_drift: int, default: None
Only for discrete properties. Max number of allowed categories. If there are more,
they are binned into an "Other" category. This limit applies for both drift calculation and distribution plots.
they are binned into an "Other" category.
max_num_categories_for_display: int, default: 10
Max number of categories to show in plot.
show_categories_by: str, default: 'largest_difference'
Specify which categories to show for categorical features' graphs, as the number of shown categories is limited
by max_num_categories_for_display. Possible values:
- 'train_largest': Show the largest train categories.
- 'test_largest': Show the largest test categories.
- 'largest_difference': Show the largest difference between categories.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,10 @@ class TrainTestLabelDrift(TrainTestCheck, ReducePropertyMixin, ReduceLabelMixin)
disproportionally. This filter is applied to both distributions, in both margins.
min_category_size_ratio : float, default 0.01
minimum size ratio for categories. Categories with size ratio lower than this number are binned
into an "Other" category.
into an "Other" category. Ignored if balance_classes=True.
max_num_categories_for_drift : int, default: None
Only for discrete properties. Max number of allowed categories. If there are more,
they are binned into an "Other" category. This limit applies for both drift calculation and distribution plots.
they are binned into an "Other" category.
max_num_categories_for_display: int, default: 10
Max number of categories to show in plot.
show_categories_by : str, default: 'largest_difference'
Expand Down
12 changes: 12 additions & 0 deletions tests/utils/drift_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,18 @@ def test_cramers_v_min_category_ratio():
res_min_cat_ratio = cramers_v(dist1=dist1, dist2=dist2, min_category_size_ratio=0.1)
assert_that(res_min_cat_ratio, close_to(0.208, 0.01))

def test_cramers_v_imbalanced():
dist1 = np.array([0] * 9900 + [1] * 100)
dist2 = np.array([0] * 9950 + [1] * 50)
res = cramers_v(dist1=dist1, dist2=dist2, balance_classes=True)
assert_that(res, close_to(0.17, 0.01))

def test_cramers_v_imbalanced_ignore_min_category_size():
dist1 = np.array([0] * 9900 + [1] * 100)
dist2 = np.array([0] * 9950 + [1] * 50)
res = cramers_v(dist1=dist1, dist2=dist2, balance_classes=True, min_category_size_ratio=0.1)
assert_that(res, close_to(0.17, 0.01))


def test_ks_no_drift():
dist1 = np.zeros(100)
Expand Down

0 comments on commit 5c5b20a

Please sign in to comment.