Skip to content

Commit

Permalink
fix over 255 bug in whole dataset drift (#1278)
Browse files Browse the repository at this point in the history
* fix over 255 bug in whole dataset drift
  • Loading branch information
benisraeldan committed Apr 17, 2022
1 parent 442b87c commit ff79ce0
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 6 deletions.
4 changes: 3 additions & 1 deletion deepchecks/core/check_utils/whole_dataset_drift_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

from deepchecks.tabular import Dataset
from deepchecks.utils.distribution.plot import feature_distribution_traces, drift_score_bar_traces
from deepchecks.utils.distribution.rare_category_encoder import RareCategoryEncoder
from deepchecks.utils.features import N_TOP_MESSAGE, calculate_feature_importance_or_none
from deepchecks.utils.function import run_available_kwargs
from deepchecks.utils.strings import format_percent
Expand Down Expand Up @@ -114,7 +115,8 @@ def generate_model(numerical_columns: List[Hashable], categorical_columns: List[
random_state: int = 42) -> Pipeline:
"""Generate the unfitted Domain Classifier model."""
categorical_transformer = Pipeline(
steps=[('encoder', run_available_kwargs(OrdinalEncoder, handle_unknown='use_encoded_value',
steps=[('rare', RareCategoryEncoder(254)),
('encoder', run_available_kwargs(OrdinalEncoder, handle_unknown='use_encoded_value',
unknown_value=np.nan,
dtype=np.float64))]
)
Expand Down
19 changes: 14 additions & 5 deletions deepchecks/utils/distribution/rare_category_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,18 +45,24 @@ def __init__(
self.cols = cols
self._col_mapping = None

def fit(self, data: pd.DataFrame):
def fit(self, data: pd.DataFrame, y=None): # noqa # pylint: disable=unused-argument
"""Fit the encoder using given dataframe.
Parameters
----------
data : pd.DataFrame
data to fit from
y :
Unused, but needed for sklearn pipeline
"""
self._col_mapping = {}

if self.cols is not None:
self._col_mapping = data[self.cols].apply(self._fit_for_series, axis=0)
for col in self.cols:
self._col_mapping[col] = self._fit_for_series(data[col])
else:
self._col_mapping = data.apply(self._fit_for_series, axis=0)
for col in data.columns:
self._col_mapping[col] = self._fit_for_series(data[col])

def transform(self, data: pd.DataFrame):
"""Transform given data according to columns processed in `fit`.
Expand All @@ -78,15 +84,18 @@ def transform(self, data: pd.DataFrame):
data[self.cols] = data[self.cols].apply(lambda s: s.map(self._col_mapping[s.name]))
else:
data = data.apply(lambda s: s.map(self._col_mapping[s.name]))

return data

def fit_transform(self, data: pd.DataFrame):
def fit_transform(self, data: pd.DataFrame, y=None): # noqa # pylint: disable=unused-argument
"""Run `fit` and `transform` on given data.
Parameters
----------
data : pd.DataFrame
data to fit on and transform
y :
Unused, but needed for sklearn pipeline
Returns
-------
DataFrame
Expand All @@ -98,7 +107,7 @@ def fit_transform(self, data: pd.DataFrame):
def _fit_for_series(self, series: pd.Series):
top_values = list(series.value_counts().head(self.max_num_categories).index)
other_value = self._get_unique_other_value(series)
mapper = pd.Series(defaultdict(lambda: other_value, {k: k for k in top_values}), name=series.name)
mapper = defaultdict(lambda: other_value, {k: k for k in top_values})
return mapper

def _get_unique_other_value(self, series: pd.Series):
Expand Down
45 changes: 45 additions & 0 deletions tests/checks/distribution/whole_dataset_drift_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@
# ----------------------------------------------------------------------------
#
"""Test functions of the whole dataset drift check."""
import string
import random

import numpy as np
import pandas as pd
from hamcrest import assert_that, has_entries, close_to

from deepchecks.tabular.dataset import Dataset
Expand Down Expand Up @@ -98,3 +103,43 @@ def test_max_drift_score_condition_fail(drifted_data):
name='Drift value is not greater than 0.25',
details='Found drift value of: 0.86, corresponding to a domain classifier AUC of: 0.93'
))


def test_over_255_categories_in_column():
np.random.seed(42)

letters = string.ascii_letters
categories = [''.join(random.choice(letters) for _ in range(5)) for _ in range(300)]

train_data = np.concatenate([np.random.randn(1000, 1),
np.random.choice(a=categories, size=(1000, 1))],
axis=1)
test_data = np.concatenate([np.random.randn(1000, 1),
np.random.choice(a=categories, size=(1000, 1))],
axis=1)

df_train = pd.DataFrame(train_data,
columns=['numeric_without_drift', 'categorical_with_many_categories'])
df_test = pd.DataFrame(test_data, columns=df_train.columns)

df_test['categorical_with_many_categories'] = np.random.choice(a=categories[20:280], size=(1000, 1))

df_train = df_train.astype({'numeric_without_drift': 'float'})
df_test = df_test.astype({'numeric_without_drift': 'float'})

label = np.random.randint(0, 2, size=(df_train.shape[0],))
df_train['target'] = label
train_ds = Dataset(df_train, cat_features=['categorical_with_many_categories'], label='target')

label = np.random.randint(0, 2, size=(df_test.shape[0],))
df_test['target'] = label
test_ds = Dataset(df_test, cat_features=['categorical_with_many_categories'], label='target')

check = WholeDatasetDrift()

# Act
result = check.run(train_ds, test_ds)

# Assert
# we only care that it runs
assert_that(result.value['domain_classifier_auc'])

0 comments on commit ff79ce0

Please sign in to comment.