Skip to content

Commit

Permalink
Trust Score Comparison (#85)
Browse files Browse the repository at this point in the history
* First commit confidence_change.py

* update docstrings

* Finish update confidence change

* add notebook

* Update docs and used normalized trust scores

* Start multiclass support

* update confidence to new structure

* Change confidence change to trust score comparison

* Add to suite

* Rerun iris

* Update trust score to show data before encoding

* Update plot to probability density

* fix lint

* Move from confidence to drift

* Fix __init__

* Fix __init__

* Fix lint

* commit new line just to test github action

* Update tests/checks/drift/trust_score_comparison_test.py

Co-authored-by: Noam Bressler <noamzbr@gmail.com>

* Update deepchecks/checks/drift/trust_score_comparison.py

Co-authored-by: Noam Bressler <noamzbr@gmail.com>

* PR fixes

* Update module string

* Update headnote

* Moved notebook folder

Co-authored-by: Matan Perlmutter <matan@deepchecks.com>
Co-authored-by: Noam Bressler <noamzbr@gmail.com>
  • Loading branch information
3 people committed Nov 24, 2021
1 parent d5e970b commit 792fae1
Show file tree
Hide file tree
Showing 14 changed files with 1,914 additions and 779 deletions.
5 changes: 0 additions & 5 deletions deepchecks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,6 @@
import matplotlib.pyplot as plt

from .utils import is_notebook
# This is a TEMPORARY solution because currently we use matplotlib, which does not allow us to control the output
# of the graphs, so if the user is in an interactive mode, graphs may be drawed twice. In the near future, we should
# drop matplotlib and start use plotly for our charts.
plt.ioff()

# Matplotlib has multiple backends. If we are in a context that does not support GUI (For example, during unit tests)
# we can't use a GUI backend. Thus we must use a non-GUI backend.
if not is_notebook():
Expand Down
1 change: 1 addition & 0 deletions deepchecks/base/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __call__(self, *args, **kwargs) -> 'ConditionResult':
result.set_name(self.name)
return result


class ConditionCategory(enum.Enum):
"""Condition result category. indicates whether the result should fail the suite."""

Expand Down
1 change: 1 addition & 0 deletions deepchecks/checks/distribution/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
"""Module containing all data distribution checks."""
from .trust_score_comparison import *
70 changes: 70 additions & 0 deletions deepchecks/checks/distribution/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Module of preprocessing functions."""
import numpy as np
import pandas as pd
from typing import List, Tuple

from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from deepchecks.checks.distribution.rare_category_encoder import RareCategoryEncoder

__all__ = ['preprocess_dataset_to_scaled_numerics']


def preprocess_dataset_to_scaled_numerics(baseline_features: pd.DataFrame, test_features: pd.DataFrame,
categorical_columns: List[str], max_num_categories) \
-> Tuple[pd.DataFrame, pd.DataFrame]:
"""Preprocess given features to scaled numerics.
Args:
baseline_features (DataFrame): Will be used for fit and also transformed. Expect to get only features
test_features (DataFrame): Will be transformed according to baseline_data. Expect to get only features
categorical_columns (List[str]): Indicates names of categorical columns in both DataFrames.
max_num_categories (int): Indicates the maximum number of unique categories in a single categorical column
(rare categories will be changed to a form of "other")
Returns:
(DataFrame, DataFrame): returns both datasets transformed.
"""
x_baseline = baseline_features.copy()
x_test = test_features.copy()
non_categorical_columns = list(set(test_features.columns) - set(categorical_columns))

# Impute all-nan cols to all-zero:
def impute_whole_series_to_zero(s: pd.Series):
if s.isna().sum() == s.shape[0]:
return pd.Series(np.zeros(s.shape))
else:
return s

x_baseline = x_baseline.apply(impute_whole_series_to_zero, axis=0)
x_test = x_test.apply(impute_whole_series_to_zero, axis=0)

# impute nan values:
if x_baseline.isna().any().any():
categorical_imputer = SimpleImputer(strategy='most_frequent')
numeric_imputer = SimpleImputer(strategy='mean')
if categorical_columns:
x_baseline[categorical_columns] = categorical_imputer.fit_transform(x_baseline[categorical_columns])
x_test[categorical_columns] = categorical_imputer.transform(x_test[categorical_columns])
if non_categorical_columns:
x_baseline[non_categorical_columns] = numeric_imputer.fit_transform(x_baseline[non_categorical_columns])
x_test[non_categorical_columns] = numeric_imputer.transform(x_test[non_categorical_columns])

# Scale numeric features between 0 and 1:
scaler = MinMaxScaler()
if non_categorical_columns:
x_baseline[non_categorical_columns] = scaler.fit_transform(x_baseline[non_categorical_columns])
x_test[non_categorical_columns] = scaler.transform(x_test[non_categorical_columns])

# Replace non-common categories with special value:
rare_category_encoder = RareCategoryEncoder(max_num_categories=max_num_categories, cols=categorical_columns)
x_baseline = rare_category_encoder.fit_transform(x_baseline)
x_test = rare_category_encoder.transform(x_test)

# One-hot encode categorical features:
one_hot_encoder = OneHotEncoder(cols=categorical_columns, use_cat_names=True)
x_baseline = one_hot_encoder.fit_transform(x_baseline)
x_test = one_hot_encoder.transform(x_test)

return x_baseline, x_test
85 changes: 85 additions & 0 deletions deepchecks/checks/distribution/rare_category_encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""Module of RareCategoryEncoder."""
from collections import defaultdict
import pandas as pd
from typing import List

__all__ = ['RareCategoryEncoder']


class RareCategoryEncoder:
"""Encodes rare categories into an "other" parameter.
Note that this encoder assumes data is received as a DataFrame.
"""

DEFAULT_OTHER_VALUE = 'OTHER_RARE_CATEGORY'

def __init__(self, max_num_categories: int = 10, cols: List[str] = None):
"""Initialize RareCategoryEncoder.
Args:
max_num_categories (int): Indicates the maximum number of unique categories in a single categorical column
(rare categories will be changed to a form of "other")
cols (List[str]): Columns to limit the encoder to work on. If non are given will work on all columns given
in `fit`
"""
self.max_num_categories = max_num_categories
self.cols = cols
self._col_mapping = None

def fit(self, data: pd.DataFrame):
"""Fit the encoder using given dataframe.
Args:
data (DataFrame): data to fit from
"""
if self.cols is not None:
self._col_mapping = data[self.cols].apply(self._fit_for_series, axis=0)
else:
self._col_mapping = data.apply(self._fit_for_series, axis=0)

def transform(self, data: pd.DataFrame):
"""Transform given data according to columns processed in `fit`.
Args:
data (DataFrame): data to transform
Returns:
(DataFrame): transformed data
"""
if self._col_mapping is None:
raise RuntimeError('Cannot transform without fitting first')

if self.cols is not None:
data = data.copy()
data[self.cols] = data[self.cols].apply(lambda s: s.map(self._col_mapping[s.name]))
else:
data = data.apply(lambda s: s.map(self._col_mapping[s.name]))
return data

def fit_transform(self, data: pd.DataFrame):
"""Run `fit` and `transform` on given data.
Args:
data (DataFrame): data to fit on and transform
Returns:
(DataFrame): transformed data
"""
self.fit(data)
return self.transform(data)

def _fit_for_series(self, series: pd.Series):
top_values = list(series.value_counts().head(self.max_num_categories).index)
other_value = self._get_unique_other_value(series)
mapper = defaultdict(lambda: other_value, {k: k for k in top_values})
return mapper

def _get_unique_other_value(self, series: pd.Series):
unique_values = list(series.unique())
other = self.DEFAULT_OTHER_VALUE
i = 0
while other in unique_values:
other = self.DEFAULT_OTHER_VALUE + str(i)
i += 1
return other

0 comments on commit 792fae1

Please sign in to comment.