-
Notifications
You must be signed in to change notification settings - Fork 246
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* First commit confidence_change.py * update docstrings * Finish update confidence change * add notebook * Update docs and used normalized trust scores * Start multiclass support * update confidence to new structure * Change confidence change to trust score comparison * Add to suite * Rerun iris * Update trust score to show data before encoding * Update plot to probability density * fix lint * Move from confidence to drift * Fix __init__ * Fix __init__ * Fix lint * commit new line just to test github action * Update tests/checks/drift/trust_score_comparison_test.py Co-authored-by: Noam Bressler <noamzbr@gmail.com> * Update deepchecks/checks/drift/trust_score_comparison.py Co-authored-by: Noam Bressler <noamzbr@gmail.com> * PR fixes * Update module string * Update headnote * Moved notebook folder Co-authored-by: Matan Perlmutter <matan@deepchecks.com> Co-authored-by: Noam Bressler <noamzbr@gmail.com>
- Loading branch information
1 parent
d5e970b
commit 792fae1
Showing
14 changed files
with
1,914 additions
and
779 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
"""Module containing all data distribution checks.""" | ||
from .trust_score_comparison import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
"""Module of preprocessing functions.""" | ||
import numpy as np | ||
import pandas as pd | ||
from typing import List, Tuple | ||
|
||
from category_encoders import OneHotEncoder | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.preprocessing import MinMaxScaler | ||
from deepchecks.checks.distribution.rare_category_encoder import RareCategoryEncoder | ||
|
||
__all__ = ['preprocess_dataset_to_scaled_numerics'] | ||
|
||
|
||
def preprocess_dataset_to_scaled_numerics(baseline_features: pd.DataFrame, test_features: pd.DataFrame, | ||
categorical_columns: List[str], max_num_categories) \ | ||
-> Tuple[pd.DataFrame, pd.DataFrame]: | ||
"""Preprocess given features to scaled numerics. | ||
Args: | ||
baseline_features (DataFrame): Will be used for fit and also transformed. Expect to get only features | ||
test_features (DataFrame): Will be transformed according to baseline_data. Expect to get only features | ||
categorical_columns (List[str]): Indicates names of categorical columns in both DataFrames. | ||
max_num_categories (int): Indicates the maximum number of unique categories in a single categorical column | ||
(rare categories will be changed to a form of "other") | ||
Returns: | ||
(DataFrame, DataFrame): returns both datasets transformed. | ||
""" | ||
x_baseline = baseline_features.copy() | ||
x_test = test_features.copy() | ||
non_categorical_columns = list(set(test_features.columns) - set(categorical_columns)) | ||
|
||
# Impute all-nan cols to all-zero: | ||
def impute_whole_series_to_zero(s: pd.Series): | ||
if s.isna().sum() == s.shape[0]: | ||
return pd.Series(np.zeros(s.shape)) | ||
else: | ||
return s | ||
|
||
x_baseline = x_baseline.apply(impute_whole_series_to_zero, axis=0) | ||
x_test = x_test.apply(impute_whole_series_to_zero, axis=0) | ||
|
||
# impute nan values: | ||
if x_baseline.isna().any().any(): | ||
categorical_imputer = SimpleImputer(strategy='most_frequent') | ||
numeric_imputer = SimpleImputer(strategy='mean') | ||
if categorical_columns: | ||
x_baseline[categorical_columns] = categorical_imputer.fit_transform(x_baseline[categorical_columns]) | ||
x_test[categorical_columns] = categorical_imputer.transform(x_test[categorical_columns]) | ||
if non_categorical_columns: | ||
x_baseline[non_categorical_columns] = numeric_imputer.fit_transform(x_baseline[non_categorical_columns]) | ||
x_test[non_categorical_columns] = numeric_imputer.transform(x_test[non_categorical_columns]) | ||
|
||
# Scale numeric features between 0 and 1: | ||
scaler = MinMaxScaler() | ||
if non_categorical_columns: | ||
x_baseline[non_categorical_columns] = scaler.fit_transform(x_baseline[non_categorical_columns]) | ||
x_test[non_categorical_columns] = scaler.transform(x_test[non_categorical_columns]) | ||
|
||
# Replace non-common categories with special value: | ||
rare_category_encoder = RareCategoryEncoder(max_num_categories=max_num_categories, cols=categorical_columns) | ||
x_baseline = rare_category_encoder.fit_transform(x_baseline) | ||
x_test = rare_category_encoder.transform(x_test) | ||
|
||
# One-hot encode categorical features: | ||
one_hot_encoder = OneHotEncoder(cols=categorical_columns, use_cat_names=True) | ||
x_baseline = one_hot_encoder.fit_transform(x_baseline) | ||
x_test = one_hot_encoder.transform(x_test) | ||
|
||
return x_baseline, x_test |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
"""Module of RareCategoryEncoder.""" | ||
from collections import defaultdict | ||
import pandas as pd | ||
from typing import List | ||
|
||
__all__ = ['RareCategoryEncoder'] | ||
|
||
|
||
class RareCategoryEncoder: | ||
"""Encodes rare categories into an "other" parameter. | ||
Note that this encoder assumes data is received as a DataFrame. | ||
""" | ||
|
||
DEFAULT_OTHER_VALUE = 'OTHER_RARE_CATEGORY' | ||
|
||
def __init__(self, max_num_categories: int = 10, cols: List[str] = None): | ||
"""Initialize RareCategoryEncoder. | ||
Args: | ||
max_num_categories (int): Indicates the maximum number of unique categories in a single categorical column | ||
(rare categories will be changed to a form of "other") | ||
cols (List[str]): Columns to limit the encoder to work on. If non are given will work on all columns given | ||
in `fit` | ||
""" | ||
self.max_num_categories = max_num_categories | ||
self.cols = cols | ||
self._col_mapping = None | ||
|
||
def fit(self, data: pd.DataFrame): | ||
"""Fit the encoder using given dataframe. | ||
Args: | ||
data (DataFrame): data to fit from | ||
""" | ||
if self.cols is not None: | ||
self._col_mapping = data[self.cols].apply(self._fit_for_series, axis=0) | ||
else: | ||
self._col_mapping = data.apply(self._fit_for_series, axis=0) | ||
|
||
def transform(self, data: pd.DataFrame): | ||
"""Transform given data according to columns processed in `fit`. | ||
Args: | ||
data (DataFrame): data to transform | ||
Returns: | ||
(DataFrame): transformed data | ||
""" | ||
if self._col_mapping is None: | ||
raise RuntimeError('Cannot transform without fitting first') | ||
|
||
if self.cols is not None: | ||
data = data.copy() | ||
data[self.cols] = data[self.cols].apply(lambda s: s.map(self._col_mapping[s.name])) | ||
else: | ||
data = data.apply(lambda s: s.map(self._col_mapping[s.name])) | ||
return data | ||
|
||
def fit_transform(self, data: pd.DataFrame): | ||
"""Run `fit` and `transform` on given data. | ||
Args: | ||
data (DataFrame): data to fit on and transform | ||
Returns: | ||
(DataFrame): transformed data | ||
""" | ||
self.fit(data) | ||
return self.transform(data) | ||
|
||
def _fit_for_series(self, series: pd.Series): | ||
top_values = list(series.value_counts().head(self.max_num_categories).index) | ||
other_value = self._get_unique_other_value(series) | ||
mapper = defaultdict(lambda: other_value, {k: k for k in top_values}) | ||
return mapper | ||
|
||
def _get_unique_other_value(self, series: pd.Series): | ||
unique_values = list(series.unique()) | ||
other = self.DEFAULT_OTHER_VALUE | ||
i = 0 | ||
while other in unique_values: | ||
other = self.DEFAULT_OTHER_VALUE + str(i) | ||
i += 1 | ||
return other |
Oops, something went wrong.