Trust Score Comparison (#85)

* First commit confidence_change.py * update docstrings * Finish update confidence change * add notebook * Update docs and used normalized trust scores * Start multiclass support * update confidence to new structure * Change confidence change to trust score comparison * Add to suite * Rerun iris * Update trust score to show data before encoding * Update plot to probability density * fix lint * Move from confidence to drift * Fix __init__ * Fix __init__ * Fix lint * commit new line just to test github action * Update tests/checks/drift/trust_score_comparison_test.py Co-authored-by: Noam Bressler <noamzbr@gmail.com> * Update deepchecks/checks/drift/trust_score_comparison.py Co-authored-by: Noam Bressler <noamzbr@gmail.com> * PR fixes * Update module string * Update headnote * Moved notebook folder Co-authored-by: Matan Perlmutter <matan@deepchecks.com> Co-authored-by: Noam Bressler <noamzbr@gmail.com>
deepchecks · Nov 24, 2021 · 792fae1 · 792fae1
1 parent d5e970b
commit 792fae1
Show file tree

Hide file tree

Showing 14 changed files with 1,914 additions and 779 deletions.
diff --git a/deepchecks/__init__.py b/deepchecks/__init__.py
@@ -4,11 +4,6 @@
 import matplotlib.pyplot as plt
 
 from .utils import is_notebook
-# This is a TEMPORARY solution because currently we use matplotlib, which does not allow us to control the output
-# of the graphs, so if the user is in an interactive mode, graphs may be drawed twice. In the near future, we should
-# drop matplotlib and start use plotly for our charts.
-plt.ioff()
-
 # Matplotlib has multiple backends. If we are in a context that does not support GUI (For example, during unit tests)
 # we can't use a GUI backend. Thus we must use a non-GUI backend.
 if not is_notebook():

diff --git a/deepchecks/base/check.py b/deepchecks/base/check.py
@@ -41,6 +41,7 @@ def __call__(self, *args, **kwargs) -> 'ConditionResult':
         result.set_name(self.name)
         return result
 
+
 class ConditionCategory(enum.Enum):
     """Condition result category. indicates whether the result should fail the suite."""
 

diff --git a/deepchecks/checks/distribution/__init__.py b/deepchecks/checks/distribution/__init__.py
@@ -1 +1,2 @@
 """Module containing all data distribution checks."""
+from .trust_score_comparison import *
diff --git a/deepchecks/checks/distribution/preprocessing.py b/deepchecks/checks/distribution/preprocessing.py
@@ -0,0 +1,70 @@
+"""Module of preprocessing functions."""
+import numpy as np
+import pandas as pd
+from typing import List, Tuple
+
+from category_encoders import OneHotEncoder
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import MinMaxScaler
+from deepchecks.checks.distribution.rare_category_encoder import RareCategoryEncoder
+
+__all__ = ['preprocess_dataset_to_scaled_numerics']
+
+
+def preprocess_dataset_to_scaled_numerics(baseline_features: pd.DataFrame, test_features: pd.DataFrame,
+                                          categorical_columns: List[str], max_num_categories) \
+        -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """Preprocess given features to scaled numerics.
+
+    Args:
+        baseline_features (DataFrame): Will be used for fit and also transformed. Expect to get only features
+        test_features (DataFrame): Will be transformed according to baseline_data. Expect to get only features
+        categorical_columns (List[str]): Indicates names of categorical columns in both DataFrames.
+        max_num_categories (int): Indicates the maximum number of unique categories in a single categorical column
+                                  (rare categories will be changed to a form of "other")
+
+    Returns:
+        (DataFrame, DataFrame): returns both datasets transformed.
+    """
+    x_baseline = baseline_features.copy()
+    x_test = test_features.copy()
+    non_categorical_columns = list(set(test_features.columns) - set(categorical_columns))
+
+    # Impute all-nan cols to all-zero:
+    def impute_whole_series_to_zero(s: pd.Series):
+        if s.isna().sum() == s.shape[0]:
+            return pd.Series(np.zeros(s.shape))
+        else:
+            return s
+
+    x_baseline = x_baseline.apply(impute_whole_series_to_zero, axis=0)
+    x_test = x_test.apply(impute_whole_series_to_zero, axis=0)
+
+    # impute nan values:
+    if x_baseline.isna().any().any():
+        categorical_imputer = SimpleImputer(strategy='most_frequent')
+        numeric_imputer = SimpleImputer(strategy='mean')
+        if categorical_columns:
+            x_baseline[categorical_columns] = categorical_imputer.fit_transform(x_baseline[categorical_columns])
+            x_test[categorical_columns] = categorical_imputer.transform(x_test[categorical_columns])
+        if non_categorical_columns:
+            x_baseline[non_categorical_columns] = numeric_imputer.fit_transform(x_baseline[non_categorical_columns])
+            x_test[non_categorical_columns] = numeric_imputer.transform(x_test[non_categorical_columns])
+
+    # Scale numeric features between 0 and 1:
+    scaler = MinMaxScaler()
+    if non_categorical_columns:
+        x_baseline[non_categorical_columns] = scaler.fit_transform(x_baseline[non_categorical_columns])
+        x_test[non_categorical_columns] = scaler.transform(x_test[non_categorical_columns])
+
+    # Replace non-common categories with special value:
+    rare_category_encoder = RareCategoryEncoder(max_num_categories=max_num_categories, cols=categorical_columns)
+    x_baseline = rare_category_encoder.fit_transform(x_baseline)
+    x_test = rare_category_encoder.transform(x_test)
+
+    # One-hot encode categorical features:
+    one_hot_encoder = OneHotEncoder(cols=categorical_columns, use_cat_names=True)
+    x_baseline = one_hot_encoder.fit_transform(x_baseline)
+    x_test = one_hot_encoder.transform(x_test)
+
+    return x_baseline, x_test
diff --git a/deepchecks/checks/distribution/rare_category_encoder.py b/deepchecks/checks/distribution/rare_category_encoder.py
@@ -0,0 +1,85 @@
+"""Module of RareCategoryEncoder."""
+from collections import defaultdict
+import pandas as pd
+from typing import List
+
+__all__ = ['RareCategoryEncoder']
+
+
+class RareCategoryEncoder:
+    """Encodes rare categories into an "other" parameter.
+
+    Note that this encoder assumes data is received as a DataFrame.
+    """
+
+    DEFAULT_OTHER_VALUE = 'OTHER_RARE_CATEGORY'
+
+    def __init__(self, max_num_categories: int = 10, cols: List[str] = None):
+        """Initialize RareCategoryEncoder.
+
+        Args:
+            max_num_categories (int): Indicates the maximum number of unique categories in a single categorical column
+                                      (rare categories will be changed to a form of "other")
+            cols (List[str]): Columns to limit the encoder to work on. If non are given will work on all columns given
+                              in `fit`
+        """
+        self.max_num_categories = max_num_categories
+        self.cols = cols
+        self._col_mapping = None
+
+    def fit(self, data: pd.DataFrame):
+        """Fit the encoder using given dataframe.
+
+        Args:
+            data (DataFrame): data to fit from
+        """
+        if self.cols is not None:
+            self._col_mapping = data[self.cols].apply(self._fit_for_series, axis=0)
+        else:
+            self._col_mapping = data.apply(self._fit_for_series, axis=0)
+
+    def transform(self, data: pd.DataFrame):
+        """Transform given data according to columns processed in `fit`.
+
+        Args:
+            data (DataFrame): data to transform
+
+        Returns:
+            (DataFrame): transformed data
+        """
+        if self._col_mapping is None:
+            raise RuntimeError('Cannot transform without fitting first')
+
+        if self.cols is not None:
+            data = data.copy()
+            data[self.cols] = data[self.cols].apply(lambda s: s.map(self._col_mapping[s.name]))
+        else:
+            data = data.apply(lambda s: s.map(self._col_mapping[s.name]))
+        return data
+
+    def fit_transform(self, data: pd.DataFrame):
+        """Run `fit` and `transform` on given data.
+
+        Args:
+            data (DataFrame): data to fit on and transform
+
+        Returns:
+            (DataFrame): transformed data
+        """
+        self.fit(data)
+        return self.transform(data)
+
+    def _fit_for_series(self, series: pd.Series):
+        top_values = list(series.value_counts().head(self.max_num_categories).index)
+        other_value = self._get_unique_other_value(series)
+        mapper = defaultdict(lambda: other_value, {k: k for k in top_values})
+        return mapper
+
+    def _get_unique_other_value(self, series: pd.Series):
+        unique_values = list(series.unique())
+        other = self.DEFAULT_OTHER_VALUE
+        i = 0
+        while other in unique_values:
+            other = self.DEFAULT_OTHER_VALUE + str(i)
+            i += 1
+        return other