make numerical type explicit + other edge case fixes (#1114)

* v0 * v0 * v0 * linting * added_tests * linting * fix_more_split_bugs * fix_more_split_bugs * some_fixes * typo
deepchecks · Mar 28, 2022 · 535d0cb · 535d0cb
1 parent 2e630e8
commit 535d0cb
Show file tree

Hide file tree

Showing 17 changed files with 140 additions and 50 deletions.
diff --git a/deepchecks/tabular/checks/distribution/train_test_feature_drift.py b/deepchecks/tabular/checks/distribution/train_test_feature_drift.py
@@ -116,6 +116,12 @@ def run_logic(self, context: Context) -> CheckResult:
         values_dict = OrderedDict()
         displays_dict = OrderedDict()
         for column in train_dataset.features:
+            if column in train_dataset.numerical_features:
+                column_type = 'numerical'
+            elif column in train_dataset.cat_features:
+                column_type = 'categorical'
+            else:
+                continue  # we only support categorical or numerical features
             if features_importance is not None:
                 fi_rank_series = features_importance.rank(method='first', ascending=False)
                 fi_rank = fi_rank_series[column]
@@ -127,7 +133,7 @@ def run_logic(self, context: Context) -> CheckResult:
                 train_column=train_dataset.data[column],
                 test_column=test_dataset.data[column],
                 value_name=column,
-                column_type='categorical' if column in train_dataset.cat_features else 'numerical',
+                column_type=column_type,
                 plot_title=plot_title,
                 max_num_categories=self.max_num_categories
             )
@@ -153,7 +159,8 @@ def run_logic(self, context: Context) -> CheckResult:
             <br>If available, the plot titles also show the feature importance (FI) rank.
         </span>"""
 
-        displays = [headnote] + [displays_dict[col] for col in columns_order]
+        displays = [headnote] + [displays_dict[col] for col in columns_order
+                                 if col in train_dataset.cat_features + train_dataset.numerical_features]
 
         return CheckResult(value=values_dict, display=displays, header='Train Test Drift')
 

diff --git a/deepchecks/tabular/checks/distribution/whole_dataset_drift.py b/deepchecks/tabular/checks/distribution/whole_dataset_drift.py
@@ -93,11 +93,10 @@ def run_logic(self, context: Context) -> CheckResult:
         test_dataset = context.test
         features = train_dataset.features
         cat_features = train_dataset.cat_features
+        numerical_features = train_dataset.numerical_features
 
         sample_size = min(self.sample_size, train_dataset.n_samples, test_dataset.n_samples)
 
-        numerical_features = list(set(features) - set(cat_features))
-
         headnote = """
         <span>
         The shown features are the features that are most important for the domain classifier - the

diff --git a/deepchecks/tabular/checks/methodology/unused_features.py b/deepchecks/tabular/checks/methodology/unused_features.py
@@ -254,15 +254,13 @@ def naive_encoder(dataset: Dataset) -> Tuple[TransformerMixin, list]:
     Tuple[TransformerMixin, list]
         A transformer object, a list of columns returned
     """
-    numeric_features = [col for col in dataset.features if col not in dataset.cat_features]
-
     return ColumnTransformer(
         transformers=[
             ('num', Pipeline([
                 ('nan_handling', SimpleImputer()),
                 ('norm', RobustScaler())
             ]),
-             numeric_features),
+             dataset.numerical_features),
             ('cat',
              Pipeline([
                  ('nan_handling', SimpleImputer(strategy='most_frequent')),
@@ -271,4 +269,4 @@ def naive_encoder(dataset: Dataset) -> Tuple[TransformerMixin, list]:
              ]),
              dataset.cat_features)
         ]
-    ), numeric_features + dataset.cat_features
+    ), dataset.numerical_features + dataset.cat_features
diff --git a/deepchecks/tabular/checks/performance/confusion_matrix_report.py b/deepchecks/tabular/checks/performance/confusion_matrix_report.py
@@ -9,6 +9,7 @@
 # ----------------------------------------------------------------------------
 #
 """The confusion_matrix_report check module."""
+import pandas as pd
 import sklearn
 import plotly.express as px
 
@@ -46,10 +47,11 @@ def run_logic(self, context: Context, dataset_type: str = 'train') -> CheckResul
         model = context.model
 
         y_pred = model.predict(ds_x)
+        total_classes = sorted(list(set(pd.concat([ds_y, pd.Series(y_pred)]).to_list())))
         confusion_matrix = sklearn.metrics.confusion_matrix(ds_y, y_pred)
 
         # Figure
-        fig = px.imshow(confusion_matrix, x=dataset.classes, y=dataset.classes, text_auto=True)
+        fig = px.imshow(confusion_matrix, x=total_classes, y=total_classes, text_auto=True)
         fig.update_layout(width=600, height=600)
         fig.update_xaxes(title='Predicted Value', type='category')
         fig.update_yaxes(title='True value', type='category')

diff --git a/deepchecks/tabular/checks/performance/model_error_analysis.py b/deepchecks/tabular/checks/performance/model_error_analysis.py
@@ -64,7 +64,12 @@ class ModelErrorAnalysis(TrainTestCheck):
     .. code-block:: python
 
         from sklearn.metrics import roc_auc_score, make_scorer
-        auc_scorer = make_scorer(roc_auc_score)
+
+        training_labels = [1, 2, 3]
+        auc_scorer = make_scorer(roc_auc_score, labels=training_labels, multi_class='ovr')
+        # Note that the labels parameter is required for multi-class classification in metrics like roc_auc_score or
+        # log_loss that use the predict_proba function of the model, in case that not all labels are present in the test
+        # set.
 
     Or you can implement your own:
 
@@ -121,10 +126,9 @@ def run_logic(self, context: Context) -> CheckResult:
             def scoring_func(dataset: Dataset):
                 return per_sample_mse(dataset.label_col, model.predict(dataset.features_columns))
         else:
-            le = preprocessing.LabelEncoder()
-            le.fit(train_dataset.classes)
-
             def scoring_func(dataset: Dataset):
+                le = preprocessing.LabelEncoder()
+                le.fit(dataset.classes)
                 encoded_label = le.transform(dataset.label_col)
                 return per_sample_cross_entropy(encoded_label,
                                                 model.predict_proba(dataset.features_columns))
@@ -133,7 +137,7 @@ def scoring_func(dataset: Dataset):
         test_scores = scoring_func(test_dataset)
 
         cat_features = train_dataset.cat_features
-        numeric_features = [num_feature for num_feature in train_dataset.features if num_feature not in cat_features]
+        numeric_features = train_dataset.numerical_features
 
         error_fi, error_model_predicted = model_error_contribution(train_dataset.features_columns,
                                                                    train_scores,

diff --git a/deepchecks/tabular/checks/performance/performance_report.py b/deepchecks/tabular/checks/performance/performance_report.py
@@ -48,7 +48,12 @@ class PerformanceReport(TrainTestCheck):
     .. code-block:: python
 
         from sklearn.metrics import roc_auc_score, make_scorer
-        auc_scorer = make_scorer(roc_auc_score)
+
+        training_labels = [1, 2, 3]
+        auc_scorer = make_scorer(roc_auc_score, labels=training_labels, multi_class='ovr')
+        # Note that the labels parameter is required for multi-class classification in metrics like roc_auc_score or
+        # log_loss that use the predict_proba function of the model, in case that not all labels are present in the test
+        # set.
 
     Or you can implement your own:
 
@@ -83,7 +88,6 @@ def run_logic(self, context: Context) -> CheckResult:
 
         model = context.model
         task_type = context.task_type
-        classes = train_dataset.classes
 
         scorers = context.get_scorers(self.user_scorers, class_avg=False)
         datasets = {'Train': train_dataset, 'Test': test_dataset}
@@ -93,6 +97,7 @@ def run_logic(self, context: Context) -> CheckResult:
             results = []
 
             for dataset_name, dataset in datasets.items():
+                classes = dataset.classes
                 label = cast(pd.Series, dataset.label_col)
                 n_samples = label.groupby(label).count()
                 results.extend(
@@ -197,14 +202,14 @@ def condition(check_result: pd.DataFrame) -> ConditionResult:
                     test_scores_dict = dict(zip(test_scores_class['Metric'], test_scores_class['Value']))
                     train_scores_dict = dict(zip(train_scores_class['Metric'], train_scores_class['Value']))
                     # Calculate percentage of change from train to test
-                    diff = {score_name: _ratio_of_change_calc(score, test_scores_dict[score_name])
+                    diff = {score_name: _ratio_of_change_calc(score, test_scores_dict.get(score_name, 0))
                             for score_name, score in train_scores_dict.items()}
                     failed_scores = [k for k, v in diff.items() if v > threshold]
                     if failed_scores:
                         for score_name in failed_scores:
                             explained_failures.append(f'{score_name} for class {class_name} '
-                                                      f'(train={format_number(train_scores_dict[score_name])} '
-                                                      f'test={format_number(test_scores_dict[score_name])})')
+                                                      f'(train={format_number(train_scores_dict.get(score_name, 0))} '
+                                                      f'test={format_number(test_scores_dict.get(score_name, 0))})')
             else:
                 test_scores_dict = dict(zip(test_scores['Metric'], test_scores['Value']))
                 train_scores_dict = dict(zip(train_scores['Metric'], train_scores['Value']))

diff --git a/deepchecks/tabular/checks/performance/segment_performance.py b/deepchecks/tabular/checks/performance/segment_performance.py
@@ -14,7 +14,7 @@
 import numpy as np
 import plotly.figure_factory as ff
 
-from deepchecks.tabular import Context, SingleDatasetCheck, Dataset
+from deepchecks.tabular import Context, SingleDatasetCheck
 from deepchecks.core import CheckResult
 from deepchecks.core.errors import DeepchecksValueError, DatasetValidationError
 from deepchecks.utils.performance.partition import partition_column
@@ -111,9 +111,7 @@ def run_logic(self, context: Context, dataset_type: str = 'train') -> CheckResul
                 if feature_2_df.empty:
                     score = np.NaN
                 else:
-                    score = scorer(model,
-                                   Dataset(feature_2_df, features=dataset.features,
-                                           label=dataset.label_name, cat_features=dataset.cat_features))
+                    score = scorer(model, dataset.copy(feature_2_df))
                 scores[i, j] = score
                 counts[i, j] = len(feature_2_df)
 

diff --git a/deepchecks/tabular/checks/performance/simple_model_comparison.py b/deepchecks/tabular/checks/performance/simple_model_comparison.py
@@ -64,7 +64,12 @@ class SimpleModelComparison(TrainTestCheck):
     .. code-block:: python
 
         from sklearn.metrics import roc_auc_score, make_scorer
-        auc_scorer = make_scorer(roc_auc_score)
+
+        training_labels = [1, 2, 3]
+        auc_scorer = make_scorer(roc_auc_score, labels=training_labels, multi_class='ovr')
+        # Note that the labels parameter is required for multi-class classification in metrics like roc_auc_score or
+        # log_loss that use the predict_proba function of the model, in case that not all labels are present in the test
+        # set.
 
     Or you can implement your own:
 
@@ -135,7 +140,7 @@ def run_logic(self, context: Context) -> CheckResult:
         # Multiclass have different return type from the scorer, list of score per class instead of single score
         if task_type in [ModelType.MULTICLASS, ModelType.BINARY]:
             n_samples = test_label.groupby(test_label).count()
-            classes = train_dataset.classes
+            classes = test_dataset.classes
 
             results_array = []
             # Dict in format { Scorer : Dict { Class : Dict { Origin/Simple : score } } }

diff --git a/deepchecks/tabular/dataset.py b/deepchecks/tabular/dataset.py
@@ -20,7 +20,7 @@
 from sklearn.model_selection import train_test_split
 
 from deepchecks.utils.dataframes import select_from_dataframe
-from deepchecks.utils.features import is_categorical, infer_categorical_features
+from deepchecks.utils.features import infer_numerical_features, is_categorical, infer_categorical_features
 from deepchecks.utils.typing import Hashable
 from deepchecks.core.errors import DeepchecksValueError, DatasetValidationError, DeepchecksNotSupportedError
 
@@ -285,6 +285,9 @@ def __init__(
         else:
             self._label_type = None
 
+        unassigned_cols = [col for col in self._features if col not in self._cat_features]
+        self._numerical_features = infer_numerical_features(self._data[unassigned_cols])
+
     @classmethod
     def from_numpy(
             cls: t.Type[TDataset],
@@ -463,16 +466,6 @@ def n_samples(self) -> int:
         """
         return self.data.shape[0]
 
-    def __len__(self) -> int:
-        """Return number of samples in the member dataframe.
-
-        Returns
-        -------
-        int
-
-        """
-        return self.n_samples
-
     @property
     def label_type(self) -> t.Optional[str]:
         """Return the label type.
@@ -736,6 +729,17 @@ def cat_features(self) -> t.List[Hashable]:
         """
         return list(self._cat_features)
 
+    @property
+    def numerical_features(self) -> t.List[Hashable]:
+        """Return list of numerical feature names.
+
+         Returns
+        -------
+        t.List[Hashable]
+           List of categorical feature names.
+        """
+        return list(self._numerical_features)
+
     @property
     @lru_cache(maxsize=128)
     def classes(self) -> t.Tuple[str, ...]:
@@ -770,8 +774,10 @@ def columns_info(self) -> t.Dict[Hashable, str]:
             elif column in self._features:
                 if column in self.cat_features:
                     value = 'categorical feature'
-                else:
+                elif column in self.numerical_features:
                     value = 'numerical feature'
+                else:
+                    value = 'other feature'
             else:
                 value = 'other'
             columns[column] = value
@@ -1043,3 +1049,13 @@ def datasets_share_date(cls, *datasets: 'Dataset') -> bool:
                 return False
 
         return True
+
+    def __len__(self) -> int:
+        """Return number of samples in the member dataframe.
+
+        Returns
+        -------
+        int
+
+        """
+        return self.n_samples
diff --git a/deepchecks/utils/features.py b/deepchecks/utils/features.py
@@ -21,7 +21,7 @@
 
 import numpy as np
 import pandas as pd
-from pandas.core.dtypes.common import is_float_dtype
+from pandas.core.dtypes.common import is_float_dtype, is_numeric_dtype
 from sklearn.inspection import permutation_importance
 
 from deepchecks import tabular
@@ -38,6 +38,7 @@
     'column_importance_sorter_dict',
     'column_importance_sorter_df',
     'infer_categorical_features',
+    'infer_numerical_features',
     'is_categorical',
     'N_TOP_MESSAGE'
 ]
@@ -380,6 +381,31 @@ def column_importance_sorter_df(
     return df
 
 
+def infer_numerical_features(df: pd.DataFrame) -> t.List[Hashable]:
+    """Infers which features are numerical.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        dataframe for which to infer numerical features
+
+    Returns
+    -------
+    List[Hashable]
+        list of numerical features
+    """
+    columns = df.columns
+    numerical_columns = []
+    for col in columns:
+        col_data = df[col]
+        if col_data.dtype == 'object':
+            # object might still be only floats, so we rest the dtype
+            col_data = pd.Series(col_data.to_list())
+        if is_numeric_dtype(col_data):
+            numerical_columns.append(col)
+    return numerical_columns
+
+
 def infer_categorical_features(
     df: pd.DataFrame,
     max_categorical_ratio: float = 0.01,

diff --git a/deepchecks/utils/performance/error_model.py b/deepchecks/utils/performance/error_model.py
@@ -216,7 +216,7 @@ def error_model_display(error_fi: pd.Series,
                 labels={error_col_name: 'model error'}, color=color_col,
                 color_discrete_map=color_map
             ))
-        else:
+        elif feature in dataset.numerical_features:
             # sample data for display
             np.random.seed(random_state)
             sampling_idx = np.random.choice(range(len(data)), size=n_samples_display, replace=False)

diff --git a/deepchecks/utils/performance/partition.py b/deepchecks/utils/performance/partition.py
@@ -111,7 +111,7 @@ def partition_column(
     List[DeepchecksFilter]
     """
     column = dataset.data[column_name]
-    if column_name not in dataset.cat_features:
+    if column_name in dataset.numerical_features:
         percentile_values = numeric_segmentation_edges(column, max_segments)
         # If for some reason only single value in the column (and column not categorical) we will get single item
         if len(percentile_values) == 1:
@@ -131,7 +131,7 @@ def partition_column(
 
             filters.append(DeepchecksFilter(f, label))
         return filters
-    else:
+    elif column_name in dataset.cat_features:
         # Get sorted histogram
         cat_hist_dict = column.value_counts()
         # Get index of last value in histogram to show

diff --git a/docs/source/examples/tabular/checks/performance/source/plot_model_error_analysis.py b/docs/source/examples/tabular/checks/performance/source/plot_model_error_analysis.py
@@ -31,12 +31,6 @@
 # Classification Model
 # ====================
 
-from sklearn.pipeline import Pipeline
-from sklearn.impute import SimpleImputer
-from sklearn.compose import ColumnTransformer
-from sklearn.preprocessing import OrdinalEncoder
-from sklearn.ensemble import RandomForestClassifier
-
 model = adult.load_fitted_model()
 
 #%%

diff --git a/spelling-allowlist.txt b/spelling-allowlist.txt
@@ -70,4 +70,5 @@ quantiles
 toplevel
 ious
 renormalization
-jsonpickle
+jsonpickle
+dtype