From d4d7ec95078a674bc284fbdf1559dac176de4142 Mon Sep 17 00:00:00 2001 From: matanper Date: Mon, 2 May 2022 10:42:16 +0300 Subject: [PATCH] Update plot model error analysis (#1361) * Update plot model error analysis * Minor fix to Segment Performance --- .../checks/performance/segment_performance.py | 5 ++ .../performance/plot_model_error_analysis.py | 86 +++++++++++++------ .../performance/segment_performance_test.py | 11 +++ 3 files changed, 75 insertions(+), 27 deletions(-) diff --git a/deepchecks/tabular/checks/performance/segment_performance.py b/deepchecks/tabular/checks/performance/segment_performance.py index 5862849e10..2dc71631bf 100644 --- a/deepchecks/tabular/checks/performance/segment_performance.py +++ b/deepchecks/tabular/checks/performance/segment_performance.py @@ -93,6 +93,11 @@ def run_logic(self, context: Context, dataset_type: str = 'train') -> CheckResul elif self.feature_1 is None or self.feature_2 is None: raise DeepchecksValueError('Must define both "feature_1" and "feature_2" or none of them') + else: + # If both are defined, must be in dataset columns + columns = dataset.data.columns + if self.feature_1 not in columns or self.feature_2 not in columns: + raise DeepchecksValueError('"feature_1" and "feature_2" must be in dataset columns') feature_1_filters = partition_column(dataset, self.feature_1, max_segments=self.max_segments) feature_2_filters = partition_column(dataset, self.feature_2, max_segments=self.max_segments) diff --git a/docs/source/checks/tabular/performance/plot_model_error_analysis.py b/docs/source/checks/tabular/performance/plot_model_error_analysis.py index 67be96fb4c..b8c036d013 100644 --- a/docs/source/checks/tabular/performance/plot_model_error_analysis.py +++ b/docs/source/checks/tabular/performance/plot_model_error_analysis.py @@ -2,47 +2,79 @@ """ Model Error Analysis ******************** -""" -#%% -# Load Data -# ========= -# The dataset is the adult dataset which can be downloaded from the UCI machine learning repository. -# -# Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. -# Irvine, CA: University of California, School of Information and Computer Science. +This notebooks provides an overview for using and understanding the model error analysis check. -from urllib.request import urlopen +**Structure:** -import pandas as pd +* `What is Model Error Analysis? <#what-is-model-error-analysis>`__ +* `Run the check <#run-the-check>`__ +* `Define a condition <#define-a-condition>`__ -from deepchecks.tabular.datasets.classification import adult +What is Model Error Analysis? +============================= +Evaluating the model's overall performance metrics gives a good high-level overview and can be useful for tracking model progress during training of for comparing models. However, when it's time to fully evaluate if a model is fit for production, or when you're interested in a deeper understanding of your model's performance in order to improve it or to be aware of its weaknesses, it's recommended +to look deeper at how the model performs on various segments of the data. The model error analysis check searches for data segments in which the model error is significantly lower from the model error of the dataset as a whole. -#%% -# Create Dataset -# ============== +Algorithm: +---------- -train_ds, test_ds = adult.load_data() +1. Computes the per-sample loss (for log-loss for classification, mse for regression). +2. Trains a regression model to predict the error of the user's model, based on the input features. +3. Repeat stage 2 several times with various tree parameters and random states to ensure that the most relevant partitions for model error are selected. +4. The features scoring the highest feature importance for the error regression model are selected and the distribution of the error vs the feature values is plotted. -#%% -# Classification Model -# ==================== +The check results are shown only if the error regression model manages +to predict the error well enough (above a given r squared performance threshold, defined by the min_error_model_score parameter and set by default to 0.5). The resulting plots show the distribution of the error for the features that are +most effective at segmenting the error to high and low values, without need for manual selection of segmentation +features. -model = adult.load_fitted_model() +Related Checks: +--------------- +When the important segments of the data are known in advance (when we know that some population segments have +different behaviours and business importance, for example income levels or state of residence) it is possible to just +have a look at the performance at various pre-defined segments. In deepchecks, this can be done using the +:doc:`Segment Performance `_ check, which shows the +performance for segments defined by combination of values from two pre-defined columns. -#%% -# Run Check -# ========= +Run the check +============= +We will run the check on the adult dataset which can be downloaded from the +`UCI machine learning repository `_ and is also available in +`deepchecks.tabular.datasets`. +""" + +from deepchecks.tabular.datasets.classification import adult from deepchecks.tabular.checks import ModelErrorAnalysis -#%% +train_ds, test_ds = adult.load_data(data_format='Dataset', as_train_test=True) +model = adult.load_fitted_model() +# We create the check with a slightly lower r squared threshold to ensure that the check can run on the example dataset. check = ModelErrorAnalysis(min_error_model_score=0.3) -check = check.add_condition_segments_performance_relative_difference_not_greater_than() -res = check.run(train_ds, test_ds, model) -res +result = check.run(train_ds, test_ds, model) +result + +#%% +# The check has found that the features 'hours-per-week', 'age' and 'relationship' are the most predictive of +# differences in the model error. We can further investigate the model performance by passing two of these columns +# to the :doc:`Segment Performance `_ check: + +from deepchecks.tabular.checks import SegmentPerformance + +SegmentPerformance(feature_1='age', feature_2='relationship').run(test_ds, model) + +#%% +# From which we learn that the model error is exceptionally higher for people in the "Husband" or "Other" status, +# except for the lower age groups for which the error is lower. #%% +# Define a condition +# ================== +# We can define a condition that enforces that the relative difference between the weak and strong segments is not +# greater than a certain ratio, for example ratio of 0.05 -res.value +check = check.add_condition_segments_performance_relative_difference_not_greater_than(0.05) +result = check.run(train_ds, test_ds, model) +result.show(show_additional_outputs=False) diff --git a/tests/checks/performance/segment_performance_test.py b/tests/checks/performance/segment_performance_test.py index c91ccd18de..f363e7cbde 100644 --- a/tests/checks/performance/segment_performance_test.py +++ b/tests/checks/performance/segment_performance_test.py @@ -56,6 +56,17 @@ def test_segment_performance_diabetes(diabetes_split_dataset_and_model): assert_that(result['counts'].sum(), equal_to(146)) +def test_segment_performance_illegal_features(diabetes_split_dataset_and_model): + # Arrange + _, val, model = diabetes_split_dataset_and_model + + # Act & Assert + assert_that( + calling(SegmentPerformance(feature_1='AGE', feature_2='sex').run).with_args(val, model), + raises(DeepchecksValueError, r'\"feature_1\" and \"feature_2\" must be in dataset columns') + ) + + def test_segment_top_features(diabetes_split_dataset_and_model): # Arrange _, val, model = diabetes_split_dataset_and_model