diff --git a/deepchecks/utils/abstracts/weak_segment_abstract.py b/deepchecks/utils/abstracts/weak_segment_abstract.py index 1b7f26d73f..b752978f25 100644 --- a/deepchecks/utils/abstracts/weak_segment_abstract.py +++ b/deepchecks/utils/abstracts/weak_segment_abstract.py @@ -164,7 +164,7 @@ def _weak_segments_search(self, dummy_model, encoded_dataset, feature_rank_for_s tuple(filters[feature1]), feature2, tuple(filters[feature2]), data_size] - return weak_segments.drop_duplicates().sort_values(f'{scorer.name} score') + return weak_segments.drop_duplicates().sort_values(f'{scorer.name} score').reset_index(drop=True) def _find_weak_segment(self, dummy_model, dataset, features_for_segment, scorer: DeepcheckScorer, loss_per_sample): """Find weak segment based on scorer for specified features.""" diff --git a/docs/source/checks/nlp/model_evaluation/plot_confusion_matrix_report.py b/docs/source/checks/nlp/model_evaluation/plot_confusion_matrix_report.py new file mode 100644 index 0000000000..85c70134e5 --- /dev/null +++ b/docs/source/checks/nlp/model_evaluation/plot_confusion_matrix_report.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +""" +.. _plot_tabular_confusion_matrix_report: + +Confusion Matrix Report +*********************** +This notebook provides an overview for using and understanding the Confusion Matrix Report check for NLP tasks. + + +**Structure:** + +* `What is the Confusion Matrix Report? <#what-is-the-confusion-matrix-report>`__ +* `Generate data & model <#generate-data-model>`__ +* `Run the check <#run-the-check>`__ + + +What is the Confusion Matrix Report? +====================================== +The ``ConfusionMatrixReport`` produces a confusion matrix visualization which summarizes the +performance of the model. The confusion matrix contains the TP (true positive), FP (false positive), +TN (true negative) and FN (false negative), from which we can derive the relevant metrics, +such as accuracy, precision, recall etc. (`confusion matrix `__). +""" + +#%% +# Generate data & model +# ======================= +from deepchecks.nlp import TextData +from deepchecks.nlp.checks import ConfusionMatrixReport +from deepchecks.nlp.datasets.classification.tweet_emotion import load_data, load_precalculated_predictions + +tweets_data = load_data(data_format='DataFrame', as_train_test=False) +tweets_dataset = TextData(tweets_data.text, label=tweets_data['label'], + task_type='text_classification') + +predictions = load_precalculated_predictions(as_train_test=False) + + +#%% +# Run the check +# =============== + +check = ConfusionMatrixReport() +result = check.run(tweets_dataset, predictions=predictions) +result.show() + +#%% diff --git a/docs/source/checks/nlp/model_evaluation/plot_metadata_segments_performance.py b/docs/source/checks/nlp/model_evaluation/plot_metadata_segments_performance.py new file mode 100644 index 0000000000..d5aa067c61 --- /dev/null +++ b/docs/source/checks/nlp/model_evaluation/plot_metadata_segments_performance.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- +""" +Metadata Segments Performance +************************* + +This notebook provides an overview for using and understanding the metadata segment performance check. + +**Structure:** + +* `What is the purpose of the check? <#what-is-the-purpose-of-the-check>`__ +* `Automatically detecting weak segments <#automatically-detecting-weak-segments>`__ +* `Generate data & model <#generate-data-model>`__ +* `Run the check <#run-the-check>`__ +* `Define a condition <#define-a-condition>`__ + +What is the purpose of the check? +================================== + +The check is designed to help you easily identify the model's weakest segments based on the provided +:func:`metadata `. In addition, +it enables to provide a sublist of the metadata columns, thus limiting the check to search in +interesting subspaces. + +Automatically detecting weak segments +===================================== + +The check contains several steps: + +#. We calculate loss for each sample in the dataset using the provided model via either log-loss or MSE according + to the task type. + +#. Select a subset of features for the weak segment search. This is done by selecting the features with the + highest feature importance to the model provided (within the features selected for check, if limited). + +#. We train multiple simple tree based models, each one is trained using exactly two + features (out of the ones selected above) to predict the per sample error calculated before. + +#. We extract the corresponding data samples for each of the leaves in each of the trees (data segments) and calculate + the model performance on them. For the weakest data segments detected we also calculate the model's + performance on data segments surrounding them. +""" +#%% +# Generate data & model +# ===================== + +from deepchecks.nlp.datasets.classification.tweet_emotion import load_data, load_precalculated_predictions + +_, test_dataset = load_data(data_format='TextData') +_, test_probas = load_precalculated_predictions(pred_format='probabilities') + +test_dataset.metadata.head(3) + +#%% +# Run the check +# ============= +# +# The check has several key parameters (that are all optional) that affect the behavior of the +# check and especially its output. +# +# ``columns / ignore_columns``: Controls which columns should be searched for weak segments. By default, +# uses all columns. +# +# ``alternative_scorer``: Determines the metric to be used as the performance measurement of the model on different +# segments. It is important to select a metric that is relevant to the data domain and task you are performing. +# For additional information on scorers and how to use them see +# :doc:`Metrics Guide `. +# +# ``segment_minimum_size_ratio``: Determines the minimum size of segments that are of interest. The check will +# return data segments that contain at least this fraction of the total data samples. It is recommended to +# try different configurations +# of this parameter as larger segments can be of interest even the model performance on them is superior. +# +# ``categorical_aggregation_threshold``: By default the check will combine rare categories into a single category called +# "Other". This parameter determines the frequency threshold for categories to be mapped into to the "other" category. +# +# see :class:`API reference ` for more details. + +from deepchecks.nlp.checks import MetadataSegmentsPerformance +from sklearn.metrics import make_scorer, f1_score + +scorer = {'f1': make_scorer(f1_score, average='micro')} +check = MetadataSegmentsPerformance(alternative_scorer=scorer, + segment_minimum_size_ratio=0.03) +result = check.run(test_dataset, probabilities=test_probas) +result.show() + +#%% +# Observe the check's output +# -------------------------- +# +# We see in the results that the check indeed found several segments on which the model performance is below average. +# In the heatmap display we can see model performance on the weakest segments and their environment with respect to the +# two features that are relevant to the segment. In order to get the full list of weak segments found we will inspect +# the ``result.value`` attribute. Shown below are the 3 segments with the worst performance. + + +result.value['weak_segments_list'].head(3) + +#%% +# Define a condition +# ================== +# +# We can add a condition that will validate the model's performance on the weakest segment detected is above a certain +# threshold. A scenario where this can be useful is when we want to make sure that the model is not under performing +# on a subset of the data that is of interest to us, for example for specific age or gender groups. + +# Let's add a condition and re-run the check: + +check = MetadataSegmentsPerformance(alternative_scorer=scorer, segment_minimum_size_ratio=0.03) +check.add_condition_segments_relative_performance_greater_than(0.1) +result = check.run(test_dataset, probabilities=test_probas) +result.show(show_additional_outputs=False) diff --git a/docs/source/checks/nlp/model_evaluation/plot_property_segments_performance.py b/docs/source/checks/nlp/model_evaluation/plot_property_segments_performance.py new file mode 100644 index 0000000000..b4324a6c60 --- /dev/null +++ b/docs/source/checks/nlp/model_evaluation/plot_property_segments_performance.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- +""" +Property Segments Performance +************************* + +This notebook provides an overview for using and understanding the property segment performance check. + +**Structure:** + +* `What is the purpose of the check? <#what-is-the-purpose-of-the-check>`__ +* `Automatically detecting weak segments <#automatically-detecting-weak-segments>`__ +* `Generate data & model <#generate-data-model>`__ +* `Run the check <#run-the-check>`__ +* `Define a condition <#define-a-condition>`__ + +What is the purpose of the check? +================================== + +The check is designed to help you easily identify the model's weakest segments based on the provided +:func:`properties `. In addition, +it enables to provide a sublist of the metadata columns, thus limiting the check to search in +interesting subspaces. + +Automatically detecting weak segments +===================================== + +The check contains several steps: + +#. We calculate loss for each sample in the dataset using the provided model via either log-loss or MSE according + to the task type. + +#. Select a subset of features for the weak segment search. This is done by selecting the features with the + highest feature importance to the model provided (within the features selected for check, if limited). + +#. We train multiple simple tree based models, each one is trained using exactly two + features (out of the ones selected above) to predict the per sample error calculated before. + +#. We extract the corresponding data samples for each of the leaves in each of the trees (data segments) and calculate + the model performance on them. For the weakest data segments detected we also calculate the model's + performance on data segments surrounding them. +""" +#%% +# Generate data & model +# ===================== + +from deepchecks.nlp.datasets.classification.tweet_emotion import load_data, load_precalculated_predictions + +_, test_dataset = load_data(data_format='TextData') +_, test_probas = load_precalculated_predictions(pred_format='probabilities') + +test_dataset.properties.head(3) + +#%% +# Run the check +# ============= +# +# The check has several key parameters (that are all optional) that affect the behavior of the +# check and especially its output. +# +# ``properties / ignore_properties``: Controls which properties should be searched for weak segments. By default, +# uses all properties data provided. +# +# ``alternative_scorer``: Determines the metric to be used as the performance measurement of the model on different +# segments. It is important to select a metric that is relevant to the data domain and task you are performing. +# For additional information on scorers and how to use them see +# :doc:`Metrics Guide `. +# +# ``segment_minimum_size_ratio``: Determines the minimum size of segments that are of interest. The check will +# return data segments that contain at least this fraction of the total data samples. It is recommended to +# try different configurations +# of this parameter as larger segments can be of interest even the model performance on them is superior. +# +# ``categorical_aggregation_threshold``: By default the check will combine rare categories into a single category called +# "Other". This parameter determines the frequency threshold for categories to be mapped into to the "other" category. +# +# see :class:`API reference ` for more details. + +from deepchecks.nlp.checks import PropertySegmentsPerformance +from sklearn.metrics import make_scorer, f1_score + +scorer = {'f1': make_scorer(f1_score, average='micro')} +check = PropertySegmentsPerformance(alternative_scorer=scorer, + segment_minimum_size_ratio=0.03) +result = check.run(test_dataset, probabilities=test_probas) +result.show() + +#%% +# Observe the check's output +# -------------------------- +# +# We see in the results that the check indeed found several segments on which the model performance is below average. +# In the heatmap display we can see model performance on the weakest segments and their environment with respect to the +# two features that are relevant to the segment. In order to get the full list of weak segments found we will inspect +# the ``result.value`` attribute. Shown below are the 3 segments with the worst performance. + + +result.value['weak_segments_list'].head(3) + +#%% +# Define a condition +# ================== +# +# We can add a condition that will validate the model's performance on the weakest segment detected is above a certain +# threshold. A scenario where this can be useful is when we want to make sure that the model is not under performing +# on a subset of the data that is of interest to us. + +# Let's add a condition and re-run the check: + +check = PropertySegmentsPerformance(alternative_scorer=scorer, segment_minimum_size_ratio=0.03) +check.add_condition_segments_relative_performance_greater_than(0.1) +result = check.run(test_dataset, probabilities=test_probas) +result.show(show_additional_outputs=False) diff --git a/docs/source/checks/tabular/model_evaluation/plot_confusion_matrix_report.py b/docs/source/checks/tabular/model_evaluation/plot_confusion_matrix_report.py index 5fcdfdaab6..543dbca569 100644 --- a/docs/source/checks/tabular/model_evaluation/plot_confusion_matrix_report.py +++ b/docs/source/checks/tabular/model_evaluation/plot_confusion_matrix_report.py @@ -23,9 +23,8 @@ """ #%% -# Imports -# ========= - +# Generate data & model +# ======================= import pandas as pd from sklearn.datasets import load_iris from sklearn.ensemble import AdaBoostClassifier @@ -34,10 +33,6 @@ from deepchecks.tabular import Dataset from deepchecks.tabular.checks import ConfusionMatrixReport -#%% -# Generate data & model -# ======================= - iris = load_iris(as_frame=True) clf = AdaBoostClassifier() frame = iris.frame @@ -54,4 +49,7 @@ # =============== check = ConfusionMatrixReport() -check.run(ds, clf) +result = check.run(ds, clf) +result.show() + +#%% diff --git a/docs/source/checks/tabular/model_evaluation/plot_weak_segments_performance.py b/docs/source/checks/tabular/model_evaluation/plot_weak_segments_performance.py index 67daa71e90..cca73241f8 100644 --- a/docs/source/checks/tabular/model_evaluation/plot_weak_segments_performance.py +++ b/docs/source/checks/tabular/model_evaluation/plot_weak_segments_performance.py @@ -34,8 +34,9 @@ #. We train multiple simple tree based models, each one is trained using exactly two features (out of the ones selected above) to predict the per sample error calculated before. -#. We convert each of the leafs in each of the trees into a segment and calculate the segment's performance. For the - weakest segments detected we also calculate the model's performance on data segments surrounding them. +#. We extract the corresponding data samples for each of the leaves in each of the trees (data segments) and calculate + the model performance on them. For the weakest data segments detected we also calculate the model's + performance on data segments surrounding them. """ #%% # Generate data & model @@ -44,7 +45,7 @@ from deepchecks.tabular.datasets.classification.phishing import ( load_data, load_fitted_model) -train_dataset, test_dataset = load_data() +_, test_ds = load_data() model = load_fitted_model() #%% @@ -60,9 +61,12 @@ # ``alternative_scorer``: Determines the metric to be used as the performance measurement of the model on different # segments. It is important to select a metric that is relevant to the data domain and task you are performing. # By default, the check uses Neg RMSE for regression tasks and Accuracy for classification tasks. +# For additional information on scorers and how to use them see +# :doc:`Metrics Guide `. # -# ``segment_minimum_size_ratio``: Determines the minimum size of segments that are of interest. The check is tuned -# to find the weakest segment regardless of the segment size and so it is recommended to try different configurations +# ``segment_minimum_size_ratio``: Determines the minimum size of segments that are of interest. The check will +# return data segments that contain at least this fraction of the total data samples. It is recommended to +# try different configurations # of this parameter as larger segments can be of interest even the model performance on them is superior. # # ``categorical_aggregation_threshold``: By default the check will combine rare categories into a single category called @@ -70,13 +74,10 @@ # # see :class:`API reference ` for more details. -from deepchecks.tabular.datasets.classification import phishing from deepchecks.tabular.checks import WeakSegmentsPerformance from sklearn.metrics import make_scorer, f1_score scorer = {'f1': make_scorer(f1_score, average='micro')} -_, test_ds = phishing.load_data() -model = phishing.load_fitted_model() check = WeakSegmentsPerformance(columns=['urlLength', 'numTitles', 'ext', 'entropy'], alternative_scorer=scorer, segment_minimum_size_ratio=0.03, @@ -91,17 +92,18 @@ # We see in the results that the check indeed found several segments on which the model performance is below average. # In the heatmap display we can see model performance on the weakest segments and their environment with respect to the # two features that are relevant to the segment. In order to get the full list of weak segments found we will inspect -# the result.value attribute. +# the ``result.value`` attribute. Shown below are the 3 segments with the worst performance. -result.value['weak_segments_list'] +result.value['weak_segments_list'].head(3) #%% # Define a condition # ================== # -# We can define on our check a condition that will validate that the model performance on the weakest segment detected -# is greater than a specified ratio of the average model performance of the entire dataset. +# We can add a condition that will validate the model's performance on the weakest segment detected is above a certain +# threshold. A scenario where this can be useful is when we want to make sure that the model is not under performing +# on a subset of the data that is of interest to us, for example for specific age or gender groups. # Let's add a condition and re-run the check: diff --git a/docs/source/checks/vision/model_evaluation/plot_weak_segments_performance.py b/docs/source/checks/vision/model_evaluation/plot_weak_segments_performance.py index 043d07ee13..8733784c0a 100644 --- a/docs/source/checks/vision/model_evaluation/plot_weak_segments_performance.py +++ b/docs/source/checks/vision/model_evaluation/plot_weak_segments_performance.py @@ -34,8 +34,9 @@ #. We train multiple simple tree based models, each one is trained using two properties to predict the per sample error calculated before. -#. We convert each of the leafs in each of the trees into a segment and calculate the segment's performance. For the - weakest segments detected we also calculate the model's performance on the data segments surrounding them. +#. We extract the corresponding data samples for each of the leaves in each of the trees (data segments) and calculate + the model performance on them. For the weakest data segments detected we also calculate the model's + performance on data segments surrounding them. """ #%% @@ -75,10 +76,10 @@ # We see in the results that the check indeed found several segments on which the model performance is below average. # In the heatmap display we can see the model's performance on the weakest segments and their environment with respect # to the two segmentation features. In order to get the full list of weak segments found we can look at -# the result.value attribute. +# the ``result.value`` attribute. Shown below are the 3 segments with the worst performance. -result.value['weak_segments_list'] +result.value['weak_segments_list'].head(3) #%% # Now we will run a check with properties and minimum segment size ratio (the minimal fraction of the data to be @@ -95,8 +96,9 @@ # Define a condition # ================== # -# We can define on our check a condition that will validate that the ratio of the model performance on the -# weakest segment to the average model performance is less than a specified ratio. +# We can add a condition that will validate the model's performance on the weakest segment detected is above a certain +# threshold. A scenario where this can be useful is when we want to make sure that the model is not under performing +# on a subset of the data that is of interest to us. # Let's add a condition and re-run the check: