performance docs NLP (#2453)

deepchecks · Apr 18, 2023 · 76d5957 · 76d5957
1 parent 0d105c8
commit 76d5957
Show file tree

Hide file tree

Showing 7 changed files with 300 additions and 27 deletions.
diff --git a/deepchecks/utils/abstracts/weak_segment_abstract.py b/deepchecks/utils/abstracts/weak_segment_abstract.py
@@ -164,7 +164,7 @@ def _weak_segments_search(self, dummy_model, encoded_dataset, feature_rank_for_s
                                                              tuple(filters[feature1]), feature2,
                                                              tuple(filters[feature2]), data_size]
 
-        return weak_segments.drop_duplicates().sort_values(f'{scorer.name} score')
+        return weak_segments.drop_duplicates().sort_values(f'{scorer.name} score').reset_index(drop=True)
 
     def _find_weak_segment(self, dummy_model, dataset, features_for_segment, scorer: DeepcheckScorer, loss_per_sample):
         """Find weak segment based on scorer for specified features."""

diff --git a/docs/source/checks/nlp/model_evaluation/plot_confusion_matrix_report.py b/docs/source/checks/nlp/model_evaluation/plot_confusion_matrix_report.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+"""
+.. _plot_tabular_confusion_matrix_report:
+
+Confusion Matrix Report
+***********************
+This notebook provides an overview for using and understanding the Confusion Matrix Report check for NLP tasks.
+
+
+**Structure:**
+
+* `What is the Confusion Matrix Report? <#what-is-the-confusion-matrix-report>`__
+* `Generate data & model <#generate-data-model>`__
+* `Run the check <#run-the-check>`__
+
+
+What is the Confusion Matrix Report?
+======================================
+The ``ConfusionMatrixReport`` produces a confusion matrix visualization which summarizes the
+performance of the model. The confusion matrix contains the TP (true positive), FP (false positive),
+TN (true negative) and FN (false negative), from which we can derive the relevant metrics,
+such as accuracy, precision, recall etc. (`confusion matrix <https://en.wikipedia.org/wiki/Confusion_matrix>`__).
+"""
+
+#%%
+# Generate data & model
+# =======================
+from deepchecks.nlp import TextData
+from deepchecks.nlp.checks import ConfusionMatrixReport
+from deepchecks.nlp.datasets.classification.tweet_emotion import load_data, load_precalculated_predictions
+
+tweets_data = load_data(data_format='DataFrame', as_train_test=False)
+tweets_dataset = TextData(tweets_data.text, label=tweets_data['label'],
+                          task_type='text_classification')
+
+predictions = load_precalculated_predictions(as_train_test=False)
+
+
+#%%
+# Run the check
+# ===============
+
+check = ConfusionMatrixReport()
+result = check.run(tweets_dataset, predictions=predictions)
+result.show()
+
+#%%
diff --git a/docs/source/checks/nlp/model_evaluation/plot_metadata_segments_performance.py b/docs/source/checks/nlp/model_evaluation/plot_metadata_segments_performance.py
@@ -0,0 +1,112 @@
+# -*- coding: utf-8 -*-
+"""
+Metadata Segments Performance
+*************************
+
+This notebook provides an overview for using and understanding the metadata segment performance check.
+
+**Structure:**
+
+* `What is the purpose of the check? <#what-is-the-purpose-of-the-check>`__
+* `Automatically detecting weak segments <#automatically-detecting-weak-segments>`__
+* `Generate data & model <#generate-data-model>`__
+* `Run the check <#run-the-check>`__
+* `Define a condition <#define-a-condition>`__
+
+What is the purpose of the check?
+==================================
+
+The check is designed to help you easily identify the model's weakest segments based on the provided
+:func:`metadata <deepchecks.nlp.text_data.TextData.set_metadata>`. In addition,
+it enables to provide a sublist of the metadata columns, thus limiting the check to search in
+interesting subspaces.
+
+Automatically detecting weak segments
+=====================================
+
+The check contains several steps:
+
+#. We calculate loss for each sample in the dataset using the provided model via either log-loss or MSE according
+   to the task type.
+
+#. Select a subset of features for the weak segment search. This is done by selecting the features with the
+   highest feature importance to the model provided (within the features selected for check, if limited).
+
+#. We train multiple simple tree based models, each one is trained using exactly two
+   features (out of the ones selected above) to predict the per sample error calculated before.
+
+#. We extract the corresponding data samples for each of the leaves in each of the trees (data segments) and calculate
+   the model performance on them. For the weakest data segments detected we also calculate the model's
+   performance on data segments surrounding them.
+"""
+#%%
+# Generate data & model
+# =====================
+
+from deepchecks.nlp.datasets.classification.tweet_emotion import load_data, load_precalculated_predictions
+
+_, test_dataset = load_data(data_format='TextData')
+_, test_probas = load_precalculated_predictions(pred_format='probabilities')
+
+test_dataset.metadata.head(3)
+
+#%%
+# Run the check
+# =============
+#
+# The check has several key parameters (that are all optional) that affect the behavior of the
+# check and especially its output.
+#
+# ``columns / ignore_columns``: Controls which columns should be searched for weak segments. By default,
+# uses all columns.
+#
+# ``alternative_scorer``: Determines the metric to be used as the performance measurement of the model on different
+# segments. It is important to select a metric that is relevant to the data domain and task you are performing.
+# For additional information on scorers and how to use them see
+# :doc:`Metrics Guide </user-guide/general/metrics_guide>`.
+#
+# ``segment_minimum_size_ratio``: Determines the minimum size of segments that are of interest. The check will
+# return data segments that contain at least this fraction of the total data samples. It is recommended to
+# try different configurations
+# of this parameter as larger segments can be of interest even the model performance on them is superior.
+#
+# ``categorical_aggregation_threshold``: By default the check will combine rare categories into a single category called
+# "Other". This parameter determines the frequency threshold for categories to be mapped into to the "other" category.
+#
+# see :class:`API reference <deepchecks.tabular.checks.model_evaluation.WeakSegmentsPerformance>` for more details.
+
+from deepchecks.nlp.checks import MetadataSegmentsPerformance
+from sklearn.metrics import make_scorer, f1_score
+
+scorer = {'f1': make_scorer(f1_score, average='micro')}
+check = MetadataSegmentsPerformance(alternative_scorer=scorer,
+                                    segment_minimum_size_ratio=0.03)
+result = check.run(test_dataset, probabilities=test_probas)
+result.show()
+
+#%%
+# Observe the check's output
+# --------------------------
+#
+# We see in the results that the check indeed found several segments on which the model performance is below average.
+# In the heatmap display we can see model performance on the weakest segments and their environment with respect to the
+# two features that are relevant to the segment. In order to get the full list of weak segments found we will inspect
+# the ``result.value`` attribute. Shown below are the 3 segments with the worst performance.
+
+
+result.value['weak_segments_list'].head(3)
+
+#%%
+# Define a condition
+# ==================
+#
+# We can add a condition that will validate the model's performance on the weakest segment detected is above a certain
+# threshold. A scenario where this can be useful is when we want to make sure that the model is not under performing
+# on a subset of the data that is of interest to us, for example for specific age or gender groups.
+
+# Let's add a condition and re-run the check:
+
+check = MetadataSegmentsPerformance(alternative_scorer=scorer, segment_minimum_size_ratio=0.03)
+check.add_condition_segments_relative_performance_greater_than(0.1)
+result = check.run(test_dataset, probabilities=test_probas)
+result.show(show_additional_outputs=False)
diff --git a/docs/source/checks/nlp/model_evaluation/plot_property_segments_performance.py b/docs/source/checks/nlp/model_evaluation/plot_property_segments_performance.py
@@ -0,0 +1,112 @@
+# -*- coding: utf-8 -*-
+"""
+Property Segments Performance
+*************************
+
+This notebook provides an overview for using and understanding the property segment performance check.
+
+**Structure:**
+
+* `What is the purpose of the check? <#what-is-the-purpose-of-the-check>`__
+* `Automatically detecting weak segments <#automatically-detecting-weak-segments>`__
+* `Generate data & model <#generate-data-model>`__
+* `Run the check <#run-the-check>`__
+* `Define a condition <#define-a-condition>`__
+
+What is the purpose of the check?
+==================================
+
+The check is designed to help you easily identify the model's weakest segments based on the provided
+:func:`properties <deepchecks.nlp.text_data.TextData.set_properties>`. In addition,
+it enables to provide a sublist of the metadata columns, thus limiting the check to search in
+interesting subspaces.
+
+Automatically detecting weak segments
+=====================================
+
+The check contains several steps:
+
+#. We calculate loss for each sample in the dataset using the provided model via either log-loss or MSE according
+   to the task type.
+
+#. Select a subset of features for the weak segment search. This is done by selecting the features with the
+   highest feature importance to the model provided (within the features selected for check, if limited).
+
+#. We train multiple simple tree based models, each one is trained using exactly two
+   features (out of the ones selected above) to predict the per sample error calculated before.
+
+#. We extract the corresponding data samples for each of the leaves in each of the trees (data segments) and calculate
+   the model performance on them. For the weakest data segments detected we also calculate the model's
+   performance on data segments surrounding them.
+"""
+#%%
+# Generate data & model
+# =====================
+
+from deepchecks.nlp.datasets.classification.tweet_emotion import load_data, load_precalculated_predictions
+
+_, test_dataset = load_data(data_format='TextData')
+_, test_probas = load_precalculated_predictions(pred_format='probabilities')
+
+test_dataset.properties.head(3)
+
+#%%
+# Run the check
+# =============
+#
+# The check has several key parameters (that are all optional) that affect the behavior of the
+# check and especially its output.
+#
+# ``properties / ignore_properties``: Controls which properties should be searched for weak segments. By default,
+# uses all properties data provided.
+#
+# ``alternative_scorer``: Determines the metric to be used as the performance measurement of the model on different
+# segments. It is important to select a metric that is relevant to the data domain and task you are performing.
+# For additional information on scorers and how to use them see
+# :doc:`Metrics Guide </user-guide/general/metrics_guide>`.
+#
+# ``segment_minimum_size_ratio``: Determines the minimum size of segments that are of interest. The check will
+# return data segments that contain at least this fraction of the total data samples. It is recommended to
+# try different configurations
+# of this parameter as larger segments can be of interest even the model performance on them is superior.
+#
+# ``categorical_aggregation_threshold``: By default the check will combine rare categories into a single category called
+# "Other". This parameter determines the frequency threshold for categories to be mapped into to the "other" category.
+#
+# see :class:`API reference <deepchecks.tabular.checks.model_evaluation.WeakSegmentsPerformance>` for more details.
+
+from deepchecks.nlp.checks import PropertySegmentsPerformance
+from sklearn.metrics import make_scorer, f1_score
+
+scorer = {'f1': make_scorer(f1_score, average='micro')}
+check = PropertySegmentsPerformance(alternative_scorer=scorer,
+                                    segment_minimum_size_ratio=0.03)
+result = check.run(test_dataset, probabilities=test_probas)
+result.show()
+
+#%%
+# Observe the check's output
+# --------------------------
+#
+# We see in the results that the check indeed found several segments on which the model performance is below average.
+# In the heatmap display we can see model performance on the weakest segments and their environment with respect to the
+# two features that are relevant to the segment. In order to get the full list of weak segments found we will inspect
+# the ``result.value`` attribute. Shown below are the 3 segments with the worst performance.
+
+
+result.value['weak_segments_list'].head(3)
+
+#%%
+# Define a condition
+# ==================
+#
+# We can add a condition that will validate the model's performance on the weakest segment detected is above a certain
+# threshold. A scenario where this can be useful is when we want to make sure that the model is not under performing
+# on a subset of the data that is of interest to us.
+
+# Let's add a condition and re-run the check:
+
+check = PropertySegmentsPerformance(alternative_scorer=scorer, segment_minimum_size_ratio=0.03)
+check.add_condition_segments_relative_performance_greater_than(0.1)
+result = check.run(test_dataset, probabilities=test_probas)
+result.show(show_additional_outputs=False)
diff --git a/docs/source/checks/tabular/model_evaluation/plot_confusion_matrix_report.py b/docs/source/checks/tabular/model_evaluation/plot_confusion_matrix_report.py
@@ -23,9 +23,8 @@
 """
 
 #%%
-# Imports
-# =========
-
+# Generate data & model
+# =======================
 import pandas as pd
 from sklearn.datasets import load_iris
 from sklearn.ensemble import AdaBoostClassifier
@@ -34,10 +33,6 @@
 from deepchecks.tabular import Dataset
 from deepchecks.tabular.checks import ConfusionMatrixReport
 
-#%%
-# Generate data & model
-# =======================
-
 iris = load_iris(as_frame=True)
 clf = AdaBoostClassifier()
 frame = iris.frame
@@ -54,4 +49,7 @@
 # ===============
 
 check = ConfusionMatrixReport()
-check.run(ds, clf)
+result = check.run(ds, clf)
+result.show()
+
+#%%
diff --git a/docs/source/checks/tabular/model_evaluation/plot_weak_segments_performance.py b/docs/source/checks/tabular/model_evaluation/plot_weak_segments_performance.py
@@ -34,8 +34,9 @@
 #. We train multiple simple tree based models, each one is trained using exactly two
    features (out of the ones selected above) to predict the per sample error calculated before.
 
-#. We convert each of the leafs in each of the trees into a segment and calculate the segment's performance. For the
-   weakest segments detected we also calculate the model's performance on data segments surrounding them.
+#. We extract the corresponding data samples for each of the leaves in each of the trees (data segments) and calculate
+   the model performance on them. For the weakest data segments detected we also calculate the model's
+   performance on data segments surrounding them.
 """
 #%%
 # Generate data & model
@@ -44,7 +45,7 @@
 from deepchecks.tabular.datasets.classification.phishing import (
     load_data, load_fitted_model)
 
-train_dataset, test_dataset = load_data()
+_, test_ds = load_data()
 model = load_fitted_model()
 
 #%%
@@ -60,23 +61,23 @@
 # ``alternative_scorer``: Determines the metric to be used as the performance measurement of the model on different
 # segments. It is important to select a metric that is relevant to the data domain and task you are performing.
 # By default, the check uses Neg RMSE for regression tasks and Accuracy for classification tasks.
+# For additional information on scorers and how to use them see
+# :doc:`Metrics Guide </user-guide/general/metrics_guide>`.
 #
-# ``segment_minimum_size_ratio``: Determines the minimum size of segments that are of interest. The check is tuned
-# to find the weakest segment regardless of the segment size and so it is recommended to try different configurations
+# ``segment_minimum_size_ratio``: Determines the minimum size of segments that are of interest. The check will
+# return data segments that contain at least this fraction of the total data samples. It is recommended to
+# try different configurations
 # of this parameter as larger segments can be of interest even the model performance on them is superior.
 #
 # ``categorical_aggregation_threshold``: By default the check will combine rare categories into a single category called
 # "Other". This parameter determines the frequency threshold for categories to be mapped into to the "other" category.
 #
 # see :class:`API reference <deepchecks.tabular.checks.model_evaluation.WeakSegmentsPerformance>` for more details.
 
-from deepchecks.tabular.datasets.classification import phishing
 from deepchecks.tabular.checks import WeakSegmentsPerformance
 from sklearn.metrics import make_scorer, f1_score
 
 scorer = {'f1': make_scorer(f1_score, average='micro')}
-_, test_ds = phishing.load_data()
-model = phishing.load_fitted_model()
 check = WeakSegmentsPerformance(columns=['urlLength', 'numTitles', 'ext', 'entropy'],
                                 alternative_scorer=scorer,
                                 segment_minimum_size_ratio=0.03,
@@ -91,17 +92,18 @@
 # We see in the results that the check indeed found several segments on which the model performance is below average.
 # In the heatmap display we can see model performance on the weakest segments and their environment with respect to the
 # two features that are relevant to the segment. In order to get the full list of weak segments found we will inspect
-# the result.value attribute.
+# the ``result.value`` attribute. Shown below are the 3 segments with the worst performance.
 
 
-result.value['weak_segments_list']
+result.value['weak_segments_list'].head(3)
 
 #%%
 # Define a condition
 # ==================
 #
-# We can define on our check a condition that will validate that the model performance on the weakest segment detected
-# is greater than a specified ratio of the average model performance of the entire dataset.
+# We can add a condition that will validate the model's performance on the weakest segment detected is above a certain
+# threshold. A scenario where this can be useful is when we want to make sure that the model is not under performing
+# on a subset of the data that is of interest to us, for example for specific age or gender groups.
 
 # Let's add a condition and re-run the check: