Added properties and metadata guides (#2468)

* Added properties and metadata guides * Update deepchecks/nlp/text_data.py Co-authored-by: Noam Bressler <noamzbr@gmail.com> * Update docs/source/user-guide/nlp/nlp_metadata.rst Co-authored-by: Noam Bressler <noamzbr@gmail.com> * Update docs/source/user-guide/nlp/nlp_properties.rst Co-authored-by: Noam Bressler <noamzbr@gmail.com> * Fixed CR comments * Fixed CR comments * Fixed CR comments * pylint * cr comments * Apply suggestions from code review Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com> * Nir/dee 482 create plot files for nlp drift (#2472) * prediction drift plot file * Added label drift * property label correlation * Outliers check * Outliers check * small changes * Changes * Update deepchecks/vision/suites/default_suites.py Co-authored-by: Noam Bressler <noamzbr@gmail.com> * Changes * Update docs/source/checks/nlp/data_integrity/plot_property_label_correlation.py Co-authored-by: Noam Bressler <noamzbr@gmail.com> * Update docs/source/checks/nlp/data_integrity/plot_property_label_correlation.py Co-authored-by: Noam Bressler <noamzbr@gmail.com> * Update docs/source/checks/nlp/data_integrity/plot_text_property_outliers.py Co-authored-by: Noam Bressler <noamzbr@gmail.com> * Update docs/source/checks/nlp/model_evaluation/plot_prediction_drift.py Co-authored-by: Noam Bressler <noamzbr@gmail.com> * Update docs/source/checks/nlp/model_evaluation/plot_prediction_drift.py Co-authored-by: Noam Bressler <noamzbr@gmail.com> * CR changes * CR changes * CR changes --------- Co-authored-by: Noam Bressler <noamzbr@gmail.com> * Moved files * Updated links --------- Co-authored-by: Noam Bressler <noamzbr@gmail.com> Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>
deepchecks · Apr 27, 2023 · 0b39ef1 · 0b39ef1
1 parent e4429e3
commit 0b39ef1
Show file tree

Hide file tree

Showing 12 changed files with 705 additions and 33 deletions.
diff --git a/deepchecks/nlp/checks/data_integrity/text_property_outliers.py b/deepchecks/nlp/checks/data_integrity/text_property_outliers.py
@@ -34,7 +34,7 @@ class TextPropertyOutliers(SingleDatasetCheck):
     Parameters
     ----------
     n_show_top : int , default : 5
-        number of outliers to show from each direction (upper limit and bottom limit)
+        number of graphs to show (ordered from the property with the most outliers to the least)
     iqr_percentiles : Tuple[int, int] , default : (25, 75)
         Two percentiles which define the IQR range
     iqr_scale : float , default : 1.5
@@ -131,34 +131,40 @@ def run_logic(self, context: Context, dataset_kind: DatasetKind) -> CheckResult:
             display = []
             no_outliers = pd.Series([], dtype='object')
 
-            for property_name, info in result.items():
+            sorted_result_items = sorted(result.items(), key=lambda x: len(x[1]['indices']), reverse=True)
+
+            for property_name, info in sorted_result_items:
                 # If info is string it means there was error
                 if isinstance(info, str):
                     no_outliers = pd.concat([no_outliers, pd.Series(property_name, index=[info])])
                 elif len(info['indices']) == 0:
                     no_outliers = pd.concat([no_outliers, pd.Series(property_name, index=['No outliers found.'])])
                 else:
-                    dist = df_properties[property_name]
-                    lower_limit = info['lower_limit']
-                    upper_limit = info['upper_limit']
-
-                    fig = get_text_outliers_graph(
-                        dist=dist,
-                        data=dataset.text,
-                        lower_limit=lower_limit,
-                        upper_limit=upper_limit,
-                        dist_name=property_name,
-                        is_categorical=property_name in cat_properties
-                    )
-
-                    display.append(fig)
+                    if len(display) < self.n_show_top:
+                        dist = df_properties[property_name]
+                        lower_limit = info['lower_limit']
+                        upper_limit = info['upper_limit']
+
+                        fig = get_text_outliers_graph(
+                            dist=dist,
+                            data=dataset.text,
+                            lower_limit=lower_limit,
+                            upper_limit=upper_limit,
+                            dist_name=property_name,
+                            is_categorical=property_name in cat_properties
+                        )
+
+                        display.append(fig)
+                    else:
+                        no_outliers = pd.concat([no_outliers, pd.Series(property_name, index=[
+                            f'Outliers found but not shown in graphs (n_show_top={self.n_show_top}).'])])
 
             if not no_outliers.empty:
                 grouped = no_outliers.groupby(level=0).unique().str.join(', ')
                 grouped_df = pd.DataFrame(grouped, columns=['Properties'])
                 grouped_df['More Info'] = grouped_df.index
                 grouped_df = grouped_df[['More Info', 'Properties']]
-                display.append('<h5><b>Properties With No Outliers Found</h5></b>')
+                display.append('<h5><b>Properties Not Shown:</h5></b>')
                 display.append(grouped_df.style.hide(axis='index') if hasattr(grouped_df.style, 'hide') else
                                grouped_df.style.hide_index())
 

diff --git a/deepchecks/nlp/text_data.py b/deepchecks/nlp/text_data.py
@@ -65,22 +65,39 @@ class TextData:
         label is provided.
     name : t.Optional[str] , default: None
         The name of the dataset. If None, the dataset name will be defined when running it within a check.
-    metadata : t.Optional[pd.DataFrame] , default: None
-        Metadata for the samples. If None, no metadata is set. If a DataFrame is given, it must contain
-        the same number of samples as the raw_text and identical index.
-    properties : t.Optional[Union[pd.DataFrame, str]] , default: None
-        The text properties for the samples. If None, no properties are set. If 'auto', the properties are calculated
-        using the default properties. If a DataFrame is given, it must contain the properties for each sample as the raw
-        text and identical index.
+    metadata : t.Optional[t.Union[pd.DataFrame, str]] , default: None
+        Metadata for the samples. Metadata must be given as a pandas DataFrame or a path to a pandas
+        DataFrame compatible csv file, with the rows representing each sample
+        and columns representing the different metadata columns. If None, no metadata is set.
+        The number of rows in the metadata DataFrame must be equal to the number of samples in the dataset, and the
+        order of the rows must be the same as the order of the samples in the dataset.
+        For more on metadata, see the `NLP Metadata Guide
+        <https://docs.deepchecks.com/en/latest/nlp/nlp-metadata.html>`_.
+    categorical_metadata : t.Optional[t.List[str]] , default: None
+        The names of the categorical metadata columns. If None, categorical metadata columns are automatically inferred.
+        Only relevant if metadata is not None.
+    properties : t.Optional[t.Union[pd.DataFrame, str]] , default: None
+        The text properties for the samples. Properties must be given as either a pandas DataFrame or a path to a pandas
+        DataFrame compatible csv file, with the rows representing each sample and columns representing the different
+        properties. If None, no properties are set.
+        The number of rows in the properties DataFrame must be equal to the number of samples in the dataset, and the
+        order of the rows must be the same as the order of the samples in the dataset.
+        In order to calculate the default properties, use the `TextData.calculate_default_properties` function after
+        the creation of the TextData object.
+        For more on properties, see the `NLP Properties Guide
+        <https://docs.deepchecks.com/en/latest/nlp/nlp-properties.html>`_.
+    categorical_properties : t.Optional[t.List[str]] , default: None
+        The names of the categorical properties columns. If None, categorical properties columns are automatically
+        inferred. Only relevant if properties is not None.
     """
 
     _text: np.ndarray
     _label: TTextLabel
     task_type: t.Optional[TaskType]
     _tokenized_text: t.Optional[t.Sequence[t.Sequence[str]]] = None  # Outer sequence is np array
     name: t.Optional[str] = None
-    _metadata: t.Optional[pd.DataFrame] = None
-    _properties: t.Optional[pd.DataFrame] = None
+    _metadata: t.Optional[t.Union[pd.DataFrame, str]] = None
+    _properties: t.Optional[t.Union[pd.DataFrame, str]] = None
     _cat_properties: t.Optional[t.List[str]] = None
     _cat_metadata: t.Optional[t.List[str]] = None
     _original_text_index: t.Optional[t.Sequence[int]] = None  # Sequence is np array
@@ -93,7 +110,9 @@ def __init__(
             task_type: str = 'other',
             name: t.Optional[str] = None,
             metadata: t.Optional[pd.DataFrame] = None,
+            categorical_metadata: t.Optional[t.List[str]] = None,
             properties: t.Optional[pd.DataFrame] = None,
+            categorical_properties: t.Optional[t.List[str]] = None,
     ):
         # Require explicitly setting task type if label is provided
         if task_type in [None, 'other']:
@@ -130,9 +149,9 @@ def __init__(
         self.name = name
 
         if metadata is not None:
-            self.set_metadata(metadata)
+            self.set_metadata(metadata, categorical_metadata)
         if properties is not None:
-            self.set_properties(properties)
+            self.set_properties(properties, categorical_properties)
 
         # Used for display purposes
         self._original_text_index = np.arange(len(self))
@@ -275,6 +294,9 @@ def set_metadata(
         if self._metadata is not None:
             warnings.warn('Metadata already exist, overwriting it', UserWarning)
 
+        if isinstance(metadata, str):
+            metadata = pd.read_csv(metadata)
+
         column_types = validate_length_and_calculate_column_types(
             data_table=metadata,
             data_table_name='Metadata',
@@ -331,6 +353,9 @@ def set_properties(
         if self._properties is not None:
             warnings.warn('Properties already exist, overwriting them', UserWarning)
 
+        if isinstance(properties, str):
+            properties = pd.read_csv(properties)
+
         column_types = validate_length_and_calculate_column_types(
             data_table=properties,
             data_table_name='Properties',
@@ -341,6 +366,22 @@ def set_properties(
         self._properties = properties.reset_index(drop=True)
         self._cat_properties = column_types.categorical_columns
 
+    def save_properties(self, path: str):
+        """Save the dataset properties to csv.
+
+        Parameters
+        ----------
+        path : str
+            Path to save the properties to.
+        """
+        if self._properties is None:
+            raise DeepchecksNotSupportedError(
+                'TextData does not contain properties, add them by using '
+                '"calculate_default_properties" or "set_properties" functions'
+            )
+
+        self._properties.to_csv(path, index=False)
+
     @property
     def properties(self) -> pd.DataFrame:
         """Return the properties of the dataset."""

diff --git a/deepchecks/vision/suites/default_suites.py b/deepchecks/vision/suites/default_suites.py
@@ -48,7 +48,7 @@ def train_test_validation(label_properties: List[Dict[str, Any]] = None, image_p
              - :class:`~deepchecks.vision.checks.train_test_validation.ImagePropertyDrift`
            * - :ref:`plot_vision_image_dataset_drift`
              - :class:`~deepchecks.vision.checks.train_test_validation.ImageDatasetDrift`
-           * - :ref:`plot_vision_feature_label_correlation_change`
+           * - :ref:`_nlp__property_label_correlation `
              - :class:`~deepchecks.vision.checks.train_test_validation.PropertyLabelCorrelationChange`
 
     Parameters

diff --git a/docs/source/checks/nlp/data_integrity/plot_property_label_correlation.py b/docs/source/checks/nlp/data_integrity/plot_property_label_correlation.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+"""
+.. _nlp__property_label_correlation:
+
+Property Label Correlation
+**************************
+
+This notebook provides an overview for using and understanding the "Property Label Correlation" check.
+
+**Structure:**
+
+* `What Is The Purpose of the Check? <#what-is-the-purpose-of-the-check>`__
+* `Run the Check <#run-the-check>`__
+
+What Is The Purpose of the Check?
+=================================
+The check estimates for every :ref:`text property <nlp__properties_guide>`
+(such as text length, language etc.) its ability to predict the label by itself.
+
+This check can help find a potential bias in the dataset - the labels being strongly correlated with simple text
+properties such as percentage of special characters, sentiment, toxicity and more.
+
+This is a critical problem that can result in a phenomenon called "shortcut learning", where the model is likely to
+learn this property instead of the actual textual characteristics of each class, as it's easier to do so.
+In this case, the model will show high performance on text collected under similar conditions (e.g. same source),
+but will fail to generalize on other data (for example, when production receives new data from another source).
+This kind of correlation will likely stay hidden without this check until tested on the actual problem data.
+
+For example, in a classification dataset of true and false statements, if only true facts are written in detail,
+and false facts are written in a short and vague manner, the model might learn to predict the label by the length
+of the statement, and not by the actual content. In this case, the model will perform well on the training data,
+and may even perform well on the test data, but will fail to generalize to new data.
+
+The check is based on calculating the predictive power score (PPS) of each text
+property. In simple terms, the PPS is a metric that measures how well can one feature predict another (in our case,
+how well can one property predict the label).
+For further information about PPS you can visit the `ppscore github <https://github.com/8080labs/ppscore>`__
+or the following blog post: `RIP correlation. Introducing the Predictive Power Score
+<https://towardsdatascience.com/rip-correlation-introducing-the-predictive-power-score-3d90808b9598>`__
+"""
+
+#%%
+# Run the Check
+# =============
+
+from deepchecks.nlp.checks import PropertyLabelCorrelation
+from deepchecks.nlp.datasets.classification import tweet_emotion
+
+# For this example, we'll use the tweet emotion dataset, which is a dataset of tweets labeled by one of four emotions:
+# happiness, anger, sadness and optimism.
+
+# Load Data:
+dataset = tweet_emotion.load_data(as_train_test=False)
+
+#%%
+# Let's see how our data looks like:
+dataset.head()
+
+#%%
+# Now lets run the check:
+result = PropertyLabelCorrelation().run(dataset)
+result
+
+#%%
+# We can see that in our example of tweet emotion dataset, the label is correlated with the "sentiment" property,
+# which makes sense, as the label is the emotion of the tweet, and the sentiment expresses whether the tweet is
+# positive or negative.
+# Also, there's some correlation with the "toxciity" property, which is a measure of how toxic the tweet is.
+# This is also reasonable, as some emotions are more likely to be expressed in a toxic way.
+# However, these correlation may indicate that a model may learn to predict the label by curse words, for instance,
+# instead of the actual content of the tweet, which could lead it to fail on new tweets that don't contain curse words.
diff --git a/docs/source/checks/nlp/data_integrity/plot_text_property_outliers.py b/docs/source/checks/nlp/data_integrity/plot_text_property_outliers.py
@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+"""
+.. _nlp__text_property_outliers:
+
+Text Property Outliers
+=======================
+
+This notebooks provides an overview for using and understanding the text property
+outliers check, used to detect outliers in simple text properties in a dataset.
+
+**Structure:**
+
+* `Why Check for Outliers? <#why-check-for-outliers>`__
+* `How Does the Check Work? <#how-does-the-check-work>`__
+* `Which Text Properties Are Used? <#which-text-properties-are-used>`__
+* `Run the Check <#run-the-check>`__
+
+
+Why Check for Outliers?
+-----------------------
+Examining outliers may help you gain insights that you couldn't have reached from taking an aggregate look or by
+inspecting random samples. For example, it may help you understand you have some corrupt samples (e.g.
+texts without spaces between words), or samples you didn't expect to have (e.g. texts in Norwegian instead of English).
+In some cases, these outliers may help debug some performance discrepancies (the model can be excused for failing on
+a totally blank text). In more extreme cases, the outlier samples may indicate the presence of samples interfering with
+the model's training by teaching the model to fit "irrelevant" samples.
+
+
+How Does the Check Work?
+------------------------
+Ideally we would like to directly find text samples which are outliers, but this is computationally expensive and does not
+have a clear and explainable results. Therefore, we use text properties in order to find outliers (such as text length,
+average word length, language etc.) which are much more efficient to compute, and each outlier is easily explained.
+
+* For numeric properties (such as "percent of special characters"), we use
+  `Interquartile Range <https://en.wikipedia.org/wiki/Interquartile_range#Outliers>`_ to define our upper and lower
+  limit for the properties' values.
+* For categorical properties (such as "language"), we look for a "sharp drop" in the category distribution to
+  define our lower limit for the properties' values. This method is based on the assumption that the distribution of
+  categories in the dataset is "smooth" and differences in the commonality of categories are gradual.
+  For example, in a clean dataset, if the distribution of English texts is 80%, the distribution of the next most
+  common language would be of similar scale (e.g. 10%) and so forth. If we find a category that has a much lower
+  distribution than the rest, we assume that this category and even smaller categories are outliers.
+
+Which Text Properties Are Used?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+By default the checks use the built-in text properties, and it's also possible to replace the default properties
+with custom ones. For the list of the built-in text properties and explanation about custom properties refer to
+:ref:`NLP properties <nlp__properties_guide>`.
+
+"""
+
+#%%
+# Run the Check
+# -------------
+# For this example, we'll use the tweet emotion dataset, which is a dataset of tweets labeled by one of four emotions:
+# happiness, anger, sadness and optimism.
+
+from deepchecks.nlp.checks import TextPropertyOutliers
+from deepchecks.nlp.datasets.classification import tweet_emotion
+
+dataset = tweet_emotion.load_data(as_train_test=False)
+
+check = TextPropertyOutliers()
+result = check.run(dataset)
+result
+
+#%%
+# Observe Graphic Result
+# ^^^^^^^^^^^^^^^^^^^^^^
+# In this example, we can find many tweets that are outliers - For example, in the "average word length" property,
+# we can see that there are tweets with a very large average word length, which is is usually because of missing spaces
+# in the tweet itself, or the fact that tweeter hashtags remained in the data and they don't contain spaces. This
+# could be problematic for the model, as it cannot coprehend the hashtags as words, and it may cause the model to
+# fail on these tweets.