Skip to content

Commit

Permalink
Added properties and metadata guides (#2468)
Browse files Browse the repository at this point in the history
* Added properties and metadata guides

* Update deepchecks/nlp/text_data.py

Co-authored-by: Noam Bressler <noamzbr@gmail.com>

* Update docs/source/user-guide/nlp/nlp_metadata.rst

Co-authored-by: Noam Bressler <noamzbr@gmail.com>

* Update docs/source/user-guide/nlp/nlp_properties.rst

Co-authored-by: Noam Bressler <noamzbr@gmail.com>

* Fixed CR comments

* Fixed CR comments

* Fixed CR comments

* pylint

* cr comments

* Apply suggestions from code review

Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>

* Nir/dee 482 create plot files for nlp drift (#2472)

* prediction drift plot file

* Added label drift

* property label correlation

* Outliers check

* Outliers check

* small changes

* Changes

* Update deepchecks/vision/suites/default_suites.py

Co-authored-by: Noam Bressler <noamzbr@gmail.com>

* Changes

* Update docs/source/checks/nlp/data_integrity/plot_property_label_correlation.py

Co-authored-by: Noam Bressler <noamzbr@gmail.com>

* Update docs/source/checks/nlp/data_integrity/plot_property_label_correlation.py

Co-authored-by: Noam Bressler <noamzbr@gmail.com>

* Update docs/source/checks/nlp/data_integrity/plot_text_property_outliers.py

Co-authored-by: Noam Bressler <noamzbr@gmail.com>

* Update docs/source/checks/nlp/model_evaluation/plot_prediction_drift.py

Co-authored-by: Noam Bressler <noamzbr@gmail.com>

* Update docs/source/checks/nlp/model_evaluation/plot_prediction_drift.py

Co-authored-by: Noam Bressler <noamzbr@gmail.com>

* CR changes

* CR changes

* CR changes

---------

Co-authored-by: Noam Bressler <noamzbr@gmail.com>

* Moved files

* Updated links

---------

Co-authored-by: Noam Bressler <noamzbr@gmail.com>
Co-authored-by: Nadav Barak <67195469+Nadav-Barak@users.noreply.github.com>
  • Loading branch information
3 people committed Apr 27, 2023
1 parent e4429e3 commit 0b39ef1
Show file tree
Hide file tree
Showing 12 changed files with 705 additions and 33 deletions.
40 changes: 23 additions & 17 deletions deepchecks/nlp/checks/data_integrity/text_property_outliers.py
Expand Up @@ -34,7 +34,7 @@ class TextPropertyOutliers(SingleDatasetCheck):
Parameters
----------
n_show_top : int , default : 5
number of outliers to show from each direction (upper limit and bottom limit)
number of graphs to show (ordered from the property with the most outliers to the least)
iqr_percentiles : Tuple[int, int] , default : (25, 75)
Two percentiles which define the IQR range
iqr_scale : float , default : 1.5
Expand Down Expand Up @@ -131,34 +131,40 @@ def run_logic(self, context: Context, dataset_kind: DatasetKind) -> CheckResult:
display = []
no_outliers = pd.Series([], dtype='object')

for property_name, info in result.items():
sorted_result_items = sorted(result.items(), key=lambda x: len(x[1]['indices']), reverse=True)

for property_name, info in sorted_result_items:
# If info is string it means there was error
if isinstance(info, str):
no_outliers = pd.concat([no_outliers, pd.Series(property_name, index=[info])])
elif len(info['indices']) == 0:
no_outliers = pd.concat([no_outliers, pd.Series(property_name, index=['No outliers found.'])])
else:
dist = df_properties[property_name]
lower_limit = info['lower_limit']
upper_limit = info['upper_limit']

fig = get_text_outliers_graph(
dist=dist,
data=dataset.text,
lower_limit=lower_limit,
upper_limit=upper_limit,
dist_name=property_name,
is_categorical=property_name in cat_properties
)

display.append(fig)
if len(display) < self.n_show_top:
dist = df_properties[property_name]
lower_limit = info['lower_limit']
upper_limit = info['upper_limit']

fig = get_text_outliers_graph(
dist=dist,
data=dataset.text,
lower_limit=lower_limit,
upper_limit=upper_limit,
dist_name=property_name,
is_categorical=property_name in cat_properties
)

display.append(fig)
else:
no_outliers = pd.concat([no_outliers, pd.Series(property_name, index=[
f'Outliers found but not shown in graphs (n_show_top={self.n_show_top}).'])])

if not no_outliers.empty:
grouped = no_outliers.groupby(level=0).unique().str.join(', ')
grouped_df = pd.DataFrame(grouped, columns=['Properties'])
grouped_df['More Info'] = grouped_df.index
grouped_df = grouped_df[['More Info', 'Properties']]
display.append('<h5><b>Properties With No Outliers Found</h5></b>')
display.append('<h5><b>Properties Not Shown:</h5></b>')
display.append(grouped_df.style.hide(axis='index') if hasattr(grouped_df.style, 'hide') else
grouped_df.style.hide_index())

Expand Down
63 changes: 52 additions & 11 deletions deepchecks/nlp/text_data.py
Expand Up @@ -65,22 +65,39 @@ class TextData:
label is provided.
name : t.Optional[str] , default: None
The name of the dataset. If None, the dataset name will be defined when running it within a check.
metadata : t.Optional[pd.DataFrame] , default: None
Metadata for the samples. If None, no metadata is set. If a DataFrame is given, it must contain
the same number of samples as the raw_text and identical index.
properties : t.Optional[Union[pd.DataFrame, str]] , default: None
The text properties for the samples. If None, no properties are set. If 'auto', the properties are calculated
using the default properties. If a DataFrame is given, it must contain the properties for each sample as the raw
text and identical index.
metadata : t.Optional[t.Union[pd.DataFrame, str]] , default: None
Metadata for the samples. Metadata must be given as a pandas DataFrame or a path to a pandas
DataFrame compatible csv file, with the rows representing each sample
and columns representing the different metadata columns. If None, no metadata is set.
The number of rows in the metadata DataFrame must be equal to the number of samples in the dataset, and the
order of the rows must be the same as the order of the samples in the dataset.
For more on metadata, see the `NLP Metadata Guide
<https://docs.deepchecks.com/en/latest/nlp/nlp-metadata.html>`_.
categorical_metadata : t.Optional[t.List[str]] , default: None
The names of the categorical metadata columns. If None, categorical metadata columns are automatically inferred.
Only relevant if metadata is not None.
properties : t.Optional[t.Union[pd.DataFrame, str]] , default: None
The text properties for the samples. Properties must be given as either a pandas DataFrame or a path to a pandas
DataFrame compatible csv file, with the rows representing each sample and columns representing the different
properties. If None, no properties are set.
The number of rows in the properties DataFrame must be equal to the number of samples in the dataset, and the
order of the rows must be the same as the order of the samples in the dataset.
In order to calculate the default properties, use the `TextData.calculate_default_properties` function after
the creation of the TextData object.
For more on properties, see the `NLP Properties Guide
<https://docs.deepchecks.com/en/latest/nlp/nlp-properties.html>`_.
categorical_properties : t.Optional[t.List[str]] , default: None
The names of the categorical properties columns. If None, categorical properties columns are automatically
inferred. Only relevant if properties is not None.
"""

_text: np.ndarray
_label: TTextLabel
task_type: t.Optional[TaskType]
_tokenized_text: t.Optional[t.Sequence[t.Sequence[str]]] = None # Outer sequence is np array
name: t.Optional[str] = None
_metadata: t.Optional[pd.DataFrame] = None
_properties: t.Optional[pd.DataFrame] = None
_metadata: t.Optional[t.Union[pd.DataFrame, str]] = None
_properties: t.Optional[t.Union[pd.DataFrame, str]] = None
_cat_properties: t.Optional[t.List[str]] = None
_cat_metadata: t.Optional[t.List[str]] = None
_original_text_index: t.Optional[t.Sequence[int]] = None # Sequence is np array
Expand All @@ -93,7 +110,9 @@ def __init__(
task_type: str = 'other',
name: t.Optional[str] = None,
metadata: t.Optional[pd.DataFrame] = None,
categorical_metadata: t.Optional[t.List[str]] = None,
properties: t.Optional[pd.DataFrame] = None,
categorical_properties: t.Optional[t.List[str]] = None,
):
# Require explicitly setting task type if label is provided
if task_type in [None, 'other']:
Expand Down Expand Up @@ -130,9 +149,9 @@ def __init__(
self.name = name

if metadata is not None:
self.set_metadata(metadata)
self.set_metadata(metadata, categorical_metadata)
if properties is not None:
self.set_properties(properties)
self.set_properties(properties, categorical_properties)

# Used for display purposes
self._original_text_index = np.arange(len(self))
Expand Down Expand Up @@ -275,6 +294,9 @@ def set_metadata(
if self._metadata is not None:
warnings.warn('Metadata already exist, overwriting it', UserWarning)

if isinstance(metadata, str):
metadata = pd.read_csv(metadata)

column_types = validate_length_and_calculate_column_types(
data_table=metadata,
data_table_name='Metadata',
Expand Down Expand Up @@ -331,6 +353,9 @@ def set_properties(
if self._properties is not None:
warnings.warn('Properties already exist, overwriting them', UserWarning)

if isinstance(properties, str):
properties = pd.read_csv(properties)

column_types = validate_length_and_calculate_column_types(
data_table=properties,
data_table_name='Properties',
Expand All @@ -341,6 +366,22 @@ def set_properties(
self._properties = properties.reset_index(drop=True)
self._cat_properties = column_types.categorical_columns

def save_properties(self, path: str):
"""Save the dataset properties to csv.
Parameters
----------
path : str
Path to save the properties to.
"""
if self._properties is None:
raise DeepchecksNotSupportedError(
'TextData does not contain properties, add them by using '
'"calculate_default_properties" or "set_properties" functions'
)

self._properties.to_csv(path, index=False)

@property
def properties(self) -> pd.DataFrame:
"""Return the properties of the dataset."""
Expand Down
2 changes: 1 addition & 1 deletion deepchecks/vision/suites/default_suites.py
Expand Up @@ -48,7 +48,7 @@ def train_test_validation(label_properties: List[Dict[str, Any]] = None, image_p
- :class:`~deepchecks.vision.checks.train_test_validation.ImagePropertyDrift`
* - :ref:`plot_vision_image_dataset_drift`
- :class:`~deepchecks.vision.checks.train_test_validation.ImageDatasetDrift`
* - :ref:`plot_vision_feature_label_correlation_change`
* - :ref:`_nlp__property_label_correlation `
- :class:`~deepchecks.vision.checks.train_test_validation.PropertyLabelCorrelationChange`
Parameters
Expand Down
@@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
"""
.. _nlp__property_label_correlation:
Property Label Correlation
**************************
This notebook provides an overview for using and understanding the "Property Label Correlation" check.
**Structure:**
* `What Is The Purpose of the Check? <#what-is-the-purpose-of-the-check>`__
* `Run the Check <#run-the-check>`__
What Is The Purpose of the Check?
=================================
The check estimates for every :ref:`text property <nlp__properties_guide>`
(such as text length, language etc.) its ability to predict the label by itself.
This check can help find a potential bias in the dataset - the labels being strongly correlated with simple text
properties such as percentage of special characters, sentiment, toxicity and more.
This is a critical problem that can result in a phenomenon called "shortcut learning", where the model is likely to
learn this property instead of the actual textual characteristics of each class, as it's easier to do so.
In this case, the model will show high performance on text collected under similar conditions (e.g. same source),
but will fail to generalize on other data (for example, when production receives new data from another source).
This kind of correlation will likely stay hidden without this check until tested on the actual problem data.
For example, in a classification dataset of true and false statements, if only true facts are written in detail,
and false facts are written in a short and vague manner, the model might learn to predict the label by the length
of the statement, and not by the actual content. In this case, the model will perform well on the training data,
and may even perform well on the test data, but will fail to generalize to new data.
The check is based on calculating the predictive power score (PPS) of each text
property. In simple terms, the PPS is a metric that measures how well can one feature predict another (in our case,
how well can one property predict the label).
For further information about PPS you can visit the `ppscore github <https://github.com/8080labs/ppscore>`__
or the following blog post: `RIP correlation. Introducing the Predictive Power Score
<https://towardsdatascience.com/rip-correlation-introducing-the-predictive-power-score-3d90808b9598>`__
"""

#%%
# Run the Check
# =============

from deepchecks.nlp.checks import PropertyLabelCorrelation
from deepchecks.nlp.datasets.classification import tweet_emotion

# For this example, we'll use the tweet emotion dataset, which is a dataset of tweets labeled by one of four emotions:
# happiness, anger, sadness and optimism.

# Load Data:
dataset = tweet_emotion.load_data(as_train_test=False)

#%%
# Let's see how our data looks like:
dataset.head()

#%%
# Now lets run the check:
result = PropertyLabelCorrelation().run(dataset)
result

#%%
# We can see that in our example of tweet emotion dataset, the label is correlated with the "sentiment" property,
# which makes sense, as the label is the emotion of the tweet, and the sentiment expresses whether the tweet is
# positive or negative.
# Also, there's some correlation with the "toxciity" property, which is a measure of how toxic the tweet is.
# This is also reasonable, as some emotions are more likely to be expressed in a toxic way.
# However, these correlation may indicate that a model may learn to predict the label by curse words, for instance,
# instead of the actual content of the tweet, which could lead it to fail on new tweets that don't contain curse words.
@@ -0,0 +1,75 @@
# -*- coding: utf-8 -*-
"""
.. _nlp__text_property_outliers:
Text Property Outliers
=======================
This notebooks provides an overview for using and understanding the text property
outliers check, used to detect outliers in simple text properties in a dataset.
**Structure:**
* `Why Check for Outliers? <#why-check-for-outliers>`__
* `How Does the Check Work? <#how-does-the-check-work>`__
* `Which Text Properties Are Used? <#which-text-properties-are-used>`__
* `Run the Check <#run-the-check>`__
Why Check for Outliers?
-----------------------
Examining outliers may help you gain insights that you couldn't have reached from taking an aggregate look or by
inspecting random samples. For example, it may help you understand you have some corrupt samples (e.g.
texts without spaces between words), or samples you didn't expect to have (e.g. texts in Norwegian instead of English).
In some cases, these outliers may help debug some performance discrepancies (the model can be excused for failing on
a totally blank text). In more extreme cases, the outlier samples may indicate the presence of samples interfering with
the model's training by teaching the model to fit "irrelevant" samples.
How Does the Check Work?
------------------------
Ideally we would like to directly find text samples which are outliers, but this is computationally expensive and does not
have a clear and explainable results. Therefore, we use text properties in order to find outliers (such as text length,
average word length, language etc.) which are much more efficient to compute, and each outlier is easily explained.
* For numeric properties (such as "percent of special characters"), we use
`Interquartile Range <https://en.wikipedia.org/wiki/Interquartile_range#Outliers>`_ to define our upper and lower
limit for the properties' values.
* For categorical properties (such as "language"), we look for a "sharp drop" in the category distribution to
define our lower limit for the properties' values. This method is based on the assumption that the distribution of
categories in the dataset is "smooth" and differences in the commonality of categories are gradual.
For example, in a clean dataset, if the distribution of English texts is 80%, the distribution of the next most
common language would be of similar scale (e.g. 10%) and so forth. If we find a category that has a much lower
distribution than the rest, we assume that this category and even smaller categories are outliers.
Which Text Properties Are Used?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
By default the checks use the built-in text properties, and it's also possible to replace the default properties
with custom ones. For the list of the built-in text properties and explanation about custom properties refer to
:ref:`NLP properties <nlp__properties_guide>`.
"""

#%%
# Run the Check
# -------------
# For this example, we'll use the tweet emotion dataset, which is a dataset of tweets labeled by one of four emotions:
# happiness, anger, sadness and optimism.

from deepchecks.nlp.checks import TextPropertyOutliers
from deepchecks.nlp.datasets.classification import tweet_emotion

dataset = tweet_emotion.load_data(as_train_test=False)

check = TextPropertyOutliers()
result = check.run(dataset)
result

#%%
# Observe Graphic Result
# ^^^^^^^^^^^^^^^^^^^^^^
# In this example, we can find many tweets that are outliers - For example, in the "average word length" property,
# we can see that there are tweets with a very large average word length, which is is usually because of missing spaces
# in the tweet itself, or the fact that tweeter hashtags remained in the data and they don't contain spaces. This
# could be problematic for the model, as it cannot coprehend the hashtags as words, and it may cause the model to
# fail on these tweets.

0 comments on commit 0b39ef1

Please sign in to comment.