Skip to content

Commit

Permalink
Add min_samples and n_top_classes to TrainTestPerformance in NLP (#2558)
Browse files Browse the repository at this point in the history
* Fixed the bug with 110% degradation 

* Filtered classes without enough samples

* Show top classes
  • Loading branch information
nirhutnik committed May 27, 2023
1 parent ce15432 commit 7c031aa
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 24 deletions.
74 changes: 59 additions & 15 deletions deepchecks/nlp/checks/model_evaluation/train_test_performance.py
Expand Up @@ -10,16 +10,19 @@
#
"""Module containing the Train-Test Performance check."""
import typing as t
from collections import Counter
from numbers import Number

import numpy as np
import pandas as pd

from deepchecks.core import CheckResult
from deepchecks.core.errors import DeepchecksValueError
from deepchecks.nlp import Context, TrainTestCheck
from deepchecks.nlp.metric_utils.scorers import infer_on_text_data
from deepchecks.nlp.task_type import TaskType
from deepchecks.nlp.text_data import TextData
from deepchecks.nlp.utils.token_classification_utils import clean_iob_prefixes
from deepchecks.utils.abstracts.train_test_performace import TrainTestPerformanceAbstract

__all__ = ['TrainTestPerformance']
Expand All @@ -33,6 +36,17 @@ class TrainTestPerformance(TrainTestPerformanceAbstract, TrainTestCheck):
scorers: Union[Mapping[str, Union[str, Callable]], List[str]], default: None
Scorers to override the default scorers, find more about the supported formats at
https://docs.deepchecks.com/stable/user-guide/general/metrics_guide.html
min_samples: int, default: 30
Minimum number of samples required to calculate model performance. If scorer is per class, then
this is the minimum number of samples per class.
n_top_classes: int, default: 10
Number of top classes to show in the label graphs. The top classes are determined by the show_classes_by
parameter. if None, then all classes are shown.
show_classes_by: str, default: 'test_largest'
Specify which classes to show for label graphs, as the number of shown classes is limited
by n_top_classes. Possible values:
- 'train_largest': Show the n top largest train classes.
- 'test_largest': Show the n top largest test classes.
n_samples : int , default: 1_000_000
number of samples to use for this check.
random_state : int, default: 42
Expand Down Expand Up @@ -74,18 +88,27 @@ def my_mse(y_true, y_pred):
"""

def __init__(
self,
scorers: t.Union[
t.Mapping[str, t.Union[str, t.Callable]],
t.List[str],
None
] = None,
n_samples: int = 1_000_000,
random_state: int = 42,
**kwargs
self,
scorers: t.Union[
t.Mapping[str, t.Union[str, t.Callable]],
t.List[str],
None
] = None,
min_samples: int = 30,
n_top_classes: int = 10,
show_classes_by: str = 'test_largest',
n_samples: int = 1_000_000,
random_state: int = 42,
**kwargs
):
super().__init__(**kwargs)
self.scorers = scorers
self.min_samples = min_samples
self.n_top_classes = n_top_classes
if n_top_classes and show_classes_by not in ['train_largest', 'test_largest']:
raise DeepchecksValueError(f'Invalid value for show_classes_by: {show_classes_by}. Allowed values are '
'"train_largest" and "test_largest".')
self.show_classes_by = show_classes_by
self.n_samples = n_samples
self.random_state = random_state

Expand All @@ -111,9 +134,8 @@ def run_logic(self, context: Context) -> CheckResult:
n_of_labels = len(label)

elif context.task_type is TaskType.TOKEN_CLASSIFICATION:
# TODO:
n_samples_per_class = {}
n_of_labels = 0
n_samples_per_class = Counter(clean_iob_prefixes(np.concatenate(dataset.label)))
n_of_labels = sum(n_samples_per_class.values())

else:
raise NotImplementedError()
Expand Down Expand Up @@ -155,13 +177,35 @@ def run_logic(self, context: Context) -> CheckResult:
]
)

# Nullify rows with less than min_samples:
results_df.loc[results_df['Number of samples'] < self.min_samples, 'Value'] = None
classes_without_enough_samples = results_df[results_df['Class'].notna() & results_df['Value'].isna()]['Class'] \
.unique().tolist()

# Show only top n classes:
if self.n_top_classes:
samples_per_class = results_df[results_df['Class'].notna()][['Class', 'Dataset', 'Number of samples']] \
.drop_duplicates()
samples_per_class = samples_per_class[~samples_per_class['Class'].isin(classes_without_enough_samples)]

if self.show_classes_by == 'train_largest':
top_classes_to_show = samples_per_class[samples_per_class['Dataset'] == 'Train'] \
.sort_values('Number of samples', ascending=False).head(self.n_top_classes)['Class'].tolist()
else: # self.show_classes_by == 'test_largest':
top_classes_to_show = samples_per_class[samples_per_class['Dataset'] == 'Test'] \
.sort_values('Number of samples', ascending=False).head(self.n_top_classes)['Class'].tolist()
else:
top_classes_to_show = None

if context.with_display is False:
figures = None
else:
figures = self._prepare_display(
results_df,
train_dataset.name or 'Train',
test_dataset.name or 'Test'
results=results_df,
train_dataset_name=train_dataset.name or 'Train',
test_dataset_name=test_dataset.name or 'Test',
classes_without_enough_samples=classes_without_enough_samples,
top_classes_to_show=top_classes_to_show
)

return CheckResult(
Expand Down
40 changes: 33 additions & 7 deletions deepchecks/utils/abstracts/train_test_performace.py
Expand Up @@ -31,10 +31,12 @@ class TrainTestPerformanceAbstract(abc.ABC):
add_condition: t.Callable[..., t.Any]

def _prepare_display(
self,
results: pd.DataFrame,
train_dataset_name: str,
test_dataset_name: str,
self,
results: pd.DataFrame,
train_dataset_name: str,
test_dataset_name: str,
classes_without_enough_samples: t.Optional[t.List[str]] = None,
top_classes_to_show: t.Optional[t.List[str]] = None
):
display_df = results.replace({
'Dataset': {
Expand All @@ -47,6 +49,19 @@ def _prepare_display(
data_scorers_per_class = display_df[results['Class'].notna()]
data_scorers_per_dataset = display_df[results['Class'].isna()].drop(columns=['Class'])

# Filter classes without enough samples and get display comment for them:
if classes_without_enough_samples:
data_scorers_per_class = \
data_scorers_per_class.loc[~data_scorers_per_class['Class'].isin(classes_without_enough_samples)]

# Filter top classes to show:
if top_classes_to_show:
not_shown_classes = list(set(data_scorers_per_class['Class'].unique()) - set(top_classes_to_show))
data_scorers_per_class = \
data_scorers_per_class.loc[data_scorers_per_class['Class'].isin(top_classes_to_show)]
else:
not_shown_classes = None

for data in (data_scorers_per_dataset, data_scorers_per_class):
if data.shape[0] == 0:
continue
Expand Down Expand Up @@ -85,6 +100,17 @@ def _prepare_display(
)
)

# Add comments about not shown classes:
df = pd.DataFrame({}, columns=['Reason', 'Classes']).set_index('Reason')
if not_shown_classes:
df.loc[f'Not shown classes (showing only top {len(top_classes_to_show)})'] = str(not_shown_classes)
if classes_without_enough_samples:
df.loc[f'Classes without enough samples in either {train_dataset_name} or {test_dataset_name}'] = \
str(classes_without_enough_samples)

if not df.empty:
figures.append(df)

return figures

def add_condition_test_performance_greater_than(self: Self, min_score: float) -> Self:
Expand All @@ -111,9 +137,9 @@ def add_condition_train_test_relative_degradation_less_than(self: Self, threshol
return self.add_condition(name, condition)

def add_condition_class_performance_imbalance_ratio_less_than(
self: Self,
score: str,
threshold: float = 0.3,
self: Self,
score: str,
threshold: float = 0.3,
) -> Self:
"""Add condition - relative ratio difference between highest-class and lowest-class is less than threshold.
Expand Down
67 changes: 65 additions & 2 deletions tests/nlp/checks/model_evaluation/train_test_performance_test.py
Expand Up @@ -68,7 +68,7 @@ def test_check_execution(self):
task_type='text_classification'
)
test = train.copy()
check = TrainTestPerformance()
check = TrainTestPerformance(min_samples=0)
# Act
result = check.run(
train_dataset=train,
Expand Down Expand Up @@ -153,10 +153,55 @@ def test_check_execution_with_wrong_model_classes(self):
raises(DeepchecksValueError, 'Received model_classes of length 4, but data indicates labels of length 3')
)

def test_display_params(self):
# Arrange
train = TextData(
raw_text=['I think therefore I am' for _ in range(100)],
label=[
*([0, 0, 1] for _ in range(50)),
*([0, 1, 1] for _ in range(50))
],
task_type='text_classification'
)
test = train.copy()
check = TrainTestPerformance(min_samples=101)
# Act
result = check.run(
train_dataset=train,
test_dataset=test,
train_predictions=list(train.label),
test_predictions=list(test.label),
)
# Assert
assert isinstance(result.value, pd.DataFrame), type(result.value)
assert set(result.value["Metric"]) == {"F1", "Precision", "Recall"}
assert set(result.value["Dataset"]) == {"Train", "Test"}
assert result.value["Value"].notna().sum() == 0 # all values are NaNs

check = TrainTestPerformance(n_top_classes=1, show_classes_by='test_largest')
# Act
result = check.run(
train_dataset=train,
test_dataset=test,
train_predictions=list(train.label),
test_predictions=list(test.label),
)
# Assert
assert isinstance(result.value, pd.DataFrame), type(result.value)
assert set(result.value["Metric"]) == {"F1", "Precision", "Recall"}
assert set(result.value["Dataset"]) == {"Train", "Test"}
assert isinstance(result.display[1], pd.DataFrame)
assert result.display[1]['Classes'].loc['Not shown classes (showing only top 1)'] == '[1]'
assert result.display[1]['Classes'].loc['Classes without enough samples in either Train or Test'] == '[0]'
assert result.display[0].data[0]['x'].shape == (1,) # Make sure x-axis has only 1 class

assert_that(calling(TrainTestPerformance).with_args(show_classes_by='blabla'),
raises(DeepchecksValueError))


class TestTokenClassification:

def test_check_execution(self, small_wikiann_train_test_text_data):
def test_check_execution_macro(self, small_wikiann_train_test_text_data):
# Arrange
train, test = small_wikiann_train_test_text_data
scorers = ["recall_macro", "f1_macro"]
Expand All @@ -173,3 +218,21 @@ def test_check_execution(self, small_wikiann_train_test_text_data):
assert set(result.value["Metric"]) == set(scorers)
assert set(result.value["Dataset"]) == {"Train", "Test"}
assert set(result.value["Value"]) == {1.0}

def test_check_execution_micro(self, small_wikiann_train_test_text_data):
# Arrange
train, test = small_wikiann_train_test_text_data
check = TrainTestPerformance(min_samples=50)
# Act
result = check.run(
train_dataset=train,
test_dataset=test,
train_predictions=list(train.label),
test_predictions=list(test.label),
)
# Assert
assert isinstance(result.value, pd.DataFrame), type(result.value)
assert set(result.value["Dataset"]) == {"Train", "Test"}
assert set(result.value[result.value["Value"].notna()]["Value"]) == {1.0}
assert set(result.value[(result.value["Value"].notna()) & (result.value["Dataset"] == 'Train')]["Class"])\
== {'PER', 'ORG'} # LOC has only 49 samples

0 comments on commit 7c031aa

Please sign in to comment.