Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Noam/dee 505 multilabel with many classes usecase #2494

Merged
merged 8 commits into from
May 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 5 additions & 1 deletion deepchecks/nlp/base_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def run(
predictions on dataset
probabilities: Union[TTextProba, None] , default: None
probabilities on dataset
model_classes: Optional[List, List[List], default: None
model_classes: Optional[List], default: None
For classification: list of classes known to the model
random_state : int, default 42
A seed to set for pseudo-random functions, primarily sampling.
Expand Down Expand Up @@ -99,6 +99,7 @@ def run(
test_predictions: Optional[TTextPred] = None,
train_probabilities: Optional[TTextProba] = None,
test_probabilities: Optional[TTextProba] = None,
model_classes: Optional[List] = None,
random_state: int = 42,
) -> CheckResult:
"""Run check.
Expand All @@ -121,6 +122,8 @@ def run(
probabilities on train dataset
test_probabilities: Union[TTextProba, None] , default: None
probabilities on test_dataset dataset
model_classes: Optional[List], default: None
For classification: list of classes known to the model
random_state : int, default 42
A seed to set for pseudo-random functions, primarily sampling.

Expand All @@ -134,6 +137,7 @@ def run(
test_pred=test_predictions,
train_proba=train_probabilities,
test_proba=test_probabilities,
model_classes=model_classes,
random_state=random_state,
with_display=with_display,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
'conflicting_samples': result_df,
}

if context.with_display is False:
if context.with_display is False or num_of_ambiguous_samples == 0:
return CheckResult(value=result_value)

ambiguous_samples['Text'] = ambiguous_samples['Text'].apply(self._truncate_text)
Expand Down
6 changes: 5 additions & 1 deletion deepchecks/nlp/checks/data_integrity/unknown_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ def create_pie_chart(self, all_unknown_words_counter, total_words):

# Truncate labels for display
labels = [truncate_string(label, self.max_text_length_for_display) for label in labels]
# round percentages to 2 decimal places after the percent
percentages = [round(percent, 2) for percent in percentages]

# Create pie chart with hover text and custom hover template
fig = go.Figure(data=[go.Pie(
Expand All @@ -170,7 +172,9 @@ def create_pie_chart(self, all_unknown_words_counter, total_words):
)])

# Customize chart appearance
fig.update_layout(title=f'Words containing Unknown Tokens - {self.tokenizer.name_or_path} Tokenizer',
fig.update_layout(title=f'Words containing Unknown Tokens - {self.tokenizer.name_or_path} Tokenizer<br>'
f'({format_percent(sum(percentages) / 100.)} of all words)',
title_x=0.5,
legend_title='Words with Unknown Tokens')

return fig
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,20 @@ class SingleDatasetPerformance(SingleDatasetCheck, BaseSingleDatasetPerformance)
scorers : Union[List[str], Dict[str, Union[str, Callable]]], default: None
List of scorers to use. If None, use default scorers.
Scorers can be supplied as a list of scorer names or as a dictionary of names and functions.
max_rows_to_display : int, default: 15
Maximum number of rows to display in the check result.
n_samples : int , default: 10_000
Maximum number of samples to use for this check.
"""

def __init__(self,
scorers: Union[List[str], Dict[str, Union[str, Callable]]] = None,
max_rows_to_display: int = 15,
n_samples: int = 10_000,
**kwargs):
super().__init__(**kwargs)
self.scorers = scorers
self.max_rows_to_display = max_rows_to_display
self.n_samples = n_samples

def run_logic(self, context: Context, dataset_kind) -> CheckResult:
Expand All @@ -62,7 +66,12 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
results_df = pd.DataFrame(results, columns=['Class', 'Metric', 'Value'])

if context.with_display:
display = [results_df]
if len(results_df) > self.max_rows_to_display:
display = [results_df.iloc[:self.max_rows_to_display, :],
'<p style="font-size:0.9em;line-height:1;"><i>'
f'* Only showing first {self.max_rows_to_display} rows.']
else:
display = [results_df]
else:
display = []

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def run_logic(self, context: Context) -> CheckResult:
for dataset_name, dataset in datasets.items():

if context.task_type is TaskType.TEXT_CLASSIFICATION and dataset.is_multi_label_classification():
n_samples_per_class = dict(enumerate(np.array(dataset.label).sum(axis=0)))
n_samples_per_class = dict(zip(context.model_classes, np.array(dataset.label).sum(axis=0)))
n_of_labels = sum(n_samples_per_class.values())

elif context.task_type is TaskType.TEXT_CLASSIFICATION:
Expand Down
2 changes: 1 addition & 1 deletion deepchecks/nlp/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def _validate_prediction(dataset: TextData, prediction: TTextPred, n_classes: in
@staticmethod
def _validate_classification_prediction(dataset: TextData, prediction: TTextPred, n_classes: int):
"""Validate prediction for given text classification dataset."""
classification_format_error = f'Check requires classification for {dataset.name} to be ' \
classification_format_error = f'Check requires classification predictions for {dataset.name} to be ' \
f'either a sequence that can be cast to a 1D numpy array of shape' \
f' (n_samples,), or a sequence of sequences that can be cast to a 2D ' \
f'numpy array of shape (n_samples, n_classes) for the multilabel case.'
Expand Down
8 changes: 6 additions & 2 deletions deepchecks/nlp/suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""Module for base tabular abstractions."""
"""Module for base nlp suite."""
# pylint: disable=broad-except
from typing import Optional, Tuple, Union
from typing import List, Optional, Tuple, Union

from deepchecks.core import DatasetKind
from deepchecks.core.check_result import CheckFailure
Expand Down Expand Up @@ -42,6 +42,7 @@ def run(
test_predictions: Optional[TTextPred] = None,
train_probabilities: Optional[TTextProba] = None,
test_probabilities: Optional[TTextProba] = None,
model_classes: Optional[List] = None,
random_state: int = 42,
) -> SuiteResult:
"""Run all checks.
Expand All @@ -62,6 +63,8 @@ def run(
probabilities on train dataset
test_probabilities: Union[TTextProba, None] , default: None
probabilities on test_dataset dataset
model_classes: Optional[List], default: None
For classification: list of classes known to the model
random_state : int, default 42
A seed to set for pseudo-random functions, primarily sampling.

Expand All @@ -79,6 +82,7 @@ def run(
test_pred=test_predictions,
train_proba=train_probabilities,
test_proba=test_probabilities,
model_classes=model_classes,
noamzbr marked this conversation as resolved.
Show resolved Hide resolved
with_display=with_display,
random_state=random_state
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import pandas as pd
from hamcrest import *

from deepchecks.core.errors import DeepchecksValueError
from deepchecks.nlp.checks import TrainTestPerformance
from deepchecks.nlp.text_data import TextData

Expand Down Expand Up @@ -93,6 +94,65 @@ def test_check_execution(self):

assert n_of_samples_per_class == expected_n_of_samples_per_class

def test_check_execution_with_model_classes(self):
train = TextData(
raw_text=['I think therefore I am' for _ in range(20)],
label=[
*([0, 0, 1] for _ in range(10)),
*([0, 1, 1] for _ in range(10))
],
task_type='text_classification'
)
test = train.copy()
check = TrainTestPerformance()
# Act
result = check.run(
train_dataset=train,
test_dataset=test,
train_predictions=list(train.label),
test_predictions=list(test.label),
model_classes=['a', 'b', 'c']
)
# Assert
assert isinstance(result.value, pd.DataFrame), type(result.value)
assert set(result.value["Metric"]) == {"F1", "Precision", "Recall"}
assert set(result.value["Dataset"]) == {"Train", "Test"}

n_of_samples_per_class = (
result.value[(result.value["Metric"] == "F1") & (result.value["Dataset"] == "Train")]
.loc[:, ['Class', 'Number of samples']]
.groupby('Class')
.sum()
.to_dict()
)
expected_n_of_samples_per_class = {
'Number of samples': {'a': 0, 'b': 10, 'c': 20}
}

assert n_of_samples_per_class == expected_n_of_samples_per_class

def test_check_execution_with_wrong_model_classes(self):
train = TextData(
raw_text=['I think therefore I am' for _ in range(20)],
label=[
*([0, 0, 1] for _ in range(10)),
*([0, 1, 1] for _ in range(10))
],
task_type='text_classification'
)
test = train.copy()
check = TrainTestPerformance()

# Act & Assert
assert_that(calling(check.run).with_args(
train_dataset=train,
test_dataset=test,
train_predictions=list(train.label),
test_predictions=list(test.label),
model_classes=['a', 'b', 'c', 'd']),
raises(DeepchecksValueError, 'Received model_classes of length 4, but data indicates labels of length 3')
)


class TestTokenClassification:

Expand Down
2 changes: 1 addition & 1 deletion tests/nlp/test_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from deepchecks.nlp import Suite
from deepchecks.nlp.checks import LabelDrift, SingleDatasetPerformance

CLASSIFICATION_ERROR_FORMAT = r'Check requires classification for Train to be ' \
CLASSIFICATION_ERROR_FORMAT = r'Check requires classification predictions for Train to be ' \
r'either a sequence that can be cast to a 1D numpy array of shape' \
r' \(n_samples,\), or a sequence of sequences that can be cast to a 2D ' \
r'numpy array of shape \(n_samples, n_classes\) for the multilabel case.'
Expand Down
23 changes: 22 additions & 1 deletion tests/nlp/test_suites.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# ----------------------------------------------------------------------------
#
"""Test for the default suites"""
from deepchecks.nlp.suites import full_suite
from deepchecks.nlp.suites import model_evaluation, full_suite
from tests.common import get_expected_results_length, validate_suite_result


Expand All @@ -30,3 +30,24 @@ def test_full_suite(tweet_emotion_train_test_textdata, tweet_emotion_train_test_
# Assert
length = get_expected_results_length(suite, kwargs)
validate_suite_result(result, length)


def test_model_eval_suite_with_model_classes_argument(tweet_emotion_train_test_textdata,
tweet_emotion_train_test_predictions,
tweet_emotion_train_test_probabilities):
# Arrange
train_data, test_data = tweet_emotion_train_test_textdata
train_preds, test_preds = tweet_emotion_train_test_predictions
train_probas, test_probas = tweet_emotion_train_test_probabilities

kwargs = dict(train_dataset=train_data, test_dataset=test_data, train_predictions=train_preds,
test_predictions=test_preds, train_probabilities=train_probas, test_probabilities=test_probas,
model_classes=['anger', 'happiness', 'optimism', 'sadness'])

# Act
suite = model_evaluation()
result = suite.run(**kwargs)

# Assert
length = get_expected_results_length(suite, kwargs)
validate_suite_result(result, length)