Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Noam/dee 480 support weak segments in multilabel #2507

Merged
merged 13 commits into from
May 9, 2023
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD

from deepchecks.core import CheckResult
from deepchecks.core.check_result import DisplayMap
Expand Down Expand Up @@ -51,7 +52,6 @@ def __init__(self, segment_by: str, columns: Union[Hashable, List[Hashable], Non
def run_logic(self, context: Context, dataset_kind) -> CheckResult:
"""Run check."""
context.raise_if_token_classification_task(self)
context.raise_if_multi_label_task(self)

text_data = context.get_data_by_kind(dataset_kind)
text_data = text_data.sample(self.n_samples, random_state=context.random_state)
Expand All @@ -61,8 +61,19 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
n_top_features=self.n_top_features)

# Decide which scorer and score_per_sample to use in the algorithm run
encoded_dataset = self._target_encode_categorical_features_fill_na(features, text_data.label,
cat_features, is_cat_label=True)
is_multilabel = text_data.is_multi_label_classification()
if is_multilabel:
label = TruncatedSVD(1).fit_transform(text_data.label).squeeze()
noamzbr marked this conversation as resolved.
Show resolved Hide resolved
is_cat_label = False
noamzbr marked this conversation as resolved.
Show resolved Hide resolved
else:
label = text_data.label
is_cat_label = True
encoded_dataset = self._target_encode_categorical_features_fill_na(features, label,
cat_features, is_cat_label=is_cat_label)
noamzbr marked this conversation as resolved.
Show resolved Hide resolved
# Replacing the label with the original label for multilabel
if is_multilabel:
listed_label = [list(x) for x in text_data.label]
encoded_dataset._data[encoded_dataset.label_name] = listed_label # pylint: disable=protected-access
noamzbr marked this conversation as resolved.
Show resolved Hide resolved
if self.score_per_sample is not None:
score_per_sample = self.score_per_sample[list(features.index)]
scorer, dummy_model = None, None
Expand All @@ -77,7 +88,8 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
'rather than only predicted classes.')
y_proba = context.model.predict_proba(text_data)
score_per_sample = calculate_neg_cross_entropy_per_sample(text_data.label, np.asarray(y_proba),
context.model_classes)
is_multilabel=is_multilabel,
model_classes=context.model_classes)
else:
raise DeepchecksNotSupportedError('Weak segments performance check is not supported for '
f'{context.task_type}.')
Expand Down
1 change: 0 additions & 1 deletion deepchecks/nlp/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ def __init__(self,
if (y_pred is None) and (y_proba is not None):
if dataset.is_multi_label_classification():
y_pred = (np.array(y_proba) > 0.5) # TODO: Replace with user-configurable threshold
y_pred = [np.array(model_classes)[pred] for pred in y_pred]
else:
y_pred = np.argmax(np.array(y_proba), axis=-1)
y_pred = np.array(model_classes, dtype='str')[y_pred]
Expand Down
49 changes: 29 additions & 20 deletions deepchecks/tabular/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,32 +63,41 @@ class _DummyModel:
predictions: pd.DataFrame
proba: pd.DataFrame

def __init__(self,
test: Dataset,
y_proba_test: t.Optional[np.ndarray] = None,
y_pred_test: t.Optional[np.ndarray] = None,
train: t.Union[Dataset, None] = None,
y_pred_train: t.Optional[np.ndarray] = None,
y_proba_train: t.Optional[np.ndarray] = None,
validate_data_on_predict: bool = True,
model_classes: t.Optional[t.List] = None):

def __init__(
self,
test: Dataset,
y_proba_test: t.Optional[np.ndarray] = None,
y_pred_test: t.Optional[np.ndarray] = None,
train: t.Optional[Dataset] = None,
y_pred_train: t.Optional[np.ndarray] = None,
y_proba_train: t.Optional[np.ndarray] = None,
validate_data_on_predict: bool = True,
model_classes: t.Optional[t.List[t.Any]] = None
):
# TODO:
noamzbr marked this conversation as resolved.
Show resolved Hide resolved
# constructor signature says that test cannot be `None`
# why do we check it then?
if train is not None and test is not None:
# check if datasets have same indexes
if set(train.data.index) & set(test.data.index):
train.data.index = map(lambda x: f'train-{x}', list(train.data.index))
test.data.index = map(lambda x: f'test-{x}', list(test.data.index))
get_logger().warning('train and test datasets have common index - adding "train"/"test"'
' prefixes. To avoid that provide datasets with no common indexes '
'or pass the model object instead of the predictions.')
train_index = train.data.index
noamzbr marked this conversation as resolved.
Show resolved Hide resolved
test_index = test.data.index
if set(train_index) & set(test_index):
train.data.index = [f'train-{it}' for it in train_index]
test.data.index = [f'test-{it}' for it in test_index]
get_logger().warning(
'train and test datasets have common index - adding "train"/"test" '
'prefixes. To avoid that provide datasets with no common indexes '
'or pass the model object instead of the predictions.'
)

feature_df_list = []
predictions = []
probas = []

for dataset, y_pred, y_proba in zip([train, test],
[y_pred_train, y_pred_test],
[y_proba_train, y_proba_test]):
for dataset, y_pred, y_proba in (
(train, y_pred_train, y_proba_train),
(test, y_pred_test, y_proba_test),
):
if y_pred is not None and not isinstance(y_pred, np.ndarray):
y_pred = np.array(y_pred)
if y_proba is not None and not isinstance(y_proba, np.ndarray):
Expand All @@ -103,7 +112,7 @@ def __init__(self,
if len(y_pred.shape) > 1 and y_pred.shape[1] == 1:
y_pred = y_pred[:, 0]
ensure_predictions_shape(y_pred, dataset.data)
y_pred_ser = pd.Series(y_pred, index=dataset.data.index)
y_pred_ser = pd.Series(list(y_pred), index=dataset.data.index)
predictions.append(y_pred_ser)
if y_proba is not None:
ensure_predictions_proba(y_proba, y_pred)
Expand Down
9 changes: 9 additions & 0 deletions deepchecks/tabular/metric_utils/scorers.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from deepchecks.utils.metrics import get_scorer_name
from deepchecks.utils.simple_models import PerfectModel
from deepchecks.utils.typing import BasicModel
from deepchecks.utils.validation import is_sequence_not_str

if TYPE_CHECKING:
from deepchecks import tabular # pylint: disable=unused-import; it is used for type annotations
Expand Down Expand Up @@ -267,6 +268,10 @@ def predict(self, data: pd.DataFrame) -> np.ndarray:
predictions = transfer_func(predictions)
# In case of multiclass with single label, convert into multi-label
elif self.model_classes:
# if multilabel convert from numpy array of lists to 2d numpy array
if len(predictions) != 0:
noamzbr marked this conversation as resolved.
Show resolved Hide resolved
if is_sequence_not_str(next(iter(predictions))):
predictions = np.array([np.array(x) for x in predictions])
predictions = _transform_to_multi_label_format(predictions, self.model_classes)
return predictions

Expand Down Expand Up @@ -309,6 +314,10 @@ def _run_score(self, model, data: pd.DataFrame, label_col: pd.Series):
f'{label_col.unique()}')
label_col = label_col.map({self.model_classes[0]: 0, self.model_classes[1]: 1})
else:
# if multilabel convert from series of lists to 2d numpy array
if len(label_col) != 0:
if is_sequence_not_str(next(iter(label_col))):
label_col = np.array([np.array(x) for x in label_col])
label_col = _transform_to_multi_label_format(np.array(label_col), self.model_classes)

try:
Expand Down
44 changes: 24 additions & 20 deletions deepchecks/utils/single_sample_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,26 +24,30 @@ def calculate_neg_mse_per_sample(labels, predictions, index=None) -> pd.Series:
return pd.Series([-(y - y_pred) ** 2 for y, y_pred in zip(labels, predictions)], index=index)


def calculate_neg_cross_entropy_per_sample(labels, probas: np.ndarray, model_classes: Optional[List] = None,
index=None, eps=1e-15) -> pd.Series:
def calculate_neg_cross_entropy_per_sample(labels, probas: np.ndarray,
noamzbr marked this conversation as resolved.
Show resolved Hide resolved
model_classes: Optional[List] = None,
index=None, is_multilabel: bool = False, eps=1e-15) -> pd.Series:
"""Calculate negative cross entropy per sample."""
if index is None and isinstance(labels, pd.Series):
index = labels.index

# transform categorical labels into integers
if model_classes is not None:
if any(x not in model_classes for x in labels):
raise DeepchecksValueError(
f'Label observed values {sorted(labels.unique())} contain values '
f'that are not found in the model classes: {model_classes}.')
if probas.shape[1] != len(model_classes):
raise DeepchecksValueError(
f'Predicted probabilities shape {probas.shape} does not match the number of classes found in'
f' the labels: {model_classes}.')
labels = pd.Series(labels).apply(list(model_classes).index)

num_samples, num_classes = probas.shape
one_hot_labels = np.zeros((num_samples, num_classes))
one_hot_labels[list(np.arange(num_samples)), list(labels)] = 1
if not is_multilabel:
if index is None and isinstance(labels, pd.Series):
index = labels.index

# transform categorical labels into integers
if model_classes is not None:
if any(x not in model_classes for x in labels):
raise DeepchecksValueError(
f'Label observed values {sorted(labels.unique())} contain values '
f'that are not found in the model classes: {model_classes}.')
if probas.shape[1] != len(model_classes):
raise DeepchecksValueError(
f'Predicted probabilities shape {probas.shape} does not match the number of classes found in'
f' the labels: {model_classes}.')
labels = pd.Series(labels).apply(list(model_classes).index)

num_samples, num_classes = probas.shape
one_hot_labels = np.zeros((num_samples, num_classes))
one_hot_labels[list(np.arange(num_samples)), list(labels)] = 1
else:
one_hot_labels = labels

return pd.Series(np.sum(one_hot_labels * np.log(probas + eps), axis=1), index=index)
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,33 @@
# ----------------------------------------------------------------------------
#
"""Test for the NLP WeakSegmentsPerformance check"""
import numpy as np
import pytest
from hamcrest import assert_that, close_to, equal_to, has_items

from deepchecks.nlp.checks import MetadataSegmentsPerformance, PropertySegmentsPerformance
from tests.base.utils import equal_condition_result


@pytest.fixture
def multilabel_mock_dataset_and_probabilities(tweet_emotion_train_test_textdata):
"""Mock dataset and probabilities for multilabel classification"""
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X, y = make_multilabel_classification(n_samples=3_000, n_features=10, n_classes=3, n_labels=2,
random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
probabilities = np.zeros(y_test.shape)
for label_dim in range(y.shape[1]):
clf = LogisticRegression(random_state=42).fit(X_train, y_train[:, label_dim])
probabilities[:, label_dim] = clf.predict_proba(X_test)[:, 1]
data = tweet_emotion_train_test_textdata[1].sample(len(probabilities), random_state=42)
data._label = y_test
return data, probabilities


def test_tweet_emotion(tweet_emotion_train_test_textdata, tweet_emotion_train_test_probabilities):
# Arrange
_, test = tweet_emotion_train_test_textdata
Expand Down Expand Up @@ -79,3 +99,26 @@ def test_warning_of_n_top_columns(tweet_emotion_train_test_textdata, tweet_emoti
_ = property_check.run(test, probabilities=test_probas)
with pytest.warns(UserWarning, match=metadata_warning):
_ = metadata_check.run(test, probabilities=test_probas)


def test_multilabel_dataset(multilabel_mock_dataset_and_probabilities):
# Arrange
data, probabilities = multilabel_mock_dataset_and_probabilities
assert_that(data.is_multi_label_classification(), equal_to(True))
check = MetadataSegmentsPerformance().add_condition_segments_relative_performance_greater_than()
# Act
result = check.run(data, probabilities=probabilities)
condition_result = check.conditions_decision(result)

# Assert
assert_that(condition_result, has_items(
equal_condition_result(is_pass=False,
details='Found a segment with accuracy score of 0.395 in comparison to an average '
'score of 0.624 in sampled data.',
name='The relative performance of weakest segment is greater than 80% of average model '
'performance.')
))

assert_that(result.value['avg_score'], close_to(0.624, 0.001))
assert_that(len(result.value['weak_segments_list']), equal_to(5))
assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.395, 0.01))