Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

binary classification support nlp #2571

Merged
merged 1 commit into from May 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions deepchecks/nlp/checks/model_evaluation/prediction_drift.py
Expand Up @@ -162,8 +162,8 @@ def run_logic(self, context: Context) -> CheckResult:
proba_drift = False
else:
# Flag for computing drift on the probabilities rather than the predicted labels
proba_drift = ((len(context.model_classes) == 2) and (self.drift_mode == 'auto')) or \
(self.drift_mode == 'proba')
proba_drift = ((len(context.model_classes) == 2) and (self.drift_mode == 'auto') and
hasattr(model, 'predict_proba')) or (self.drift_mode == 'proba')

if proba_drift:
train_prediction = np.array(model.predict_proba(train_dataset))
Expand Down
Expand Up @@ -180,7 +180,7 @@ def run_logic(self, context: Context) -> CheckResult:

# Flag for computing drift on the probabilities rather than the predicted labels
proba_drift = \
((context.task_type == TaskType.BINARY and self.drift_mode == 'auto')
((context.task_type == TaskType.BINARY and self.drift_mode == 'auto' and hasattr(model, 'predict_proba'))
or (self.drift_mode == 'proba')) \
and not (self.balance_classes is True and self.drift_mode == 'auto')

Expand Down
2 changes: 2 additions & 0 deletions deepchecks/tabular/metric_utils/scorers.py
Expand Up @@ -304,13 +304,15 @@ def _run_score(self, model, data: pd.DataFrame, label_col: pd.Series):
f'manually provide predicted probabilities to the check. '
f'{SUPPORTED_MODELS_DOCLINK}')

label_col = pd.Series(label_col)
original_label_col = label_col
if self.model_classes is not None:
model = self._wrap_classification_model(model, data)
if model.is_binary:
if len(label_col.unique()) > 2:
raise errors.DeepchecksValueError('Model is binary but the label column has more than 2 classes: '
f'{label_col.unique()}')

label_col = label_col.map({self.model_classes[0]: 0, self.model_classes[1]: 1})
else:
# if multilabel convert from series of lists to 2d numpy array
Expand Down
26 changes: 26 additions & 0 deletions tests/nlp/checks/model_evaluation/prediction_drift_test.py
Expand Up @@ -147,3 +147,29 @@ def test_drift_mode_proba_warnings(small_wikiann_train_test_text_data):

assert_that(record, has_length(0))


def test_binary_classification_probabilities(binary_mock_dataset_and_probabilities):
# Arrange
text_data, proba_train, proba_test = binary_mock_dataset_and_probabilities
check = PredictionDrift()

# Act
result = check.run(text_data, text_data, train_probabilities=proba_train, test_probabilities=proba_test)

# Assert
assert_that(result.value['Drift score'], close_to(0.666, 0.01))


def test_binary_classification(binary_mock_dataset_and_probabilities):
# Arrange
text_data, proba_train, proba_test = binary_mock_dataset_and_probabilities
model_classes = {0: 'negative', 1: 'positive'}
preds_train = [model_classes[x] for x in np.argmax(proba_train, axis=1)]
preds_test = [model_classes[x] for x in np.argmax(proba_test, axis=1)]
check = PredictionDrift()

# Act
result = check.run(text_data, text_data, train_predictions=preds_train, test_predictions=preds_test)

# Assert
assert_that(result.value['Drift score'], close_to(0.327, 0.01))
Expand Up @@ -170,3 +170,17 @@ def test_multilabel_just_dance(just_dance_train_test_textdata, just_dance_train_
assert_that(result.value['avg_score'], close_to(0.615, 0.001))
assert_that(len(result.value['weak_segments_list']), is_in([79, 80])) # TODO: check why it's not always 80
assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.401, 0.01))


def test_binary_classification(binary_mock_dataset_and_probabilities):
# Arrange
text_data, _, proba_test = binary_mock_dataset_and_probabilities
check = PropertySegmentsPerformance()

# Act
result = check.run(text_data, probabilities=proba_test)

# Assert
assert_that(result.value['avg_score'], close_to(0.447, 0.001))
assert_that(len(result.value['weak_segments_list']), equal_to(6))
assert_that(result.value['weak_segments_list'].iloc[0, 0], close_to(0.34, 0.01))
16 changes: 16 additions & 0 deletions tests/nlp/conftest.py
Expand Up @@ -179,6 +179,22 @@ def multilabel_mock_dataset_and_probabilities(tweet_emotion_train_test_textdata)
return data, probabilities


@pytest.fixture(scope='session')
def binary_mock_dataset_and_probabilities():
"""Mock dataset and probabilities for binary classification"""

text = ['I think therefore I am', 'I am therefore I think', 'I am'] * 100
label = ["a", "b", "b"] * 100
probabilities_train = np.array([[0.1,0.9]] * 200 + [[0.9,0.1]] * 100)
probabilities_test = np.array([[0.6,0.4]] * 200 + [[0.4,0.6]] * 100)

text_data = TextData(raw_text=text, label=label, task_type='text_classification')
text_data.calculate_builtin_properties()
return text_data, probabilities_train, probabilities_test




# Token Classification
@pytest.fixture(scope='session')
def original_wikiann():
Expand Down