Skip to content

Commit

Permalink
Nb/feat/support multi label (#2531)
Browse files Browse the repository at this point in the history
  • Loading branch information
Nadav-Barak committed May 15, 2023
1 parent d6bab6a commit 78124a9
Show file tree
Hide file tree
Showing 16 changed files with 284 additions and 62 deletions.
10 changes: 7 additions & 3 deletions deepchecks/nlp/checks/data_integrity/conflicting_labels.py
Expand Up @@ -11,6 +11,7 @@
"""Module contains Conflicting Labels check."""
import typing as t

import numpy as np
import pandas as pd

from deepchecks.core import CheckResult
Expand Down Expand Up @@ -83,7 +84,8 @@ def _truncate_text(self, x: str) -> str:

def run_logic(self, context: Context, dataset_kind) -> CheckResult:
"""Run check."""
dataset = context.get_data_by_kind(dataset_kind).sample(self.n_samples, random_state=self.random_state)
dataset = context.get_data_by_kind(dataset_kind)
dataset = dataset.sample(self.n_samples, random_state=self.random_state, drop_na_label=True)
dataset = t.cast(TextData, dataset)
samples = dataset.text
n_of_samples = len(samples)
Expand All @@ -96,12 +98,14 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
**self._text_normalization_kwargs
))

if dataset.task_type is TaskType.TOKEN_CLASSIFICATION or dataset.is_multi_label_classification():
if dataset.task_type is TaskType.TOKEN_CLASSIFICATION:
labels = [tuple(t.cast(t.Sequence[t.Any], it)) for it in dataset.label]
elif dataset.is_multi_label_classification():
labels = [tuple(np.where(row == 1)[0]) for row in dataset.label]
elif dataset.task_type is TaskType.TEXT_CLASSIFICATION:
labels = dataset.label
else:
raise DeepchecksValueError(f'Unknow task type - {dataset.task_type}')
raise DeepchecksValueError(f'Unknown task type - {dataset.task_type}')

df = pd.DataFrame({
'hash': samples_hashes,
Expand Down
8 changes: 8 additions & 0 deletions deepchecks/nlp/checks/model_evaluation/prediction_drift.py
Expand Up @@ -19,6 +19,8 @@

__all__ = ['PredictionDrift']

from deepchecks.utils.distribution.preprocessing import convert_multi_label_to_multi_class


class PredictionDrift(PredictionDriftAbstract, TrainTestCheck):
"""
Expand Down Expand Up @@ -150,10 +152,16 @@ def run_logic(self, context: Context) -> CheckResult:
# Flag for computing drift on the probabilities rather than the predicted labels
proba_drift = ((len(context.model_classes) == 2) and (self.drift_mode == 'auto')) or \
(self.drift_mode == 'proba')
model_classes = context.model_classes

if proba_drift:
if context.is_multi_label_task():
raise DeepchecksValueError('Cannot use proba drift mode for multi-label tasks')
train_prediction = np.array(model.predict_proba(train_dataset))
test_prediction = np.array(model.predict_proba(test_dataset))
elif context.is_multi_label_task():
train_prediction = convert_multi_label_to_multi_class(model.predict(train_dataset), model_classes)
test_prediction = convert_multi_label_to_multi_class(model.predict(test_dataset), model_classes)
else:
train_prediction = np.array(model.predict(train_dataset)).reshape((-1, 1))
test_prediction = np.array(model.predict(test_dataset)).reshape((-1, 1))
Expand Down
10 changes: 9 additions & 1 deletion deepchecks/nlp/checks/train_test_validation/label_drift.py
Expand Up @@ -14,6 +14,7 @@
from deepchecks.core import CheckResult
from deepchecks.nlp import Context, TrainTestCheck
from deepchecks.utils.abstracts.label_drift import LabelDriftAbstract
from deepchecks.utils.distribution.preprocessing import convert_multi_label_to_multi_class

__all__ = ['LabelDrift']

Expand Down Expand Up @@ -113,5 +114,12 @@ def run_logic(self, context: Context) -> CheckResult:
train_dataset = context.train.sample(self.n_samples, random_state=self.random_state)
test_dataset = context.test.sample(self.n_samples, random_state=self.random_state)

return self._calculate_label_drift(train_dataset.label.flatten(), test_dataset.label.flatten(), 'Label',
if context.is_multi_label_task():
train_labels = convert_multi_label_to_multi_class(train_dataset.label, context.model_classes).flatten()
test_labels = convert_multi_label_to_multi_class(test_dataset.label, context.model_classes).flatten()
else:
train_labels = train_dataset.label
test_labels = test_dataset.label

return self._calculate_label_drift(train_labels, test_labels, 'Label',
'categorical', context.with_display, (train_dataset.name, test_dataset.name))
14 changes: 10 additions & 4 deletions deepchecks/nlp/context.py
Expand Up @@ -29,7 +29,7 @@
from deepchecks.tabular.utils.task_type import TaskType as TabularTaskType
from deepchecks.utils.docref import doclink
from deepchecks.utils.logger import get_logger
from deepchecks.utils.typing import BasicModel
from deepchecks.utils.typing import ClassificationModel
from deepchecks.utils.validation import is_sequence_not_str

__all__ = [
Expand All @@ -54,7 +54,7 @@
TTextProba = t.Sequence[t.Sequence[float]]


class _DummyModel(BasicModel):
class _DummyModel(ClassificationModel):
"""Dummy model class used for inference with static predictions from the user.
Parameters
Expand Down Expand Up @@ -398,11 +398,17 @@ def raise_if_token_classification_task(self, check=None):
f'"{check_name}" is not supported for the "{task_type_name}" tasks'
)

def is_multi_label_task(self):
"""Return whether the task is multi-label classification."""
if self.task_type == TaskType.TEXT_CLASSIFICATION:
dataset = t.cast(TextData, self._train if self._train is not None else self._test)
return dataset.is_multi_label_classification()
return False

def raise_if_multi_label_task(self, check=None):
"""Raise an exception if it is a multi-label classification task."""
dataset = t.cast(TextData, self._train if self._train is not None else self._test)
check_name = type(check).__name__ if check else 'Check'
if dataset.is_multi_label_classification():
if self.is_multi_label_task():
raise DeepchecksNotSupportedError(
f'"{check_name}" is not supported for the multilable classification tasks'
)
Expand Down
4 changes: 2 additions & 2 deletions deepchecks/nlp/datasets/classification/__init__.py
Expand Up @@ -9,6 +9,6 @@
# ----------------------------------------------------------------------------
#
"""Module for working with pre-built classification datasets."""
from . import tweet_emotion
from . import just_dance_comment_analysis, tweet_emotion

__all__ = ['tweet_emotion']
__all__ = ['tweet_emotion', 'just_dance_comment_analysis']
112 changes: 112 additions & 0 deletions deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py
@@ -0,0 +1,112 @@
# ----------------------------------------------------------------------------
# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
#
# This file is part of Deepchecks.
# Deepchecks is distributed under the terms of the GNU Affero General
# Public License (version 3 or later).
# You should have received a copy of the GNU Affero General Public License
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""Dataset containing comments and metadata information for multilabel predictions for different properties of comments.
The data has 216193 comments make on the just dance YouTube videos. It has metadata information about the date the
comment was written and the number of "likes" it got. It also has
42 multilabel binary target label columns,
referring to the category classification of the comment.
This dataset is a modification of Just Dance @ YouTube dataset curated by the COIMBRA university,
For additional details about the dataset, please refer to the original source:
https://www.kaggle.com/datasets/renatojmsantos/just-dance-on-youtube.
Dataset used under the following license: https://creativecommons.org/licenses/by/4.0/
Original publication:
R. Santos, J. P. Arrais and P. A. Silva, "Analysing Games for Health through Users' Opinion Mining,"
2021 IEEE 34th International Symposium on Computer-Based Medical Systems (CBMS), Aveiro, Portugal, 2021, pp. 319-323,
doi: 10.1109/CBMS52027.2021.00035.
"""
import pathlib
import typing as t

import pandas as pd

from deepchecks.nlp import TextData
from deepchecks.utils.builtin_datasets_utils import read_and_save_data

__all__ = ['load_data']


_FULL_DATA_URL = 'https://figshare.com/ndownloader/files/40564895'


ASSETS_DIR = pathlib.Path(__file__).absolute().parent.parent / 'assets' / 'just_dance_comment_analysis'

_METADATA_COLS = ['likes', 'dateComment']
_CAT_METADATA = []
_CAT_PROPERTIES = ['Language']
_TEXT_COL = 'originalText'


def load_data(data_format: str = 'TextData', as_train_test: bool = True, use_full_size: bool = False) -> \
t.Union[t.Tuple, t.Union[TextData, pd.DataFrame]]:
"""Load and returns the Just Dance Comment Analysis dataset (multi-label classification).
Parameters
----------
data_format : str, default: 'TextData'
Represent the format of the returned value. Can be 'TextData'|'DataFrame'
'TextData' will return the data as a TextData object
'Dataframe' will return the data as a pandas DataFrame object
as_train_test : bool, default: True
If True, the returned data is split into train and test exactly like the toy model
was trained. The first return value is the train data and the second is the test data.
In order to get this model, call the load_fitted_model() function.
Otherwise, returns a single object.
use_full_size : bool, default: False
If True, the returned data will be the full dataset, otherwise returns a subset of the data.
Returns
-------
dataset : Union[TextData, pd.DataFrame]
the data object, corresponding to the data_format attribute.
train, test : Tuple[Union[TextData, pd.DataFrame],Union[TextData, pd.DataFrame]
tuple if as_train_test = True. Tuple of two objects represents the dataset split to train and test sets.
"""
if data_format.lower() not in ['textdata', 'dataframe']:
raise ValueError('data_format must be either "Dataset" or "Dataframe"')

data = read_and_save_data(ASSETS_DIR, 'just_dance_data.csv', _FULL_DATA_URL, to_numpy=False)
data['dateComment'] = pd.to_datetime(data['dateComment'])

if not as_train_test:
if not use_full_size:
data = data[(data['dateComment'] < '2013-01-01') | (data['dateComment'] >= '2021-01-01')]
if data_format.lower() != 'textdata':
return data

label = data.drop(columns=[_TEXT_COL] + _METADATA_COLS).to_numpy().astype(int)
dataset = TextData(data[_TEXT_COL], label=label, task_type='text_classification',
metadata=data[_METADATA_COLS], categorical_metadata=_CAT_METADATA)
return dataset

else:
if use_full_size:
train = data[data['dateComment'] < '2015-01-01']
test = data[data['dateComment'] >= '2015-01-01']
else:
train = data[data['dateComment'] < '2013-01-01']
test = data[data['dateComment'] >= '2021-01-01']

if data_format.lower() != 'textdata':
return train, test

train_metadata, test_metadata = train[_METADATA_COLS], test[_METADATA_COLS]
label_train = train.drop(columns=[_TEXT_COL] + _METADATA_COLS).to_numpy().astype(int)
label_test = test.drop(columns=[_TEXT_COL] + _METADATA_COLS).to_numpy().astype(int)

train_ds = TextData(train[_TEXT_COL], label=label_train, task_type='text_classification',
metadata=train_metadata, categorical_metadata=_CAT_METADATA)
test_ds = TextData(test[_TEXT_COL], label=label_test, task_type='text_classification',
metadata=test_metadata, categorical_metadata=_CAT_METADATA)

return train_ds, test_ds
46 changes: 8 additions & 38 deletions deepchecks/nlp/datasets/classification/tweet_emotion.py
Expand Up @@ -17,16 +17,14 @@
Dataset originally published in "Semeval-2018 task 1: Affect in tweets" by Mohammad et al. (2018):
https://aclanthology.org/S18-1001/.
"""
import os
import pathlib
import typing as t
from io import BytesIO

import numpy as np
import pandas as pd
import requests

from deepchecks.nlp import TextData
from deepchecks.utils.builtin_datasets_utils import read_and_save_data

__all__ = ['load_data', 'load_embeddings', 'load_precalculated_predictions']

Expand Down Expand Up @@ -58,7 +56,8 @@ def load_embeddings(as_train_test: bool = True) -> t.Union[np.array, t.Tuple[np.
embeddings : np.ndarray
Embeddings for the tweet_emotion dataset.
"""
all_embeddings = _read_and_save('tweet_emotion_embeddings.npy', _EMBEDDINGS_URL, file_type='npy')
all_embeddings = read_and_save_data(ASSETS_DIR, 'tweet_emotion_embeddings.npy', _EMBEDDINGS_URL,
file_type='npy', to_numpy=True)

if as_train_test:
train_indexes, test_indexes = _get_train_test_indexes()
Expand All @@ -83,12 +82,7 @@ def load_properties(as_train_test: bool = True) -> t.Union[pd.DataFrame, t.Tuple
properties : pd.DataFrame
Properties for the tweet_emotion dataset.
"""
if (ASSETS_DIR / 'tweet_emotion_properties.csv').exists():
properties = pd.read_csv(ASSETS_DIR / 'tweet_emotion_properties.csv', index_col=0)
else:
properties = pd.read_csv(_PROPERTIES_URL, index_col=0)
properties.to_csv(ASSETS_DIR / 'tweet_emotion_properties.csv')

properties = read_and_save_data(ASSETS_DIR, 'tweet_emotion_properties.csv', _PROPERTIES_URL, to_numpy=False)
if as_train_test:
train = properties[properties['train_test_split'] == 'Train'].drop(columns=['train_test_split'])
test = properties[properties['train_test_split'] == 'Test'].drop(columns=['train_test_split'])
Expand Down Expand Up @@ -128,7 +122,7 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True,
if data_format.lower() not in ['textdata', 'dataframe']:
raise ValueError('data_format must be either "Dataset" or "Dataframe"')

data = _read_and_save('tweet_emotion_data.csv', _FULL_DATA_URL)
data = read_and_save_data(ASSETS_DIR, 'tweet_emotion_data.csv', _FULL_DATA_URL, to_numpy=False)
if not as_train_test:
data.drop(columns=['train_test_split'], inplace=True)
if data_format.lower() != 'textdata':
Expand Down Expand Up @@ -165,7 +159,8 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True,
return train_ds, test_ds


def load_precalculated_predictions(pred_format: str = 'predictions', as_train_test: bool = True) -> np.array:
def load_precalculated_predictions(pred_format: str = 'predictions', as_train_test: bool = True) -> \
t.Union[np.array, t.Tuple[np.array, np.array]]:
"""Load and return a precalculated predictions for the dataset.
Parameters
Expand All @@ -185,7 +180,7 @@ def load_precalculated_predictions(pred_format: str = 'predictions', as_train_te
The prediction of the data elements in the dataset.
"""
all_preds = _read_and_save('tweet_emotion_probabilities.csv', _PREDICTIONS_URL, to_numpy=True)
all_preds = read_and_save_data(ASSETS_DIR, 'tweet_emotion_probabilities.csv', _PREDICTIONS_URL, to_numpy=True)
if pred_format == 'predictions':
all_preds = np.array([_LABEL_MAP[x] for x in np.argmax(all_preds, axis=1)])
elif pred_format != 'probabilities':
Expand All @@ -198,31 +193,6 @@ def load_precalculated_predictions(pred_format: str = 'predictions', as_train_te
return all_preds


def _read_and_save(file_name, url_to_file, file_type='csv', to_numpy=False):
"""Read a file from a url and save it to the assets' directory."""
os.makedirs(ASSETS_DIR, exist_ok=True)
if (ASSETS_DIR / file_name).exists():
if file_type == 'csv':
data = pd.read_csv(ASSETS_DIR / file_name, index_col=0)
elif file_type == 'npy':
data = np.load(ASSETS_DIR / file_name)
else:
raise ValueError('file_type must be either "csv" or "npy"')
else:
if file_type == 'csv':
data = pd.read_csv(url_to_file, index_col=0)
data.to_csv(ASSETS_DIR / file_name)
elif file_type == 'npy':
data = np.load(BytesIO(requests.get(url_to_file).content))
np.save(ASSETS_DIR / file_name, data)
else:
raise ValueError('file_type must be either "csv" or "npy"')

if to_numpy:
data = data.to_numpy()
return data


def _get_train_test_indexes() -> t.Tuple[np.array, np.array]:
"""Get the indexes of the train and test sets."""
if (ASSETS_DIR / 'tweet_emotion_data.csv').exists():
Expand Down
11 changes: 7 additions & 4 deletions deepchecks/nlp/utils/text_properties.py
Expand Up @@ -313,10 +313,12 @@ def lexical_density(raw_text: Sequence[str]) -> List[str]:
for text in raw_text:
if not pd.isna(text):
all_words = textblob.TextBlob(text).words
total_words = len(all_words)
total_unique_words = len(set(all_words))
text_lexical_density = round(total_unique_words * 100 / total_words, 2)
result.append(text_lexical_density)
if len(all_words) == 0:
result.append(np.nan)
else:
total_unique_words = len(set(all_words))
text_lexical_density = round(total_unique_words * 100 / len(all_words), 2)
result.append(text_lexical_density)
else:
result.append(np.nan)
return result
Expand Down Expand Up @@ -482,6 +484,7 @@ def calculate_default_properties(
Dict[str, str]
A dictionary with the property name as key and the property's type as value.
"""
raw_text = list(raw_text)
default_text_properties = _get_default_properties(
include_properties=include_properties,
ignore_properties=ignore_properties
Expand Down
4 changes: 2 additions & 2 deletions deepchecks/utils/abstracts/prediction_drift.py
Expand Up @@ -51,8 +51,8 @@ def _prediction_drift(self, train_prediction, test_prediction, model_classes, wi
train prediction or probabilities
test_prediction : np.ndarray
test prediction or probabilities
model_classes : list
list of model classes
model_classes : List[str]
List of model classes names
with_display : bool
flag for displaying the prediction distribution graph
proba_drift : bool
Expand Down

0 comments on commit 78124a9

Please sign in to comment.