Nb/feat/support multi label (#2531)

deepchecks · May 15, 2023 · 78124a9 · 78124a9
1 parent d6bab6a
commit 78124a9
Show file tree

Hide file tree

Showing 16 changed files with 284 additions and 62 deletions.
diff --git a/deepchecks/nlp/checks/data_integrity/conflicting_labels.py b/deepchecks/nlp/checks/data_integrity/conflicting_labels.py
@@ -11,6 +11,7 @@
 """Module contains Conflicting Labels check."""
 import typing as t
 
+import numpy as np
 import pandas as pd
 
 from deepchecks.core import CheckResult
@@ -83,7 +84,8 @@ def _truncate_text(self, x: str) -> str:
 
     def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         """Run check."""
-        dataset = context.get_data_by_kind(dataset_kind).sample(self.n_samples, random_state=self.random_state)
+        dataset = context.get_data_by_kind(dataset_kind)
+        dataset = dataset.sample(self.n_samples, random_state=self.random_state, drop_na_label=True)
         dataset = t.cast(TextData, dataset)
         samples = dataset.text
         n_of_samples = len(samples)
@@ -96,12 +98,14 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
             **self._text_normalization_kwargs
         ))
 
-        if dataset.task_type is TaskType.TOKEN_CLASSIFICATION or dataset.is_multi_label_classification():
+        if dataset.task_type is TaskType.TOKEN_CLASSIFICATION:
             labels = [tuple(t.cast(t.Sequence[t.Any], it)) for it in dataset.label]
+        elif dataset.is_multi_label_classification():
+            labels = [tuple(np.where(row == 1)[0]) for row in dataset.label]
         elif dataset.task_type is TaskType.TEXT_CLASSIFICATION:
             labels = dataset.label
         else:
-            raise DeepchecksValueError(f'Unknow task type - {dataset.task_type}')
+            raise DeepchecksValueError(f'Unknown task type - {dataset.task_type}')
 
         df = pd.DataFrame({
             'hash': samples_hashes,

diff --git a/deepchecks/nlp/checks/model_evaluation/prediction_drift.py b/deepchecks/nlp/checks/model_evaluation/prediction_drift.py
@@ -19,6 +19,8 @@
 
 __all__ = ['PredictionDrift']
 
+from deepchecks.utils.distribution.preprocessing import convert_multi_label_to_multi_class
+
 
 class PredictionDrift(PredictionDriftAbstract, TrainTestCheck):
     """
@@ -150,10 +152,16 @@ def run_logic(self, context: Context) -> CheckResult:
         # Flag for computing drift on the probabilities rather than the predicted labels
         proba_drift = ((len(context.model_classes) == 2) and (self.drift_mode == 'auto')) or \
                       (self.drift_mode == 'proba')
+        model_classes = context.model_classes
 
         if proba_drift:
+            if context.is_multi_label_task():
+                raise DeepchecksValueError('Cannot use proba drift mode for multi-label tasks')
             train_prediction = np.array(model.predict_proba(train_dataset))
             test_prediction = np.array(model.predict_proba(test_dataset))
+        elif context.is_multi_label_task():
+            train_prediction = convert_multi_label_to_multi_class(model.predict(train_dataset), model_classes)
+            test_prediction = convert_multi_label_to_multi_class(model.predict(test_dataset), model_classes)
         else:
             train_prediction = np.array(model.predict(train_dataset)).reshape((-1, 1))
             test_prediction = np.array(model.predict(test_dataset)).reshape((-1, 1))

diff --git a/deepchecks/nlp/checks/train_test_validation/label_drift.py b/deepchecks/nlp/checks/train_test_validation/label_drift.py
@@ -14,6 +14,7 @@
 from deepchecks.core import CheckResult
 from deepchecks.nlp import Context, TrainTestCheck
 from deepchecks.utils.abstracts.label_drift import LabelDriftAbstract
+from deepchecks.utils.distribution.preprocessing import convert_multi_label_to_multi_class
 
 __all__ = ['LabelDrift']
 
@@ -113,5 +114,12 @@ def run_logic(self, context: Context) -> CheckResult:
         train_dataset = context.train.sample(self.n_samples, random_state=self.random_state)
         test_dataset = context.test.sample(self.n_samples, random_state=self.random_state)
 
-        return self._calculate_label_drift(train_dataset.label.flatten(), test_dataset.label.flatten(), 'Label',
+        if context.is_multi_label_task():
+            train_labels = convert_multi_label_to_multi_class(train_dataset.label, context.model_classes).flatten()
+            test_labels = convert_multi_label_to_multi_class(test_dataset.label, context.model_classes).flatten()
+        else:
+            train_labels = train_dataset.label
+            test_labels = test_dataset.label
+
+        return self._calculate_label_drift(train_labels, test_labels, 'Label',
                                            'categorical', context.with_display, (train_dataset.name, test_dataset.name))
diff --git a/deepchecks/nlp/context.py b/deepchecks/nlp/context.py
@@ -29,7 +29,7 @@
 from deepchecks.tabular.utils.task_type import TaskType as TabularTaskType
 from deepchecks.utils.docref import doclink
 from deepchecks.utils.logger import get_logger
-from deepchecks.utils.typing import BasicModel
+from deepchecks.utils.typing import ClassificationModel
 from deepchecks.utils.validation import is_sequence_not_str
 
 __all__ = [
@@ -54,7 +54,7 @@
 TTextProba = t.Sequence[t.Sequence[float]]
 
 
-class _DummyModel(BasicModel):
+class _DummyModel(ClassificationModel):
     """Dummy model class used for inference with static predictions from the user.
 
     Parameters
@@ -398,11 +398,17 @@ def raise_if_token_classification_task(self, check=None):
                 f'"{check_name}" is not supported for the "{task_type_name}" tasks'
             )
 
+    def is_multi_label_task(self):
+        """Return whether the task is multi-label classification."""
+        if self.task_type == TaskType.TEXT_CLASSIFICATION:
+            dataset = t.cast(TextData, self._train if self._train is not None else self._test)
+            return dataset.is_multi_label_classification()
+        return False
+
     def raise_if_multi_label_task(self, check=None):
         """Raise an exception if it is a multi-label classification task."""
-        dataset = t.cast(TextData, self._train if self._train is not None else self._test)
         check_name = type(check).__name__ if check else 'Check'
-        if dataset.is_multi_label_classification():
+        if self.is_multi_label_task():
             raise DeepchecksNotSupportedError(
                 f'"{check_name}" is not supported for the multilable classification tasks'
             )

diff --git a/deepchecks/nlp/datasets/classification/__init__.py b/deepchecks/nlp/datasets/classification/__init__.py
@@ -9,6 +9,6 @@
 # ----------------------------------------------------------------------------
 #
 """Module for working with pre-built classification datasets."""
-from . import tweet_emotion
+from . import just_dance_comment_analysis, tweet_emotion
 
-__all__ = ['tweet_emotion']
+__all__ = ['tweet_emotion', 'just_dance_comment_analysis']
diff --git a/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py b/deepchecks/nlp/datasets/classification/just_dance_comment_analysis.py
@@ -0,0 +1,112 @@
+# ----------------------------------------------------------------------------
+# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
+#
+# This file is part of Deepchecks.
+# Deepchecks is distributed under the terms of the GNU Affero General
+# Public License (version 3 or later).
+# You should have received a copy of the GNU Affero General Public License
+# along with Deepchecks.  If not, see <http://www.gnu.org/licenses/>.
+# ----------------------------------------------------------------------------
+#
+"""Dataset containing comments and metadata information for multilabel predictions for different properties of comments.
+
+The data has 216193 comments make on the just dance YouTube videos. It has metadata information about the date the
+comment was written and the number of "likes" it got. It also has
+42 multilabel binary target label columns,
+referring to the category classification of the comment.
+
+This dataset is a modification of Just Dance @ YouTube dataset curated by the COIMBRA university,
+For additional details about the dataset, please refer to the original source:
+https://www.kaggle.com/datasets/renatojmsantos/just-dance-on-youtube.
+Dataset used under the following license: https://creativecommons.org/licenses/by/4.0/
+
+Original publication:
+R. Santos, J. P. Arrais and P. A. Silva, "Analysing Games for Health through Users' Opinion Mining,"
+2021 IEEE 34th International Symposium on Computer-Based Medical Systems (CBMS), Aveiro, Portugal, 2021, pp. 319-323,
+doi: 10.1109/CBMS52027.2021.00035.
+"""
+import pathlib
+import typing as t
+
+import pandas as pd
+
+from deepchecks.nlp import TextData
+from deepchecks.utils.builtin_datasets_utils import read_and_save_data
+
+__all__ = ['load_data']
+
+
+_FULL_DATA_URL = 'https://figshare.com/ndownloader/files/40564895'
+
+
+ASSETS_DIR = pathlib.Path(__file__).absolute().parent.parent / 'assets' / 'just_dance_comment_analysis'
+
+_METADATA_COLS = ['likes', 'dateComment']
+_CAT_METADATA = []
+_CAT_PROPERTIES = ['Language']
+_TEXT_COL = 'originalText'
+
+
+def load_data(data_format: str = 'TextData', as_train_test: bool = True, use_full_size: bool = False) -> \
+        t.Union[t.Tuple, t.Union[TextData, pd.DataFrame]]:
+    """Load and returns the Just Dance Comment Analysis dataset (multi-label classification).
+
+    Parameters
+    ----------
+    data_format : str, default: 'TextData'
+        Represent the format of the returned value. Can be 'TextData'|'DataFrame'
+        'TextData' will return the data as a TextData object
+        'Dataframe' will return the data as a pandas DataFrame object
+    as_train_test : bool, default: True
+        If True, the returned data is split into train and test exactly like the toy model
+        was trained. The first return value is the train data and the second is the test data.
+        In order to get this model, call the load_fitted_model() function.
+        Otherwise, returns a single object.
+    use_full_size : bool, default: False
+        If True, the returned data will be the full dataset, otherwise returns a subset of the data.
+
+    Returns
+    -------
+    dataset : Union[TextData, pd.DataFrame]
+        the data object, corresponding to the data_format attribute.
+    train, test : Tuple[Union[TextData, pd.DataFrame],Union[TextData, pd.DataFrame]
+        tuple if as_train_test = True. Tuple of two objects represents the dataset split to train and test sets.
+    """
+    if data_format.lower() not in ['textdata', 'dataframe']:
+        raise ValueError('data_format must be either "Dataset" or "Dataframe"')
+
+    data = read_and_save_data(ASSETS_DIR, 'just_dance_data.csv', _FULL_DATA_URL, to_numpy=False)
+    data['dateComment'] = pd.to_datetime(data['dateComment'])
+
+    if not as_train_test:
+        if not use_full_size:
+            data = data[(data['dateComment'] < '2013-01-01') | (data['dateComment'] >= '2021-01-01')]
+        if data_format.lower() != 'textdata':
+            return data
+
+        label = data.drop(columns=[_TEXT_COL] + _METADATA_COLS).to_numpy().astype(int)
+        dataset = TextData(data[_TEXT_COL], label=label, task_type='text_classification',
+                           metadata=data[_METADATA_COLS], categorical_metadata=_CAT_METADATA)
+        return dataset
+
+    else:
+        if use_full_size:
+            train = data[data['dateComment'] < '2015-01-01']
+            test = data[data['dateComment'] >= '2015-01-01']
+        else:
+            train = data[data['dateComment'] < '2013-01-01']
+            test = data[data['dateComment'] >= '2021-01-01']
+
+        if data_format.lower() != 'textdata':
+            return train, test
+
+        train_metadata, test_metadata = train[_METADATA_COLS], test[_METADATA_COLS]
+        label_train = train.drop(columns=[_TEXT_COL] + _METADATA_COLS).to_numpy().astype(int)
+        label_test = test.drop(columns=[_TEXT_COL] + _METADATA_COLS).to_numpy().astype(int)
+
+        train_ds = TextData(train[_TEXT_COL], label=label_train, task_type='text_classification',
+                            metadata=train_metadata, categorical_metadata=_CAT_METADATA)
+        test_ds = TextData(test[_TEXT_COL], label=label_test, task_type='text_classification',
+                           metadata=test_metadata, categorical_metadata=_CAT_METADATA)
+
+        return train_ds, test_ds
diff --git a/deepchecks/nlp/datasets/classification/tweet_emotion.py b/deepchecks/nlp/datasets/classification/tweet_emotion.py
@@ -17,16 +17,14 @@
 Dataset originally published in "Semeval-2018 task 1: Affect in tweets" by Mohammad et al. (2018):
 https://aclanthology.org/S18-1001/.
 """
-import os
 import pathlib
 import typing as t
-from io import BytesIO
 
 import numpy as np
 import pandas as pd
-import requests
 
 from deepchecks.nlp import TextData
+from deepchecks.utils.builtin_datasets_utils import read_and_save_data
 
 __all__ = ['load_data', 'load_embeddings', 'load_precalculated_predictions']
 
@@ -58,7 +56,8 @@ def load_embeddings(as_train_test: bool = True) -> t.Union[np.array, t.Tuple[np.
     embeddings : np.ndarray
         Embeddings for the tweet_emotion dataset.
     """
-    all_embeddings = _read_and_save('tweet_emotion_embeddings.npy', _EMBEDDINGS_URL, file_type='npy')
+    all_embeddings = read_and_save_data(ASSETS_DIR, 'tweet_emotion_embeddings.npy', _EMBEDDINGS_URL,
+                                        file_type='npy', to_numpy=True)
 
     if as_train_test:
         train_indexes, test_indexes = _get_train_test_indexes()
@@ -83,12 +82,7 @@ def load_properties(as_train_test: bool = True) -> t.Union[pd.DataFrame, t.Tuple
     properties : pd.DataFrame
         Properties for the tweet_emotion dataset.
     """
-    if (ASSETS_DIR / 'tweet_emotion_properties.csv').exists():
-        properties = pd.read_csv(ASSETS_DIR / 'tweet_emotion_properties.csv', index_col=0)
-    else:
-        properties = pd.read_csv(_PROPERTIES_URL, index_col=0)
-        properties.to_csv(ASSETS_DIR / 'tweet_emotion_properties.csv')
-
+    properties = read_and_save_data(ASSETS_DIR, 'tweet_emotion_properties.csv', _PROPERTIES_URL, to_numpy=False)
     if as_train_test:
         train = properties[properties['train_test_split'] == 'Train'].drop(columns=['train_test_split'])
         test = properties[properties['train_test_split'] == 'Test'].drop(columns=['train_test_split'])
@@ -128,7 +122,7 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True,
     if data_format.lower() not in ['textdata', 'dataframe']:
         raise ValueError('data_format must be either "Dataset" or "Dataframe"')
 
-    data = _read_and_save('tweet_emotion_data.csv', _FULL_DATA_URL)
+    data = read_and_save_data(ASSETS_DIR, 'tweet_emotion_data.csv', _FULL_DATA_URL, to_numpy=False)
     if not as_train_test:
         data.drop(columns=['train_test_split'], inplace=True)
         if data_format.lower() != 'textdata':
@@ -165,7 +159,8 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True,
         return train_ds, test_ds
 
 
-def load_precalculated_predictions(pred_format: str = 'predictions', as_train_test: bool = True) -> np.array:
+def load_precalculated_predictions(pred_format: str = 'predictions', as_train_test: bool = True) -> \
+        t.Union[np.array, t.Tuple[np.array, np.array]]:
     """Load and return a precalculated predictions for the dataset.
 
     Parameters
@@ -185,7 +180,7 @@ def load_precalculated_predictions(pred_format: str = 'predictions', as_train_te
         The prediction of the data elements in the dataset.
 
     """
-    all_preds = _read_and_save('tweet_emotion_probabilities.csv', _PREDICTIONS_URL, to_numpy=True)
+    all_preds = read_and_save_data(ASSETS_DIR, 'tweet_emotion_probabilities.csv', _PREDICTIONS_URL, to_numpy=True)
     if pred_format == 'predictions':
         all_preds = np.array([_LABEL_MAP[x] for x in np.argmax(all_preds, axis=1)])
     elif pred_format != 'probabilities':
@@ -198,31 +193,6 @@ def load_precalculated_predictions(pred_format: str = 'predictions', as_train_te
         return all_preds
 
 
-def _read_and_save(file_name, url_to_file, file_type='csv', to_numpy=False):
-    """Read a file from a url and save it to the assets' directory."""
-    os.makedirs(ASSETS_DIR, exist_ok=True)
-    if (ASSETS_DIR / file_name).exists():
-        if file_type == 'csv':
-            data = pd.read_csv(ASSETS_DIR / file_name, index_col=0)
-        elif file_type == 'npy':
-            data = np.load(ASSETS_DIR / file_name)
-        else:
-            raise ValueError('file_type must be either "csv" or "npy"')
-    else:
-        if file_type == 'csv':
-            data = pd.read_csv(url_to_file, index_col=0)
-            data.to_csv(ASSETS_DIR / file_name)
-        elif file_type == 'npy':
-            data = np.load(BytesIO(requests.get(url_to_file).content))
-            np.save(ASSETS_DIR / file_name, data)
-        else:
-            raise ValueError('file_type must be either "csv" or "npy"')
-
-    if to_numpy:
-        data = data.to_numpy()
-    return data
-
-
 def _get_train_test_indexes() -> t.Tuple[np.array, np.array]:
     """Get the indexes of the train and test sets."""
     if (ASSETS_DIR / 'tweet_emotion_data.csv').exists():

diff --git a/deepchecks/nlp/utils/text_properties.py b/deepchecks/nlp/utils/text_properties.py
@@ -313,10 +313,12 @@ def lexical_density(raw_text: Sequence[str]) -> List[str]:
     for text in raw_text:
         if not pd.isna(text):
             all_words = textblob.TextBlob(text).words
-            total_words = len(all_words)
-            total_unique_words = len(set(all_words))
-            text_lexical_density = round(total_unique_words * 100 / total_words, 2)
-            result.append(text_lexical_density)
+            if len(all_words) == 0:
+                result.append(np.nan)
+            else:
+                total_unique_words = len(set(all_words))
+                text_lexical_density = round(total_unique_words * 100 / len(all_words), 2)
+                result.append(text_lexical_density)
         else:
             result.append(np.nan)
     return result
@@ -482,6 +484,7 @@ def calculate_default_properties(
     Dict[str, str]
         A dictionary with the property name as key and the property's type as value.
     """
+    raw_text = list(raw_text)
     default_text_properties = _get_default_properties(
         include_properties=include_properties,
         ignore_properties=ignore_properties

diff --git a/deepchecks/utils/abstracts/prediction_drift.py b/deepchecks/utils/abstracts/prediction_drift.py
@@ -51,8 +51,8 @@ def _prediction_drift(self, train_prediction, test_prediction, model_classes, wi
                 train prediction or probabilities
             test_prediction : np.ndarray
                 test prediction or probabilities
-            model_classes : list
-                list of model classes
+            model_classes : List[str]
+                List of model classes names
             with_display : bool
                 flag for displaying the prediction distribution graph
             proba_drift : bool