refactor text data (#2432)

deepchecks · Apr 4, 2023 · 656d5fc · 656d5fc
1 parent 2a24338
commit 656d5fc
Show file tree

Hide file tree

Showing 17 changed files with 418 additions and 400 deletions.
diff --git a/deepchecks/nlp/checks/data_integrity/property_label_correlation.py b/deepchecks/nlp/checks/data_integrity/property_label_correlation.py
@@ -94,7 +94,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         """
         text_data = context.get_data_by_kind(dataset_kind).sample(self.n_samples, random_state=context.random_state)
 
-        label = pd.Series(text_data.label, name='label', index=text_data.index)
+        label = pd.Series(text_data.label, name='label', index=text_data.get_original_text_indexes())
 
         # Classification labels should be of type object (and not int, for example)
         if context.task_type in [TaskType.TEXT_CLASSIFICATION, TaskType.TOKEN_CLASSIFICATION]:

diff --git a/deepchecks/nlp/checks/data_integrity/text_property_outliers.py b/deepchecks/nlp/checks/data_integrity/text_property_outliers.py
@@ -119,7 +119,7 @@ def run_logic(self, context: Context, dataset_kind: DatasetKind) -> CheckResult:
             text_outliers = np.concatenate([bottom_outliers, top_outliers])
 
             result[name] = {
-                'indices': [dataset.index[i] for i in text_outliers],
+                'indices': [dataset.get_original_text_indexes()[i] for i in text_outliers],
                 # For the upper and lower limits doesn't show values that are smaller/larger than the actual values
                 # we have in the data
                 'lower_limit': max(lower_limit, min(values_arr)),

diff --git a/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py b/deepchecks/nlp/checks/model_evaluation/weak_segments_performance.py
@@ -73,7 +73,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         predictions = context.model.predict(text_data)
 
         if self.loss_per_sample is not None:
-            loss_per_sample = self.loss_per_sample[list(text_data.index)]
+            loss_per_sample = self.loss_per_sample[text_data.get_original_text_indexes()]
             proba_values = None
         elif not hasattr(context.model, 'predict_proba'):
             raise DeepchecksNotSupportedError('Predicted probabilities not supplied. The weak segment checks relies'
@@ -87,7 +87,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         if features.shape[1] < 2:
             raise DeepchecksNotSupportedError('Check requires meta data to have at least two columns in order to run.')
         # label is not used in the check, just here to avoid errors
-        dataset = Dataset(features, label=pd.Series(text_data.label, index=text_data.index), cat_features=cat_features)
+        dataset = Dataset(features, label=pd.Series(text_data.label), cat_features=cat_features)
         encoded_dataset = self._target_encode_categorical_features_fill_na(dataset, list(np.unique(text_data.label)))
 
         dummy_model = _DummyModel(test=encoded_dataset, y_pred_test=np.asarray(predictions),

diff --git a/deepchecks/nlp/context.py b/deepchecks/nlp/context.py
@@ -90,9 +90,9 @@ def __init__(self,
 
         if train is not None and test is not None:
             # check if datasets have same indexes
-            if set(train.index) & set(test.index):
-                train.reindex(list(map(lambda x: f'train-{x}', list(train.index))))
-                test.reindex(list(map(lambda x: f'test-{x}', list(test.index))))
+            if set(train.get_original_text_indexes()) & set(test.get_original_text_indexes()):
+                train._original_text_index = np.asarray([f'train-{i}' for i in train.get_original_text_indexes()])
+                test._original_text_index = np.asarray([f'test-{i}' for i in test.get_original_text_indexes()])
                 get_logger().warning('train and test datasets have common index - adding "train"/"test"'
                                      ' prefixes. To avoid that provide datasets with no common indexes '
                                      'or pass the model object instead of the predictions.')
@@ -108,26 +108,29 @@ def __init__(self,
 
                 if dataset.task_type == TaskType.TEXT_CLASSIFICATION:
                     if (y_pred is None) and (y_proba is not None):
-                        if dataset.is_multilabel:
+                        if dataset.is_multi_label_classification():
                             y_pred = (np.array(y_proba) > 0.5)  # TODO: Replace with user-configurable threshold
                             y_pred = [np.array(model_classes)[pred] for pred in y_pred]
                         else:
                             y_pred = np.argmax(np.array(y_proba), axis=-1)
-                            y_pred = np.array(model_classes)[y_pred]
+                            y_pred = np.array(model_classes, dtype='str')[y_pred]
 
                     if y_pred is not None:
-                        y_pred = np.array(y_pred)
+                        if dataset.is_multi_label_classification():
+                            y_pred = np.array(y_pred)
+                        else:
+                            y_pred = np.array(y_pred, dtype='str')
                         if len(y_pred.shape) > 1 and y_pred.shape[1] == 1:
                             y_pred = y_pred[:, 0]
                         ensure_predictions_shape(y_pred, dataset.text)
 
                     if y_proba is not None:
                         ensure_predictions_proba(y_proba, y_pred)
-                        y_proba_dict = dict(zip(dataset.index, y_proba))
+                        y_proba_dict = dict(zip(dataset.get_original_text_indexes(), y_proba))
                         probas.update({dataset.name: y_proba_dict})
 
                 if y_pred is not None:
-                    y_pred_dict = dict(zip(dataset.index, y_pred))
+                    y_pred_dict = dict(zip(dataset.get_original_text_indexes(), y_pred))
                     predictions.update({dataset.name: y_pred_dict})
 
         self.predictions = predictions
@@ -148,20 +151,22 @@ def __init__(self,
     def _predict(self, data: TextData) -> TTextPred:  # TODO: Needs to receive list of strings, not TextData
         """Predict on given data by the data indexes."""
         if self.validate_data_on_predict:
-            data_indices = set(np.random.choice(data.index, min(100, len(data.index)), replace=False))
+            data_indices = set(np.random.choice(data.get_original_text_indexes(), min(100, len(data)), replace=False))
             if not data_indices.issubset(self._prediction_indices[data.name]):
                 raise DeepchecksValueError('Data that has not been seen before passed for inference with pre computed '
                                            'predictions.')
-        return list(itemgetter(*data.index)(self.predictions[data.name]))  # pylint: disable=unsubscriptable-object
+        return list(itemgetter(*data.get_original_text_indexes())(
+            self.predictions[data.name]))  # pylint: disable=unsubscriptable-object
 
     def _predict_proba(self, data: TextData) -> TTextProba:  # TODO: Needs to receive list of strings, not TextData
         """Predict probabilities on given data by the data indexes."""
         if self.validate_data_on_predict:
-            data_indices = set(np.random.choice(data.index, min(100, len(data.index)), replace=False))
+            data_indices = set(np.random.choice(data.get_original_text_indexes(), min(100, len(data)), replace=False))
             if not data_indices.issubset(self._proba_indices[data.name]):
                 raise DeepchecksValueError('Data that has not been seen before passed for inference with pre computed '
                                            'probabilities.')
-        return list(itemgetter(*data.index)(self.probas[data.name]))  # pylint: disable=unsubscriptable-object
+        return list(itemgetter(*data.get_original_text_indexes())(
+            self.probas[data.name]))  # pylint: disable=unsubscriptable-object
 
     def fit(self, *args, **kwargs):
         """Just for python 3.6 (sklearn validates fit method)."""
@@ -192,7 +197,7 @@ def _validate_classification_prediction(dataset: TextData, prediction: TTextPred
 
         try:
             prediction = np.array(prediction)
-            if dataset.is_multilabel:
+            if dataset.is_multi_label_classification():
                 prediction = prediction.astype(float)  # Multilabel prediction is a binary matrix
             else:
                 prediction = prediction.reshape((-1, 1))  # Multiclass (not multilabel) Prediction can be a string
@@ -201,7 +206,7 @@ def _validate_classification_prediction(dataset: TextData, prediction: TTextPred
         except ValueError as e:
             raise ValidationError(classification_format_error) from e
         pred_shape = prediction.shape
-        if dataset.is_multilabel:
+        if dataset.is_multi_label_classification():
             if len(pred_shape) == 1 or pred_shape[1] != n_classes:
                 raise ValidationError(classification_format_error)
             if not np.array_equal(prediction, prediction.astype(bool)):
@@ -247,7 +252,7 @@ def _validate_proba(dataset: TextData, probabilities: TTextProba, n_classes: int
             if proba_shape[1] != n_classes:
                 raise ValidationError(f'Check requires classification probabilities for {dataset.name} dataset '
                                       f'to have {n_classes} columns, same as the number of classes')
-            if dataset.is_multilabel:
+            if dataset.is_multi_label_classification():
                 if (probabilities > 1).any() or (probabilities < 0).any():
                     raise ValidationError(f'Check requires classification probabilities for {dataset.name} '
                                           f'dataset to be between 0 and 1')
@@ -308,18 +313,19 @@ def __init__(
         # If both dataset, validate they fit each other
         if train_dataset and test_dataset:
             if test_dataset.has_label() and train_dataset.has_label() and not \
-                    TextData.datasets_share_task_type(train_dataset, test_dataset):
+                    train_dataset.validate_textdata_compatibility(test_dataset):
                 raise DatasetValidationError('train_dataset and test_dataset must share the same label and task type')
         if test_dataset and not train_dataset:
             raise DatasetValidationError('Can\'t initialize context with only test_dataset. if you have single '
                                          'dataset, initialize it as train_dataset')
-        if model_classes and len(model_classes) == 0:
-            raise DeepchecksValueError('Received empty model_classes')
-        if model_classes and sorted(model_classes) != model_classes:
-            supported_models_link = doclink(
-                'nlp-supported-predictions-format',
-                template='For more information please refer to the Supported Tasks guide {link}')
-            raise DeepchecksValueError(f'Received unsorted model_classes. {supported_models_link}')
+        if model_classes is not None:
+            if (not is_sequence_not_str(model_classes)) or len(model_classes) == 0:
+                raise DeepchecksValueError('model_classes must be a non-empty sequence')
+            if sorted(model_classes) != model_classes:
+                supported_models_link = doclink(
+                    'nlp-supported-predictions-format',
+                    template='For more information please refer to the Supported Tasks guide {link}')
+                raise DeepchecksValueError(f'Received unsorted model_classes. {supported_models_link}')
 
         self._task_type = self.infer_task_type(train_dataset, test_dataset)
 

diff --git a/deepchecks/nlp/datasets/classification/tweet_emotion.py b/deepchecks/nlp/datasets/classification/tweet_emotion.py
@@ -98,13 +98,7 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True,
     if data_format.lower() not in ['textdata', 'dataframe']:
         raise ValueError('data_format must be either "Dataset" or "Dataframe"')
 
-    os.makedirs(ASSETS_DIR, exist_ok=True)
-    if (ASSETS_DIR / 'tweet_emotion_data.csv').exists():
-        dataset = pd.read_csv(ASSETS_DIR / 'tweet_emotion_data.csv', index_col=0)
-    else:
-        dataset = pd.read_csv(_FULL_DATA_URL, index_col=0)
-        dataset.to_csv(ASSETS_DIR / 'tweet_emotion_data.csv')
-
+    dataset = _read_and_save('tweet_emotion_data.csv', _FULL_DATA_URL, to_numpy=False)
     if not as_train_test:
         dataset.drop(columns=['train_test_split'], inplace=True)
         if data_format.lower() == 'textdata':
@@ -114,7 +108,7 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True,
                 properties = None
             dataset = TextData(dataset.text, label=dataset[_target], task_type='text_classification',
                                metadata=dataset.drop(columns=[_target, 'text']),
-                               properties=properties, index=dataset.index)
+                               properties=properties)
         return dataset
     else:
         # train has more sport and Customer Complains but less Terror and Optimism
@@ -128,26 +122,39 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True,
                 train_properties, test_properties = None, None
 
             train = TextData(train.text, label=train[_target], task_type='text_classification',
-                             index=train.index, metadata=train.drop(columns=[_target, 'text']),
+                             metadata=train.drop(columns=[_target, 'text']),
                              properties=train_properties)
             test = TextData(test.text, label=test[_target], task_type='text_classification',
-                            index=test.index, metadata=test.drop(columns=[_target, 'text']),
+                            metadata=test.drop(columns=[_target, 'text']),
                             properties=test_properties)
         return train, test
 
 
-def load_embeddings() -> np.ndarray:
+def load_embeddings(as_train_test: bool = False) -> np.ndarray:
     """Load and return the embeddings of the tweet_emotion dataset calculated by OpenAI.
 
+    Parameters
+    ----------
+    as_train_test : bool, default: True
+        If True, the returned data is split into train and test exactly like the toy model
+        was trained. The first return value is the train data and the second is the test data.
+        Otherwise, returns a single object.
+
     Returns
     -------
     embeddings : np.ndarray
         Embeddings for the tweet_emotion dataset.
     """
-    return pd.read_csv(_EMBEDDINGS_URL, index_col=0).to_numpy()
+    all_embeddings = _read_and_save('tweet_emotion_embeddings.csv', _EMBEDDINGS_URL)
+    if as_train_test:
+        train_indexes, test_indexes = _get_train_test_indexes()
+        return all_embeddings[train_indexes], all_embeddings[test_indexes]
+    else:
+        return all_embeddings
 
 
-def load_precalculated_predictions(pred_format: str = 'predictions') -> np.array:
+def load_precalculated_predictions(pred_format: str = 'predictions',
+                                   as_train_test: bool = False) -> np.array:
     """Load and return a precalculated predictions for the dataset.
 
     Parameters
@@ -156,25 +163,52 @@ def load_precalculated_predictions(pred_format: str = 'predictions') -> np.array
         Represent the format of the returned value. Can be 'predictions' or 'probabilities'.
         'predictions' will return the predicted class for each sample.
         'probabilities' will return the predicted probabilities for each sample.
+    as_train_test : bool, default: True
+        If True, the returned data is split into train and test exactly like the toy model
+        was trained. The first return value is the train data and the second is the test data.
+        Otherwise, returns a single object.
 
     Returns
     -------
     predictions : np.ndarray
         The prediction of the data elements in the dataset.
 
     """
+    all_preds = _read_and_save('tweet_emotion_probabilities.csv', _PREDICTIONS_URL)
+    if pred_format == 'predictions':
+        all_preds = np.array([_LABEL_MAP[x] for x in np.argmax(all_preds, axis=1)])
+    elif pred_format != 'probabilities':
+        raise ValueError('pred_format must be either "predictions" or "probabilities"')
+
+    if as_train_test:
+        train_indexes, test_indexes = _get_train_test_indexes()
+        return all_preds[train_indexes], all_preds[test_indexes]
+    else:
+        return all_preds
+
+
+def _read_and_save(file_name, url_to_file, to_numpy=True):
+    """Read a file from a url and save it to the assets directory."""
     os.makedirs(ASSETS_DIR, exist_ok=True)
-    if (ASSETS_DIR / 'tweet_emotion_probabilities.csv').exists():
-        preds = pd.read_csv(ASSETS_DIR / 'tweet_emotion_probabilities.csv', index_col=0)
+    if (ASSETS_DIR / file_name).exists():
+        data = pd.read_csv(ASSETS_DIR / file_name, index_col=0)
     else:
-        preds = pd.read_csv(_PREDICTIONS_URL, index_col=0)
-        preds.to_csv(ASSETS_DIR / 'tweet_emotion_probabilities.csv')
+        data = pd.read_csv(url_to_file, index_col=0)
+        data.to_csv(ASSETS_DIR / file_name)
 
-    preds = preds.to_numpy()
+    if to_numpy:
+        data = data.to_numpy()
+    return data
 
-    if pred_format == 'predictions':
-        return np.array([_LABEL_MAP[x] for x in np.argmax(preds, axis=1)])
-    elif pred_format == 'probabilities':
-        return preds
+
+def _get_train_test_indexes() -> t.Tuple[np.array, np.array]:
+    """Get the indexes of the train and test sets."""
+    if (ASSETS_DIR / 'tweet_emotion_data.csv').exists():
+        dataset = pd.read_csv(ASSETS_DIR / 'tweet_emotion_data.csv', index_col=0,
+                              usecols=['Unnamed: 0', 'train_test_split'])
     else:
-        raise ValueError('pred_format must be either "predictions" or "probabilities"')
+        dataset = pd.read_csv(_FULL_DATA_URL, index_col=0, usecols=['Unnamed: 0', 'train_test_split'])
+
+    train_indexes = dataset[dataset['train_test_split'] == 'Train'].index
+    test_indexes = dataset[dataset['train_test_split'] == 'Test'].index
+    return train_indexes, test_indexes