Skip to content

Commit

Permalink
refactor text data (#2432)
Browse files Browse the repository at this point in the history
  • Loading branch information
Nadav-Barak committed Apr 4, 2023
1 parent 2a24338 commit 656d5fc
Show file tree
Hide file tree
Showing 17 changed files with 418 additions and 400 deletions.
Expand Up @@ -94,7 +94,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
"""
text_data = context.get_data_by_kind(dataset_kind).sample(self.n_samples, random_state=context.random_state)

label = pd.Series(text_data.label, name='label', index=text_data.index)
label = pd.Series(text_data.label, name='label', index=text_data.get_original_text_indexes())

# Classification labels should be of type object (and not int, for example)
if context.task_type in [TaskType.TEXT_CLASSIFICATION, TaskType.TOKEN_CLASSIFICATION]:
Expand Down
Expand Up @@ -119,7 +119,7 @@ def run_logic(self, context: Context, dataset_kind: DatasetKind) -> CheckResult:
text_outliers = np.concatenate([bottom_outliers, top_outliers])

result[name] = {
'indices': [dataset.index[i] for i in text_outliers],
'indices': [dataset.get_original_text_indexes()[i] for i in text_outliers],
# For the upper and lower limits doesn't show values that are smaller/larger than the actual values
# we have in the data
'lower_limit': max(lower_limit, min(values_arr)),
Expand Down
Expand Up @@ -73,7 +73,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
predictions = context.model.predict(text_data)

if self.loss_per_sample is not None:
loss_per_sample = self.loss_per_sample[list(text_data.index)]
loss_per_sample = self.loss_per_sample[text_data.get_original_text_indexes()]
proba_values = None
elif not hasattr(context.model, 'predict_proba'):
raise DeepchecksNotSupportedError('Predicted probabilities not supplied. The weak segment checks relies'
Expand All @@ -87,7 +87,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
if features.shape[1] < 2:
raise DeepchecksNotSupportedError('Check requires meta data to have at least two columns in order to run.')
# label is not used in the check, just here to avoid errors
dataset = Dataset(features, label=pd.Series(text_data.label, index=text_data.index), cat_features=cat_features)
dataset = Dataset(features, label=pd.Series(text_data.label), cat_features=cat_features)
encoded_dataset = self._target_encode_categorical_features_fill_na(dataset, list(np.unique(text_data.label)))

dummy_model = _DummyModel(test=encoded_dataset, y_pred_test=np.asarray(predictions),
Expand Down
52 changes: 29 additions & 23 deletions deepchecks/nlp/context.py
Expand Up @@ -90,9 +90,9 @@ def __init__(self,

if train is not None and test is not None:
# check if datasets have same indexes
if set(train.index) & set(test.index):
train.reindex(list(map(lambda x: f'train-{x}', list(train.index))))
test.reindex(list(map(lambda x: f'test-{x}', list(test.index))))
if set(train.get_original_text_indexes()) & set(test.get_original_text_indexes()):
train._original_text_index = np.asarray([f'train-{i}' for i in train.get_original_text_indexes()])
test._original_text_index = np.asarray([f'test-{i}' for i in test.get_original_text_indexes()])
get_logger().warning('train and test datasets have common index - adding "train"/"test"'
' prefixes. To avoid that provide datasets with no common indexes '
'or pass the model object instead of the predictions.')
Expand All @@ -108,26 +108,29 @@ def __init__(self,

if dataset.task_type == TaskType.TEXT_CLASSIFICATION:
if (y_pred is None) and (y_proba is not None):
if dataset.is_multilabel:
if dataset.is_multi_label_classification():
y_pred = (np.array(y_proba) > 0.5) # TODO: Replace with user-configurable threshold
y_pred = [np.array(model_classes)[pred] for pred in y_pred]
else:
y_pred = np.argmax(np.array(y_proba), axis=-1)
y_pred = np.array(model_classes)[y_pred]
y_pred = np.array(model_classes, dtype='str')[y_pred]

if y_pred is not None:
y_pred = np.array(y_pred)
if dataset.is_multi_label_classification():
y_pred = np.array(y_pred)
else:
y_pred = np.array(y_pred, dtype='str')
if len(y_pred.shape) > 1 and y_pred.shape[1] == 1:
y_pred = y_pred[:, 0]
ensure_predictions_shape(y_pred, dataset.text)

if y_proba is not None:
ensure_predictions_proba(y_proba, y_pred)
y_proba_dict = dict(zip(dataset.index, y_proba))
y_proba_dict = dict(zip(dataset.get_original_text_indexes(), y_proba))
probas.update({dataset.name: y_proba_dict})

if y_pred is not None:
y_pred_dict = dict(zip(dataset.index, y_pred))
y_pred_dict = dict(zip(dataset.get_original_text_indexes(), y_pred))
predictions.update({dataset.name: y_pred_dict})

self.predictions = predictions
Expand All @@ -148,20 +151,22 @@ def __init__(self,
def _predict(self, data: TextData) -> TTextPred: # TODO: Needs to receive list of strings, not TextData
"""Predict on given data by the data indexes."""
if self.validate_data_on_predict:
data_indices = set(np.random.choice(data.index, min(100, len(data.index)), replace=False))
data_indices = set(np.random.choice(data.get_original_text_indexes(), min(100, len(data)), replace=False))
if not data_indices.issubset(self._prediction_indices[data.name]):
raise DeepchecksValueError('Data that has not been seen before passed for inference with pre computed '
'predictions.')
return list(itemgetter(*data.index)(self.predictions[data.name])) # pylint: disable=unsubscriptable-object
return list(itemgetter(*data.get_original_text_indexes())(
self.predictions[data.name])) # pylint: disable=unsubscriptable-object

def _predict_proba(self, data: TextData) -> TTextProba: # TODO: Needs to receive list of strings, not TextData
"""Predict probabilities on given data by the data indexes."""
if self.validate_data_on_predict:
data_indices = set(np.random.choice(data.index, min(100, len(data.index)), replace=False))
data_indices = set(np.random.choice(data.get_original_text_indexes(), min(100, len(data)), replace=False))
if not data_indices.issubset(self._proba_indices[data.name]):
raise DeepchecksValueError('Data that has not been seen before passed for inference with pre computed '
'probabilities.')
return list(itemgetter(*data.index)(self.probas[data.name])) # pylint: disable=unsubscriptable-object
return list(itemgetter(*data.get_original_text_indexes())(
self.probas[data.name])) # pylint: disable=unsubscriptable-object

def fit(self, *args, **kwargs):
"""Just for python 3.6 (sklearn validates fit method)."""
Expand Down Expand Up @@ -192,7 +197,7 @@ def _validate_classification_prediction(dataset: TextData, prediction: TTextPred

try:
prediction = np.array(prediction)
if dataset.is_multilabel:
if dataset.is_multi_label_classification():
prediction = prediction.astype(float) # Multilabel prediction is a binary matrix
else:
prediction = prediction.reshape((-1, 1)) # Multiclass (not multilabel) Prediction can be a string
Expand All @@ -201,7 +206,7 @@ def _validate_classification_prediction(dataset: TextData, prediction: TTextPred
except ValueError as e:
raise ValidationError(classification_format_error) from e
pred_shape = prediction.shape
if dataset.is_multilabel:
if dataset.is_multi_label_classification():
if len(pred_shape) == 1 or pred_shape[1] != n_classes:
raise ValidationError(classification_format_error)
if not np.array_equal(prediction, prediction.astype(bool)):
Expand Down Expand Up @@ -247,7 +252,7 @@ def _validate_proba(dataset: TextData, probabilities: TTextProba, n_classes: int
if proba_shape[1] != n_classes:
raise ValidationError(f'Check requires classification probabilities for {dataset.name} dataset '
f'to have {n_classes} columns, same as the number of classes')
if dataset.is_multilabel:
if dataset.is_multi_label_classification():
if (probabilities > 1).any() or (probabilities < 0).any():
raise ValidationError(f'Check requires classification probabilities for {dataset.name} '
f'dataset to be between 0 and 1')
Expand Down Expand Up @@ -308,18 +313,19 @@ def __init__(
# If both dataset, validate they fit each other
if train_dataset and test_dataset:
if test_dataset.has_label() and train_dataset.has_label() and not \
TextData.datasets_share_task_type(train_dataset, test_dataset):
train_dataset.validate_textdata_compatibility(test_dataset):
raise DatasetValidationError('train_dataset and test_dataset must share the same label and task type')
if test_dataset and not train_dataset:
raise DatasetValidationError('Can\'t initialize context with only test_dataset. if you have single '
'dataset, initialize it as train_dataset')
if model_classes and len(model_classes) == 0:
raise DeepchecksValueError('Received empty model_classes')
if model_classes and sorted(model_classes) != model_classes:
supported_models_link = doclink(
'nlp-supported-predictions-format',
template='For more information please refer to the Supported Tasks guide {link}')
raise DeepchecksValueError(f'Received unsorted model_classes. {supported_models_link}')
if model_classes is not None:
if (not is_sequence_not_str(model_classes)) or len(model_classes) == 0:
raise DeepchecksValueError('model_classes must be a non-empty sequence')
if sorted(model_classes) != model_classes:
supported_models_link = doclink(
'nlp-supported-predictions-format',
template='For more information please refer to the Supported Tasks guide {link}')
raise DeepchecksValueError(f'Received unsorted model_classes. {supported_models_link}')

self._task_type = self.infer_task_type(train_dataset, test_dataset)

Expand Down
80 changes: 57 additions & 23 deletions deepchecks/nlp/datasets/classification/tweet_emotion.py
Expand Up @@ -98,13 +98,7 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True,
if data_format.lower() not in ['textdata', 'dataframe']:
raise ValueError('data_format must be either "Dataset" or "Dataframe"')

os.makedirs(ASSETS_DIR, exist_ok=True)
if (ASSETS_DIR / 'tweet_emotion_data.csv').exists():
dataset = pd.read_csv(ASSETS_DIR / 'tweet_emotion_data.csv', index_col=0)
else:
dataset = pd.read_csv(_FULL_DATA_URL, index_col=0)
dataset.to_csv(ASSETS_DIR / 'tweet_emotion_data.csv')

dataset = _read_and_save('tweet_emotion_data.csv', _FULL_DATA_URL, to_numpy=False)
if not as_train_test:
dataset.drop(columns=['train_test_split'], inplace=True)
if data_format.lower() == 'textdata':
Expand All @@ -114,7 +108,7 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True,
properties = None
dataset = TextData(dataset.text, label=dataset[_target], task_type='text_classification',
metadata=dataset.drop(columns=[_target, 'text']),
properties=properties, index=dataset.index)
properties=properties)
return dataset
else:
# train has more sport and Customer Complains but less Terror and Optimism
Expand All @@ -128,26 +122,39 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True,
train_properties, test_properties = None, None

train = TextData(train.text, label=train[_target], task_type='text_classification',
index=train.index, metadata=train.drop(columns=[_target, 'text']),
metadata=train.drop(columns=[_target, 'text']),
properties=train_properties)
test = TextData(test.text, label=test[_target], task_type='text_classification',
index=test.index, metadata=test.drop(columns=[_target, 'text']),
metadata=test.drop(columns=[_target, 'text']),
properties=test_properties)
return train, test


def load_embeddings() -> np.ndarray:
def load_embeddings(as_train_test: bool = False) -> np.ndarray:
"""Load and return the embeddings of the tweet_emotion dataset calculated by OpenAI.
Parameters
----------
as_train_test : bool, default: True
If True, the returned data is split into train and test exactly like the toy model
was trained. The first return value is the train data and the second is the test data.
Otherwise, returns a single object.
Returns
-------
embeddings : np.ndarray
Embeddings for the tweet_emotion dataset.
"""
return pd.read_csv(_EMBEDDINGS_URL, index_col=0).to_numpy()
all_embeddings = _read_and_save('tweet_emotion_embeddings.csv', _EMBEDDINGS_URL)
if as_train_test:
train_indexes, test_indexes = _get_train_test_indexes()
return all_embeddings[train_indexes], all_embeddings[test_indexes]
else:
return all_embeddings


def load_precalculated_predictions(pred_format: str = 'predictions') -> np.array:
def load_precalculated_predictions(pred_format: str = 'predictions',
as_train_test: bool = False) -> np.array:
"""Load and return a precalculated predictions for the dataset.
Parameters
Expand All @@ -156,25 +163,52 @@ def load_precalculated_predictions(pred_format: str = 'predictions') -> np.array
Represent the format of the returned value. Can be 'predictions' or 'probabilities'.
'predictions' will return the predicted class for each sample.
'probabilities' will return the predicted probabilities for each sample.
as_train_test : bool, default: True
If True, the returned data is split into train and test exactly like the toy model
was trained. The first return value is the train data and the second is the test data.
Otherwise, returns a single object.
Returns
-------
predictions : np.ndarray
The prediction of the data elements in the dataset.
"""
all_preds = _read_and_save('tweet_emotion_probabilities.csv', _PREDICTIONS_URL)
if pred_format == 'predictions':
all_preds = np.array([_LABEL_MAP[x] for x in np.argmax(all_preds, axis=1)])
elif pred_format != 'probabilities':
raise ValueError('pred_format must be either "predictions" or "probabilities"')

if as_train_test:
train_indexes, test_indexes = _get_train_test_indexes()
return all_preds[train_indexes], all_preds[test_indexes]
else:
return all_preds


def _read_and_save(file_name, url_to_file, to_numpy=True):
"""Read a file from a url and save it to the assets directory."""
os.makedirs(ASSETS_DIR, exist_ok=True)
if (ASSETS_DIR / 'tweet_emotion_probabilities.csv').exists():
preds = pd.read_csv(ASSETS_DIR / 'tweet_emotion_probabilities.csv', index_col=0)
if (ASSETS_DIR / file_name).exists():
data = pd.read_csv(ASSETS_DIR / file_name, index_col=0)
else:
preds = pd.read_csv(_PREDICTIONS_URL, index_col=0)
preds.to_csv(ASSETS_DIR / 'tweet_emotion_probabilities.csv')
data = pd.read_csv(url_to_file, index_col=0)
data.to_csv(ASSETS_DIR / file_name)

preds = preds.to_numpy()
if to_numpy:
data = data.to_numpy()
return data

if pred_format == 'predictions':
return np.array([_LABEL_MAP[x] for x in np.argmax(preds, axis=1)])
elif pred_format == 'probabilities':
return preds

def _get_train_test_indexes() -> t.Tuple[np.array, np.array]:
"""Get the indexes of the train and test sets."""
if (ASSETS_DIR / 'tweet_emotion_data.csv').exists():
dataset = pd.read_csv(ASSETS_DIR / 'tweet_emotion_data.csv', index_col=0,
usecols=['Unnamed: 0', 'train_test_split'])
else:
raise ValueError('pred_format must be either "predictions" or "probabilities"')
dataset = pd.read_csv(_FULL_DATA_URL, index_col=0, usecols=['Unnamed: 0', 'train_test_split'])

train_indexes = dataset[dataset['train_test_split'] == 'Train'].index
test_indexes = dataset[dataset['train_test_split'] == 'Test'].index
return train_indexes, test_indexes

0 comments on commit 656d5fc

Please sign in to comment.