Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor text data #2432

Merged
merged 8 commits into from
Apr 4, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
"""
text_data = context.get_data_by_kind(dataset_kind).sample(self.n_samples, random_state=context.random_state)

label = pd.Series(text_data.label, name='label', index=text_data.index)
label = pd.Series(text_data.label, name='label', index=text_data.get_original_text_indexes())

# Classification labels should be of type object (and not int, for example)
if context.task_type in [TaskType.TEXT_CLASSIFICATION, TaskType.TOKEN_CLASSIFICATION]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def run_logic(self, context: Context, dataset_kind: DatasetKind) -> CheckResult:
text_outliers = np.concatenate([bottom_outliers, top_outliers])

result[name] = {
'indices': [dataset.index[i] for i in text_outliers],
'indices': [dataset.get_original_text_indexes()[i] for i in text_outliers],
# For the upper and lower limits doesn't show values that are smaller/larger than the actual values
# we have in the data
'lower_limit': max(lower_limit, min(values_arr)),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
if features.shape[1] < 2:
raise DeepchecksNotSupportedError('Check requires meta data to have at least two columns in order to run.')
# label is not used in the check, just here to avoid errors
dataset = Dataset(features, label=pd.Series(text_data.label, index=text_data.index), cat_features=cat_features)
dataset = Dataset(features, label=pd.Series(text_data.label), cat_features=cat_features)
encoded_dataset = self._target_encode_categorical_features_fill_na(dataset, list(np.unique(text_data.label)))

dummy_model = _DummyModel(test=encoded_dataset, y_pred_test=np.asarray(predictions),
Expand Down
52 changes: 29 additions & 23 deletions deepchecks/nlp/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,9 @@ def __init__(self,

if train is not None and test is not None:
# check if datasets have same indexes
if set(train.index) & set(test.index):
train.reindex(list(map(lambda x: f'train-{x}', list(train.index))))
test.reindex(list(map(lambda x: f'test-{x}', list(test.index))))
if set(train.get_original_text_indexes()) & set(test.get_original_text_indexes()):
train._original_text_index = np.asarray([f'train-{i}' for i in train.get_original_text_indexes()])
test._original_text_index = np.asarray([f'test-{i}' for i in test.get_original_text_indexes()])
nirhutnik marked this conversation as resolved.
Show resolved Hide resolved
get_logger().warning('train and test datasets have common index - adding "train"/"test"'
' prefixes. To avoid that provide datasets with no common indexes '
'or pass the model object instead of the predictions.')
Expand All @@ -108,26 +108,29 @@ def __init__(self,

if dataset.task_type == TaskType.TEXT_CLASSIFICATION:
if (y_pred is None) and (y_proba is not None):
if dataset.is_multilabel:
if dataset.is_multi_label_classification():
y_pred = (np.array(y_proba) > 0.5) # TODO: Replace with user-configurable threshold
y_pred = [np.array(model_classes)[pred] for pred in y_pred]
else:
y_pred = np.argmax(np.array(y_proba), axis=-1)
y_pred = np.array(model_classes)[y_pred]
y_pred = np.array(model_classes, dtype='str')[y_pred]

if y_pred is not None:
y_pred = np.array(y_pred)
if dataset.is_multi_label_classification():
y_pred = np.array(y_pred)
else:
y_pred = np.array(y_pred, dtype='str')
if len(y_pred.shape) > 1 and y_pred.shape[1] == 1:
y_pred = y_pred[:, 0]
ensure_predictions_shape(y_pred, dataset.text)

if y_proba is not None:
ensure_predictions_proba(y_proba, y_pred)
y_proba_dict = dict(zip(dataset.index, y_proba))
y_proba_dict = dict(zip(dataset.get_original_text_indexes(), y_proba))
probas.update({dataset.name: y_proba_dict})

if y_pred is not None:
y_pred_dict = dict(zip(dataset.index, y_pred))
y_pred_dict = dict(zip(dataset.get_original_text_indexes(), y_pred))
predictions.update({dataset.name: y_pred_dict})

self.predictions = predictions
Expand All @@ -148,20 +151,22 @@ def __init__(self,
def _predict(self, data: TextData) -> TTextPred: # TODO: Needs to receive list of strings, not TextData
"""Predict on given data by the data indexes."""
if self.validate_data_on_predict:
data_indices = set(np.random.choice(data.index, min(100, len(data.index)), replace=False))
data_indices = set(np.random.choice(data.get_original_text_indexes(), min(100, len(data)), replace=False))
if not data_indices.issubset(self._prediction_indices[data.name]):
raise DeepchecksValueError('Data that has not been seen before passed for inference with pre computed '
'predictions.')
return list(itemgetter(*data.index)(self.predictions[data.name])) # pylint: disable=unsubscriptable-object
return list(itemgetter(*data.get_original_text_indexes())(
self.predictions[data.name])) # pylint: disable=unsubscriptable-object

def _predict_proba(self, data: TextData) -> TTextProba: # TODO: Needs to receive list of strings, not TextData
"""Predict probabilities on given data by the data indexes."""
if self.validate_data_on_predict:
data_indices = set(np.random.choice(data.index, min(100, len(data.index)), replace=False))
data_indices = set(np.random.choice(data.get_original_text_indexes(), min(100, len(data)), replace=False))
if not data_indices.issubset(self._proba_indices[data.name]):
raise DeepchecksValueError('Data that has not been seen before passed for inference with pre computed '
'probabilities.')
return list(itemgetter(*data.index)(self.probas[data.name])) # pylint: disable=unsubscriptable-object
return list(itemgetter(*data.get_original_text_indexes())(
self.probas[data.name])) # pylint: disable=unsubscriptable-object

def fit(self, *args, **kwargs):
"""Just for python 3.6 (sklearn validates fit method)."""
Expand Down Expand Up @@ -192,7 +197,7 @@ def _validate_classification_prediction(dataset: TextData, prediction: TTextPred

try:
prediction = np.array(prediction)
if dataset.is_multilabel:
if dataset.is_multi_label_classification():
prediction = prediction.astype(float) # Multilabel prediction is a binary matrix
else:
prediction = prediction.reshape((-1, 1)) # Multiclass (not multilabel) Prediction can be a string
Expand All @@ -201,7 +206,7 @@ def _validate_classification_prediction(dataset: TextData, prediction: TTextPred
except ValueError as e:
raise ValidationError(classification_format_error) from e
pred_shape = prediction.shape
if dataset.is_multilabel:
if dataset.is_multi_label_classification():
if len(pred_shape) == 1 or pred_shape[1] != n_classes:
raise ValidationError(classification_format_error)
if not np.array_equal(prediction, prediction.astype(bool)):
Expand Down Expand Up @@ -247,7 +252,7 @@ def _validate_proba(dataset: TextData, probabilities: TTextProba, n_classes: int
if proba_shape[1] != n_classes:
raise ValidationError(f'Check requires classification probabilities for {dataset.name} dataset '
f'to have {n_classes} columns, same as the number of classes')
if dataset.is_multilabel:
if dataset.is_multi_label_classification():
if (probabilities > 1).any() or (probabilities < 0).any():
raise ValidationError(f'Check requires classification probabilities for {dataset.name} '
f'dataset to be between 0 and 1')
Expand Down Expand Up @@ -308,18 +313,19 @@ def __init__(
# If both dataset, validate they fit each other
if train_dataset and test_dataset:
if test_dataset.has_label() and train_dataset.has_label() and not \
TextData.datasets_share_task_type(train_dataset, test_dataset):
train_dataset.validate_textdata_compatibility(test_dataset):
raise DatasetValidationError('train_dataset and test_dataset must share the same label and task type')
if test_dataset and not train_dataset:
raise DatasetValidationError('Can\'t initialize context with only test_dataset. if you have single '
'dataset, initialize it as train_dataset')
if model_classes and len(model_classes) == 0:
raise DeepchecksValueError('Received empty model_classes')
if model_classes and sorted(model_classes) != model_classes:
supported_models_link = doclink(
'nlp-supported-predictions-format',
template='For more information please refer to the Supported Tasks guide {link}')
raise DeepchecksValueError(f'Received unsorted model_classes. {supported_models_link}')
if model_classes is not None:
if (not is_sequence_not_str(model_classes)) or len(model_classes) == 0:
raise DeepchecksValueError('model_classes must be a non-empty sequence')
if sorted(model_classes) != model_classes:
supported_models_link = doclink(
'nlp-supported-predictions-format',
template='For more information please refer to the Supported Tasks guide {link}')
raise DeepchecksValueError(f'Received unsorted model_classes. {supported_models_link}')

self._task_type = self.infer_task_type(train_dataset, test_dataset)

Expand Down
44 changes: 30 additions & 14 deletions deepchecks/nlp/datasets/classification/tweet_emotion.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True,
properties = None
dataset = TextData(dataset.text, label=dataset[_target], task_type='text_classification',
metadata=dataset.drop(columns=[_target, 'text']),
properties=properties, index=dataset.index)
properties=properties)
return dataset
else:
# train has more sport and Customer Complains but less Terror and Optimism
Expand All @@ -128,10 +128,10 @@ def load_data(data_format: str = 'TextData', as_train_test: bool = True,
train_properties, test_properties = None, None

train = TextData(train.text, label=train[_target], task_type='text_classification',
index=train.index, metadata=train.drop(columns=[_target, 'text']),
metadata=train.drop(columns=[_target, 'text']),
properties=train_properties)
test = TextData(test.text, label=test[_target], task_type='text_classification',
index=test.index, metadata=test.drop(columns=[_target, 'text']),
metadata=test.drop(columns=[_target, 'text']),
properties=test_properties)
return train, test

Expand All @@ -147,7 +147,8 @@ def load_embeddings() -> np.ndarray:
return pd.read_csv(_EMBEDDINGS_URL, index_col=0).to_numpy()


def load_precalculated_predictions(pred_format: str = 'predictions') -> np.array:
def load_precalculated_predictions(pred_format: str = 'predictions',
as_train_test: bool = False) -> np.array:
"""Load and return a precalculated predictions for the dataset.

Parameters
Expand All @@ -156,7 +157,11 @@ def load_precalculated_predictions(pred_format: str = 'predictions') -> np.array
Represent the format of the returned value. Can be 'predictions' or 'probabilities'.
'predictions' will return the predicted class for each sample.
'probabilities' will return the predicted probabilities for each sample.

as_train_test : bool, default: True
If True, the returned data is split into train and test exactly like the toy model
was trained. The first return value is the train data and the second is the test data.
In order to get this model, call the load_fitted_model() function.
Otherwise, returns a single object.
Returns
-------
predictions : np.ndarray
Expand All @@ -165,16 +170,27 @@ def load_precalculated_predictions(pred_format: str = 'predictions') -> np.array
"""
os.makedirs(ASSETS_DIR, exist_ok=True)
if (ASSETS_DIR / 'tweet_emotion_probabilities.csv').exists():
preds = pd.read_csv(ASSETS_DIR / 'tweet_emotion_probabilities.csv', index_col=0)
all_preds = pd.read_csv(ASSETS_DIR / 'tweet_emotion_probabilities.csv', index_col=0)
else:
preds = pd.read_csv(_PREDICTIONS_URL, index_col=0)
preds.to_csv(ASSETS_DIR / 'tweet_emotion_probabilities.csv')

preds = preds.to_numpy()
all_preds = pd.read_csv(_PREDICTIONS_URL, index_col=0)
all_preds.to_csv(ASSETS_DIR / 'tweet_emotion_probabilities.csv')
all_preds = all_preds.to_numpy()

if pred_format == 'predictions':
return np.array([_LABEL_MAP[x] for x in np.argmax(preds, axis=1)])
elif pred_format == 'probabilities':
return preds
else:
all_preds = np.array([_LABEL_MAP[x] for x in np.argmax(all_preds, axis=1)])
elif pred_format != 'probabilities':
raise ValueError('pred_format must be either "predictions" or "probabilities"')

if not as_train_test:
return all_preds

# Load indexes of train and test
if (ASSETS_DIR / 'tweet_emotion_data.csv').exists():
dataset = pd.read_csv(ASSETS_DIR / 'tweet_emotion_data.csv', index_col=0,
usecols=['Unnamed: 0', 'train_test_split'])
else:
dataset = pd.read_csv(_FULL_DATA_URL, index_col=0, usecols=['Unnamed: 0', 'train_test_split'])

train_indexes = dataset[dataset['train_test_split'] == 'Train'].index
test_indexes = dataset[dataset['train_test_split'] == 'Test'].index
return all_preds[train_indexes], all_preds[test_indexes]