remove label_name from dataset constructor (#456)

* remove label_name from dataset constructor * Add tests for dataset label * Update tests * Update docstring and from df takes name of column * Fix test * Change handle of numpy arrays and add tests * update identifier leakage notebook * change Hashable to typing.Hashable * Change order of if/else to prevent bug in python 3.6 with hashable
deepchecks · Jan 4, 2022 · b95dc48 · b95dc48
1 parent c917845
commit b95dc48
Show file tree

Hide file tree

Showing 55 changed files with 6,024 additions and 986 deletions.
diff --git a/deepchecks/base/dataset.py b/deepchecks/base/dataset.py
@@ -43,17 +43,18 @@ class Dataset:
     Args:
         df (pandas.DataFrame):
             A pandas DataFrame containing data relevant for the training or validating of a ML models.
-        label (pandas.Series)
-            A pandas series containing data of the labels. Will be joined to the data dataframe with the name
-            given by `label_name` parameter or 'target' by default.
+        label (t.Union[Hashable, pd.Series, pd.DataFrame, np.ndarray])
+            label column provided either as a string with the name of an existing column in the DataFrame or a label
+            object including the label data (pandas Series/DataFrame or a numpy array) that will be concatenated to the
+            data in the DataFrame. in case of label data the following logic is applied to set the label name:
+                - Series: takes the series name or 'target' if name is empty
+                - DataFrame: expect single column in the dataframe and use its name
+                - numpy: use 'target'
         features (Optional[Sequence[Hashable]]):
             List of names for the feature columns in the DataFrame.
         cat_features (Optional[Sequence[Hashable]]):
             List of names for the categorical features in the DataFrame. In order to disable categorical.
             features inference, pass cat_features=[]
-        label_name (Optional[Hashable]):
-            If `label` is given, then this name is used as the column name for the labels.
-            If `label` is none, then looks for this name in the data dataframe.
         index_name (Optional[Hashable]):
             Name of the index column in the dataframe. If set_index_from_dataframe_index is True and index_name
             is not None, index will be created from the dataframe index level with the given name. If index levels
@@ -103,10 +104,9 @@ class Dataset:
     def __init__(
             self,
             df: pd.DataFrame,
-            label: pd.Series = None,
+            label: t.Union[Hashable, pd.Series, pd.DataFrame, np.array] = None,
             features: t.Optional[t.Sequence[Hashable]] = None,
             cat_features: t.Optional[t.Sequence[Hashable]] = None,
-            label_name: t.Optional[Hashable] = None,
             index_name: t.Optional[Hashable] = None,
             set_index_from_dataframe_index: bool = False,
             datetime_name: t.Optional[Hashable] = None,
@@ -122,28 +122,59 @@ def __init__(
         self._data = df.copy()
 
         # Validations
-        if label is not None:
+        if label is None:
+            label_name = None
+        elif isinstance(label, (pd.Series, pd.DataFrame, np.ndarray)):
+            label_name = None
+            if isinstance(label, pd.Series):
+                # Set label name if exists
+                if label.name is not None:
+                    label_name = label.name
+                    if label_name in self._data.columns:
+                        raise DeepchecksValueError(f'Data has column with name "{label_name}", use pandas rename to'
+                                                   f' change label name or remove the column from the dataframe')
+            elif isinstance(label, pd.DataFrame):
+                # Validate shape
+                if label.shape[1] > 1:
+                    raise DeepchecksValueError('Label must have a single column')
+                # Set label name
+                label_name = label.columns[0]
+                label = label[label_name]
+                if label_name in self._data.columns:
+                    raise DeepchecksValueError(f'Data has column with name "{label_name}", change label column '
+                                               f'or remove the column from the data dataframe')
+            elif isinstance(label, np.ndarray):
+                # Validate label shape
+                if len(label.shape) > 2:
+                    raise DeepchecksValueError('Label must be either column vector or row vector')
+                elif len(label.shape) == 2:
+                    if all(x != 1 for x in label.shape):
+                        raise DeepchecksValueError('Label must be either column vector or row vector')
+                    label = np.squeeze(label)
+
+            # Validate length of label
             if label.shape[0] != self._data.shape[0]:
                 raise DeepchecksValueError('Number of samples of label and data must be equal')
-            if len(label.shape) > 1 and label.shape[1] != 1:
-                raise DeepchecksValueError('Label must be a column vector')
-            # Make tests to prevent overriding user column
+
+            # If no label found to set, then set default name
             if label_name is None:
                 label_name = 'target'
                 if label_name in self._data.columns:
-                    raise DeepchecksValueError(f'Data has column with name "{label_name}", use label_name parameter'
-                                               ' to set column name for label which does\'t exists in the data')
-            else:
-                if label_name in self._data.columns:
-                    raise DeepchecksValueError('Can\'t pass label with label_name that exists in the data. change '
-                                               'the label_name parameter')
-
-            # If passed label is a pandas object, check that indexes match, else set column as is with provided values
-            if isinstance(label, (pd.Series, pd.DataFrame)):
+                    raise DeepchecksValueError('Can\'t set default label name "target" since it already exists in '
+                                               'the dataframe. use pandas name parameter to give the label a '
+                                               'unique name')
+            # Set label data in dataframe
+            if isinstance(label, pd.Series):
                 pd.testing.assert_index_equal(self._data.index, label.index)
                 self._data[label_name] = label
             else:
                 self._data[label_name] = np.array(label).reshape(-1, 1)
+        elif isinstance(label, t.Hashable):
+            label_name = label
+            if label_name not in self._data.columns:
+                raise DeepchecksValueError(f'label column {label_name} not found in dataset columns')
+        else:
+            raise DeepchecksValueError(f'Unsupported type for label: {type(label).__name__}')
 
         # Assert that the requested index can be found
         if not set_index_from_dataframe_index:
@@ -188,9 +219,6 @@ def __init__(
                                                f' int or str, but found {type(index_name)}')
             self._datetime_column = self.get_datetime_column_from_index(datetime_name)
 
-        if label_name is not None and label_name not in self._data.columns:
-            raise DeepchecksValueError(f'label column {label_name} not found in dataset columns')
-
         if features:
             difference = set(features) - set(self._data.columns)
             if len(difference) > 0:
@@ -268,6 +296,7 @@ def from_numpy(
             cls: t.Type[TDataset],
             *args: np.ndarray,
             columns: t.Sequence[Hashable] = None,
+            label_name: t.Hashable = None,
             **kwargs
     ) -> TDataset:
         """Create Dataset instance from numpy arrays.
@@ -359,6 +388,8 @@ def from_numpy(
                 )
 
             labels_array = pd.Series(labels_array)
+            if label_name:
+                labels_array = labels_array.rename(label_name)
 
         return cls(
             df=pd.DataFrame(data=columns_array, columns=columns),
@@ -382,7 +413,7 @@ def copy(self: TDataset, new_data) -> TDataset:
 
         cls = type(self)
 
-        return cls(new_data, features=features, cat_features=cat_features, label_name=label_name,
+        return cls(new_data, features=features, cat_features=cat_features, label=label_name,
                    index_name=index, set_index_from_dataframe_index=self._set_index_from_dataframe_index,
                    datetime_name=date, set_datetime_from_dataframe_index=self._set_datetime_from_dataframe_index,
                    convert_datetime=False, max_categorical_ratio=self._max_categorical_ratio,

diff --git a/deepchecks/checks/distribution/whole_dataset_drift.py b/deepchecks/checks/distribution/whole_dataset_drift.py
@@ -132,7 +132,7 @@ def run(self, train_dataset, test_dataset, model=None) -> CheckResult:
 
         y_test.name = 'belongs_to_test'
         domain_test_dataset = Dataset(pd.concat([x_test.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1),
-                                      cat_features=cat_features, label_name='belongs_to_test')
+                                      cat_features=cat_features, label='belongs_to_test')
 
         # calculate feature importance of domain_classifier, containing the information which features separate
         # the dataset best.

diff --git a/deepchecks/checks/performance/segment_performance.py b/deepchecks/checks/performance/segment_performance.py
@@ -115,7 +115,7 @@ def run(self, dataset: Dataset, model: Any) -> CheckResult:
                 else:
                     score = scorer(model,
                                    Dataset(feature_2_df, features=dataset.features,
-                                           label_name=dataset.label_name, cat_features=dataset.cat_features))
+                                           label=dataset.label_name, cat_features=dataset.cat_features))
                 scores[i, j] = score
                 counts[i, j] = len(feature_2_df)
 

diff --git a/deepchecks/datasets/regression/avocado.py b/deepchecks/datasets/regression/avocado.py
@@ -123,16 +123,16 @@ def load_data(data_format: str = 'Dataset', as_train_test: bool = True) -> \
         dataset = pd.read_csv(_FULL_DATA_URL)
 
         if data_format == 'Dataset':
-            dataset = Dataset(dataset, label_name='AveragePrice', cat_features=_CAT_FEATURES, datetime_name='Date')
+            dataset = Dataset(dataset, label='AveragePrice', cat_features=_CAT_FEATURES, datetime_name='Date')
 
         return dataset
     else:
         train = pd.read_csv(_TRAIN_DATA_URL)
         test = pd.read_csv(_TEST_DATA_URL)
 
         if data_format == 'Dataset':
-            train = Dataset(train, label_name='AveragePrice', cat_features=_CAT_FEATURES, datetime_name='Date')
-            test = Dataset(test, label_name='AveragePrice', cat_features=_CAT_FEATURES, datetime_name='Date')
+            train = Dataset(train, label='AveragePrice', cat_features=_CAT_FEATURES, datetime_name='Date')
+            test = Dataset(test, label='AveragePrice', cat_features=_CAT_FEATURES, datetime_name='Date')
 
         return train, test