Skip to content

Commit

Permalink
make numerical type explicit + other edge case fixes (#1114)
Browse files Browse the repository at this point in the history
* v0

* v0

* v0

* linting

* added_tests

* linting

* fix_more_split_bugs

* fix_more_split_bugs

* some_fixes

* typo
  • Loading branch information
JKL98ISR committed Mar 28, 2022
1 parent 2e630e8 commit 535d0cb
Show file tree
Hide file tree
Showing 17 changed files with 140 additions and 50 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,12 @@ def run_logic(self, context: Context) -> CheckResult:
values_dict = OrderedDict()
displays_dict = OrderedDict()
for column in train_dataset.features:
if column in train_dataset.numerical_features:
column_type = 'numerical'
elif column in train_dataset.cat_features:
column_type = 'categorical'
else:
continue # we only support categorical or numerical features
if features_importance is not None:
fi_rank_series = features_importance.rank(method='first', ascending=False)
fi_rank = fi_rank_series[column]
Expand All @@ -127,7 +133,7 @@ def run_logic(self, context: Context) -> CheckResult:
train_column=train_dataset.data[column],
test_column=test_dataset.data[column],
value_name=column,
column_type='categorical' if column in train_dataset.cat_features else 'numerical',
column_type=column_type,
plot_title=plot_title,
max_num_categories=self.max_num_categories
)
Expand All @@ -153,7 +159,8 @@ def run_logic(self, context: Context) -> CheckResult:
<br>If available, the plot titles also show the feature importance (FI) rank.
</span>"""

displays = [headnote] + [displays_dict[col] for col in columns_order]
displays = [headnote] + [displays_dict[col] for col in columns_order
if col in train_dataset.cat_features + train_dataset.numerical_features]

return CheckResult(value=values_dict, display=displays, header='Train Test Drift')

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,10 @@ def run_logic(self, context: Context) -> CheckResult:
test_dataset = context.test
features = train_dataset.features
cat_features = train_dataset.cat_features
numerical_features = train_dataset.numerical_features

sample_size = min(self.sample_size, train_dataset.n_samples, test_dataset.n_samples)

numerical_features = list(set(features) - set(cat_features))

headnote = """
<span>
The shown features are the features that are most important for the domain classifier - the
Expand Down
6 changes: 2 additions & 4 deletions deepchecks/tabular/checks/methodology/unused_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,15 +254,13 @@ def naive_encoder(dataset: Dataset) -> Tuple[TransformerMixin, list]:
Tuple[TransformerMixin, list]
A transformer object, a list of columns returned
"""
numeric_features = [col for col in dataset.features if col not in dataset.cat_features]

return ColumnTransformer(
transformers=[
('num', Pipeline([
('nan_handling', SimpleImputer()),
('norm', RobustScaler())
]),
numeric_features),
dataset.numerical_features),
('cat',
Pipeline([
('nan_handling', SimpleImputer(strategy='most_frequent')),
Expand All @@ -271,4 +269,4 @@ def naive_encoder(dataset: Dataset) -> Tuple[TransformerMixin, list]:
]),
dataset.cat_features)
]
), numeric_features + dataset.cat_features
), dataset.numerical_features + dataset.cat_features
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# ----------------------------------------------------------------------------
#
"""The confusion_matrix_report check module."""
import pandas as pd
import sklearn
import plotly.express as px

Expand Down Expand Up @@ -46,10 +47,11 @@ def run_logic(self, context: Context, dataset_type: str = 'train') -> CheckResul
model = context.model

y_pred = model.predict(ds_x)
total_classes = sorted(list(set(pd.concat([ds_y, pd.Series(y_pred)]).to_list())))
confusion_matrix = sklearn.metrics.confusion_matrix(ds_y, y_pred)

# Figure
fig = px.imshow(confusion_matrix, x=dataset.classes, y=dataset.classes, text_auto=True)
fig = px.imshow(confusion_matrix, x=total_classes, y=total_classes, text_auto=True)
fig.update_layout(width=600, height=600)
fig.update_xaxes(title='Predicted Value', type='category')
fig.update_yaxes(title='True value', type='category')
Expand Down
14 changes: 9 additions & 5 deletions deepchecks/tabular/checks/performance/model_error_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,12 @@ class ModelErrorAnalysis(TrainTestCheck):
.. code-block:: python
from sklearn.metrics import roc_auc_score, make_scorer
auc_scorer = make_scorer(roc_auc_score)
training_labels = [1, 2, 3]
auc_scorer = make_scorer(roc_auc_score, labels=training_labels, multi_class='ovr')
# Note that the labels parameter is required for multi-class classification in metrics like roc_auc_score or
# log_loss that use the predict_proba function of the model, in case that not all labels are present in the test
# set.
Or you can implement your own:
Expand Down Expand Up @@ -121,10 +126,9 @@ def run_logic(self, context: Context) -> CheckResult:
def scoring_func(dataset: Dataset):
return per_sample_mse(dataset.label_col, model.predict(dataset.features_columns))
else:
le = preprocessing.LabelEncoder()
le.fit(train_dataset.classes)

def scoring_func(dataset: Dataset):
le = preprocessing.LabelEncoder()
le.fit(dataset.classes)
encoded_label = le.transform(dataset.label_col)
return per_sample_cross_entropy(encoded_label,
model.predict_proba(dataset.features_columns))
Expand All @@ -133,7 +137,7 @@ def scoring_func(dataset: Dataset):
test_scores = scoring_func(test_dataset)

cat_features = train_dataset.cat_features
numeric_features = [num_feature for num_feature in train_dataset.features if num_feature not in cat_features]
numeric_features = train_dataset.numerical_features

error_fi, error_model_predicted = model_error_contribution(train_dataset.features_columns,
train_scores,
Expand Down
15 changes: 10 additions & 5 deletions deepchecks/tabular/checks/performance/performance_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,12 @@ class PerformanceReport(TrainTestCheck):
.. code-block:: python
from sklearn.metrics import roc_auc_score, make_scorer
auc_scorer = make_scorer(roc_auc_score)
training_labels = [1, 2, 3]
auc_scorer = make_scorer(roc_auc_score, labels=training_labels, multi_class='ovr')
# Note that the labels parameter is required for multi-class classification in metrics like roc_auc_score or
# log_loss that use the predict_proba function of the model, in case that not all labels are present in the test
# set.
Or you can implement your own:
Expand Down Expand Up @@ -83,7 +88,6 @@ def run_logic(self, context: Context) -> CheckResult:

model = context.model
task_type = context.task_type
classes = train_dataset.classes

scorers = context.get_scorers(self.user_scorers, class_avg=False)
datasets = {'Train': train_dataset, 'Test': test_dataset}
Expand All @@ -93,6 +97,7 @@ def run_logic(self, context: Context) -> CheckResult:
results = []

for dataset_name, dataset in datasets.items():
classes = dataset.classes
label = cast(pd.Series, dataset.label_col)
n_samples = label.groupby(label).count()
results.extend(
Expand Down Expand Up @@ -197,14 +202,14 @@ def condition(check_result: pd.DataFrame) -> ConditionResult:
test_scores_dict = dict(zip(test_scores_class['Metric'], test_scores_class['Value']))
train_scores_dict = dict(zip(train_scores_class['Metric'], train_scores_class['Value']))
# Calculate percentage of change from train to test
diff = {score_name: _ratio_of_change_calc(score, test_scores_dict[score_name])
diff = {score_name: _ratio_of_change_calc(score, test_scores_dict.get(score_name, 0))
for score_name, score in train_scores_dict.items()}
failed_scores = [k for k, v in diff.items() if v > threshold]
if failed_scores:
for score_name in failed_scores:
explained_failures.append(f'{score_name} for class {class_name} '
f'(train={format_number(train_scores_dict[score_name])} '
f'test={format_number(test_scores_dict[score_name])})')
f'(train={format_number(train_scores_dict.get(score_name, 0))} '
f'test={format_number(test_scores_dict.get(score_name, 0))})')
else:
test_scores_dict = dict(zip(test_scores['Metric'], test_scores['Value']))
train_scores_dict = dict(zip(train_scores['Metric'], train_scores['Value']))
Expand Down
6 changes: 2 additions & 4 deletions deepchecks/tabular/checks/performance/segment_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import numpy as np
import plotly.figure_factory as ff

from deepchecks.tabular import Context, SingleDatasetCheck, Dataset
from deepchecks.tabular import Context, SingleDatasetCheck
from deepchecks.core import CheckResult
from deepchecks.core.errors import DeepchecksValueError, DatasetValidationError
from deepchecks.utils.performance.partition import partition_column
Expand Down Expand Up @@ -111,9 +111,7 @@ def run_logic(self, context: Context, dataset_type: str = 'train') -> CheckResul
if feature_2_df.empty:
score = np.NaN
else:
score = scorer(model,
Dataset(feature_2_df, features=dataset.features,
label=dataset.label_name, cat_features=dataset.cat_features))
score = scorer(model, dataset.copy(feature_2_df))
scores[i, j] = score
counts[i, j] = len(feature_2_df)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,12 @@ class SimpleModelComparison(TrainTestCheck):
.. code-block:: python
from sklearn.metrics import roc_auc_score, make_scorer
auc_scorer = make_scorer(roc_auc_score)
training_labels = [1, 2, 3]
auc_scorer = make_scorer(roc_auc_score, labels=training_labels, multi_class='ovr')
# Note that the labels parameter is required for multi-class classification in metrics like roc_auc_score or
# log_loss that use the predict_proba function of the model, in case that not all labels are present in the test
# set.
Or you can implement your own:
Expand Down Expand Up @@ -135,7 +140,7 @@ def run_logic(self, context: Context) -> CheckResult:
# Multiclass have different return type from the scorer, list of score per class instead of single score
if task_type in [ModelType.MULTICLASS, ModelType.BINARY]:
n_samples = test_label.groupby(test_label).count()
classes = train_dataset.classes
classes = test_dataset.classes

results_array = []
# Dict in format { Scorer : Dict { Class : Dict { Origin/Simple : score } } }
Expand Down
40 changes: 28 additions & 12 deletions deepchecks/tabular/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from sklearn.model_selection import train_test_split

from deepchecks.utils.dataframes import select_from_dataframe
from deepchecks.utils.features import is_categorical, infer_categorical_features
from deepchecks.utils.features import infer_numerical_features, is_categorical, infer_categorical_features
from deepchecks.utils.typing import Hashable
from deepchecks.core.errors import DeepchecksValueError, DatasetValidationError, DeepchecksNotSupportedError

Expand Down Expand Up @@ -285,6 +285,9 @@ def __init__(
else:
self._label_type = None

unassigned_cols = [col for col in self._features if col not in self._cat_features]
self._numerical_features = infer_numerical_features(self._data[unassigned_cols])

@classmethod
def from_numpy(
cls: t.Type[TDataset],
Expand Down Expand Up @@ -463,16 +466,6 @@ def n_samples(self) -> int:
"""
return self.data.shape[0]

def __len__(self) -> int:
"""Return number of samples in the member dataframe.
Returns
-------
int
"""
return self.n_samples

@property
def label_type(self) -> t.Optional[str]:
"""Return the label type.
Expand Down Expand Up @@ -736,6 +729,17 @@ def cat_features(self) -> t.List[Hashable]:
"""
return list(self._cat_features)

@property
def numerical_features(self) -> t.List[Hashable]:
"""Return list of numerical feature names.
Returns
-------
t.List[Hashable]
List of categorical feature names.
"""
return list(self._numerical_features)

@property
@lru_cache(maxsize=128)
def classes(self) -> t.Tuple[str, ...]:
Expand Down Expand Up @@ -770,8 +774,10 @@ def columns_info(self) -> t.Dict[Hashable, str]:
elif column in self._features:
if column in self.cat_features:
value = 'categorical feature'
else:
elif column in self.numerical_features:
value = 'numerical feature'
else:
value = 'other feature'
else:
value = 'other'
columns[column] = value
Expand Down Expand Up @@ -1043,3 +1049,13 @@ def datasets_share_date(cls, *datasets: 'Dataset') -> bool:
return False

return True

def __len__(self) -> int:
"""Return number of samples in the member dataframe.
Returns
-------
int
"""
return self.n_samples
28 changes: 27 additions & 1 deletion deepchecks/utils/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

import numpy as np
import pandas as pd
from pandas.core.dtypes.common import is_float_dtype
from pandas.core.dtypes.common import is_float_dtype, is_numeric_dtype
from sklearn.inspection import permutation_importance

from deepchecks import tabular
Expand All @@ -38,6 +38,7 @@
'column_importance_sorter_dict',
'column_importance_sorter_df',
'infer_categorical_features',
'infer_numerical_features',
'is_categorical',
'N_TOP_MESSAGE'
]
Expand Down Expand Up @@ -380,6 +381,31 @@ def column_importance_sorter_df(
return df


def infer_numerical_features(df: pd.DataFrame) -> t.List[Hashable]:
"""Infers which features are numerical.
Parameters
----------
df : pd.DataFrame
dataframe for which to infer numerical features
Returns
-------
List[Hashable]
list of numerical features
"""
columns = df.columns
numerical_columns = []
for col in columns:
col_data = df[col]
if col_data.dtype == 'object':
# object might still be only floats, so we rest the dtype
col_data = pd.Series(col_data.to_list())
if is_numeric_dtype(col_data):
numerical_columns.append(col)
return numerical_columns


def infer_categorical_features(
df: pd.DataFrame,
max_categorical_ratio: float = 0.01,
Expand Down
2 changes: 1 addition & 1 deletion deepchecks/utils/performance/error_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def error_model_display(error_fi: pd.Series,
labels={error_col_name: 'model error'}, color=color_col,
color_discrete_map=color_map
))
else:
elif feature in dataset.numerical_features:
# sample data for display
np.random.seed(random_state)
sampling_idx = np.random.choice(range(len(data)), size=n_samples_display, replace=False)
Expand Down
4 changes: 2 additions & 2 deletions deepchecks/utils/performance/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def partition_column(
List[DeepchecksFilter]
"""
column = dataset.data[column_name]
if column_name not in dataset.cat_features:
if column_name in dataset.numerical_features:
percentile_values = numeric_segmentation_edges(column, max_segments)
# If for some reason only single value in the column (and column not categorical) we will get single item
if len(percentile_values) == 1:
Expand All @@ -131,7 +131,7 @@ def partition_column(

filters.append(DeepchecksFilter(f, label))
return filters
else:
elif column_name in dataset.cat_features:
# Get sorted histogram
cat_hist_dict = column.value_counts()
# Get index of last value in histogram to show
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,6 @@
# Classification Model
# ====================

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier

model = adult.load_fitted_model()

#%%
Expand Down
3 changes: 2 additions & 1 deletion spelling-allowlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,5 @@ quantiles
toplevel
ious
renormalization
jsonpickle
jsonpickle
dtype

0 comments on commit 535d0cb

Please sign in to comment.