Skip to content

Commit

Permalink
[Issue-1560] fix effect of display control parameters on `CheckResult…
Browse files Browse the repository at this point in the history
….value` (#1680)

* fix effect of display control parameters on `CheckResult.value`
  • Loading branch information
yromanyshyn committed Jun 29, 2022
1 parent f2ef42b commit 7cf93ae
Show file tree
Hide file tree
Showing 14 changed files with 554 additions and 205 deletions.
19 changes: 15 additions & 4 deletions deepchecks/tabular/checks/data_integrity/columns_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,20 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
display a table of the dictionary.
"""
dataset = context.get_data_by_kind(dataset_kind)
value = dataset.columns_info
value = column_importance_sorter_dict(value, dataset, context.feature_importance, self.n_top_columns)
df = pd.DataFrame.from_dict(value, orient='index', columns=['role'])
columns_info = dataset.columns_info
columns_info = column_importance_sorter_dict(columns_info, dataset, context.feature_importance)

columns_info_to_display = (
columns_info
if len(columns_info) <= self.n_top_columns
else dict(list(columns_info.items())[:self.n_top_columns])
)

df = pd.DataFrame.from_dict(columns_info_to_display, orient='index', columns=['role'])
df = df.transpose()

return CheckResult(value, header='Columns Info', display=[N_TOP_MESSAGE % self.n_top_columns, df])
return CheckResult(
columns_info,
header='Columns Info',
display=[N_TOP_MESSAGE % self.n_top_columns, df]
)
9 changes: 5 additions & 4 deletions deepchecks/tabular/checks/data_integrity/special_chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,12 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
special_samples = _get_special_samples(column_data)
if special_samples:
result[column_name] = sum(special_samples.values()) / column_data.size
percent = format_percent(sum(special_samples.values()) / column_data.size)
top_n_samples_items = \
sorted(special_samples.items(), key=lambda x: x[1], reverse=True)[:self.n_most_common]
top_n_samples_values = [item[0] for item in top_n_samples_items]
if context.with_display:
percent = format_percent(sum(special_samples.values()) / column_data.size)
sortkey = lambda x: x[1]
top_n_samples_items = sorted(special_samples.items(), key=sortkey, reverse=True)
top_n_samples_items = top_n_samples_items[:self.n_most_common]
top_n_samples_values = [item[0] for item in top_n_samples_items]
display_array.append([column_name, percent, top_n_samples_values])
else:
result[column_name] = 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class PerformanceReport(TrainTestCheck, ReduceMixin):
reduce: Union[Callable, str], default: 'mean'
An optional argument only used for the reduce_output function when using
non-average scorers.
Notes
-----
Scorers are a convention of sklearn to evaluate a model.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __init__(
super().__init__(**kwargs)
self.columns = columns
self.ignore_columns = ignore_columns
self.max_features_to_show = max_features_to_show
self.max_features_to_show = max_features_to_show # TODO: attr is not used, remove it
self.max_new_categories_to_show = max_new_categories_to_show

def run_logic(self, context: Context) -> CheckResult:
Expand Down
45 changes: 23 additions & 22 deletions deepchecks/vision/checks/model_evaluation/class_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,33 +111,34 @@ def compute(self, context: Context) -> CheckResult:

results_df = pd.concat(results)
results_df = results_df[['Dataset', 'Metric', 'Class', 'Class Name', 'Number of samples', 'Value']]
if self.class_list_to_show is not None:
results_df = results_df.loc[results_df['Class'].isin(self.class_list_to_show)]
elif self.n_to_show is not None:
classes_to_show = filter_classes_for_display(results_df,
self.metric_to_show_by,
self.n_to_show,
self.show_only)
results_df = results_df.loc[results_df['Class'].isin(classes_to_show)]

results_df = results_df.sort_values(by=['Dataset', 'Value'], ascending=False)

if context.with_display:
fig = px.histogram(
results_df,
x='Class Name',
y='Value',
color='Dataset',
color_discrete_sequence=(plot.colors['Train'], plot.colors['Test']),
barmode='group',
facet_col='Metric',
facet_col_spacing=0.05,
hover_data=['Number of samples'],

)
if self.class_list_to_show is not None:
display_df = results_df.loc[results_df['Class'].isin(self.class_list_to_show)]
elif self.n_to_show is not None:
rows = results_df['Class'].isin(filter_classes_for_display(
results_df,
self.metric_to_show_by,
self.n_to_show,
self.show_only
))
display_df = results_df.loc[rows]
else:
display_df = results_df

fig = (
fig.update_xaxes(title='Class', type='category')
px.histogram(
display_df,
x='Class Name',
y='Value',
color='Dataset',
color_discrete_sequence=(plot.colors['Train'], plot.colors['Test']),
barmode='group',
facet_col='Metric',
facet_col_spacing=0.05,
hover_data=['Number of samples'])
.update_xaxes(title='Class', type='category')
.update_yaxes(title='Value', matches=None)
.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1]))
.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,16 +162,24 @@ def compute(self, context: Context, dataset_kind: DatasetKind) -> CheckResult:
result_value[property_name].append(single_bin)

display_df = pd.DataFrame(display_data)

if display_df.empty:
return CheckResult(value=dict(result_value))

first_metric = display_df['Metric'][0]

if self.alternative_metrics is None:
display_df = display_df[display_df['Metric'] == first_metric]
top_properties = display_df[display_df['Metric'] == first_metric] \
.groupby('Property')[['Value']] \
.agg(np.ptp).sort_values('Value', ascending=False).head(self.n_to_show) \

top_properties = (
display_df[display_df['Metric'] == first_metric]
.groupby('Property')[['Value']]
.agg(np.ptp).sort_values('Value', ascending=False).head(self.n_to_show)
.reset_index()['Property']
)

display_df = display_df[display_df['Property'].isin(top_properties)]

fig = px.bar(
display_df,
x='Range',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,39 +160,41 @@ def compute(self, context: Context) -> CheckResult:
results_df = pd.concat(results)
results_df = results_df[['Model', 'Metric', 'Class', 'Class Name', 'Number of samples', 'Value']]

if not self.metric_to_show_by:
self.metric_to_show_by = list(self._test_metrics.keys())[0]

if self.class_list_to_show is not None:
results_df = results_df.loc[results_df['Class'].isin(self.class_list_to_show)]
elif self.n_to_show is not None:
classes_to_show = filter_classes_for_display(results_df.loc[results_df['Model'] != 'Perfect Model'],
self.metric_to_show_by,
self.n_to_show,
self.show_only,
column_to_filter_by='Model',
column_filter_value='Given Model')
results_df = results_df.loc[results_df['Class'].isin(classes_to_show)]

results_df = results_df.dropna()
results_df = results_df.sort_values(by=['Model', 'Value'], ascending=False).reset_index(drop=True)
results_df.dropna(inplace=True)
results_df.sort_values(by=['Model', 'Value'], ascending=False, inplace=True)
results_df.reset_index(drop=True, inplace=True)

if context.with_display:
fig = px.histogram(
results_df.loc[results_df['Model'] != 'Perfect Model'],
x='Class Name',
y='Value',
color='Model',
color_discrete_sequence=(plot.colors['Generated'], plot.colors['Baseline']),
barmode='group',
facet_col='Metric',
facet_col_spacing=0.05,
hover_data=['Number of samples'],
title=f'Simple Model (Strategy: {self.strategy}) vs. Given Model',
)
if not self.metric_to_show_by:
self.metric_to_show_by = list(self._test_metrics.keys())[0]
if self.class_list_to_show is not None:
display_df = results_df.loc[results_df['Class'].isin(self.class_list_to_show)]
elif self.n_to_show is not None:
rows = results_df['Class'].isin(filter_classes_for_display(
results_df.loc[results_df['Model'] != 'Perfect Model'],
self.metric_to_show_by,
self.n_to_show,
self.show_only,
column_to_filter_by='Model',
column_filter_value='Given Model'
))
display_df = results_df.loc[rows]
else:
display_df = results_df

fig = (
fig.update_xaxes(title=None, type='category')
px.histogram(
display_df.loc[results_df['Model'] != 'Perfect Model'],
x='Class Name',
y='Value',
color='Model',
color_discrete_sequence=(plot.colors['Generated'], plot.colors['Baseline']),
barmode='group',
facet_col='Metric',
facet_col_spacing=0.05,
hover_data=['Number of samples'],
title=f'Simple Model (Strategy: {self.strategy}) vs. Given Model')
.update_xaxes(title=None, type='category')
.update_yaxes(title=None, matches=None)
.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1]))
.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ def compute(self, context: Context) -> CheckResult:
display = []
similar_pairs = []
if similar_indices['test']:

# TODO: this for loop should be below `if context.with_display:` branch
for similar_index in display_indices:
for dataset in ('train', 'test'):
image = data_obj[dataset].batch_to_images(
Expand Down
11 changes: 7 additions & 4 deletions tests/base/to_json_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,12 @@ def test_check_full_suite_not_failing(iris_split_dataset_and_model):
def test_check_metadata(iris_dataset):
check_res = ColumnsInfo(n_top_columns=4).run(iris_dataset)
json_res = jsonpickle.loads(check_res.to_json())
assert_that(json_res['value'], equal_to({'target': 'label',
'sepal length (cm)': 'numerical feature',
'sepal width (cm)': 'numerical feature',
'petal length (cm)': 'numerical feature'}))
assert_that(json_res['value'], equal_to({
'target': 'label',
'sepal length (cm)': 'numerical feature',
'sepal width (cm)': 'numerical feature',
'petal length (cm)': 'numerical feature',
'petal width (cm)': 'numerical feature'
}))
assert_that(json_res['check']['name'], equal_to('Columns Info'))
assert_that(json_res['check']['params'], equal_to({'n_top_columns': 4}))
25 changes: 24 additions & 1 deletion tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
from hamcrest import any_of, instance_of
from hamcrest import all_of, any_of, contains_exactly, contains_string, equal_to, has_property, instance_of
from plotly.graph_objects import Histogram

from deepchecks.core.check_result import CheckFailure, CheckResult, DisplayMap
from deepchecks.core.checks import BaseCheck
Expand Down Expand Up @@ -139,3 +140,25 @@ def instance_of_ipython_formatter():
instance_of(IPythonDisplayFormatter),
instance_of(MimeBundleFormatter),
)


def assert_class_performance_display(
xaxis, # classes
yaxis, # values
metrics=('Recall', 'Precision'),
):
pairs = (
(dataset, metric)
for dataset in ('Train', 'Test')
for metric in metrics
)
return contains_exactly(*[
all_of(
instance_of(Histogram),
has_property('name', equal_to(dataset)),
has_property('hovertemplate', contains_string(f'Metric={metric}')),
has_property('x', xaxis[index]),
has_property('y', yaxis[index]),
)
for index, (dataset, metric) in enumerate(pairs)
])
2 changes: 1 addition & 1 deletion tests/tabular/checks/integrity/columns_info_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def test_fi_n_top(diabetes_split_dataset_and_model):
# Act
result_ds = check.run(train, clf).value
# Assert
assert_that(result_ds, has_length(3))
assert_that(result_ds, has_length(10))

def test_other_feature(kiss_dataset_and_model):
train, _, clf = kiss_dataset_and_model
Expand Down

0 comments on commit 7cf93ae

Please sign in to comment.