Skip to content

Commit

Permalink
[Issue-1542] MixedNull display table order (#1679)
Browse files Browse the repository at this point in the history
* changed MixedNulls display table order

Co-authored-by: Itay Gabbay <itay@deepchecks.com>
  • Loading branch information
yromanyshyn and ItayGabbay committed Jun 27, 2022
1 parent fbfcaa0 commit d1ac880
Show file tree
Hide file tree
Showing 10 changed files with 28 additions and 40 deletions.
13 changes: 8 additions & 5 deletions deepchecks/tabular/checks/data_integrity/mixed_nulls.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from deepchecks.tabular import Context, SingleDatasetCheck
from deepchecks.tabular.utils.messages import get_condition_passed_message
from deepchecks.utils.dataframes import select_from_dataframe
from deepchecks.utils.features import N_TOP_MESSAGE, column_importance_sorter_df
from deepchecks.utils.features import N_TOP_MESSAGE
from deepchecks.utils.strings import format_percent, string_baseform
from deepchecks.utils.typing import Hashable

Expand Down Expand Up @@ -86,8 +86,11 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
for column_name in list(df.columns):
column_data = df[column_name]

string_null_counts = {value: count for value, count in column_data.value_counts(dropna=True).iteritems()
if string_baseform(value) in null_string_list}
string_null_counts = {
repr(value).replace('\'', '"'): count
for value, count in column_data.value_counts(dropna=True).iteritems()
if string_baseform(value) in null_string_list
}
nan_data_counts = column_data[column_data.isna()].apply(nan_type).value_counts().to_dict()
null_counts = {**string_null_counts, **nan_data_counts}

Expand All @@ -101,9 +104,9 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
# Create dataframe to display table
if context.with_display and display_array:
df_graph = pd.DataFrame(display_array, columns=['Column Name', 'Value', 'Count', 'Percent of data'])
order = df_graph['Column Name'].value_counts(ascending=False).index[:self.n_top_columns]
df_graph = df_graph.set_index(['Column Name', 'Value'])
df_graph = column_importance_sorter_df(df_graph, dataset, context.feature_importance,
self.n_top_columns, col='Column Name')
df_graph = df_graph.loc[order, :]
display = [N_TOP_MESSAGE % self.n_top_columns, df_graph]
else:
display = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
# ----------------------------------------------------------------------------
#
"""Tests for Feature Feature Correlation check"""
from hamcrest import assert_that, calling, contains_exactly, contains_inanyorder, equal_to, has_items, has_length, raises
from hamcrest import (assert_that, calling, contains_exactly, contains_inanyorder, equal_to, has_items, has_length,
raises)

from deepchecks.tabular.checks.data_integrity.feature_feature_correlation import FeatureFeatureCorrelation
from deepchecks.tabular.dataset import Dataset
Expand Down
3 changes: 2 additions & 1 deletion tests/tabular/checks/integrity/mixed_data_types_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
import numpy as np
import pandas as pd
# Disable wildcard import check for hamcrest
from hamcrest import assert_that, calling, close_to, equal_to, greater_than, has_entries, has_entry, has_items, has_length, raises
from hamcrest import (assert_that, calling, close_to, equal_to, greater_than, has_entries, has_entry, has_items,
has_length, raises)

from deepchecks.core import ConditionCategory
from deepchecks.core.errors import DeepchecksValueError
Expand Down
27 changes: 4 additions & 23 deletions tests/tabular/checks/integrity/mixed_nulls_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
"""Tests for Mixed Nulls check"""
import numpy as np
import pandas as pd
from hamcrest import (assert_that, calling, close_to, equal_to, greater_than, has_entries, has_entry, has_items, has_length, is_,
raises)
from hamcrest import (assert_that, calling, close_to, equal_to, greater_than, has_entries, has_entry, has_items,
has_length, is_, raises)

from deepchecks.core.errors import DatasetValidationError, DeepchecksValueError
from deepchecks.tabular.checks.data_integrity.mixed_nulls import MixedNulls
Expand All @@ -36,7 +36,7 @@ def test_single_column_one_null_type():
dataframe = pd.DataFrame(data=data)
# Act
result = MixedNulls().run(dataframe)
assert_that(result.value, equal_to({'col1': {'null': {'count': 2, 'percent': 0.5}}}))
assert_that(result.value, equal_to({'col1': {'"null"': {'count': 2, 'percent': 0.5}}}))
assert_that(result.display, has_length(greater_than(0)))


Expand All @@ -46,7 +46,7 @@ def test_single_column_one_null_type_without_display():
dataframe = pd.DataFrame(data=data)
# Act
result = MixedNulls().run(dataframe, with_display=False)
assert_that(result.value, equal_to({'col1': {'null': {'count': 2, 'percent': 0.5}}}))
assert_that(result.value, equal_to({'col1': {'"null"': {'count': 2, 'percent': 0.5}}}))
assert_that(result.display, has_length(0))


Expand Down Expand Up @@ -241,22 +241,3 @@ def test_condition_max_nulls_passed():
details='Passed for 1 relevant column',
name='Number of different null types is less or equal to 10')
))


def test_fi_n_top(diabetes_split_dataset_and_model):
train, _, clf = diabetes_split_dataset_and_model
train = Dataset(train.data.copy(), label='target', cat_features=['sex'])
train.data.loc[train.data.index % 4 == 0, 'age'] = 'Nan'
train.data.loc[train.data.index % 4 == 1, 'age'] = 'null'
train.data.loc[train.data.index % 4 == 0, 'bmi'] = 'Nan'
train.data.loc[train.data.index % 4 == 1, 'bmi'] = 'null'
train.data.loc[train.data.index % 4 == 0, 'bp'] = 'Nan'
train.data.loc[train.data.index % 4 == 1, 'bp'] = 'null'
train.data.loc[train.data.index % 4 == 0, 's1'] = 'Nan'
train.data.loc[train.data.index % 4 == 1, 's1'] = 'null'
# Arrange
check = MixedNulls(n_top_columns=3)
# Act
result = check.run(train, clf)
# Assert - Display dataframe have only 3
assert_that(result.display[1], has_length(3))
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def test_binary_model_info_object(iris_dataset_single_class_labeled, iris_random
# Act X
result = check.run(iris_dataset_single_class_labeled, iris_random_forest_single_class).value
# Assert
assert_that(result, has_length(1))
assert_that(result, has_length(1))

assert_that(result, has_entries({
0: close_to(0.0002, 0.0005)
Expand All @@ -103,7 +103,7 @@ def test_binary_string_model_info_object(iris_binary_string_split_dataset_and_mo
# Act X
result = check.run(test_ds, clf).value
# Assert
assert_that(result, has_length(1))
assert_that(result, has_length(1))

assert_that(result, has_entries({
0: close_to(0.04, 0.001)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
# ----------------------------------------------------------------------------
#
"""Tests for segment performance check."""
from hamcrest import assert_that, calling, close_to, equal_to, greater_than, has_entries, has_length, has_property, raises
from hamcrest import (assert_that, calling, close_to, equal_to, greater_than, has_entries, has_length, has_property,
raises)

from deepchecks.core.errors import DeepchecksNotSupportedError, DeepchecksValueError
from deepchecks.tabular.checks.model_evaluation.segment_performance import SegmentPerformance
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
# ----------------------------------------------------------------------------
#
"""Contains unit tests for the confusion_matrix_report check."""
from hamcrest import assert_that, calling, close_to, greater_than, has_entries, has_entry, has_items, has_length, is_, raises
from hamcrest import (assert_that, calling, close_to, greater_than, has_entries, has_entry, has_items, has_length, is_,
raises)
from sklearn.metrics import f1_score, make_scorer, recall_score

from deepchecks.core.errors import DeepchecksValueError
Expand Down
4 changes: 2 additions & 2 deletions tests/utils/partition_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
from hamcrest import assert_that, equal_to
from sklearn.tree import DecisionTreeRegressor

from deepchecks.utils.performance.partition import convert_tree_leaves_into_filters, DeepchecksFilter, \
intersect_two_filters
from deepchecks.utils.performance.partition import (DeepchecksFilter, convert_tree_leaves_into_filters,
intersect_two_filters)


def test_iris_tree_to_filters(iris_dataset):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
from hamcrest import (all_of, any_of, assert_that, calling, close_to, contains_exactly, equal_to, greater_than, has_entries, has_key,
has_length, has_properties, instance_of, is_, raises)
from hamcrest import (all_of, any_of, assert_that, calling, close_to, contains_exactly, equal_to, greater_than,
has_entries, has_key, has_length, has_properties, instance_of, is_, raises)
from hamcrest.core.matcher import Matcher

from deepchecks import CheckResult
Expand All @@ -33,7 +33,7 @@ def is_correct_image_property_outliers_result(with_display: bool = True) -> Matc
display_assertion = all_of(
instance_of(list),
has_length(0),
)
)

return all_of(
instance_of(CheckResult),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def is_correct_label_property_outliers_result(props, with_display: bool = True)
display_assertion = all_of(
instance_of(list),
has_length(0),
)
)

return all_of(
instance_of(CheckResult),
Expand Down

0 comments on commit d1ac880

Please sign in to comment.