[Issue-1542] MixedNull display table order (#1679)

* changed MixedNulls display table order Co-authored-by: Itay Gabbay <itay@deepchecks.com>
deepchecks · Jun 27, 2022 · d1ac880 · d1ac880
1 parent fbfcaa0
commit d1ac880
Show file tree

Hide file tree

Showing 10 changed files with 28 additions and 40 deletions.
diff --git a/deepchecks/tabular/checks/data_integrity/mixed_nulls.py b/deepchecks/tabular/checks/data_integrity/mixed_nulls.py
@@ -19,7 +19,7 @@
 from deepchecks.tabular import Context, SingleDatasetCheck
 from deepchecks.tabular.utils.messages import get_condition_passed_message
 from deepchecks.utils.dataframes import select_from_dataframe
-from deepchecks.utils.features import N_TOP_MESSAGE, column_importance_sorter_df
+from deepchecks.utils.features import N_TOP_MESSAGE
 from deepchecks.utils.strings import format_percent, string_baseform
 from deepchecks.utils.typing import Hashable
 
@@ -86,8 +86,11 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         for column_name in list(df.columns):
             column_data = df[column_name]
 
-            string_null_counts = {value: count for value, count in column_data.value_counts(dropna=True).iteritems()
-                                  if string_baseform(value) in null_string_list}
+            string_null_counts = {
+                repr(value).replace('\'', '"'): count
+                for value, count in column_data.value_counts(dropna=True).iteritems()
+                if string_baseform(value) in null_string_list
+            }
             nan_data_counts = column_data[column_data.isna()].apply(nan_type).value_counts().to_dict()
             null_counts = {**string_null_counts, **nan_data_counts}
 
@@ -101,9 +104,9 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
         # Create dataframe to display table
         if context.with_display and display_array:
             df_graph = pd.DataFrame(display_array, columns=['Column Name', 'Value', 'Count', 'Percent of data'])
+            order = df_graph['Column Name'].value_counts(ascending=False).index[:self.n_top_columns]
             df_graph = df_graph.set_index(['Column Name', 'Value'])
-            df_graph = column_importance_sorter_df(df_graph, dataset, context.feature_importance,
-                                                   self.n_top_columns, col='Column Name')
+            df_graph = df_graph.loc[order, :]
             display = [N_TOP_MESSAGE % self.n_top_columns, df_graph]
         else:
             display = None

diff --git a/tests/tabular/checks/integrity/feature_feature_correlation_test.py b/tests/tabular/checks/integrity/feature_feature_correlation_test.py
@@ -9,7 +9,8 @@
 # ----------------------------------------------------------------------------
 #
 """Tests for Feature Feature Correlation check"""
-from hamcrest import assert_that, calling, contains_exactly, contains_inanyorder, equal_to, has_items, has_length, raises
+from hamcrest import (assert_that, calling, contains_exactly, contains_inanyorder, equal_to, has_items, has_length,
+                      raises)
 
 from deepchecks.tabular.checks.data_integrity.feature_feature_correlation import FeatureFeatureCorrelation
 from deepchecks.tabular.dataset import Dataset

diff --git a/tests/tabular/checks/integrity/mixed_data_types_test.py b/tests/tabular/checks/integrity/mixed_data_types_test.py
@@ -12,7 +12,8 @@
 import numpy as np
 import pandas as pd
 # Disable wildcard import check for hamcrest
-from hamcrest import assert_that, calling, close_to, equal_to, greater_than, has_entries, has_entry, has_items, has_length, raises
+from hamcrest import (assert_that, calling, close_to, equal_to, greater_than, has_entries, has_entry, has_items,
+                      has_length, raises)
 
 from deepchecks.core import ConditionCategory
 from deepchecks.core.errors import DeepchecksValueError

diff --git a/tests/tabular/checks/integrity/mixed_nulls_test.py b/tests/tabular/checks/integrity/mixed_nulls_test.py
@@ -11,8 +11,8 @@
 """Tests for Mixed Nulls check"""
 import numpy as np
 import pandas as pd
-from hamcrest import (assert_that, calling, close_to, equal_to, greater_than, has_entries, has_entry, has_items, has_length, is_,
-                      raises)
+from hamcrest import (assert_that, calling, close_to, equal_to, greater_than, has_entries, has_entry, has_items,
+                      has_length, is_, raises)
 
 from deepchecks.core.errors import DatasetValidationError, DeepchecksValueError
 from deepchecks.tabular.checks.data_integrity.mixed_nulls import MixedNulls
@@ -36,7 +36,7 @@ def test_single_column_one_null_type():
     dataframe = pd.DataFrame(data=data)
     # Act
     result = MixedNulls().run(dataframe)
-    assert_that(result.value, equal_to({'col1': {'null': {'count': 2, 'percent': 0.5}}}))
+    assert_that(result.value, equal_to({'col1': {'"null"': {'count': 2, 'percent': 0.5}}}))
     assert_that(result.display, has_length(greater_than(0)))
 
 
@@ -46,7 +46,7 @@ def test_single_column_one_null_type_without_display():
     dataframe = pd.DataFrame(data=data)
     # Act
     result = MixedNulls().run(dataframe, with_display=False)
-    assert_that(result.value, equal_to({'col1': {'null': {'count': 2, 'percent': 0.5}}}))
+    assert_that(result.value, equal_to({'col1': {'"null"': {'count': 2, 'percent': 0.5}}}))
     assert_that(result.display, has_length(0))
 
 
@@ -241,22 +241,3 @@ def test_condition_max_nulls_passed():
                                details='Passed for 1 relevant column',
                                name='Number of different null types is less or equal to 10')
     ))
-
-
-def test_fi_n_top(diabetes_split_dataset_and_model):
-    train, _, clf = diabetes_split_dataset_and_model
-    train = Dataset(train.data.copy(), label='target', cat_features=['sex'])
-    train.data.loc[train.data.index % 4 == 0, 'age'] = 'Nan'
-    train.data.loc[train.data.index % 4 == 1, 'age'] = 'null'
-    train.data.loc[train.data.index % 4 == 0, 'bmi'] = 'Nan'
-    train.data.loc[train.data.index % 4 == 1, 'bmi'] = 'null'
-    train.data.loc[train.data.index % 4 == 0, 'bp'] = 'Nan'
-    train.data.loc[train.data.index % 4 == 1, 'bp'] = 'null'
-    train.data.loc[train.data.index % 4 == 0, 's1'] = 'Nan'
-    train.data.loc[train.data.index % 4 == 1, 's1'] = 'null'
-    # Arrange
-    check = MixedNulls(n_top_columns=3)
-    # Act
-    result = check.run(train, clf)
-    # Assert - Display dataframe have only 3
-    assert_that(result.display[1], has_length(3))
diff --git a/tests/tabular/checks/model_evaluation/calibration_score_test.py b/tests/tabular/checks/model_evaluation/calibration_score_test.py
@@ -89,7 +89,7 @@ def test_binary_model_info_object(iris_dataset_single_class_labeled, iris_random
     # Act X
     result = check.run(iris_dataset_single_class_labeled, iris_random_forest_single_class).value
     # Assert
-    assert_that(result, has_length(1)) 
+    assert_that(result, has_length(1))
 
     assert_that(result, has_entries({
         0: close_to(0.0002, 0.0005)
@@ -103,7 +103,7 @@ def test_binary_string_model_info_object(iris_binary_string_split_dataset_and_mo
     # Act X
     result = check.run(test_ds, clf).value
     # Assert
-    assert_that(result, has_length(1)) 
+    assert_that(result, has_length(1))
 
     assert_that(result, has_entries({
         0: close_to(0.04, 0.001)

diff --git a/tests/tabular/checks/model_evaluation/segment_performance_test.py b/tests/tabular/checks/model_evaluation/segment_performance_test.py
@@ -9,7 +9,8 @@
 # ----------------------------------------------------------------------------
 #
 """Tests for segment performance check."""
-from hamcrest import assert_that, calling, close_to, equal_to, greater_than, has_entries, has_length, has_property, raises
+from hamcrest import (assert_that, calling, close_to, equal_to, greater_than, has_entries, has_length, has_property,
+                      raises)
 
 from deepchecks.core.errors import DeepchecksNotSupportedError, DeepchecksValueError
 from deepchecks.tabular.checks.model_evaluation.segment_performance import SegmentPerformance

diff --git a/tests/tabular/checks/model_evaluation/simple_model_comparison_test.py b/tests/tabular/checks/model_evaluation/simple_model_comparison_test.py
@@ -9,7 +9,8 @@
 # ----------------------------------------------------------------------------
 #
 """Contains unit tests for the confusion_matrix_report check."""
-from hamcrest import assert_that, calling, close_to, greater_than, has_entries, has_entry, has_items, has_length, is_, raises
+from hamcrest import (assert_that, calling, close_to, greater_than, has_entries, has_entry, has_items, has_length, is_,
+                      raises)
 from sklearn.metrics import f1_score, make_scorer, recall_score
 
 from deepchecks.core.errors import DeepchecksValueError

diff --git a/tests/utils/partition_tests.py b/tests/utils/partition_tests.py
@@ -13,8 +13,8 @@
 from hamcrest import assert_that, equal_to
 from sklearn.tree import DecisionTreeRegressor
 
-from deepchecks.utils.performance.partition import convert_tree_leaves_into_filters, DeepchecksFilter, \
-    intersect_two_filters
+from deepchecks.utils.performance.partition import (DeepchecksFilter, convert_tree_leaves_into_filters,
+                                                    intersect_two_filters)
 
 
 def test_iris_tree_to_filters(iris_dataset):

diff --git a/tests/vision/checks/data_integrity/image_property_outliers_test.py b/tests/vision/checks/data_integrity/image_property_outliers_test.py
@@ -8,8 +8,8 @@
 # along with Deepchecks.  If not, see <http://www.gnu.org/licenses/>.
 # ----------------------------------------------------------------------------
 #
-from hamcrest import (all_of, any_of, assert_that, calling, close_to, contains_exactly, equal_to, greater_than, has_entries, has_key,
-                      has_length, has_properties, instance_of, is_, raises)
+from hamcrest import (all_of, any_of, assert_that, calling, close_to, contains_exactly, equal_to, greater_than,
+                      has_entries, has_key, has_length, has_properties, instance_of, is_, raises)
 from hamcrest.core.matcher import Matcher
 
 from deepchecks import CheckResult
@@ -33,7 +33,7 @@ def is_correct_image_property_outliers_result(with_display: bool = True) -> Matc
         display_assertion = all_of(
             instance_of(list),
             has_length(0),
-        )  
+        )
 
     return all_of(
         instance_of(CheckResult),

diff --git a/tests/vision/checks/data_integrity/label_property_outliers_test.py b/tests/vision/checks/data_integrity/label_property_outliers_test.py
@@ -34,7 +34,7 @@ def is_correct_label_property_outliers_result(props, with_display: bool = True)
         display_assertion = all_of(
             instance_of(list),
             has_length(0),
-        ) 
+        )
 
     return all_of(
         instance_of(CheckResult),