Added the n_to_show parameter for the TrainTestSamplesMix test (#2337)

deepchecks · Feb 19, 2023 · 7dae2bb · 7dae2bb
1 parent 5c5b20a
commit 7dae2bb
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 3 deletions.
diff --git a/deepchecks/tabular/checks/train_test_validation/train_test_samples_mix.py b/deepchecks/tabular/checks/train_test_validation/train_test_samples_mix.py
@@ -33,18 +33,22 @@ class TrainTestSamplesMix(TrainTestCheck):
     ----------
     n_samples : int , default: 10_000_000
         number of samples to use for this check.
+    n_to_show : int , default: 10
+        number of samples that appear in test and training data to show.
     random_state : int, default: 42
         random seed for all check internals.
     """
 
     def __init__(
         self,
         n_samples: int = 10_000_000,
+        n_to_show: int = 10,
         random_state: int = 42,
         **kwargs
     ):
         super().__init__(**kwargs)
         self.n_samples = n_samples
+        self.n_to_show = n_to_show
         self.random_state = random_state
 
     def run_logic(self, context: Context) -> CheckResult:
@@ -82,7 +86,7 @@ def run_logic(self, context: Context) -> CheckResult:
         dup_ratio = test_dup_count / test_dataset.n_samples
         user_msg = f'{format_percent(dup_ratio)} ({test_dup_count} / {test_dataset.n_samples}) \
                      of test data samples appear in train data'
-        display = [user_msg, duplicates_df.head(10)] if context.with_display and dup_ratio else None
+        display = [user_msg, duplicates_df.head(self.n_to_show)] if context.with_display and dup_ratio else None
         result = {'ratio': dup_ratio, 'data': duplicates_df}
         return CheckResult(result, header='Train Test Samples Mix', display=display)
 

diff --git a/tests/tabular/checks/train_test_validation/train_test_samples_mix_test.py b/tests/tabular/checks/train_test_validation/train_test_samples_mix_test.py
@@ -75,6 +75,28 @@ def test_leakage(iris_clean):
     assert_that(result.display, has_length(greater_than(0)))
 
 
+def test_train_test_samples_mix_n_to_show(iris_clean):
+    x = iris_clean.data
+    y = iris_clean.target
+    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=55)
+    train_dataset = Dataset(pd.concat([x_train, y_train], axis=1),
+                            features=iris_clean.feature_names,
+                            label='target')
+
+    test_df = pd.concat([x_test, y_test], axis=1)
+    bad_test = test_df.append(train_dataset.data.iloc[[0, 1, 2, 3, 4]], ignore_index=True)
+
+    test_dataset = Dataset(bad_test,
+                           features=iris_clean.feature_names,
+                           label='target')
+    # Arrange
+    check = TrainTestSamplesMix(n_to_show=2)
+    # Act X
+    result = check.run(test_dataset=test_dataset, train_dataset=train_dataset)
+    # Assert
+    assert len(result.display[1]) == 2
+
+
 def test_leakage_without_display(iris_clean):
     x = iris_clean.data
     y = iris_clean.target
@@ -175,9 +197,8 @@ def test_train_test_simple_mix_with_categorical_data(iris_clean):
         features=iris_clean.feature_names + ["cat_column"],
         label='target'
     )
-
     # Run
     TrainTestSamplesMix().run(
         test_dataset=test_dataset,
         train_dataset=train_dataset
-    )
+    )