Skip to content

Commit

Permalink
Added the n_to_show parameter for the TrainTestSamplesMix test (#2337)
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelMarien committed Feb 19, 2023
1 parent 5c5b20a commit 7dae2bb
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,22 @@ class TrainTestSamplesMix(TrainTestCheck):
----------
n_samples : int , default: 10_000_000
number of samples to use for this check.
n_to_show : int , default: 10
number of samples that appear in test and training data to show.
random_state : int, default: 42
random seed for all check internals.
"""

def __init__(
self,
n_samples: int = 10_000_000,
n_to_show: int = 10,
random_state: int = 42,
**kwargs
):
super().__init__(**kwargs)
self.n_samples = n_samples
self.n_to_show = n_to_show
self.random_state = random_state

def run_logic(self, context: Context) -> CheckResult:
Expand Down Expand Up @@ -82,7 +86,7 @@ def run_logic(self, context: Context) -> CheckResult:
dup_ratio = test_dup_count / test_dataset.n_samples
user_msg = f'{format_percent(dup_ratio)} ({test_dup_count} / {test_dataset.n_samples}) \
of test data samples appear in train data'
display = [user_msg, duplicates_df.head(10)] if context.with_display and dup_ratio else None
display = [user_msg, duplicates_df.head(self.n_to_show)] if context.with_display and dup_ratio else None
result = {'ratio': dup_ratio, 'data': duplicates_df}
return CheckResult(result, header='Train Test Samples Mix', display=display)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,28 @@ def test_leakage(iris_clean):
assert_that(result.display, has_length(greater_than(0)))


def test_train_test_samples_mix_n_to_show(iris_clean):
x = iris_clean.data
y = iris_clean.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=55)
train_dataset = Dataset(pd.concat([x_train, y_train], axis=1),
features=iris_clean.feature_names,
label='target')

test_df = pd.concat([x_test, y_test], axis=1)
bad_test = test_df.append(train_dataset.data.iloc[[0, 1, 2, 3, 4]], ignore_index=True)

test_dataset = Dataset(bad_test,
features=iris_clean.feature_names,
label='target')
# Arrange
check = TrainTestSamplesMix(n_to_show=2)
# Act X
result = check.run(test_dataset=test_dataset, train_dataset=train_dataset)
# Assert
assert len(result.display[1]) == 2


def test_leakage_without_display(iris_clean):
x = iris_clean.data
y = iris_clean.target
Expand Down Expand Up @@ -175,9 +197,8 @@ def test_train_test_simple_mix_with_categorical_data(iris_clean):
features=iris_clean.feature_names + ["cat_column"],
label='target'
)

# Run
TrainTestSamplesMix().run(
test_dataset=test_dataset,
train_dataset=train_dataset
)
)

0 comments on commit 7dae2bb

Please sign in to comment.