## Test `load_and_filter_corpus`

This notebook demonstrates how to test the function interactively.

In [None]:
import tempfile
import tensorflow as tf
from src.word2gm_fast.dataprep.load_and_filter_corpus import load_and_filter_corpus

# Prepare test data
lines = [
    "the quick brown fox jumps\n",      # valid
    "UNK quick brown fox jumps\n",      # invalid (center is UNK)
    "the quick brown UNK jumps\n",      # valid (context has at least one non-UNK)
    "the quick brown fox UNK\n",        # valid (context has at least one non-UNK)
    "UNK UNK UNK UNK UNK\n"             # invalid (all UNK)
]
valid_lines = [
    "the quick brown fox jumps",
    "the quick brown UNK jumps",
    "the quick brown fox UNK"
]

# Write to a temporary file
with tempfile.NamedTemporaryFile(mode='w+', delete=False) as temp_file:
    temp_file.writelines(lines)
    temp_file.flush()
    temp_path = temp_file.name

: 

In [None]:
# Run the function and collect results
dataset, summary = load_and_filter_corpus(temp_path)
result_lines = [line.numpy().decode("utf-8") for line in dataset]

print("Filtered lines:", result_lines)
print("Summary:", summary)

In [None]:
# Assertions (will raise AssertionError if test fails)
assert set(result_lines) == set(valid_lines), "Filtered lines do not match expected valid lines."
assert isinstance(summary, dict)
assert summary["retained"] == 3
assert summary["rejected"] == 2
assert summary["total"] == 5
print("All assertions passed!")