Skip to content

Commit

Permalink
Add threshold for file validations (#89)
Browse files Browse the repository at this point in the history
  • Loading branch information
haritha-ravi committed Nov 2, 2023
1 parent 7c3e3d4 commit 1a8e901
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 0 deletions.
5 changes: 5 additions & 0 deletions README.rst
Expand Up @@ -354,6 +354,11 @@ Running Vlads Programatically
Whether to disable log output generated by validations.
Optional, defaults to `False`.

:``file_validation_failure_threshold=None``:
Stops validating the file after this failure threshold is reached.
Input a value between `0.0` and `1.0`. `1.0`(100%) validates the entire file.
Optional, defaults to `None`.

For example:

.. code:: python
Expand Down
19 changes: 19 additions & 0 deletions tests/test_vlads.py
Expand Up @@ -155,3 +155,22 @@ class TestVlad(Vlad):
assert vlad.validators["Column A"][0].bad
assert vlad.validators["Column B"][0].fail_count == 0
assert not vlad.validators["Column B"][0].bad


def test_stop_file_validation_at_invalid_threshold():
source = LocalFile("vladiate/examples/real_vampires.csv")

class TestVlad(Vlad):
validators = {
"Column A": [EmptyValidator()],
"Column B": [EmptyValidator()],
"Column C": [UniqueValidator()],
}

vlad = TestVlad(source=source, file_validation_failure_threshold=0.1)

assert not vlad.validate()
assert vlad.validators["Column A"][0].fail_count == 1
assert vlad.validators["Column B"][0].fail_count == 0
assert vlad.validators["Column C"][0].fail_count == 0
assert vlad.invalid_lines == {1}
25 changes: 25 additions & 0 deletions vladiate/vlad.py
Expand Up @@ -14,6 +14,7 @@ def __init__(
default_validator=EmptyValidator,
delimiter=None,
ignore_missing_validators=False,
file_validation_failure_threshold=None,
quiet=False,
row_validators=[],
):
Expand All @@ -30,6 +31,8 @@ def __init__(
self.ignore_missing_validators = ignore_missing_validators
self.logger.disabled = quiet
self.invalid_lines = set()
self.file_validation_failure_threshold = file_validation_failure_threshold
self.total_lines = 0

self.validators.update(
{
Expand Down Expand Up @@ -120,6 +123,11 @@ def _log_missing(self, missing_items):
)
)

def _get_total_lines(self):
reader = csv.DictReader(self.source.open(), delimiter=self.delimiter)
self.total_lines = sum(1 for _ in reader)
return self.total_lines

def validate(self):
self.logger.info(
"\nValidating {}(source={})".format(self.__class__.__name__, self.source)
Expand All @@ -146,6 +154,9 @@ def validate(self):
self._log_missing_fields()
return False

if self.file_validation_failure_threshold:
self.total_lines = self._get_total_lines()

for line, row in enumerate(reader):
self.line_count += 1

Expand All @@ -166,6 +177,20 @@ def validate(self):
self.failures[field_name][line].append(e)
self.invalid_lines.add(self.line_count)
validator.fail_count += 1
if (
self.file_validation_failure_threshold
and self.total_lines > 0
and validator.fail_count / self.total_lines
> self.file_validation_failure_threshold
):
self.logger.error(
" {} failed {} time(s) ({:.1%})".format(
validator.__class__.__name__,
validator.fail_count,
validator.fail_count / self.total_lines,
)
)
return False

if self.failures or self.row_failures:
self.logger.info("\033[0;31m" + "Failed :(" + "\033[0m")
Expand Down

0 comments on commit 1a8e901

Please sign in to comment.