# Top Data Validation Tools for Machine Learning

## `pydantic`

In [2]:
from typing_extensions import Annotated
from pydantic import BaseModel, Field

class JobCandidate(BaseModel):
  id: int
  name: str
  surname: str
  age: Annotated[int, Field(strict=True, ge=18, le=45)]

input = {
	"id": 1,
	"name": "Alan",
	"surname": "Poe",
	"age": 46,
}

JobCandidate(**input)

ValidationError: 1 validation error for JobCandidate
age
  Input should be less than or equal to 45 [type=less_than_equal, input_value=46, input_type=int]
    For further information visit https://errors.pydantic.dev/2.7/v/less_than_equal

## `marshmallow`

In [3]:
from marshmallow import Schema, fields, validate


class JobCandidate(Schema):
    id = fields.Int()
    name = fields.Str()
    surname = fields.Str()
    age = fields.Int(validate=validate.Range(min=18, max=45))


input_data = {
    "id": 1,
    "name": "Alan",
    "surname": "Poe",
    "age": 46,
}

job_candidate_schema = JobCandidate()
job_candidate_schema.load(input_data)

ValidationError: {'age': ['Must be greater than or equal to 18 and less than or equal to 45.']}

## `jsonschema`

In [4]:
from jsonschema import validate

job_candidate_schema = {
    "properties": {
        "id": {"type": "integer"},
        "name": {"type": "string"},
        "surname": {"type": "string"},
        "age": {"type": "integer", "minimum": 18, "maximum": 45},
    }
}

input_data = {
    "id": 1,
    "name": "Alan",
    "surname": "Poe",
    "age": 46,
}

validate(instance=input_data, schema=job_candidate_schema)

ValidationError: 46 is greater than the maximum of 45

Failed validating 'maximum' in schema['properties']['age']:
    {'maximum': 45, 'minimum': 18, 'type': 'integer'}

On instance['age']:
    46

## `pandera`

In [5]:
import pandas as pd
import pandera as pa

job_candidate_schema = pa.DataFrameSchema(
    {
        "id": pa.Column(pa.Int),
        "name": pa.Column(pa.String),
        "surname": pa.Column(pa.String),
        "age": pa.Column(pa.Int, checks=[pa.Check.ge(18), pa.Check.le(45)]),
    }
)

test_df = pd.DataFrame(
    {"id": [1, 2], "name": ["Alan", "John"], "surname": ["Poe", "Doe"], "age": [46, 19]}
)

validated_df = job_candidate_schema(test_df)

SchemaError: Column 'age' failed element-wise validator number 1: less_than_or_equal_to(45) failure cases: 46

## `great_expectations`

In [9]:
import pandas as pd
import great_expectations as ge

test_df = pd.DataFrame(
    {"id": [1, 2], "name": ["Alan", "John"], "surname": ["Poe", "Doe"], "age": [46, 19]}
)

# Define a Great Expectations Expectation Suite
expectation_suite = ge.dataset.PandasDataset(test_df)

# Define expectations for each column
expectation_suite.expect_column_to_exist("id")
expectation_suite.expect_column_to_exist("name")
expectation_suite.expect_column_to_exist("surname")
expectation_suite.expect_column_values_to_be_between(
    column="age",
    min_value=18,
    max_value=45,
)

# Validate the DataFrame against the Expectation Suite
validation_result = expectation_suite.validate()

In [11]:
validation_result

{
  "success": false,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_to_exist",
        "kwargs": {
          "column": "id",
          "result_format": "BASIC"
        },
        "meta": {}
      },
      "result": {},
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      }
    },
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_to_exist",
        "kwargs": {
          "column": "name",
          "result_format": "BASIC"
        },
        "meta": {}
      },
      "result": {},
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      }
    },
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_to_exist",
        "kwa