In [None]:
# !pip install pandera
# !pip install pandas
# !pip install hypothesis

# !pip install pandera[hypotheses]  
#     # hypothesis checks
# !pip install pandera[io]          
#     # yaml/script schema io utilities
# !pip install pandera[strategies]  
    # data synthesis strategies
# pip install pandera[mypy]        # enable static type-linting of pandas
# pip install pandera[fastapi]     # fastapi integration
# pip install pandera[dask]        # validate dask dataframes
# pip install pandera[pyspark]     # validate pyspark dataframes
# pip install pandera[modin]       # validate modin dataframes
# pip install pandera[modin-ray]   # validate modin dataframes with ray
# pip install pandera[modin-dask]  # validate modin dataframes with dask
# pip install pandera[geopandas]   # validate geopandas geodataframes


In [1]:
import pandas as pd
import pandera as pa

# DataFrameSchema PARAMETERS
- **columns** (mapping of column names and column schema component.) – a dict where keys are column names and values are Column objects specifying the datatypes and properties of a particular column.
- **checks** (Optional[CheckList]) – dataframe-wide checks.
- **index** – specify the datatypes and properties of the index.
- **dtype** (PandasDtypeInputTypes) – datatype of the dataframe. This overrides the data types specified in any of the columns. If a string is specified, then assumes one of the valid pandas string values: http://pandas.pydata.org/pandas-docs/stable/basics.html#dtypes.
- **coerce** (bool) – whether or not to coerce all of the columns on validation. This overrides any coerce setting at the column or index level. This has no effect on columns where dtype=None.
- **strict** (StrictType) – ensure that all and only the columns defined in the schema are present in the dataframe. If set to ‘filter’, only the columns in the schema will be passed to the validated dataframe. If set to filter and columns defined in the schema are not present in the dataframe, will throw an error.
- **name** (Optional[str]) – name of the schema.
- **ordered** (bool) – whether or not to validate the columns order.
- **unique** (Optional[Union[str, List[str]]]) – a list of columns that should be jointly unique.
- **report_duplicates** (UniqueSettings) – how to report unique errors - exclude_first: report all duplicates except first occurence - exclude_last: report all duplicates except last occurence - all: (default) report all duplicates
- **unique_column_names** (bool) – whether or not column names must be unique.
- **add_missing_columns** (bool) – add missing column names with either default value, if specified in column schema, or NaN if column is nullable.
- **title** (Optional[str]) – A human-readable label for the schema.
- **description** (Optional[str]) – An arbitrary textual description of the schema.
- **metadata** (Optional[dict]) – An optional key-value data.
- **drop_invalid_rows** (bool) – if True, drop invalid rows on validation.

In [None]:
# data to validate
df = pd.DataFrame({
    "column1": [1, 4, 0, 10, 9],
    "column2": [-1.3, -1.4, -2.9, -10.1, -20.4],
    "column3": ["value_1", "value_2", "value_3", "value_2", "value_1"],
})

# define schema with inline checks
schema = pa.DataFrameSchema({
    "column1": pa.Column(int, checks=pa.Check.le(10)),
    "column2": pa.Column(float, checks=pa.Check.lt(-1.2)),
    "column3": pa.Column(str, checks=[
        pa.Check.str_startswith("value_"),
        # define custom checks as functions that take a series as input and
        # outputs a boolean or boolean Series
        pa.Check(lambda s: s.str.split("_", expand=True).shape[1] == 2)
    ]),
})

# This works, because the checks pass the schema validation for all columns
validated_df = schema(df)
print(validated_df)


In [None]:

# This will raise a SchemaError, because the checks fail for column2 and 3
df_invalid = pd.DataFrame({
    "column1": [1, 4, 0, 10, 9],
    "column2": [6, -1.4, -2.9, 10.1, -20.4],
    "column3": ["value_1", "value_2", "value_3", "value_2", "value_1"],
})

# Have to catch the exception to make the output more readable
try:
    schema(df_invalid)
except pa.errors.SchemaError as e:
    print(e)


In [None]:
#These schemas, and their checks can be serialized to and from JSON
#https://pandera.readthedocs.io/en/stable/schema_inference.html?highlight=schema%20json

schema_from_json = pa.DataFrameSchema.from_json("schema.json")

schema_from_json

# Add a new column, with checks to the schema
schema_from_json_fourth_col = schema_from_json.add_columns({
    "column4": pa.Column(int, checks=pa.Check.ge(0))
})

# This could serve as a powerful way to dynamically manage read and write schemas in a data pipeline.
schema_from_json_fourth_col.to_json("schema_with_column4.json")

# You could then validate the dataframe with the new schema, and write on validation success only.
# This will fail, as the dataframe does not have the column4
schema_from_json_fourth_col.validate(df)


# YAML Schema Example with defined reader and writer schemas

- In this example, we will set a reader and a writer schema, defined in YAML files.
- We validate the pre-defined schemas against the dataframes before reading and writing.
- This is a powerful way to ensure that the data is always in the expected format 
    and sudden changes in the data wont break the pipeline.

In this fictional example, we are collecting data from colleagues at a company, and we want to ensure:

- Each employee has a unique ID and can only appear once in the dataset
- Their age is over 18 (they've been hired legally)

This questionnare was to check who has the longest tenure at the company as a % of their total life, as we have no record of that.
During the process, we will materialise a new column, "tenure_percentage" which is the tenure at the company as a % of their total life.
We don't want cheating! So we will check that the tenure_percentage is less than 100%.


In [None]:
reader_schema_from_yaml = pa.DataFrameSchema.from_yaml("reader_schema.yaml")
reader_schema_from_yaml

input_df = pd.DataFrame({
    "name": ['Alice', 'Bob', 'Charlie', 'Dennis', 'Edith'],
    "age": [25, 32, 29, 19, 22],
    "tenure": [3, 5, 2, 1, 4]
})

reader_schema_from_yaml.validate(input_df)

#materialise the tenure_percentage column
input_df["tenure_percentage"] = (input_df["tenure"] / input_df["age"] * 100)
input_df

# To ensure that we are writing the expected data, we can validate the dataframe against the writer schema
writer_schema_from_yaml = pa.DataFrameSchema.from_yaml("writer_schema.yaml")

writer_schema_from_yaml.validate(input_df)

input_df.to_csv("employee_tenure.csv", index=False)

# Edith wins, with 18.18% of her life spent at the company.

# Data Synthesis

https://pandera.readthedocs.io/en/stable/data_synthesis_strategies.html

pandera provides a utility for generating synthetic data purely from pandera schema or schema component objects. Under the hood, the schema metadata is collected to create a data-generating strategy using hypothesis, which is a property-based testing library.

In [2]:
# Load that JSON schema into a DataFrameSchema object again

schema_from_json = pa.DataFrameSchema.from_json("schema.json")

# generate a synthetic dataframe that conforms to the schema

synthetic_df = schema_from_json.example(3)
synthetic_df

Unnamed: 0,column1,column2,column3
0,5,,2010-01-01
0,5,,2010-01-01
0,5,,2010-01-01
