In [91]:
# Using DataFrameSchema
import pandas as pd
import pandera as pa

df = pd.DataFrame(
       [
	   (1, "Thingy A", "awesome thing.", "high", 0),
	   (2, "Thingy B", "available at http://thingb.com", None, 0),
	   (3, None, None, "low", 5),
	   (4, "Thingy D", "checkout https://thingd.ca", "low", 10),
	   (5, "Thingy E", None, "high", 12),
       ],
       columns=["id", "productName", "description", "priority", "numViews"]
)

schema = pa.DataFrameSchema({
    "id": pa.Column(int, nullable=False, allow_duplicates=False),
    "description": pa.Column(str, nullable=False),
    "priority": pa.Column(str, checks=pa.Check.isin(["high", "low"]), nullable=True),
    "numViews": pa.Column(int, checks=[pa.Check.greater_than_or_equal_to(0),pa.Check(lambda c: c.median() >= 0 and c.median() <= 10)]),
    "productName": pa.Column(str, nullable=False),
})

validated_df = schema(df)
print(validated_df)

SchemaError: non-nullable series 'description' contains null values:
2    None
4    None
Name: description, dtype: object

In [104]:
# Using Schema Model
from pandera.typing import Series
import pandera as pa
from typing import Dict

df = pd.DataFrame({
    "id": [1, 2, 3, 4, 5],
    "name": ["Elsayed Rashed", "Rania Rashed", "Rawan Rashed", "Amal Rashed", "Jumana Elsayed"],
    "department": ["IT", "Sales", "IT", "Marketing", "HR"],
    "salary": [30.5, 27.3, 29.1, 28.4, 20.6],
    "salary_increase": [5.1, 2.3, 2.6, 3.4, 1.6]
})

class EmployeeSchema(pa.SchemaModel):
    id: Series[int] = pa.Field(gt=0, nullable=False, allow_duplicates=False)
    name: Series[str] = pa.Field(nullable=False)
    department: Series[str] = pa.Field(isin=["IT", "HR", "Sales", "Marketing"], nullable=False)
    salary: Series[float] = pa.Field(in_range={"min_value": 10, "max_value": 35}, nullable=False)
    salary_increase: Series[float] = pa.Field(in_range={"min_value": 0, "max_value": 10}, nullable=False)

    @pa.check("name", name="name_check_fisrt_family")
    def name_custom_check_1(cls, a: Series[int]) -> Series[bool]:
        """ Check that Name values have two elements after being split with ' ' """
        return a.str.split(" ", expand=True).shape[1] == 2

    @pa.check("salary", name="salary_check_sum")
    def salary_custom_check_1(cls, salary: Series[float]) -> bool:
        return sum(salary) < 150

    @pa.check("salary", groupby="department", name="salary_check_group_means")
    def salary_custom_check_2(cls, grouped_value: Dict[str, Series[float]]) -> bool:
        return grouped_value["HR"].mean() < grouped_value["IT"].mean()

    @pa.dataframe_check (name="salary_increase_check")
    def salary_increase_custom_check_1(cls, df: pd.DataFrame) -> Series[bool]:
        return df["salary"] + df["salary_increase"] < 50


EmployeeSchema.validate(df)

Unnamed: 0,id,name,department,salary,salary_increase
0,1,Elsayed Rashed,IT,30.5,5.1
1,2,Rania Rashed,Sales,27.3,2.3
2,3,Rawan Rashed,IT,29.1,2.6
3,4,Amal Rashed,Marketing,28.4,3.4
4,5,Jumana Elsayed,HR,20.6,1.6


In [88]:
# DataFrame & Group level Checks
import numpy as np
import pandas as pd
import pandera as pa

fruits = pd.DataFrame({
    "name": ["apple", "banana", "apple"],
    "store": ["Aldi", "Walmart", np.nan],
    "price": [5.5, 4.2, 5.1],
    "tax": [0.5, 0.3, 0.4]})

available_fruits = ["apple", "banana", "orange"]
nearby_stores = ["Aldi", "Walmart"]

schema = pa.DataFrameSchema({
    "name": pa.Column(str, pa.Check.isin(available_fruits), allow_duplicates=True),
    "store": pa.Column(str, pa.Check.isin(nearby_stores), nullable=True),
    "price": pa.Column(float, [pa.Check.less_than(6),pa.Check(lambda price: sum(price) < 20)]),
    "tax": pa.Column(float, [pa.Check(lambda g: g["Aldi"].mean() > g["Walmart"].mean(), groupby="store")])
},
    checks=pa.Check(lambda df: df["price"].mean() > df["tax"].mean())
)

schema.validate(fruits)

Unnamed: 0,name,store,price,tax
0,apple,Aldi,5.5,0.5
1,banana,Walmart,4.2,0.3
2,apple,,5.1,0.4
