In [1]:
!pip install pandera

Defaulting to user installation because normal site-packages is not writeable
Collecting pandera
  Downloading pandera-0.14.4-py3-none-any.whl (148 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.5/148.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pydantic
  Downloading pydantic-1.10.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting wrapt
  Downloading wrapt-1.15.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.4/78.4 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting multimethod
  Downloading multimethod-1.9.1-py3-none-any.whl (10 kB)
Collecting typing-inspect>=0.6.0
  Downloading typing_inspect-0.8.0-py3-none-any.whl (8.7 kB)
Coll

In [6]:
import pandas as pd
import pandera as pa
from pandera import Column, Check

Imagine we have a DataFrame with a set of conditions:

In [4]:
fruits = pd.DataFrame(
    {
        "name": ["apple", "banana", "apple", "orange"],
        "store": ["Aldi", "Walmart", "Walmart", "Aldi"],
        "price": [2, 1, 3, 4]
    }
)

available_fruits = ["apple", "banana", "orange"] # Possible values for name column
nearby_stores = ["Aldi", "Walmart"] # Possible values for store column

We can now check if our dataframe satisfies the conditions with `pandera`

In [7]:
schema = pa.DataFrameSchema( #We create a DataFrameSchema to validate our DataFrame
    {
        "name": Column(str, Check.isin(available_fruits)), #We check the Column name, etc
        "store": Column(str, Check.isin(nearby_stores)),
        "price": Column(int, Check.less_than(4)) 
        
    }
)

schema.validate(fruits) # This fails because there is a value in the price column equal to 4!

SchemaError: <Schema Column(name=price, type=DataType(int64))> failed element-wise validator 0:
<Check less_than: less_than(4)>
failure cases:
   index  failure_case
0      3             4

In [9]:
schema2 = pa.DataFrameSchema( #We create a DataFrameSchema to validate our DataFrame
    {
        "name": Column(str, Check.isin(available_fruits)), #We check the Column name, etc
        "store": Column(str, Check.isin(nearby_stores)),
        "price": Column(int, Check.less_than(5)) 
        
    }
)

schema2.validate(fruits) # If we may set it to 5 this works!

Unnamed: 0,name,store,price
0,apple,Aldi,2
1,banana,Walmart,1
2,apple,Walmart,3
3,orange,Aldi,4


We can also check multiple values at the same time:

In [10]:
schema3 = pa.DataFrameSchema( #We create a DataFrameSchema to validate our DataFrame
    {
        "name": Column(str, Check.isin(available_fruits)), #We check the Column name, etc
        "store": Column(str, Check.isin(nearby_stores)),
        "price": Column(int, [
            Check.less_than(5),
            Check.greater_than(4)
            ]
        )
    }
)

schema3.validate(fruits) # If we may set it to 5 this works!

SchemaError: <Schema Column(name=price, type=DataType(int64))> failed element-wise validator 1:
<Check greater_than: greater_than(4)>
failure cases:
   index  failure_case
0      0             2
1      1             1
2      2             3
3      3             4

We can create a custom check using `lambda` functions:

In [12]:
schema4 = pa.DataFrameSchema( #We create a DataFrameSchema to validate our DataFrame
    {
        "name": Column(str, Check.isin(available_fruits)), #We check the Column name, etc
        "store": Column(str, Check.isin(nearby_stores)),
        "price": Column(int, [
            Check.less_than(5),
            Check(lambda price: sum(price) < 20) # Check sum of all prices is lower than 20
            ]
        )
    }
)

schema4.validate(fruits) # If we may set it to 5 this works!

Unnamed: 0,name,store,price
0,apple,Aldi,2
1,banana,Walmart,1
2,apple,Walmart,3
3,orange,Aldi,4


## References

- [GitHub](https://github.com/unionai-oss/pandera)
- [Documentation](https://pandera.readthedocs.io/en/stable/index.html)
