## Pydantic Tutorial

> `Pydantic` can be applied to, but is not limited to:

1. Data modelling (Type checking).
2. User-friendly error messages
3. Field customization
4. Data validation (custom validators)

#### [Source](https://towardsdatascience.com/8-reasons-to-start-using-pydantic-to-improve-data-parsing-and-validation-4f437eae7678)

In [1]:
import numpy as np
import pandas as pd
from pydantic import BaseModel, validator, ValidationError
from typing import Optional
from pprint import pprint

# Custom import
from src.data_manager import load_data

# pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600
%load_ext lab_black

%load_ext autoreload
%autoreload 2

### 1. Data modelling (Type checking).

In [2]:
# Create a model (class) that will be used for validating data
class Person(BaseModel):
    """Data modelling a person object."""

    name: str
    age: int
    gender: str
    role: str
    salary: float

In [3]:
# Create a Person object
person_1 = Person(
    name="John Doe", age=20, gender="Male", role="ML Engineer", salary=45_000
)

# It `coerces` the object i.e it enforces the data types.
#  e.g salary=45_000 (int) was entered but the output is 45000.0 (float)
person_1

Person(name='John Doe', age=20, gender='Male', role='ML Engineer', salary=45000.0)

In [4]:
# Ypu can access the attributes
person_1.name

'John Doe'

In [5]:
# You can get the available attributes and methods by running: dir(person_1)
# Some popular methods: copy, dict, json, schema, schema_json

# Convert to json
person_1.json()

'{"name": "John Doe", "age": 20, "gender": "Male", "role": "ML Engineer", "salary": 45000.0}'

In [6]:
# Convert to dict
person_1.dict()

{'name': 'John Doe',
 'age': 20,
 'gender': 'Male',
 'role': 'ML Engineer',
 'salary': 45000.0}

In [7]:
# Get the schema
person_1.schema()

{'title': 'Person',
 'description': 'Data modelling a person object.',
 'type': 'object',
 'properties': {'name': {'title': 'Name', 'type': 'string'},
  'age': {'title': 'Age', 'type': 'integer'},
  'gender': {'title': 'Gender', 'type': 'string'},
  'role': {'title': 'Role', 'type': 'string'},
  'salary': {'title': 'Salary', 'type': 'number'}},
 'required': ['name', 'age', 'gender', 'role', 'salary']}

In [8]:
#### Combine Data Models
class Department(BaseModel):
    role: str
    salary: Optional[float]
    department: str


class Person(BaseModel):
    """Data modelling a person object."""

    name: str
    age: int
    gender: str
    department: Department

In [9]:
person_1 = Person(
    name="John Doe",
    age="30",
    gender="Male",
    department=Department(role="Data Engineer", department="Data Team"),
)

person_1

Person(name='John Doe', age=30, gender='Male', department=Department(role='Data Engineer', salary=None, department='Data Team'))

### 2. User-friendly error messages

In [10]:
class Person(BaseModel):
    """Data modelling a person object."""

    name: str
    age: int
    gender: str
    role: str
    salary: float


try:
    person_2 = Person(
        name="Jane Doe", age="hello", gender="Female", role="ML Engineer", salary=45_000
    )
except ValidationError as e:
    print(e.json())

# Expected error!
# age="hello" is not a valid integer!

[
  {
    "loc": [
      "age"
    ],
    "msg": "value is not a valid integer",
    "type": "type_error.integer"
  }
]


### 3. Field customization

In [11]:
from pydantic import Field


class Person(BaseModel):
    """Data modelling a person object."""

    name: str
    age: int = Field(ge=1, le=100)  # 1 <= age <=100
    gender: str
    role: str
    salary: float

In [12]:
person_2 = Person(
    name="John Doe", age=5, gender="Male", role="ML Engineer", salary=45_000
)

person_2

Person(name='John Doe', age=5, gender='Male', role='ML Engineer', salary=45000.0)

In [13]:
(person_2.salary / 7)

6428.571428571428

### 4. Data validation (custom validators)

* Useful when you receive user inputs

In [14]:
import string


class Person(BaseModel):
    """Data modelling a person object."""

    name: str
    password: str
    gender: str

    @validator("password")
    def validate_password(cls, value: str) -> str:
        """This is used to validate the password field."""
        min_length = 8
        message = (
            f"{value!r} must contain at least one upper case, "
            f"one lower case, one digit and one punctuation "
        )
        if len(value) < min_length:
            raise ValueError(f"{value!r} must have a minimum length of {min_length}")

        if (
            any(char in value for char in string.ascii_uppercase)
            and any(char in value for char in string.ascii_lowercase)
            and any(char in value for char in string.digits)
            and any(char in value for char in string.punctuation)
        ):
            return value
        else:
            raise ValueError(message)

In [15]:
try:
    person_3 = Person(name="Jane Doe", password="abcDefghi2@", gender="Female")
except ValidationError as e:
    print(e)

person_3

Person(name='Jane Doe', password='abcDefghi2@', gender='Female')

### Use Case

In [16]:
class InputSchema(BaseModel):
    PassengerId: int
    Survived: int
    Pclass: Optional[str]
    Name: Optional[str]
    Sex: Optional[str]
    Age: Optional[int]
    SibSp: Optional[str]
    Parch: Optional[str]
    Ticket: Optional[str]
    Fare: Optional[float]
    Cabin: Optional[str]
    Embarked: Optional[str]


class ValidateInputSchema(BaseModel):
    inputs: list[InputSchema]


def validate_input(
    *,
    data: pd.DataFrame,
) -> tuple[Optional[pd.DataFrame], Optional[str]]:
    """This is used to validate the input data using a Pydantic Model.

    Params:
        data (Pandas DF): DF containing the training data.

    Returns:
        data (Pandas DF): The validated DF.
        error (str or None): None if there's no error else a str.
    """
    # load the data
    data = data.copy()
    error = None

    # Validate the data.
    try:  # Convert NaNs to None
        validated_data = ValidateInputSchema(
            inputs=data.replace({np.nan: None}).to_dict(orient="records")
        )
        # Extract the data
        validated_dict = validated_data.dict().get("inputs")
        data = pd.DataFrame(data=validated_dict)
        return (data, error)

    except ValidationError as err:
        error = err.json()
        return (None, error)

In [17]:
# Load Data
data = load_data(filename="../../data/titanic_train.csv")

data.head(2)

Shape of data: (891, 12)

Duration: 0.005 seconds


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C


In [18]:
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [19]:
# Validate the data
validated_data, _ = validate_input(data=data)
validated_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [20]:
validated_data.dtypes

PassengerId      int64
Survived         int64
Pclass          object
Name            object
Sex             object
Age            float64
SibSp           object
Parch           object
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

#### Create A Data With The Wrong Schema!

In [21]:
new_data = [
    {
        "PassengerId": 35,
        "Survived": 0,
        "Pclass": 1,
        "Name": "Meyer, Mr. Edgar Joseph",
        "Sex": "male",
        "Age": "25.IO",  # invalid age!
        "SibSp": 1,
        "Parch": 0,
        "Ticket": "PC 17604",
        "Fare": 82.1708,
        "Cabin": None,
        "Embarked": "C",
    },
    {
        "PassengerId": 36,
        "Survived": 0,
        "Pclass": 1,
        "Name": "Holverson, Mr. Alexander Oskar",
        "Sex": "male",
        "Age": 42.0,
        "SibSp": 1,
        "Parch": 0,
        "Ticket": "113789",
        "Fare": 52.0,
        "Cabin": None,
        "Embarked": "S",
    },
]

new_data = pd.DataFrame(new_data)

new_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,35,0,1,"Meyer, Mr. Edgar Joseph",male,25.IO,1,0,PC 17604,82.1708,,C
1,36,0,1,"Holverson, Mr. Alexander Oskar",male,42.0,1,0,113789,52.0,,S


In [22]:
pprint(validate_input(data=new_data))

(None,
 '[\n'
 '  {\n'
 '    "loc": [\n'
 '      "inputs",\n'
 '      0,\n'
 '      "Age"\n'
 '    ],\n'
 '    "msg": "value is not a valid integer",\n'
 '    "type": "type_error.integer"\n'
 '  }\n'
 ']')
