<img src="../img/emagine-logo.png" width=300 height=200/>

### Pydantic

Data validation and settings management using python type annotations.

pydantic enforces type hints at runtime, and provides user friendly errors when data is invalid.

Docs: https://pydantic-docs.helpmanual.io/

Code: https://github.com/samuelcolvin/pydantic

PYPI: https://pypi.org/project/pydantic/

In [1]:
# API Introduction

from pydantic import BaseModel

class MyModel(BaseModel):
    value: int


model = MyModel(value=123)
# model = MyModel(value='123')  # implicit int conversion
# model = MyModel(value='aaa')  # failure, not valid int

print('Value: {}, Type: {}'.format(model.value, type(model.value)))

Value: 123, Type: <class 'int'>


In [2]:
# Single value, multiple types
from typing import Union


class MyModel(BaseModel):
    # For unions, type conversion will start from leftmost to rightmost
    # Try to convert value to int first - str second
    value: Union[int, str]
    
    # Try to convert value to str first - int second.
    # However anything can be converted to str so int conversion will never happen.
    # value: Union[str, int]
    

# model = MyModel(value=123)  # use int
model = MyModel(value='123')  # str -> int
# model = MyModel(value='aaa')  # str -> int FAIL, fallback to str

print('Value: {}, Type: {}'.format(model.value, type(model.value)))


Value: 123, Type: <class 'int'>


In [3]:
# Single value, multiple types, custom error msg

from pydantic import validator

MIN = 10
MAX = 50

class MyModel(BaseModel):
    # For unions, type conversion will start from leftmost to rightmost
    # Try to convert value to int first - str second
    value: Union[int, str]
    
    # Try to convert value to str first - int second.
    # However anything can be converted to str so int conversion will never happen.
    # value: Union[str, int]
    
    @validator('value')
    def test_range(cls, v):
        # Check range for numerical values
        if isinstance(v, int):
            if not (MIN <= v <= MAX):
                raise ValueError("Value ({}) should be between {} - {}".format(v, MIN, MAX))
        return v
    

model = MyModel(value=12)  # range valid
# model = MyModel(value='12')  # convert to int -> range valid
# model = MyModel(value=105)  # range invalid
# model = MyModel(value='205')  # convert to int -> range invalid
# model = MyModel(value='aaa')  # fallback to str type, range check skipped


print('Value: {}, Type: {}'.format(model.value, type(model.value)))


Value: 12, Type: <class 'int'>


In [4]:
# List validation (for each item)

from typing import List


MIN = 10
MAX = 50

MAX_LENGTH = 5


class MyModel(BaseModel):
    values: List[int] = []  # allow list of integers, with default empty list value

    @validator('values', each_item=True)
    def check_item_range(cls, v):
        assert MIN <= v <= MAX, "List item ({}) should be between {} - {}".format(v, MIN, MAX)
        return v
    
    @validator('values')
    def check_list_length(cls, v):
        assert len(v) <= MAX_LENGTH, "List cannot have more than {} items".format(len(v))
        return v
        
        
model = MyModel()  # no value given, using empty list default
# model = MyModel(values=[10, 20, 30])  # valid
# model = MyModel(values=[10, 20, 200])  # check_item_range failure
# model = MyModel(values=[10, 20, 25, 30, 35, 50])  # check_list_length failure


print('Value: {}'.format(model.values))


Value: []


In [5]:
# Dictionary validation
import datetime



class MyModel(BaseModel):
    
    first_name: str
    last_name: str
    birth_date: datetime.date
    job_start_date: datetime.date



DATA = {
    'first_name': 'John',
    'last_name': 'Smith',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1970, 5, 12),
}


model = MyModel(**DATA)

model.dict()

{'first_name': 'John',
 'last_name': 'Smith',
 'birth_date': datetime.date(1970, 5, 12),
 'job_start_date': datetime.date(1970, 5, 12)}

In [6]:
# Default behaviour for missing keys is ERROR
# e.g. any key that's missing without any defaults will cause validation error

DATA = {
    'first_name': 'John',
    #'last_name': 'Smith',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1970, 5, 12),
}

model = MyModel(**DATA)
model.dict()

ValidationError: 1 validation error for MyModel
last_name
  field required (type=value_error.missing)

In [7]:
# Optional keys

from typing import Optional


class MyModel(BaseModel):
    
    first_name: str
    middle_name: Optional[str] = None
    last_name: str
    birth_date: datetime.date
    job_start_date: datetime.date
        

DATA_1 = {
    'first_name': 'John',
    'last_name': 'Smith',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1970, 5, 12),
}


DATA_2 = {
    'first_name': 'John',
    'middle_name': 'Edward',
    'last_name': 'Smith',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1970, 5, 12),    
}

        
        
model = MyModel(**DATA_1)
# model = MyModel(**DATA_2)
model.dict()

{'first_name': 'John',
 'middle_name': None,
 'last_name': 'Smith',
 'birth_date': datetime.date(1970, 5, 12),
 'job_start_date': datetime.date(1970, 5, 12)}

In [8]:
# Extra keys

# Default behaviour for extra keys is IGNORE


class MyModel(BaseModel):
    
    first_name: str
    middle_name: Optional[str] = None
    last_name: str
    birth_date: datetime.date
    job_start_date: datetime.date
        
    
DATA = {
    'first_name': 'John',
    'last_name': 'Smith',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1970, 5, 12),
    'is_active': True
}

# Extra key behaviour can be configured via Config class: 
# https://pydantic-docs.helpmanual.io/usage/model_config/

class MyModel2(MyModel):
    
    class Config:
        extra = 'allow'
        

class MyModel3(MyModel):

    class Config:
        extra = 'forbid'


model = MyModel(**DATA)  # ignore is_active
# model = MyModel2(**DATA)  # allow is_active
# model = MyModel3(**DATA)  # validation error

model.dict()

{'first_name': 'John',
 'middle_name': None,
 'last_name': 'Smith',
 'birth_date': datetime.date(1970, 5, 12),
 'job_start_date': datetime.date(1970, 5, 12)}

In [9]:
# Interdependent field validation

# Some validation rules can take multiple keys into account:
# E.g. job_start_date cannot be smaller or equal to birth_date

# More details on validators: https://pydantic-docs.helpmanual.io/usage/validators/


class MyModel(BaseModel):
    
    first_name: str
    middle_name: Optional[str] = None
    last_name: str
    birth_date: datetime.date
    job_start_date: datetime.date
        
    @validator('job_start_date')
    def validate_job_start(cls, v, values, **kwargs):
        job_start_date = v
        birth_date = values['birth_date']
        if v <= birth_date:
            raise ValueError(
                'birth_date ({}) cannot be larger than job_start_date: {}'.format(
                    birth_date, job_start_date))
        return v
    
    
DATA = {
    'first_name': 'John',
    'middle_name': 'Robert',
    'last_name': 'Smith',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1965, 5, 12),
}


DATA_2 = dict(DATA, job_start_date=datetime.date(1985, 1, 12))
        
    
# model = MyModel(**DATA)  # fail
model = MyModel(**DATA_2)  # pass

model.dict()

{'first_name': 'John',
 'middle_name': 'Robert',
 'last_name': 'Smith',
 'birth_date': datetime.date(1970, 5, 12),
 'job_start_date': datetime.date(1985, 1, 12)}

In [10]:
# More thorough validation

# middle_name -> Optional, with explicit None value

# Strings -> 
#     Make sure each letter is alphabetical, with some special character allowance (O'Neill, Day-Lewis)
#     Ensure single spaces between words (some names may have spaces, e.g. 'van der Bellen')
#     Normalize to all caps
#     Trim whitespace
#     Should have length greater than 0 after trimming

# Dates -> Accept both isodate formatted string as well as python date objects

In [11]:
class CleanString(str):

    SPECIAL_CHARACTERS = (' ', "'", "-")

    @classmethod
    def __get_validators__(cls):
        yield cls.clean_string

    @classmethod
    def clean_string(cls, v):

        cleaned = ' '.join([w.upper() for w in v.split(' ') if w]).strip()

        if not cleaned:
            raise ValueError('Invalid value: {}'.format(v))

        for idx, char in enumerate(cleaned):
            if not (char.isalpha() or char in special_characters):
                raise ValueError('Character ({}) at index {} is not alphabetical'.format(char, idx))

        return cleaned

    
class Date(object):
    
    @classmethod
    def __get_validators__(cls):
        yield cls.parse_date
        
    @classmethod
    def parse_date(cls, v):
        if isinstance(v, datetime.date):
            return v
        try:
            return datetime.datetime.strptime(v, '%Y-%m-%d').date()
        except ValueError:
            raise ValueError("Incorrect data format, should be YYYY-MM-DD")
    


class Employee(BaseModel):
    
    first_name: CleanString
    middle_name: Optional[CleanString] = None
    last_name: CleanString
    birth_date: Date
    job_start_date: Date
    
    
DATA = {
    'first_name': 'JOHN',
    'middle_name': 'robert',
    'last_name': 'SMiTh',
    'birth_date': '1970-5-2',
    'job_start_date': datetime.date(1985, 5, 12),
}


model = Employee(**DATA)  # pass
model.dict()

{'first_name': 'JOHN',
 'middle_name': 'ROBERT',
 'last_name': 'SMITH',
 'birth_date': datetime.date(1970, 5, 2),
 'job_start_date': datetime.date(1985, 5, 12)}

In [12]:
# Nested validation
# We can use previously defined models for nested validation


class Department(BaseModel):
    
    name: CleanString
    employees: List[Employee]


SALES_DEPT = {
    'name': 'Sales',
    'employees': [
        {
            'first_name': 'Barbara',
            'last_name': 'Brown',
            'birth_date': datetime.date(1985, 2, 5),
            'job_start_date': datetime.date(1995, 7, 2),
        },
        {
            'first_name': 'martin',
            'last_name': 'anderson',
            'birth_date': '1979-9-3',
            'job_start_date': '1982-1-1',
        },
    ]
}


model = Department(**SALES_DEPT)
model.dict()

{'name': 'SALES',
 'employees': [{'first_name': 'BARBARA',
   'middle_name': None,
   'last_name': 'BROWN',
   'birth_date': datetime.date(1985, 2, 5),
   'job_start_date': datetime.date(1995, 7, 2)},
  {'first_name': 'MARTIN',
   'middle_name': None,
   'last_name': 'ANDERSON',
   'birth_date': datetime.date(1979, 9, 3),
   'job_start_date': datetime.date(1982, 1, 1)}]}