<img src="../img/emagine-logo.png" width=300 height=200/>

### schema

From docs:

schema is a library for validating Python data structures, such as those obtained from config-files, forms, external services or command-line parsing, converted from JSON/YAML (or something else) to Python data-types.

Higlights:

* Single file lib (can copy paste into your repo if pip etc is not available / allowed)
* Simple API
* (New) Support for validation rule exports via JSON schema.

Docs: https://github.com/keleshev/schema/blob/master/README.rst

Code: https://github.com/keleshev/schema

PYPI: https://pypi.org/project/schema/

In [56]:
# API Introduction
from schema import Schema

sch = Schema(int)

# Single value
# sch.validate('a') # invalid
sch.validate(5)  # valid 

5

In [57]:
# Single value, multiple types, custom error msg

from schema import Or

sch = Or(int, str, error='Please provide either int or str type')

# sch.validate('a') # valid
# sch.validate(5)  # valid 
# sch.validate(2.5)  # invalid 

In [58]:
# Single value, multiple rules

from schema import And, SchemaError

MIN = 10
MAX = 50


def check_interval(value):
    if not MIN <= value <= MAX:
        raise SchemaError('Value ({}) must be between {} - {} (inclusive)'.format(value, MIN, MAX))
    return True

sch = And(int, check_interval)

# sch.validate('a')  # invalid
# sch.validate(9)  # invalid
# sch.validate(10.5)  # invalid
# sch.validate(10)  # valid

In [59]:
# Single value, multiple types, multiple rules

sch = And(Or(int, float), check_interval)

# sch.validate(10)  # valid
# sch.validate(10.5)  # valid

In [60]:
# List validation (for each item)

# similar to above rule, just with extra list brackets
sch = Schema([And(Or(int, float), check_interval)])

# sch.validate([10, 11, 12])  # valid
# sch.validate([10, 11, 'foo'])  # invalid

In [61]:
# Dictionary validation

import datetime

sch = Schema({
    'first_name': str,
    'last_name': str,
    'birth_date': datetime.date,
    'job_start_date': datetime.date
})


DATA = {
    'first_name': 'John',
    'last_name': 'Smith',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1970, 5, 12),
}

sch.validate(DATA)

{'first_name': 'John',
 'last_name': 'Smith',
 'birth_date': datetime.date(1970, 5, 12),
 'job_start_date': datetime.date(1970, 5, 12)}

In [62]:
# Default behaviour for missing keys is ERROR
# e.g. any key that's defined within dictionary schema will be mandatory

DATA = {
    'first_name': 'John',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1970, 5, 12),
}

# sch.validate(DATA)

In [63]:
# Optional keys

from schema import Optional

# Optional -> allows key to be left out

DATA_MIDDLE_NAME_MISSING = {
    'first_name': 'John',
    'last_name': 'Smith',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1970, 5, 12),
}

DATA_MIDDLE_NAME_NONE = {
    'first_name': 'John',
    'middle_name': None,
    'last_name': 'Smith',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1970, 5, 12),
}

sch = Schema({
    'first_name': str,
    Optional('middle_name', default=None): Or(str, None), # If middle_name is missing use None as default value,
                                                          # while also allowing explicit None value
    'last_name': str,
    'birth_date': datetime.date,
    'job_start_date': datetime.date,
})


# sch.validate(DATA_MIDDLE_NAME_MISSING)  # pass
sch.validate(DATA_MIDDLE_NAME_NONE)  # pass

{'first_name': 'John',
 'middle_name': None,
 'last_name': 'Smith',
 'birth_date': datetime.date(1970, 5, 12),
 'job_start_date': datetime.date(1970, 5, 12)}

In [64]:
# Extra keys

# Default behaviour for extra keys is ERROR

sch = Schema({
    'first_name': str,
    'last_name': str,
    'birth_date': datetime.date,
    'job_start_date': datetime.date
})


# Allow extra keys, str <>
# Allow any extra str key <> bool / str value pair

sch2 = Schema({
    'first_name': str,
    'last_name': str,
    'birth_date': datetime.date,
    'job_start_date': datetime.date,
    str: Or(str, bool),
})


# Allow any type of key / value pair
# Very permissive (not recommended)

sch3 = Schema({
    'first_name': str,
    'last_name': str,
    'birth_date': datetime.date,
    'job_start_date': datetime.date,
    object: object,
})



DATA  = {
    'first_name': 'John',
    'last_name': 'Smith',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1970, 5, 12),
    'is_active': True,
    'city': 'London',
}

# sch.validate(DATA)
# sch2.validate(DATA)
# sch3.validate(DATA)

In [65]:
# Interdependent field validation

# Some validation rules can take multiple keys into account:
# E.g. job_start_date cannot be smaller or equal to birth_date

def check_job_start_date(data):
    # Make sure you return data if it is valid
    job_start_date, birth_date = data['job_start_date'], data['birth_date']
    if job_start_date <= birth_date:
        raise SchemaError(
            'job_start_date ({}) must be greater than birth_date ({})'.format(job_start_date, birth_date))
    return True

        
sch = Schema(
    And(
        {
            'first_name': str,
            'last_name': str,
            Optional('middle_name'): str,
            'birth_date': datetime.date,
            'job_start_date': datetime.date,
        },
        check_job_start_date
    )
)

DATA = {
    'first_name': 'John',
    'middle_name': 'Robert',
    'last_name': 'Smith',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1965, 5, 12),
}


DATA_2 = dict(DATA, job_start_date=datetime.date(1985, 1, 12))
        
# sch.validate(DATA) # fail
sch.validate(DATA_2)  # pass

{'first_name': 'John',
 'middle_name': 'Robert',
 'last_name': 'Smith',
 'birth_date': datetime.date(1970, 5, 12),
 'job_start_date': datetime.date(1985, 1, 12)}

In [66]:
# More thorough validation

# middle_name -> Optional, with explicit None value

# Strings -> 
#     Make sure each letter is alphabetical, with some special character allowance (O'Neill, Day-Lewis)
#     Ensure single spaces between words (some names may have spaces, e.g. 'van der Bellen')
#     Normalize to all caps
#     Trim whitespace
#     Should have length greater than 0 after trimming

# Dates -> Accept both isodate formatted string as well as python date objects

In [67]:
def clean_string(value, special_characters=(' ', "'", "-")):
    
    cleaned = ' '.join([w.upper() for w in value.split(' ') if w]).strip()
    
    if not cleaned:
        raise SchemaError('Invalid value: {}'.format(value))
        
    for idx, char in enumerate(cleaned):
        if not (char.isalpha() or char in special_characters):
            raise SchemaError('Character ({}) at index {} is not alphabetical'.format(char, idx))
        
    return cleaned


def parse_date(value):
    try:
        return datetime.datetime.strptime(value, '%Y-%m-%d').date()
    except ValueError:
        raise ValueError("Incorrect data format, should be YYYY-MM-DD")


# schema does not change input value unless you use "Use" explicitly
# Read more (v 0.7.3): https://github.com/keleshev/schema/blob/master/schema.py#L226

from schema import Use

        
CLEANED_STRING = And(str, Use(clean_string))


# Use either of:
#    a- Isoformatted date string validation -> parsed date value
#    b- Date value directly if it is datetime.date

DATE = Or(
    datetime.date,
    Use(parse_date),
)


employee_schema = Schema(
    And(
        Schema({
            'first_name': CLEANED_STRING,
            'last_name': CLEANED_STRING,
            Optional('middle_name', default=None): Or(CLEANED_STRING, None),
            'birth_date': DATE,
            'job_start_date': DATE,
        }),
        check_job_start_date,
    ),
)

employee_schema.validate({
    'first_name': 'JOHN',
    'middle_name': 'robert',
    'last_name': 'SMiTh',
    'birth_date': '1970-5-2',
    'job_start_date': datetime.date(1985, 5, 12),
})


{'first_name': 'JOHN',
 'middle_name': 'ROBERT',
 'last_name': 'SMITH',
 'birth_date': datetime.date(1970, 5, 2),
 'job_start_date': datetime.date(1985, 5, 12)}

In [68]:
# Nested validation
# We can use previously defined schemas for nested validation

department_schema = Schema({
    'name': CLEANED_STRING,
    'employees': [employee_schema]
})


SALES_DEPT = {
    'name': 'Sales',
    'employees': [
        {
            'first_name': 'Barbara',
            'last_name': 'Brown',
            'birth_date': datetime.date(1985, 2, 5),
            'job_start_date': datetime.date(1995, 7, 2),
        },
        {
            'first_name': 'martin',
            'last_name': 'anderson',
            'birth_date': '1979-9-3',
            'job_start_date': '1982-1-1',
        },
    ]
}

department_schema.validate(SALES_DEPT)

{'name': 'SALES',
 'employees': [{'first_name': 'BARBARA',
   'last_name': 'BROWN',
   'birth_date': datetime.date(1985, 2, 5),
   'job_start_date': datetime.date(1995, 7, 2),
   'middle_name': None},
  {'first_name': 'MARTIN',
   'last_name': 'ANDERSON',
   'birth_date': datetime.date(1979, 9, 3),
   'job_start_date': datetime.date(1982, 1, 1),
   'middle_name': None}]}