<img src="../img/emagine-logo.png" width=300 height=200/>

### Marshmallow

From docs:

marshmallow is an ORM/ODM/framework-agnostic library for converting complex datatypes, such as objects, to and from native Python datatypes.


This library offers object serialization along with validation / deserialization (unlike schema & voluptuous, which are use for validation only).


Docs: https://marshmallow.readthedocs.io/en/stable/

Code: https://github.com/marshmallow-code/marshmallow

PYPI: https://pypi.org/project/marshmallow/

In [2]:
# API Introduction
from marshmallow import Schema, fields

class MySchema(Schema):
    value = fields.Integer()


sch = MySchema()
    
    
# sch.load({'value': 'a'}) # invalid -> Raise validation error
# sch.validate({'value': 'a'}) # invalid -> return error context
# sch.load({'value': 5})  # valid -> Return cleaned data

In [3]:
# Single value, multiple types, custom error msg

from marshmallow import ValidationError


class CustomField(fields.Field):
    """Field that allows strings or integers"""

    def _serialize(self, value, attr, obj, **kwargs):
        pass  # this workshop focuses on validation (deserialization) only

    def _deserialize(self, value, attr, data, **kwargs):
        if not isinstance(value, (str, int)):
            raise ValidationError('Please provide either int or str type')
        return value


class MySchema(Schema):
    value = CustomField()

    
sch = MySchema()


# sch.load({'value': 'a'}) # valid
# sch.load({'value': 5})  # valid 
# sch.load({'value': 2.5})  # invalid 

In [4]:
# Single value, multiple rules

from marshmallow import validates

MIN = 10
MAX = 50


class MySchema(Schema):
    value = fields.Integer(strict=True)  # strict = False tries to convert to integer (10.5 -> 10)

    @validates('value')
    def validate_range(self, data, **kwargs):
        if not MIN <= data <= MAX:
            raise ValidationError('Value ({}) must be between {} - {} (inclusive)'.format(data, MIN, MAX))

    
sch = MySchema()


# sch.load({'value': 'a'})  # invalid
# sch.load({'value': 9})  # invalid
# sch.load({'value': 10.5})  # invalid
# sch.load({'value': 10})  # valid

In [5]:
# Single value, multiple types, multiple rules


class CustomField(fields.Field):
    """Field that allows floats or integers"""

    def _deserialize(self, value, attr, data, **kwargs):
        if not isinstance(value, (float, int)):
            raise ValidationError('Please provide either int or float type')
        return value


def validate_range(value):
    if not MIN <= value <= MAX:
        raise ValidationError('Value ({}) must be between {} - {} (inclusive)'.format(value, MIN, MAX))

    
class MySchema(Schema):
    value = CustomField(validate=validate_range)

            
sch = MySchema()


# sch.load({'value': 'a'})  # invalid
# sch.load({'value': 9})  # invalid
# sch.load({'value': 10.5})  # valid
# sch.load({'value': 10})  # valid

In [6]:
# List validation (for each item)

class MySchema(Schema):
    values = fields.List(CustomField(validate=validate_range))


sch = MySchema()

    
# sch.load({'values': [10, 15.2, ]}) # valid
# sch.load({'values': [10, 11, 'foo']})  # invalid

In [7]:
# Dictionary validation
import datetime


# Marshmallow date field accepts formatted string as date input, instead of raw date values
# We use custom field for sake of similarity to other libs

class CustomDateField(fields.Date):
    
    def _deserialize(self, value, attr, data, **kwargs):
        if isinstance(value, datetime.date):
            return value
        return super()._deserialize(value, attr, data, **kwargs)


class MySchema(Schema):
    
    first_name = fields.String()
    last_name = fields.String()
    birth_date = CustomDateField()
    job_start_date = CustomDateField()


DATA = {
    'first_name': 'John',
    'last_name': 'Smith',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1970, 5, 12),
}


sch = MySchema()

sch.load(DATA)

{'birth_date': datetime.date(1970, 5, 12),
 'first_name': 'John',
 'last_name': 'Smith',
 'job_start_date': datetime.date(1970, 5, 12)}

In [8]:
# Default behaviour for missing keys is IGNORE, e.g. any key that's missing will just be skipped

DATA = {
    'first_name': 'John',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1970, 5, 12),
}

sch.load(DATA)

{'birth_date': datetime.date(1970, 5, 12),
 'first_name': 'John',
 'job_start_date': datetime.date(1970, 5, 12)}

In [9]:
# Required keys

# We can use required=True while declaring the schema to enforce keys

class MySchema(Schema):
    
    first_name = fields.String(required=True)
    last_name = fields.String(required=True)
    birth_date = CustomDateField(required=True)
    job_start_date = CustomDateField(required=True)
    
DATA_MISSING = {
    'first_name': 'John',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1970, 5, 12),
}


DATA = dict(DATA_MISSING, last_name='Smith')

sch = MySchema()

# sch.load(DATA_MISSING)
# sch.load(DATA)

In [10]:
# Optional keys

class MySchema(Schema):
    
    first_name = fields.String(required=True)

    # missing -> used during deserialization
    # default -> used during serialization
    # https://github.com/marshmallow-code/marshmallow/blob/dev/src/marshmallow/fields.py#L79
    middle_name = fields.String(missing=None, default=None)

    last_name = fields.String(required=True)
    birth_date = CustomDateField(required=True)
    job_start_date = CustomDateField(required=True)


DATA_MIDDLE_NAME_MISSING = {
    'first_name': 'John',
    'last_name': 'Smith',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1970, 5, 12),
}

DATA_MIDDLE_NAME_NONE = {
    'first_name': 'John',
    'middle_name': None,
    'last_name': 'Smith',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1970, 5, 12),
}


sch = MySchema()


# sch.load(DATA_MIDDLE_NAME_MISSING)
# sch.load(DATA_MIDDLE_NAME_NONE)

In [11]:
# Extra keys

# Default behaviour for extra keys is ERROR
# This behaviour can be configured via:
# 1- Meta options on schema declaration
# 2- during schema instantiation
# 3- during deserialization
# https://marshmallow.readthedocs.io/en/stable/quickstart.html#handling-unknown-fields

from marshmallow import RAISE, INCLUDE, EXCLUDE


DATA  = {
    'first_name': 'John',
    'last_name': 'Smith',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1970, 5, 12),
    'is_active': True,
    'city': 'London'
}


class MySchema(Schema):
    
    first_name = fields.String(required=True)
    middle_name = fields.String(missing=None, default=None)
    last_name = fields.String(required=True)
    birth_date = CustomDateField(required=True)
    job_start_date = CustomDateField(required=True)

    
class MySchema2(Schema):
    class Meta:
        unknown = INCLUDE
            
    
sch = MySchema()
sch2 = MySchema2()

# sch.load(DATA)
# sch2.load(DATA)  # include extra keys
# sch.load(DATA, unknown=EXCLUDE)  # exclude extra keys

In [12]:
# Interdependent field validation

# Some validation rules can take multiple keys into account:
# E.g. job_start_date cannot be smaller or equal to birth_date


from marshmallow import validates_schema


class MySchema(Schema):
    
    first_name = fields.String(required=True)
    middle_name = fields.String(missing=None, default=None)
    last_name = fields.String(required=True)
    birth_date = CustomDateField(required=True)
    job_start_date = CustomDateField(required=True)
    
    @validates_schema
    def check_job_start_date(self, data, **kwargs):
        job_start_date, birth_date = data['job_start_date'], data['birth_date']
        if job_start_date <= birth_date:
            raise ValidationError(
                'job_start_date ({}) must be greater than birth_date ({})'.format(job_start_date, birth_date))


DATA = {
    'first_name': 'John',
    'middle_name': 'Robert',
    'last_name': 'Smith',
    'birth_date': datetime.date(1970, 5, 12),
    'job_start_date': datetime.date(1965, 5, 12),
}


DATA_2 = dict(DATA, job_start_date=datetime.date(1985, 1, 12))
        
    
sch = MySchema()
    
# sch.load(DATA) # fail
# sch.load(DATA_2)  # pass

In [13]:
# More thorough validation

# middle_name -> Optional, with explicit None value

# Strings -> 
#     Make sure each letter is alphabetical, with some special character allowance (O'Neill, Day-Lewis)
#     Ensure single spaces between words (some names may have spaces, e.g. 'van der Bellen')
#     Normalize to all caps
#     Trim whitespace
#     Should have length greater than 0 after trimming

# Dates -> Accept both isodate formatted string as well as python date objects

In [14]:
class NameField(fields.String):
    
    SPECIAL_CHARACTERS = (' ', "'", "-")
    
    def _deserialize(self, value, attr, data, **kwargs):
        cleaned = ' '.join([w.upper() for w in value.split(' ') if w]).strip()

        if not cleaned:
            raise ValidationError('Invalid value: {}'.format(value))

        for idx, char in enumerate(cleaned):
            if not (char.isalpha() or char in self.SPECIAL_CHARACTERS):
                raise ValidationError('Character ({}) at index {} is not alphabetical'.format(char, idx))

        return cleaned


class EmployeeSchema(Schema):
    
    first_name = NameField(required=True)
    middle_name = NameField(missing=None, default=None)
    last_name = NameField(required=True)
    birth_date = CustomDateField(required=True)
    job_start_date = CustomDateField(required=True)
    
    @validates_schema
    def check_job_start_date(self, data, **kwargs):
        job_start_date, birth_date = data['job_start_date'], data['birth_date']
        if job_start_date <= birth_date:
            raise ValidationError(
                'job_start_date ({}) must be greater than birth_date ({})'.format(job_start_date, birth_date))

            
employee_schema = EmployeeSchema()

employee_schema.load({
    'first_name': 'JOHN  ',
    'middle_name': 'robert',
    'last_name': 'SMiTh',
    'birth_date': '1970-5-2',
    'job_start_date': datetime.date(1985, 5, 12),
})


{'birth_date': datetime.date(1970, 5, 2),
 'last_name': 'SMITH',
 'middle_name': 'ROBERT',
 'job_start_date': datetime.date(1985, 5, 12),
 'first_name': 'JOHN'}

In [15]:
# Nested validation
# We can use previously defined schemas for nested validation


class DepartmentSchema(Schema):
    name = NameField(required=True)
    employees = fields.Nested(EmployeeSchema, many=True)


department_schema = DepartmentSchema()

SALES_DEPT = {
    'name': 'Sales',
    'employees': [
        {
            'first_name': 'Barbara',
            'last_name': 'Brown',
            'birth_date': datetime.date(1985, 2, 5),
            'job_start_date': datetime.date(1995, 7, 2),
        },
        {
            'first_name': 'martin',
            'last_name': 'anderson',
            'birth_date': '1979-9-3',
            'job_start_date': '1982-1-1',
        },
    ]
}

department_schema.load(SALES_DEPT)

{'employees': [{'birth_date': datetime.date(1985, 2, 5),
   'last_name': 'BROWN',
   'middle_name': None,
   'job_start_date': datetime.date(1995, 7, 2),
   'first_name': 'BARBARA'},
  {'birth_date': datetime.date(1979, 9, 3),
   'last_name': 'ANDERSON',
   'middle_name': None,
   'job_start_date': datetime.date(1982, 1, 1),
   'first_name': 'MARTIN'}],
 'name': 'SALES'}