# Use cerberus to validate the schema of csv files

conda install -c conda-forge cerberus

In [1]:
# import pandas and cerberus Validator
import pandas as pd
from cerberus import Validator

In [2]:
# https://raw.githubusercontent.com/AICPA-AuditDataAnalytics2018/ADS---Python-Example-/master/samples/data/GL_Detail_YYYYMMDD_YYYYMMDD.csv
# read csv
df = pd.read_csv('https://raw.githubusercontent.com/AICPA-AuditDataAnalytics2018/ADS---Python-Example-/master/samples/data/GL_Detail_YYYYMMDD_YYYYMMDD.csv')
df.head()

Unnamed: 0,Journal_ID,Journal_ID_Line_Number,JE_Line_Description,Business_Unit_Code,Effective_Date,Fiscal_Year,GL_Account_Number,Amount,Amount_Credit_Debit_Indicator,Amount_Currency,JE_Header_ Description,Source,Entered_By,Document_Date,Entered_Date,Entered_Time,Period
0,100000000,1,Postkosten ohne Tel.,9900.0,19000101,2007,473000,9770.52,S,USD,,SA,STEINER,20070101,20070122,101205,1
1,100000000,2,,,19000101,2007,113100,9770.52,H,USD,,SA,STEINER,20070101,20070122,101205,1
2,100000001,1,Reisekst./Unterkunft,9900.0,19000101,2007,474210,5875.2,S,USD,,SA,STEINER,20070101,20070122,101206,1
3,100000001,2,,,19000101,2007,113100,5875.2,H,USD,,SA,STEINER,20070101,20070122,101206,1
4,100000002,1,,9900.0,19000101,2007,474211,244.8,S,USD,,SA,STEINER,20070101,20070122,101206,1


In [21]:
# Define a schema dict of amount (type float), Journal_ID (type integer, min 1M), Effective_Date (int, min 19000101, max today)
# and GL_Account_Number (int, min 10000, max 999999)
schema = {
    'Amount': {'type': 'float'},
    'Journal_ID': {'type': 'integer', 'min': 1000000},
    'Effective_Date': {'type': 'integer', 'min': 19000101, 'max': 20200101},
    'GL_Account_Number': {'type': 'integer', 'min': 1000, 'max': 999999}
}

In [22]:
# Initiate Validator with schema, and allow_unknown fields
v = Validator(schema)
v.allow_unknown = True
v.require_all = True

In [23]:
v.validate({'Amount': 1.5})
v.errors

{'Effective_Date': ['required field'],
 'GL_Account_Number': ['required field'],
 'Journal_ID': ['required field']}

In [24]:
# create dict by records
df_dict = df.to_dict(orient='records')

In [25]:
# iterate and validate, if does not validate, print errors
for idx, record in enumerate(df_dict):
    if not v.validate(record):
        print(f'Item {idx}: {v.errors}')