In [2]:
import re
from functools import partial

def _semantic_validity_factory(value, semantic_name, null_values=set(('Unspecified','N/A','')), **kwargs):
    if value in null_values:
        return (None, None)
    elif all(f(value) for name, f in kwargs.items() if 'semantic' in name):
        if all(f(value) for name, f in kwargs.items() if 'valid' in name):
            return (semantic_name,'VALID')
        else:
            return (semantic_name,'INVALID')
    else:
        # only checking for semantic type!
        return (None, None)

In [70]:
# semantic_type = 'school_region' (index: )

school_region_semantic = re.compile(r'(Alternative|Region|District)')
school_region_valid = re.compile(r'.+\s[1-9]$|.+\s10$|.+\s75$|.+\sSuperintendency$')
school_region_args = {'semantic_match': school_region_semantic.match,
                      'valid_check': school_region_valid.match}
                      
is_school_region = partial(_semantic_validity_factory, semantic_name='school_region', **school_region_args)

In [9]:
# semantic_type = 'school_num' (index: 29)

school_num_semantic = re.compile(r'([BMQRX]|\b)\d{1,3}(\b|[A-Z]\d{0,2}|\-\d{2,3}(\b|[A-Z]))')
school_num_args = {'semantic_match': school_num_semantic.match,
                   'valid_check': lambda x: len(x) <= 8}

is_school_number = partial(_semantic_validity_factory, semantic_name='school_num', **school_num_args)

In [11]:
# semantic_type = 'phone_num' (index: )

phone_num_semantic = re.compile(r'(212|718|917)')
phone_num_valid = re.compile(r'\d{10}')
phone_num_args = {'semantic_match': phone_num_semantic.match,
                   'valid_check': phone_num_valid.match}

is_phone_number = partial(_semantic_validity_factory, semantic_name='phone_num', **phone_num_args)

In [4]:
# semantic_type = 'address_name' 

address_name_semantic = re.compile(r'([\w\,\.\-]+ *)+')
address_name_args = {'semantic_match': address_name_semantic.match,
                     'valid_check': lambda x: True}

is_address_name = partial(_semantic_validity_factory, semantic_name='address_name', **address_name_args)

In [5]:
is_address_name('Abe Lincoln')

('address_name', 'VALID')

In [7]:
is_address_name('Other:')

('address_name', 'VALID')

In [12]:
is_school_number('M058-0189')

('school_num', 'INVALID')

In [74]:
is_school_region('Region 16')

('school_region', 'INVALID')

In [72]:
is_school_region('Alternative Superintendency')

('school_region', 'VALID')

In [73]:
is_school_region('District 75')

('school_region', 'VALID')

In [12]:
is_phone_number('2126541236')

('phone_num', 'VALID')

In [13]:
is_phone_number('3146541236')

(None, None)

In [14]:
is_phone_number('212654123 6')

('phone_num', 'INVALID')