In [277]:
import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET
import cerberus
import schema
from collections import defaultdict

In [296]:
# Input Files
#OSM_PATH = "./dbprep/redmond_bell_kirk.xml"
OSM_FILE = "./dbprep/redmond_bell_kirk.xml"
SAMPLE_FILE = "./dbprep/rbk_samp.xml"
# switch file to SAMPLE_FILE or OSM_FILE depending on DRAFT / Production Cut
# FILE = SAMPLE_FILE 
FILE = OSM_FILE
SCHEMA = schema.schema
# Output Files
NODES_PATH = "./dbprep/nodes.csv"
NODE_TAGS_PATH = "./dbprep/nodes_tags.csv"
WAYS_PATH = "./dbprep/ways.csv"
WAY_NODES_PATH = "./dbprep/ways_nodes.csv"
WAY_TAGS_PATH = "./dbprep/ways_tags.csv"

In [279]:
# regex functions

lower_re = re.compile(r'^([a-z]|_)*$')
lower_colon_re = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars_re = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
phone_re = re.compile(r''' ([2-9][0-9][0-9])\D*(\d{3})\D* (\d{4})\D*(\d*)$''', re.VERBOSE)
zcode_re = re.compile(r'''(\d{5})''')

In [280]:
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

tag_freq = defaultdict(int)
tag_key = defaultdict(int)
elem_tag = {}
elem_tags = []
street_types = defaultdict(set)

expected = ["Redmond","Kirkland","Woodinville","Willows","Sammamish","Juanita",
            "Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway","Park", "Commons","Way","Circle",
            "North","East","West","Northeast","Center","South","Central","Northwest",
           "Parkplace","Totem","Lake","Bear","Creek","Leary","Forbes","Railroad","Hill","Union","River",
           "State","Cedar","Ohde","Slater","Lakeview","Avondale","Observation","Bridlewood","Old","Cleveland","Alexander"]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "st": "Street",
            "ST": "Street",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "AVE": "Avenue",
            "ave": "Avenue",
            "Rd": "Road",
            "Rd.": "Road",
           "NE" : "Northeast",
           "ne" : "Northeast",
           "Ln" : "Lane",
           "LN" : "Lane",
           "WY" : "Way",
           "Pl" : "Place",
           "PL" : "Place",
           "Ct" :"Court",
           "Dr" :"Drive",
           "Remond" : "Redmond",
           "Northest":"Northeast"           
            }

In [281]:
'''
This function accepts 
    a) street details from the data file, 
    b) mapping dictionary of uncoventional street type to standard street type for this project
    c) expected street names
& returns either cleaned out, standarized street names

'''
def standard_street_name(name, mapst=mapping,expst=expected):
    
    st = name.split()
    parsed_street = ''

    for s in st:
        if s.isalpha(): # is the sliced street part alphabetic? as the number part need not be massaged
            if s not in expst:
                if mapst.get(s,0) <> 0:
                    parsed_street = parsed_street + ' ' + mapst[s]
                else:
                    print name, " Please handle -> ",s
            else:
                parsed_street = parsed_street + ' ' + s
        else:
            parsed_street = parsed_street + ' ' + s
    #print name,' -> ',parsed_street.strip()
    return parsed_street.strip()

In [282]:
'''
This function accepts phone number from the data file and the phone regex function and 
returns either a) cleaned out, standarized phone or 
               b) Error in case of anamoly
'''
def standard_phone_num(phnum, pphone_re =phone_re):
    ptemp = ''
    pno = ''
    ptemp_grp = pphone_re.search(phnum)
    if ptemp_grp:
        
        ptemp = ptemp_grp.groups()
        pno =  ptemp[0]+'-'+ptemp[1]+'-'+ptemp[2]
        if ptemp[3] <>'': # if there is extenion associated with the phone #
            pno = pno + '-'+ptemp[3]

    else:
        #print "No match -> ", phnum
        pno = 'Error'
    return pno

In [283]:
'''
This function accepts postal code from the data file and the zip regex function and 
returns either a) cleaned out, standarized zip code or 
               b) Error in case of anamoly
'''
def standard_zip(pcode, pzcode_re =zcode_re):
    ztemp = pzcode_re.search(pcode)
    if ztemp:
        zipcode = ztemp.groups()
        #print "zip code > 5 : ", elem.attrib['v'], ' -> ',ztemp[0]
        return zipcode[0]
    else:
        return 'Error'
    

In [305]:
def any_foreign_char(s):
    '''function accepts a string and returns True when there are foreign char in the string'''
    # reference https://stackoverflow.com/questions/27084617/detect-strings-with-non-english-characters-in-python
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return True
    else:
        return False

In [304]:
def skip_tag_parse(tag_key,tag_value,
                   pproblem_chars_re=problemchars_re,pphone_re=phone_re):
    '''function returns list of Boolean flag indicating if the tag need to be skipped and 
      also the specific sections which caused the validation to fail
    '''
    is_tag_key_problem = False
    is_foreign_code_problem = False
    is_phone_problem = False
    is_zip_problem = False
    skip_tag = False
    
    if pproblem_chars_re.search(tag_key):
        skip_tag = True
        is_tag_key_problem = True
    if "name:" in tag_key and any_foreign_char(tag_value): #foreign char in name (not name:), not or source tag is genuine
        skip_tag = True
        is_foreign_code_problem = True
    if tag_key == 'phone': # the tag key source:phone is genuine
        pnum = standard_phone_num(tag_value)
        if pnum == 'Error':
            skip_tag = True
            is_phone_problem = True
    if "postcode" in tag_key:
        zipcode = standard_zip(tag_value)
        if zipcode == 'Error':
            skip_tag = True
            is_zip_problem = True
            
        
    
    #return the list of flags to inspect which test flagged the tag
    return[skip_tag,is_tag_key_problem,is_foreign_code_problem,is_phone_problem,is_zip_problem]
    
    
    

In [286]:
def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  node_tag_fields=NODE_TAGS_FIELDS,wy_tag_fields=WAY_TAGS_FIELDS,wy_nodes_fields=WAY_NODES_FIELDS,    
                  default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""
    ''' This function checks the following:
          a)checks if tag keys have problem chars in them and if found - discards the tag
          b)checks if the tag value have any foreign char in them and if found - discards the tag
          c)checks for street type and massages them to a standard format
          d)checks for the postalcode and massages them to the standard 5 digit zip code
          e)checks for phone number and massages them to a standard format
    '''
    node_attribs = {}
    way_attribs = {}
    node_tags = []  # Handle secondary tags the same way for both node and way elements
    node_tag = {}
    way_tags = []    
    way_tag = {}
    way_nodes = []
    way_node = {}
    node_id = ''
    way_id = ''
    typ = ''
    way_typ = ''
    skip_tag = False

    # Node element processing begins
    if element.tag == 'node':

        # iterate on node_attr_fields so that only interested fields are parsed
        for field in node_attr_fields:
            node_attribs[field] = element.attrib[field] # building the node attributes dictionary
            if field == 'id':
                node_id = element.attrib[field] # store the node id for referencing the child elements node id           
        
        # begin the processing of child element of the node
        for child in element:
            if child.tag == 'tag':
                typ = '' # this variable to hold value regular / keytype based on the : char of tag key 

                '''do not parse 
                  1) if the tag key has problem chars in them or 
                  2) incase of name: tag, if the tag value has any foreign chars in the  or
                  3) incase of phone tag, if the phone number is anomalous
                  4) incase of zip code tag, if the zip code is anomalous
                '''
                skip_tag = skip_tag_parse(child.attrib['k'],child.attrib['v'])
                #skip_tag[0] is the boolean flag. A True means the tag need not be parsed
                if skip_tag[0] == False:
                    for field in node_tag_fields: # iterate through node tag fields
                        if field == 'id':
                            node_tag[field] = node_id #store the parent node id for referencing parent node
                        elif field == 'key':
                            if ":" in child.attrib['k']:
                                node_tag[field] = child.attrib['k'].split(":", 1)[1]
                                typ = child.attrib['k'].split(":", 1)[0]
                            else:
                                node_tag[field] = child.attrib['k']
                        elif field == 'value':
                                if  "street" in child.attrib['k']:
                                    node_tag[field] = standard_street_name(child.attrib['v'])

                                elif  child.attrib['k'] == 'phone':
                                    #std_phone = standard_phone_num(child.attrib['v'])
                                    node_tag[field] = standard_phone_num(child.attrib['v'])
                                    
                                elif  "postcode" in child.attrib['k']:
                                    #std_zip = standard_zip(child.attrib['v'])
                                    node_tag[field] = standard_zip(child.attrib['v'])
                                    
                                else:
                                    node_tag[field] = child.attrib['v']
                        elif field == 'type':
                            if typ =='':
                                node_tag[field] = "regular"
                            else:
                                 node_tag[field] = typ
                    node_tags.append(node_tag.copy()) 
        return {'node': node_attribs, 'node_tags': node_tags} 
        
        
    elif element.tag == 'way':
        for field in way_attr_fields:
            way_attribs[field] = element.attrib[field]
            
            if field == 'id':
                way_id = element.attrib[field]

        waynd_idx = 0 # index to track way nd tag
        
        for child in element:
            way_typ = ''
            
            if child.tag == 'tag':
                
                skip_tag = skip_tag_parse(child.attrib['k'],child.attrib['v'])
                if skip_tag[0] == False:
                    
                    for field in wy_tag_fields:
                        if field == 'id':
                            way_tag[field] = way_id
                        elif field == 'key':
                            if ":" in child.attrib['k']:
                                way_tag[field] = child.attrib['k'].split(":", 1)[1]
                                way_typ = child.attrib['k'].split(":", 1)[0]
                            else:
                                way_tag[field] = child.attrib['k']
                        elif field == 'value':
                                if  "street" in child.attrib['k']:
                                    way_tag[field] = standard_street_name(child.attrib['v'])

                                elif  child.attrib['k'] == 'phone':
                                    #std_phone = standard_phone_num(child.attrib['v'])
                                    way_tag[field] = standard_phone_num(child.attrib['v'])
                                    
                                elif  "postcode" in child.attrib['k']:
                                    #std_zip = standard_zip(child.attrib['v'])
                                    way_tag[field] = standard_zip(child.attrib['v'])
                                    
                                else:
                                    way_tag[field] = child.attrib['v']                            
                        elif field == 'type':
                            if way_typ =='':
                                way_tag[field] = "regular"
                            else:
                                way_tag[field] = way_typ
                    way_tags.append(way_tag.copy())

            elif child.tag == 'nd':
                for wnfield in wy_nodes_fields:
                    if wnfield == 'id':
                        way_node[wnfield] = way_id
                    elif wnfield == 'node_id':
                        way_node[wnfield] = child.attrib['ref']
                    elif wnfield == 'position':
                        way_node[wnfield] = waynd_idx
                waynd_idx = waynd_idx + 1  # increment the index by 1 for the next nd tag withing the same parent way tag
                way_nodes.append(way_node.copy())    
                
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': way_tags}   

In [287]:
def get_element(pfile, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(pfile, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

In [288]:

# ================================================== #
#               Helper Functions                     #
# ================================================== #


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


In [276]:

# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()
        
        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])
            

if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    #process_map(OSM_PATH, validate=True)
    process_map(FILE, validate=True)    

In [306]:
def element_counts(file_in):
    ''' function counts the number of Node, Node tags, Way, Way tags and Ways node tags. 
    Additionally it provides the count of  Node tags & Way tags that are discarded by my code.
    The idea is to validate that the count that is imported on to db by comparing against the discarded tags'''
    node_freq = defaultdict(int)
    node_set = set()
    node_tag_freq = defaultdict(int)
    skipped_node_tag_count = 0
    
    way_freq = defaultdict(int)
    way_set = set()
    way_tag_freq = defaultdict(int)
    skipped_way_tag_count = 0
    
    for element in get_element(file_in, tags=('node', 'way')):
        if element.tag == 'node':
            node_freq[element.tag] +=1
            node_set.add(element.attrib['id'])
            for child in element:
                if child.tag =='tag':
                    node_tag_freq[child.tag] +=1
                    skip_data = skip_tag_parse(child.attrib['k'],child.attrib['v'])
                    if skip_data[0] == True:
                        skipped_node_tag_count += 1 
        elif element.tag == 'way':
            way_freq[element.tag] +=1
            way_set.add(element.attrib['id'])
            for child in element:
                if child.tag =='tag':
                    way_tag_freq[child.tag] +=1 
                    skip_data = skip_tag_parse(child.attrib['k'],child.attrib['v'])
                    if skip_data[0] == True:
                        skipped_way_tag_count += 1                     
                elif child.tag == 'nd':
                    way_tag_freq[child.tag] +=1 
                        
                    
    print '******************************************************************************'
    print '********* Count of Node Elements based on the OSM File ************'
    print '******************************************************************************'
    print "Node Frequeceny is :", node_freq
    print
    print "Unique node count is :", len(node_set)
    print 
    print "Count of Tag elemnt under Node is :",node_tag_freq
    print 
    print "Count of Tag elements under Node that are filerted out is:",skipped_node_tag_count
    
    print 
    print '******************************************************************************'
    print '********* Count of Node Elements based on the OSM File ************'
    print '******************************************************************************'
    print "Way Frequeceny is :", way_freq  
    print
    print "Unique Way count is :",len(way_set)
    print 
    print "Count of Tag/nd elemnt under Way is :",way_tag_freq
    print 
    print "Count of Tag elements under Way that are filerted out is:",skipped_way_tag_count   


In [302]:
element_counts(OSM_FILE)

******************************************************************************
********* Count of Node Elements based on the OSM File ************
******************************************************************************
Node Frequeceny is : defaultdict(<type 'int'>, {'node': 328395})

Unique node count is : 328395

Count of Tag elemnt under Node is : defaultdict(<type 'int'>, {'tag': 66620})

Count of Tag elements under Node that are filerted out is: 18

******************************************************************************
********* Count of Node Elements based on the OSM File ************
******************************************************************************
Way Frequeceny is : defaultdict(<type 'int'>, {'way': 35416})

Unique Way count is : 35416

Count of Tag/nd elemnt under Way is : defaultdict(<type 'int'>, {'tag': 130992, 'nd': 360231})

Count of Tag elements under Way that are filerted out is: 1


In [234]:
# Tester Function: test street_name, zip and phone_number anomaly
'''
This function is used to print out any anomaly that is seen in street name and in the phone
Note: Street name anomaly is ironed out by building suitable data structure i.e. mapping dictionary and expected list
'''
def test_street_phone_zip_anomaly(ifile):
    for event, elem in ET.iterparse(ifile):
        if elem.tag == 'tag':
            if "street" in elem.attrib['k']:
                n = standard_street_name(elem.attrib['v'])
            if "phone" in elem.attrib['k']:
                phone = standard_phone_num(elem.attrib['v'])
                if phone == 'Error':
                    print elem.attrib['k'],' : ',elem.attrib['v'], ' -> ',phone 
            if  "postcode" in elem.attrib['k']:
                zipcode = standard_zip(elem.attrib['v'])
                if zipcode <> elem.attrib['v']:
                    print elem.attrib['k'], ' : ', elem.attrib['v'] ,' -> ',zipcode


In [None]:
test_street_phone_zip_anomaly(OSM_FILE)

In [303]:
# Tester Function: test function to trap anamolous tag that needs to be skipped when loading the CSV
'''
This function is used to emit those tags that are discarded from being parsed into CSV
'''
def test_skip_parse(ifile):
    #print ifile
    for event, elem in ET.iterparse(ifile):
        if elem.tag == 'node' or elem.tag == 'way':
            #print elem
            for child in elem:
                if child.tag == 'tag':
                    try:
                        skip_data = skip_tag_parse(child.attrib['k'],child.attrib['v'])
                    except KeyError, e:
                        print child.tag, child.attrib
                    if skip_data[0] == True:
                        print skip_data,child.attrib['k'],' -> ',child.attrib['v']
                        



In [236]:
test_skip_parse(OSM_FILE)

[True, False, True, False, False] name:ar  ->  ريدموند
[True, False, True, False, False] name:bg  ->  Редмънд
[True, False, True, False, False] name:fa  ->  ردموند
[True, False, True, False, False] name:he  ->  רדמונד
[True, False, True, False, False] name:ja  ->  レドモンド
[True, False, True, False, False] name:ko  ->  레드먼드
[True, False, True, False, False] name:ru  ->  Редмонд
[True, False, True, False, False] name:ta  ->  ரெட்மாண்ட்
[True, False, True, False, False] name:uk  ->  Редмонд
[True, False, True, False, False] name:zh  ->  雷德蒙德
[True, False, True, False, False] name:ar  ->  كيركلاند
[True, False, True, False, False] name:bg  ->  Къркланд
[True, False, True, False, False] name:fa  ->  کرکلند
[True, False, True, False, False] name:ja  ->  カークランド
[True, False, True, False, False] name:ko  ->  커클랜드
[True, False, True, False, False] name:zh  ->  柯克蘭
[True, False, False, False, True] addr:postcode  ->  W Lake Sammamish Pkwy NE
[True, False, False, True, False] phone  ->  +1-425-569-