In [27]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET
from collections import defaultdict

import cerberus

import schema

OSM_PATH = "sample.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tag.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements
    
  
    #creating keys: values for dictionarys
    if element.tag == 'node':
        for item in NODE_FIELDS:
            node_attribs[item] = element.get(item)
        for child in element:
            tag_dict = {}
            colon = child.get('k').find(':')
            if (child.tag == 'tag'):
                tag_dict['id'] = element.get('id')
                
                if child.attrib['k'] == "addr:street":
                    if child.attrib['v'] == "NE 2nd Ave":
                        print child.attrib['v']
                    tag_dict['value'] = update_name(child.attrib['v'], mapping)
                    if child.attrib['v'] == "NE 2nd Ave":
                        print "correct: {0}".format(tag_dict['value'])       
                        tag_dict['value'] = tag_dict['value']
                        
                if child.attrib['k'] == "addr:postcode":
                    tag_dict['value'] = update_postcode(child.attrib['v'])
                
                else:
                    tag_dict['value'] = (child.attrib['v'])
                
                if (colon != -1):
                    type_value = child.get('k')[:colon]
                    key_value = child.get('k')[colon+1:]
                    tag_dict['type'] = type_value
                    tag_dict['key'] = key_value
                else:
                    tag_dict['key'] = child.get('k')
                    tag_dict['type'] = 'regular'
                tags.append(tag_dict)
        return {'node': node_attribs, 'node_tags': tags}
   
    elif element.tag == 'way':
        for field in WAY_FIELDS:
            # way holds top level way attributes
            way_attribs[field] = element.attrib[field]
            
        # index labeling what order the nd tag appears within the way element
        position = 0
            
        for child in element:
            if child.tag == 'tag':
                if PROBLEMCHARS.match(child.attrib["k"]):
                    continue
                
                second_tag_dict = {}
                second_tag_dict['id'] = element.attrib['id']
                
                if child.attrib['k'] == "addr:street":
                    second_tag_dict['value'] = update_name(child.attrib['v'], mapping)
                        
                if child.attrib['k'] == "addr:postcode":
                    second_tag_dict['value'] = update_postcode(child.attrib['v'])
                
                else:
                    second_tag_dict['value'] = (child.attrib['v'])
                
                if LOWER_COLON.match(child.attrib["k"]):
                    second_tag_dict['type'] = child.attrib["k"].split(":",1)[0]
                    second_tag_dict['key'] = child.attrib["k"].split(":",1)[1]
                else:
                    second_tag_dict["type"] = "regular"
                    second_tag_dict["key"] = child.attrib["k"]
                tags.append(second_tag_dict)
            
            elif child.tag == 'nd':
                way_nodes_dict = {}
                way_nodes_dict['id'] = element.attrib['id']
                way_nodes_dict['node_id'] = child.attrib['ref']
                way_nodes_dict['position'] = position
                position += 1
            
                way_nodes.append(way_nodes_dict) 
                
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}

#replace street names not in "expected" by mapping key to value in mapping 
def update_name(name, mapping):
#courtesy stackoverflow http://stackoverflow.com/questions/2400504/easiest-way-to-replace-a-string-using-a-dictionary-of-replacements
#    pattern = re.compile(r'\b(' + '|'.join(mapping.keys()) + r')\b')
#    name = pattern.sub(lambda x: mapping[x.group()], name)
    m = street_type_re.search(name)
    street_type = m.group()
    if street_type not in expected: 
        if street_type in mapping.keys(): 
            new_street_type = mapping[street_type]
            name = name.replace(street_type, new_street_type)
    
    return name

#Ensure all postcode are 5 digits only
def update_postcode(postcode):
    postcode = postcode.upper()
    if ' ' not in postcode:
        if len(postcode) != 5:
            postcode = postcode[0:5]  
    return postcode
            
#Cleaning and updating Street Names Functions
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Ave.": "Avenue", 
            "Rd.": "Road", 
            "Rd": "Road", 
            "N.": "North", 
            "N": "North", 
            "Blvd.": "Boulevard", 
            "Blvd": "Boulevard",
            "S.": "South",
            "S": "South",
            "W.": "West",
            "W": "West",
            "E.": "East",
            "E": "East",
            "Dr": "Drive",
            "Dr.": "Drive",
            "Pl.": "Place",
            "Pl": "Place",
            "Sq": "Square",
            "Sq.": "Square",
            "Ln": "Lane",
            "Ln.": "Lane",
            "Trl": "Trail",
            "Trl.": "Trail",
            "Pkwy": "Parkway",
            "Pkwy.": "Parkway",
            "Crt": "Court",
            "Crt.": "Court",
            "Blvd": "Boulevard",
            "Blvd.": "Boulevard",
            }

# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':

    # sample of the map when validating.
    process_map(OSM_PATH, validate=True)

NE 2nd Ave
correct: NE 2nd Avenue
