In [7]:
"""Code based on quizzes and exercises from Data Wrangling with MongoDB"""
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "vancouver_canada.osm"
MY_XML = "my_vancouver.xml"

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE) # boundary, non-whitespace, 1 or more reps., period, 0 or 1 rep., end.


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Alley", "Broadway", "Crescent", "Highway", "Mews", 
            "Walk", "Way", "Mall", "Kingsway"]

# 'mapping' only takes into account unexpected street types according to 'audit'
mapping = {"Ave" : "Avenue",
           "Blvd" : "Boulevard",
           "Rd." : "Road",
           "St" : "Street",
           "St." : "Street",
           "Vancouver" : ""} # 'Vancouver' is added at the end of the street, which is not needed.


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    
    return street_types


def update_name(name, mapping):

    # Change street types to the appropriate ones
    for e in mapping.keys():
        e_search = re.search(e,name)
        ## Make sure it is an abbreviation, and not part of other word:
        # Cast raw-string 
        # Ref: http://stackoverflow.com/questions/2428117/casting-raw-strings-python
        #      http://stackoverflow.com/questions/18707338/print-raw-string-from-variable-not-getting-the-answers/18707543#18707543
        part = e.encode('string-escape')+r'[a-z]'
        #
        part_search = re.search(part,name)
        if part_search == None:
            ##
            if e_search:
                name = re.sub(e,mapping[e],name)
    return name     


def update_osm():
    st_types = audit(OSMFILE)
    pprint.pprint(dict(st_types))

    # Create new XML file with corrections:
    # Ref: 19.7.1.4. Modifying an XML File
    #      https://docs.python.org/2/library/xml.etree.elementtree.html
    tree = ET.parse(OSMFILE)
    root = tree.getroot()
    
    for st_type, ways in st_types.iteritems():
        for name in ways:
            
            better_name = update_name(name, mapping)
            #print name, "=>", better_name
            or child in root:
                if child.tag == "node" or child.tag == "way":
                    for tag in child.iter("tag"):
                        if is_street_name(tag):
                            if tag.attrib['v'] == name:
                                #tag.attrib['v'] = better_name
                                tag.set('v',better_name)
                                    
    tree.write(MY_XML)
    # Verify changes:
    st_types_after = audit(MY_XML)
    pprint.pprint(dict(st_types_after))

if __name__ == '__main__':
    update_osm()

{'3305': set(['Expo Blvd, #3305']),
 'Ave': set(['West 3rd Ave', 'West 41st Ave']),
 'Blvd': set(['University Blvd']),
 'Broughton': set(['Broughton']),
 'East': set(['29th Avenue East']),
 'Jarvis': set(['Jarvis']),
 'Jervis': set(['Jervis']),
 'Pender': set(['463 West Pender']),
 'Rd.': set(['Boundary Rd.']),
 'South': set(['East Kent Avenue South']),
 'St': set([' Beatty St', 'Robson St', 'Whitchurch St', 'Yew St']),
 'St.': set(['Mainland St.', 'Seymour St.']),
 'Terminal': set(['Station Terminal']),
 'Vancouver': set(['Howe St. Vancouver', 'W. Hastings St. Vancouver'])}
{'3305': set(['Expo Boulevard, #3305']),
 'Broughton': set(['Broughton']),
 'East': set(['29th Avenue East']),
 'Jarvis': set(['Jarvis']),
 'Jervis': set(['Jervis']),
 'Pender': set(['463 West Pender']),
 'South': set(['East Kent Avenue South']),
 'Terminal': set(['Station Terminal'])}
