### Map Area

San Antonio, TX United States

https://mapzen.com/data/metro-extracts/metro/san-antonio_texas/

This map is of San Antonio, the city where my grandparents lived and where I spent the holidays growing up.

### Data Overview

### Problems Encountered in the Map

I noticed one main problems with the data: 

Numerous abbreviations for street names ('North US Highway 281','US Highway 281','United States Highway 281')

In [45]:
#!/usr/bin/env python
#-*- coding: utf-8 -*-

import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "san-antonio_texas.osm"  # Replace this with your osm file
SAMPLE_FILE = "sample.osm"

k = 10 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

### Data Set
We can see from the count tags function the amount of nodes, members, tags, and ways below.

In [8]:
adef count_tags(filename):
    tags = {}
    for event, elem in ET.iterparse(filename):
        if elem.tag in tags.keys():
            tags[elem.tag] += 1
        else:
            tags[elem.tag] = 1

    return tags

count_tags(OSM_FILE)

{'bounds': 1,
 'member': 23537,
 'nd': 1479783,
 'node': 1244193,
 'osm': 1,
 'relation': 1718,
 'tag': 751039,
 'way': 144603}

In [52]:
SELECT value, COUNT(*) as num
FROM nodes_tags
WHERE key='amenity'
GROUP BY value
ORDER BY num DESC
LIMIT 10

SyntaxError: invalid syntax (<ipython-input-52-7ec54da9059d>, line 1)

In [9]:
import pprint
import re
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        k = element.attrib['k']
        if re.search(lower,k):
            keys["lower"] += 1
        elif re.search(lower_colon,k):
            keys["lower_colon"] += 1
        elif re.search(problemchars,k):
            keys["problemchars"] += 1
        else:
            keys["other"] += 1
        return keys
        pass
        
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

In [24]:
process_map(OSM_FILE)

{'lower': 432074, 'lower_colon': 310783, 'other': 8182, 'problemchars': 0}

In [28]:
from collections import defaultdict

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Interstate Highway", "Farm-to-Market" ]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "Ste": "Street",
            "Rd.": "Road",
            "Rd" : "Road",
            "Ave": "Avenue",
            "Blvd": "Boulevard",
            "Hwy": "Interstate Highway",
            "Hiwy": "Interstate Highway",
            "IH": "Interstate Highway",
            "I-": "Interstate Highway",
            "I-H": "Interstate Highway",
            "Interstate": "Interstate Highway",
            "Interstate": "Interstate Highway", 
            "Dr.": "Drive",
            "Dr": "Drive",
            "FM": "Farm-to-Market",
            "Plz": "plaza"
            }

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)
            

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

In [29]:
def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

In [27]:
audit(OSM_FILE)

defaultdict(set,
            {'100': {'S Interstate 35 Ste 100', 'W Ave #100'},
             '101': {'Dolorosa St #101'},
             '102': {'North Loop 1604 East, Suite 102'},
             '103': {'Broadway St #103', 'Stone Oak Pkwy #103'},
             '105': {'SE Military Drive Ste 105'},
             '109': {'Huebner Rd #109'},
             '1101': {'FM 1101'},
             '111': {'W Loop 1604 N Ste 111'},
             '112': {'US Hwy 281 North, Suite 112'},
             '117': {'La Cantera Parkway Ste 117'},
             '12': {'Old Ranch Road 12', 'RM 12', 'Ranch Road 12'},
             '123': {'Business 123',
              'S State Hwy 123',
              'S. Hiwy 123',
              'South State Highway 123',
              'State Highway 123',
              'TX 123'},
             '1283': {'FM 1283'},
             '131': {'Fredericksburg Rd #131'},
             '132': {'State Highway 132'},
             '1346': {'FM 1346', 'Farm-to-Market Road 1346'},
             '151': {'S

In [30]:
def update_name(name, mapping):
    m = street_type_re.search(name)
    if m.group() not in expected:
        if m.group() in mapping.keys():
            name = re.sub(m.group(), mapping[m.group()], name)
    return name

In [41]:
import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus
import schema

OSM_PATH = "sample.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

In [42]:
def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    # YOUR CODE HERE
    if element.tag == 'node':
        for attribute in element.attrib:
            if attribute in NODE_FIELDS:
                node_attribs[attribute]=element.attrib[attribute]
        
        sub_iter=element.iter("tag")
        for atr in sub_iter:
            k_val=atr.attrib['k']
            locol=LOWER_COLON.search(k_val)
            prochar=PROBLEMCHARS.search(k_val)
            if locol:
                key_list = k_val.split(':',1)
                k_key=key_list[1]
                tag_type=key_list[0]
            elif prochar:
                break
            else:
                tag_type="regular"
                k_key=k_val
            v_val=atr.attrib['v']
            content={"id":node_attribs['id'],'key':k_key,'value':v_val,'type':tag_type}
            tags.append(content)
        return {'node': node_attribs, 'node_tags': tags}
    
    elif element.tag == 'way':
        for attribute in element.attrib:
            if attribute in WAY_FIELDS:
                way_attribs[attribute]=element.attrib[attribute]
    
        sub_iter=element.iter("nd")
        level=0
        for atr in sub_iter:
            for sub_attrib in atr.attrib:
                if sub_attrib=='ref':
                    content= {"id":way_attribs['id'],'node_id':atr.attrib[sub_attrib],'position':level}
                    way_nodes.append(content)
                    level+=1
        sub_iter=element.iter("tag")
        for atr in sub_iter:
            k_val=atr.attrib['k']
            locol=LOWER_COLON.search(k_val)
            prochar=PROBLEMCHARS.search(k_val)
            if locol:
                key_list = k_val.split(':',1)
                k_key=key_list[1]
                tag_type=key_list[0]
            elif prochar:
                break
            else:
                tag_type="regular"
                k_key=k_val
            v_val=atr.attrib['v']
            content={"id":way_attribs['id'],'key':k_key,'value':v_val,'type':tag_type}
            tags.append(content)            
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}
# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)



In [44]:
# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=True)
