In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
- attributes in the CREATED array should be added under a key "created"
- attributes for latitude and longitude should be added to a "pos" array,
for use in geospacial indexing. Make sure the values inside "pos" array are floats
and not strings.
- if the second level tag "k" value contains problematic characters, it should be ignored
- if the second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if the second level tag "k" value does not start with "addr:", but contains ":", you can
process it in a way that you feel is best. For example, you might split it into a two-level
dictionary like with "addr:", or otherwise convert the ":" to create a valid key.
- if there is a second ":" that separates the type/direction of a street,
the tag should be ignored, for example:
<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>
should be turned into:
{...
"address": {
"housenumber": 5158,
"street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}
- for "way" specifically:
<nd ref="305896090"/>
<nd ref="1719825889"/>
should be turned into
"node_refs": ["305896090", "1719825889"]

In [10]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json

In [11]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
startswithaddr = re.compile(r'\Aaddr:')
afteraddr = re.compile(r':.+$')
afteraddr2 = re.compile(r'[a-zA-Z+$]')


CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
pos = []

mapping = { "Ave": "Avenue",
            "Pl": "Place",
            "St": "Street",
            "Steet": "Street",
            "ave": "Avenue"
            }

#Checks to see if the element is a street name
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

#This function will update the street name if necessary to the correct one.
def update_name(name, mapping):
    m = street_type_re.search(name)
    o = street_name_re.search(name)
    if m:
        street_type = m.group()
        if street_type in mapping.keys():
            name = mapping[street_type]
            if o:
                firstname = o.group()
                
    return firstname + " " + name

#This processes the file
def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

In [12]:
def shape_element(element):
    node = {}
    
    if element.tag == "node" or element.tag == "way" :
        node['type'] = element.tag
        if 'lon' in element.attrib:
            # treat geo attribs values
            node.update({'pos' : [element.attrib['lon'], element.attrib['lat']]})
        
        for attr in element.attrib:
            
            if attr in ['lat', 'lon']:
                pass # already treated
            elif attr in CREATED:
                node.setdefault('created', {})[attr] = element.attrib[attr]
            else:   
                node[attr] = element.attrib[attr]
        
        for tag in element.iter("tag"):
            # treat child tags
            m = startswithaddr.search(tag.attrib['k'])
            if m:
                m = m.group()
                o = afteraddr.search(m)
                if o:
                    o = o.group()
                    if is_street_name(tag):
                        input1 = update.name(tag.attrib['v'], mapping)
                    else: 
                        input1 = tag.attrib['v']
                        node.update({"Address" : {o : input1}})
                else: node.update({tag.attrib['k'] : tag.attrib['v']})
        
        for tag in element.iter("nd"):
            #treat nd childs
            #node.update({'node_ref' : [element.attrib['nd']]})
            
        # here you can print your element to check if it is ok
        return node
    else:
        return None

In [13]:
process_map('wallawalla.osm')

KeyError: 'nd'