In [1]:
"""Code based on quizzes and exercises from Data Wrangling with MongoDB"""
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :
        # 
        # attributes in the CREATED array should be added under a key "created":
        node['created'] = {}
        for e in CREATED:
            if e in element.attrib:
                created_dict = {}
                created_dict[e] = element.attrib[e]
                node['created'].update(created_dict)
        # attributes for latitude and longitude should be added to a "pos" array:
        node['pos'] = []
        if 'lat' in element.attrib:
            node['pos'].append(float(element.attrib['lat']))
        else:
            node['pos'].append(None)
        if 'lon' in element.attrib:
            node['pos'].append(float(element.attrib['lon']))
        else:
            node['pos'].append(None)
        # Elements on first level tag:
        node['id'] = element.attrib['id']
        node['type'] = element.tag
        if 'visible' in element.attrib:
            node['visible'] = element.attrib['visible']
        else:
            node['visible'] = None
        # Elements on second level tag:
        # initialize 'address' dict.
        for child in element:
            if child.tag == 'tag':
                if child.attrib['k'].startswith('addr:'):
                   node['address'] = {}
        # initialize node refs. list
        for child in element:
            if child.tag == 'nd':
                node['node_refs'] = []
        # loop to fill node values
        for child in element:
            if child.tag == 'tag':
                # - if second level tag "k" value contains problematic characters, 
                # it is ignored:
                p = problemchars.search(child.attrib['k'])
                if not p:
                    # - if "k" value starts with "addr:",
                    # it is added to a dictionary "address"
                    if child.attrib['k'].startswith('addr:'):
                        splited_addr = child.attrib['k'].split(':')
                        # - if there is a second ":" that separates the type/direction
                        # of a street, then it is ignored
                        if len(splited_addr) == 2:
                            addr_dict = {}
                            addr_dict[splited_addr[1]] = child.attrib['v']
                            node['address'].update(addr_dict)
                    # - if second level tag "k" value does not start with "addr:", 
                    # but contains ":", process it the same as any other tag.
                    else:
                        node[child.attrib['k']] = child.attrib['v']
            # node references for way:
            elif child.tag == 'nd':
                node['node_refs'].append(child.attrib['ref'])
        #
        return node
    else:
        return None


def process_map(file_in):
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                #fo.write(json.dumps(el) + ",")
        # Using json.dumps to create the json file with the entire data array:
        # Ref: http://stackoverflow.com/questions/21525328/python-converting-a-list-of-dictionaries-to-json
        fo.write(json.dumps(data))                        
    return data

def transform():
    # 
    data = process_map('my_vancouver.xml') # This file has the street names corrections performed on 'improve_street_names'
    
    print data[0:2] # Print two first elements
    
    ## Print 10 elements with 'address' field
    count = 0
    for e in data: 
        if 'address' in e.keys():
            pprint.pprint(e)
            count += 1
        if count == 10:
            break
            
if __name__ == "__main__":
    transform()

[{'id': '25250662', 'visible': None, 'type': 'node', 'pos': [49.1978055, -123.1026634], 'created': {'uid': '135851', 'changeset': '8895101', 'version': '17', 'user': 'z-dude', 'timestamp': '2011-08-01T20:35:13Z'}}, {'id': '25251429', 'visible': None, 'type': 'node', 'pos': [49.1950895, -123.1397932], 'created': {'uid': '1855067', 'changeset': '20096615', 'version': '13', 'user': 'Drdul', 'timestamp': '2014-01-20T02:17:23Z'}}]
{'address': {'housenumber': '6088', 'street': 'South Campus Road'},
 'barrier': 'gate',
 'bicycle': 'yes',
 'created': {'changeset': '20153277',
             'timestamp': '2014-01-23T03:19:05Z',
             'uid': '1891976',
             'user': 'AdamWill',
             'version': '3'},
 'foot': 'yes',
 'id': '263593340',
 'pos': [49.249748, -123.2345472],
 'type': 'node',
 'visible': None}
{'address': {'housenumber': '6182', 'street': 'South Campus Road'},
 'barrier': 'gate',
 'bicycle': 'yes',
 'created': {'changeset': '20153277',
             'timestamp': '201