Clean data, create dictionary object, and save to json file

In [2]:
import xml.etree.cElementTree as ET
import re
from collections import defaultdict
import pprint
import codecs
import json

In [1]:
osm_file = 'berkeley.osm'

In [292]:
"""
Within a <node> or <way> element, a sub-element <tag>
has attributes 'k' and 'v' (for key and value).
If the 'k' has special characters (other than ':' or '_', 
do not save these to the final output.)
"""
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

"""
For street names, 
check if the housenumber is also entered before the street name 
using starts_numeric_re.

Also, exclude the cases where the street name is a number,
such as 1st, street, 22nd street, 3rd street, 44th street,
using numbered_street_re
"""
starts_numeric_re = re.compile(r'^[\d]+')
numbered_street_re = re.compile(r'^[\d]+(st|nd|rd|th)')

"""
For post code, one re is for when there is no extension,
the second is for when there is an extension.
This is because the re that includes the '-' ends up putting the last digit
of the post code into the extension when there is no '-'

"""
post_re = re.compile(r'(?P<state>[a-zA-Z]*)[\s]*(?P<post>[\d]+)')
post_ext_re = re.compile(r'(?P<state>[a-zA-Z]*)[\s]*(?P<post>[\d]+)-?(?P<ext>[\d]+)')

In [301]:
"""
If fields encountered happen to be the same as reserved field names
that are already being used, then give them a different field name
"""
reserved_field = {
    'type'   :'type_',
    'pos'    :'pos_',
    'created':'created_',
    'address':'address_'
}

"""
Within the <node> or <way> element, "k" attribute "lon" and "lat" 
are positions longitude and latitude
"""
POS = ["lon", "lat"]

"""
Within the <node> or <way> element, some attributes are related to
the creation of the data point.  When saving the cleaned data,
these fields will be grouped together in the field 'created'
"""
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

In [291]:
"""
The key is the street type abbreviation,
and the value is the full street type to replace it with
"""
street_type_d = {
               'St': 'Street',
               'St.': 'Street',
               'Ct': 'Court',
               'Ct.': 'Court',
               'Pl': 'Plaza',
               'Pl.': 'Plaza',
               'Ave': 'Avenue',
               'Ave.': 'Avenue',
               'Sq': 'Square',
               'Sq.': 'Square'    
}

In [293]:
amenity_d = {
    'car_share': 'car_sharing',
    'parking_space': 'parking',
    'parking_entrance': 'parking'
}

In [313]:
operator_d = {
    'BART': 'Bay Area Rapid Transit (BART)',
    'Bay Area Rapid Transit':'Bay Area Rapid Transit (BART)',
    '7-11':'7-Eleven',
    'ac transit': 'AC Transit',
    'BUSD':'Berkeley Unified School District',
    'city of oakland':'City of Oakland',
    'EBMUD':'East Bay Municipal Utility District',
    'UC Berkeley': 'University of California, Berkeley (Cal)',
    'UCBerkeley':'University of California, Berkeley (Cal)',
    'University California Berkeley':'University of California, Berkeley (Cal)',
    'University of California, Berkeley':'University of California, Berkeley (Cal)',
    'UC Berkeley - Parking and Transportation':'University of California, Berkeley (Cal)',
    'Zip Car': 'Zipcar',
    'ups;fedex': ['UPS','Fedex'],
    'walgreens':'Walgreens'
}

In [290]:
"""
Input: file name of input open street map xml file.
Output: json file written to disk, 
representing the  data from the xml file, 
after it has been re-shaped and cleaned
"""

def process_map(file_in):
    file_out = "{0}.json".format(file_in)
    pretty = True
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            node = shape_element(element)
            if node:
                if pretty:
                    fo.write(json.dumps(node, indent=2)+"\n")
                else:
                    fo.write(json.dumps(node) + "\n")

In [295]:
"""
Input: xml Element
Output: a dictionary containing the fields 
gathered from the input element
"""
def shape_element(element):
    node = None
    if element.tag == "node" or element.tag == "way" :
        node = init_node(element)
        node = shape_attr(element,node)
        #process child elements (may be tag, nd )
        (address_d,other_d,ndref_l) = collect_sub(element)
        address_d = clean_addr(address_d)
        node['address'] = address_d['address']
        other_d = clean_other(other_d)
        for key,val in other_d.items():
            node[key] = val
        node['node_refs'] = ndref_l
        node = remove_unused(node)
    return node

In [288]:
"""
Input: xml etree ElementTree Element

Output: node is a dictionary with the 'type' field set,
and also default fields that are also dictionaries or lists
"""

def init_node(element):
    node = {'created': dict(),
            'pos': list(),
            'address': dict(),
            'node_refs': list()}
    node['type'] = element.tag
    return node

In [287]:
"""
Input: xml etree ElementTree Element (element) and dictionary (node).
The node is already initialized to contain some default fields,
such as 'created', 'pos', 'address', 'node_refs'.

Output: the modified node has fields set for 'created', 'pos', and 'type'
"""

def shape_attr(element,node):
    attrib_list = element.keys()
    for a in attrib_list:
        if a in CREATED:
            node['created'][a] = element.get(a)
        elif a in POS:
            if a == 'lat':
                node['pos'].insert(0,float(element.get('lat')))
            elif a == 'lon':
                node['pos'].append(float(element.get('lon')))
        else:
            node[a] = element.get(a)
    return node

In [303]:
"""
Input: xml etree ElementTree Element such as a 'node' or 'way', which has
attributes and sub-elements such as 'tag' or 'nd'

Output: 
addr_d: dictionary with field 'address', 
which is a dictionary containing fields such as
street, housenumber, state, postcode

other_d: dictionary with fields. 
Fields may also be dictionaries and denoted with the suffix '_collection'.

ndref_l: list of node references
"""
def collect_sub(element):
    sub_l = [c for c in element.getchildren()]
    addr_d = defaultdict(dict)
    ndref_l = []
    other_d = defaultdict(dict)
    
    for s in sub_l:
        if s.tag == 'tag':
            key = s.get('k')
            m = problemchars.search(key)
            if m:
                continue
            if key in reserved_field.keys():
                key = reserved_field[key]
            if key.startswith('addr:'):
                key_sub = key.split(':',1)[1]
                if key_sub.find(':') >-1:
                    key_sub = key_sub.replace(':','_')
                addr_d['address'][key_sub]= s.get('v')
            else:
                if key.find(':') > -1:
                    (key_sup,key_sub) = key.split(':',1)
                    #the super key may have already been set as a string
                    #if so, modify key_sup to include _collection, to avoid overriding the string
                    key_sup = key_sup + "_collection"
                    if key_sub.find(':') >-1:
                        key_sub = key_sub.replace(':','_')
                    other_d[key_sup][key_sub] = s.get('v')
                else:
                    other_d[key] = s.get('v')
                    
        elif s.tag == 'nd':
            ndref_l.append(s.get('ref'))
        else:#if not a 'tag' or 'nd', it's not expected and won't be processed
            pass
    return (addr_d,other_d,ndref_l)

In [285]:
"""
Given a dictionary containing field 'address',
whose value is a dictionary, clean the street and postcode fields.
Set the  housenumber, state, and postcode extension fields when the 
information can be gathered from the street and postcode fields
Return the modified dictionary
"""
def clean_addr(node_d):
    addr_d = node_d['address']
    if not addr_d:
        return node_d
    #find shortened street types and replace them
    if 'street' in addr_d.keys():
        for abbrev, word in street_type_d.items():
            if addr_d['street'].endswith(abbrev):
                addr_d['street'] = word.join(addr_d['street'].rsplit(abbrev,1))
        for abbrev, word in abbrev_word_d.items():
            if addr_d['street'].find(abbrev) > -1:
                addr_d['street'] = addr_d['street'].replace(abbrev,word)
        #if street name is preceded by a housenumber (and not 1st, 2nd, 3rd 4th street), save this as housenumber field
        m = starts_numeric_re.search(addr_d['street'])
        n = numbered_street_re.search(addr_d['street'])
        if m and not n:
            housenum = m.group()
            addr_d['street'] = addr_d['street'].split(housenum,1)[1].strip()
            if 'housenum' not in addr_d.keys():
                addr_d['housenum'] = housenum       
    #clean postcode
    if 'postcode' in addr_d.keys():
        (state, post, ext) = (None,None,None)    
        if addr_d['postcode'].find('-') >-1:
            m = post_ext_re.search(addr_d['postcode'])
            if m:
                if m.group('state'):
                    state = m.group('state').upper()
                if m.group('post'):
                    post = m.group('post')
                if m.group('ext'):
                    ext = m.group('ext')
        else:
            m = post_re.search(addr_d['postcode'])
            if m:
                if m.group('state'):
                    state = m.group('state').upper()
                if m.group('post'):
                    post = m.group('post')

        if post:
            addr_d['postcode'] = post
        if state and 'state' not in addr_d.keys():
            addr_d['state'] = state
        if ext:
            addr_d['postcode_ext'] = ext

    node_d['address'] = addr_d
    return node_d

In [315]:
def clean_other(other_d):
    if 'amenity' in other_d.keys():
        k = other_d['amenity']
        if k in amenity_d.keys():
            other_d['amenity'] = amenity_d[k]
    if 'operator' in other_d.keys():
        k = other_d['operator']
        if k in operator_d.keys():
            other_d['operator'] = operator_d[k]
    if 'lanes' in other_d.keys():
        try:
            other_d['lanes'] = int(other_d['lanes'])
        except ValueError:
            print 'tried to convert {} to integer'.format(other_d['lanes'])
    return other_d

In [194]:
"""
Some default fields may not get used, so delete them if empty
"""
def remove_unused(d):
    if not d['created']:
        del d['created']
    if not d['pos']:
        del d['pos']
    if not d['address']:
        del d['address']
    if not d['node_refs']:
        del d['node_refs']
    return d

In [284]:
"""
Use during debugging to find a specific element 
and exit iteration when found
"""
def seek_elem(element):
    if element.tag == 'node' and element.get('id') == '53023690':
        return element
    else:
        return None

In [316]:
if __name__ == "__main__":
    process_map(osm_file)

Create smaller sample file of the osm data

In [317]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "berkeley.osm"  # Replace this with your osm file
SAMPLE_FILE = "berkeley_sample.osm"

k = 10 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')