In [1]:
import xml.etree.cElementTree as ET
import pprint
from collections import defaultdict
import re
import codecs
import json

In [2]:
OSMFILE = "washdc.osm"

In [3]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Alley","Avenue", "Bend", "Bottom", "Boulevard", "Bridge", 
            "Bypass", "Cape", "Causeway", "Center", "Circle", "Common", 
            "Corner",  "Court", "Crossing", "Crossroad", "Curve", "Drive", 
            "Expressway", "Flat", "Fort", "Freeway", "Garden", "Gateway",  
            "Harbor", "Heights", "Highway", "Hill", "Hills", "Junction", 
            "Landing", "Lane", "Loop", "Mall", "Manor", "Motorway", "Overlook", 
            "Overpass", "Park", "Parkway", "Passage", "Pike", "Place", "Plaza", 
            "Point", "Port", "Ridge", "Road", "Route", "Row", "Run", "Spring", 
            "Springs", "Square", "Station", "Street", "Terrace", "Throughway", 
            "Trail", "Tunnel", "Turnpike", "Union", "View", "Way"]

direction = ["North", "South", "East", "West","Northeast", "Northwest", "Southeast", "Southwest"]

direction_map = { "n.": "North", "n": "North", "N.": "North", "N": "North",
                  "s.": "South", "s": "South", "S.": "South", "S": "South",
                  "w.": "West", "w": "West", "W.": "West", "W": "West",
                  "e.": "East", "e": "East", "E.": "East", "E": "East",
                  "n.w.": "Northwest", "nw": "Northwest", "N.W.": "Northwest", "NW": "Northwest",
                  "n.e.": "Northeast", "ne": "Northeast", "N.E.": "Northeast", "NE": "Northeast",
                  "s.w.": "Southwest", "sw": "Southwest", "S.W.": "Southwest", "SW": "Southwest",
                  "s.e.": "Southeast", "se": "Southeast", "S.E.": "Southeast", "SE": "Southeast" }

mapping = { "ave": "Avenue",
            "avenue": "Avenue",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "Blvd": "Boulevard",
            "Ct": "Court",
            "Ct.": "Court",
            "drive": "Drive",
            "Dr": "Drive",
            "Dr.": "Drive",
            "Hwy": "Highway",
            "Hwy.": "Highway",
            "Ln": "Lane",
            "Ln.": "Lane",
            "pike": "Pike",
            "Pkwy": "Parkway",
            "Pky": "Parkway",
            "Plz": "Plaza",
            "rd": "Road",
            "road": "Road",
            "Rd": "Road",
            "Rd.": "Road",
            "st": "Street",
            "St": "Street",
            "St.": "Street" }

In [4]:
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

In [5]:
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

In [6]:
def audit(osmfile):
    osm_file = open(osmfile, "r", encoding="utf8")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

In [7]:
def check_street_type(street_name):
    m = street_type_re.search(street_name.strip())
    if m:
        street_type = m.group()
        return street_type in expected, street_type in mapping.keys(), m.start()
    else:
        return False, False, 0

def check_if_direction(street_name):
    m = street_type_re.search(street_name.strip())
    if m:
        street_type = m.group()
        return street_type in direction, street_type in direction_map.keys(), m.start()
    else:
        return False, False, 0

def update_name(name, mapping, direction_map):
    check1, check2, index = check_if_direction(name)
    direct = ""
    if check1:
        direct = name[index:]
        name = name[:index]
    if check2:
        temp = name[index:]
        for i in direction_map:
            if i == temp:
                direct = direction_map[i]
                name = name[:index]
    
    check1, check2, index = check_street_type(name)
    street_suffix = ""
    if check1:
        direct = direct.strip()
        name = name.strip()
        return ("%s %s" % (name, direct)).strip().title()
    if check2:
        temp = name[index:].strip()
        for i in mapping:
            if i == temp:
                street_suffix = mapping[i]
                name = name[:index]
    
    direct = direct.strip()
    street_suffix = street_suffix.strip()
    name = name.strip()
    return ("%s %s %s" % (name, street_suffix, direct)).strip().title()

In [8]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :
        
        created = {}
        pos = {}
        for i in element.attrib.keys():
            if i in CREATED:
                created[i] = element.get(i)
            elif (i == 'lat' or i == 'lon'):
                pos[i] = float(element.get(i))
            else:
                node[i] = element.get(i)
        
        node['created'] = created
        if len(pos) > 0:
            node['pos'] = [pos['lat'], pos['lon']]
        
        nds = []
        for n in element.findall('nd'):
            nds.append(n.get('ref'))
        if len(nds) > 0:
            node['node_refs'] = nds
        
        address = {}
        for t in element.findall('tag'):
            if re.search(problemchars, t.get('k')):
                pass
            elif re.search(r'\w+:\w+:\w+', t.get('k')):
                pass
            elif 'addr' in t.get('k'):
                address[t.get('k')[5:]] = t.get('v')
            else:
                if re.search(r'\w+:\w+', t.get('k')):
                    name = re.search(r':\w+', t.get('k'))
                    node[name.group(0)[1:]] = t.get('v')
                else:
                    node[t.get('k')] = t.get('v')
        if len(address) > 0:
            if 'street' in address:
                name = address['street']
                address['street'] = update_name(name, mapping, direction_map)
            node['address'] = address
        if 'type' in node:
            node['place_type'] = node['type']
        node['type'] = element.tag
        return node
    else:
        return None

In [9]:
def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

In [10]:
%time audit(OSMFILE)

Wall time: 2min 15s


defaultdict(set,
            {'1': {'1', 'Priest Bridge Court Suite 1', 'US Route 1'},
             '101': {'Eastern Ave #101', 'Georgia Ave #101'},
             '1552': {'1552'},
             '2': {'Piney Branch Road Northwest, Suite #2',
              'South Glebe Road,  Bldg. 2'},
             '202': {'N. Washington Street, Suite 202'},
             '20903': {'10121 New Hampshire Ave, Silver Spring, MD 20903'},
             '3': {'MD 3', 'Spring Mall Dr #3', 'Wiehle Avenue - Floor 3'},
             '3456': {'3456'},
             '3500': {'University Research Court, Suite 3500'},
             '7200': {'7200'},
             '850': {'Connecticut Avenue Northwest Suite 850'},
             'A,': {'11215 NEW HAMPSHIRE AVESTE A,'},
             'Ave': {'Branch Ave',
              'Campbell Ave',
              'District Ave',
              'East Maple Ave',
              'Eastern Ave',
              'Fairhaven Ave',
              'Georgia Ave',
              'Kenilworth Ave',
              