In [1]:
import xml.etree.cElementTree as ET
import pprint
from collections import defaultdict
import re
import codecs
import json

In [2]:
OSMFILE = 'map.osm'

In [3]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Alley","Avenue", "Bend", "Bottom", "Boulevard", "Bridge", 
            "Bypass", "Cape", "Causeway", "Center", "Circle", "Common", 
            "Corner",  "Court", "Crossing", "Crossroad", "Curve", "Drive", 
            "Expressway", "Flat", "Fort", "Freeway", "Garden", "Gateway",  
            "Harbor", "Heights", "Highway", "Hill", "Hills", "Junction", 
            "Landing", "Lane", "Loop", "Mall", "Manor", "Motorway", "Overlook", 
            "Overpass", "Park", "Parkway", "Passage", "Pike", "Place", "Plaza", 
            "Point", "Port", "Ridge", "Road", "Route", "Row", "Run", "Spring", 
            "Springs", "Square", "Station", "Street", "Terrace", "Throughway", 
            "Trail", "Tunnel", "Turnpike", "Union", "View", "Way"]

direction = ["North", "South", "East", "West", "Northeast", "Northwest", 
             "Southeast", "Southwest"]

mapping = { "ave": "Avenue",
            "avenue": "Avenue",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "Blvd": "Boulevard",
            "Ct": "Court",
            "Ct.": "Court",
            "drive": "Drive",
            "Dr": "Drive",
            "Dr.": "Drive",
            "Hwy": "Highway",
            "Hwy.": "Highway",
            "Ln": "Lane",
            "Ln.": "Lane",
            "pike": "Pike",
            "Pkwy": "Parkway",
            "Pky": "Parkway",
            "Plz": "Plaza",
            "rd": "Road",
            "road": "Road",
            "Rd": "Road",
            "Rd.": "Road",
            "st": "Street",
            "St": "Street",
            "St.": "Street" }

In [4]:
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if ((street_type not in expected) and (street_type not in direction)):
            street_types[street_type].add(street_name)

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit(osmfile):
    osm_file = open(osmfile, encoding="utf8", mode="r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

In [5]:
def update_name(name, mapping):
    for i in direction:
        if i in name:
            return name
    for i in mapping:
        if i in name:
            name = name.replace(i, mapping[i])
            return name

In [6]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

In [7]:
def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :
        
        node['type'] = element.tag
        created = {}
        pos = {}
        for i in element.attrib.keys():
            if i in CREATED:
                created[i] = element.get(i)
            elif (i == 'lat' or i == 'lon'):
                pos[i] = float(element.get(i))
            else:
                node[i] = element.get(i)
        
        node['created'] = created
        if len(pos) > 0:
            node['pos'] = [pos['lat'], pos['lon']]

        nds = []
        for n in element.findall('nd'):
            nds.append(n.get('ref'))
        if len(nds) > 0:
            node['node_refs'] = nds
        
        address = {}
        for t in element.findall('tag'):
            if re.search(problemchars, t.get('k')):
                pass
            elif re.search(r'\w+:\w+:\w+', t.get('k')):
                pass
            elif 'addr' in t.get('k'):
                address[t.get('k')[5:]] = t.get('v')
            else:
                if re.search(r'\w+:\w+', t.get('k')):
                    name = re.search(r':\w+', t.get('k'))
                    node[name.group(0)[1:]] = t.get('v')
                else:
                    node[t.get('k')] = t.get('v')
        
        if len(address) > 0:
            node['address'] = address

        return node
    else:
        return None

In [8]:
def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

In [9]:
def avoid_direction(result):
    for each in ['NW', 'NE', 'SW', 'SE']:
        if each in result:
            print(result[each])
    return

In [10]:
audited = audit(OSMFILE)

In [11]:
avoid_direction(audited)

{'14th Street NW', 'Connecticut Avenue NW'}
{'14th Street SW', 'Raoul Wallenberg Place SW'}


In [12]:
print(audited)

defaultdict(<class 'set'>, {'NW': {'14th Street NW', 'Connecticut Avenue NW'}, 'SW': {'14th Street SW', 'Raoul Wallenberg Place SW'}})
