In [1]:
import xml.etree.cElementTree as ET
import pprint
from collections import defaultdict
import re
import codecs
import json

In [2]:
def count_tags(filename):
    tags = {}
    for event, elem in ET.iterparse(filename):
        if elem.tag in tags:
            tags[elem.tag] = tags[elem.tag] + 1
        else:
            tags[elem.tag] = 1
        elem.clear()
    return tags

In [3]:
OSMFILE = 'washdc.osm'

In [4]:
print(count_tags(OSMFILE))

{'node': 3430696, 'member': 66586, 'meta': 1, 'way': 390404, 'osm': 1, 'tag': 2357777, 'bounds': 1, 'relation': 3546, 'nd': 4131280, 'note': 1}


In [5]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

In [6]:
def key_type(element, keys):
    if element.tag == "tag":
        k = element.attrib['k']
        if lower.match(k):
            keys['lower'] = keys['lower'] + 1
        elif lower_colon.search(k):
            keys['lower_colon'] = keys['lower_colon'] + 1
        elif problemchars.search(k):
            keys['problemchars'] = keys['problemchars'] + 1
        else:
            keys['other'] = keys['other'] + 1
    
    return keys

In [7]:
def process_map(filename):
    keys = {'lower': 0, 'lower_colon': 0, 'problemchars': 0, 'other': 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)
    
    return keys

In [8]:
print(process_map(OSMFILE))

{'problemchars': 7, 'lower': 977491, 'other': 40177, 'lower_colon': 1340102}


In [9]:
def get_user(element):
    return element.attrib['uid']

In [10]:
def fetch_users(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if 'uid' in element.attrib:
            users.add(get_user(element))
            
    return users

In [11]:
print(fetch_users(OSMFILE))

{'6390', '2820500', '1501508', '55957', '3495552', '5565', '1198074', '3747851', '188467', '1748555', '772720', '2011413', '2529450', '2677802', '18776', '579545', '1538755', '129016', '154991', '550560', '2631622', '194076', '48176', '2842554', '2901357', '1780371', '2712018', '624323', '409731', '498757', '12267', '160181', '1609393', '3344442', '532807', '306048', '359504', '5265', '2867785', '454984', '2130371', '605519', '206785', '1246157', '1764244', '745841', '143643', '663559', '1901335', '25720', '152813', '638533', '179540', '1988955', '74705', '81983', '2981996', '1654741', '3031871', '371121', '289775', '23500', '85673', '115918', '160138', '1459958', '2538098', '2786597', '1433306', '132444', '1596312', '3011908', '832898', '307520', '2677805', '1828087', '1183440', '82807', '2551967', '1649645', '331075', '772728', '149412', '1773289', '3300831', '2623692', '1', '2173054', '533465', '21323', '2015929', '227096', '105454', '3674343', '1149057', '175643', '2442399', '77566

In [12]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Alley","Avenue", "Bend", "Bottom", "Boulevard", "Bridge", 
            "Bypass", "Cape", "Causeway", "Center", "Circle", "Common", 
            "Corner",  "Court", "Crossing", "Crossroad", "Curve", "Drive", 
            "Expressway", "Flat", "Fort", "Freeway", "Garden", "Gateway",  
            "Harbor", "Heights", "Highway", "Hill", "Hills", "Junction", 
            "Landing", "Lane", "Loop", "Mall", "Manor", "Motorway", "Overlook", 
            "Overpass", "Park", "Parkway", "Passage", "Pike", "Place", "Plaza", 
            "Point", "Port", "Ridge", "Road", "Route", "Row", "Run", "Spring", 
            "Springs", "Square", "Station", "Street", "Terrace", "Throughway", 
            "Trail", "Tunnel", "Turnpike", "Union", "View", "Way"]

In [14]:
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit(osmfile):
    osm_file = open(osmfile, encoding="utf8", mode="r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

In [15]:
audit(OSMFILE)

defaultdict(set,
            {'1': {'1', 'Priest Bridge Court Suite 1', 'US Route 1'},
             '101': {'Eastern Ave #101', 'Georgia Ave #101'},
             '1552': {'1552'},
             '2': {'Piney Branch Road Northwest, Suite #2',
              'South Glebe Road,  Bldg. 2'},
             '202': {'N. Washington Street, Suite 202'},
             '20903': {'10121 New Hampshire Ave, Silver Spring, MD 20903'},
             '3': {'MD 3', 'Spring Mall Dr #3', 'Wiehle Avenue - Floor 3'},
             '3456': {'3456'},
             '3500': {'University Research Court, Suite 3500'},
             '7200': {'7200'},
             '850': {'Connecticut Avenue Northwest Suite 850'},
             'A,': {'11215 NEW HAMPSHIRE AVESTE A,'},
             'Ave': {'Branch Ave',
              'Campbell Ave',
              'District Ave',
              'East Maple Ave',
              'Eastern Ave',
              'Fairhaven Ave',
              'Georgia Ave',
              'Kenilworth Ave',
              