In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.cElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "san-francisco_california.osm"  # Replace this with your osm file
SAMPLE_FILE = "sample.osm"

k = 30 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

In [26]:
# work with sample file. Let's audit and look at the street names. 
import pprint

def count_tags(filename):
        tags={}
        for event, elem in ET.iterparse(filename):
            if(elem.tag == 'None'):
                pass
            elif(elem.tag not in tags.keys()):
                tags[elem.tag] = 1
            else:
                tags[elem.tag]+=1
        return tags
count_tags('san-francisco_california.osm')

{'bounds': 1,
 'member': 53772,
 'nd': 7448039,
 'node': 6278429,
 'osm': 1,
 'relation': 6012,
 'tag': 1994316,
 'way': 770438}

In [3]:
import pprint
import re
"""
Your task is to explore the data a bit more.
Before you process the data and add it into your database, you should check the
"k" value for each "<tag>" and see if there are any potential problems.

We have provided you with 3 regular expressions to check for certain patterns
in the tags. As we saw in the quiz earlier, we would like to change the data
model and expand the "addr:street" type of keys to a dictionary like this:
{"address": {"street": "Some value"}}
So, we have to see if we have such tags, and if we have any tags with
problematic characters.

Please complete the function 'key_type', such that we have a count of each of
four tag categories in a dictionary:
  "lower", for tags that contain only lowercase letters and are valid,
  "lower_colon", for otherwise valid tags with a colon in their names,
  "problemchars", for tags with problematic characters, and
  "other", for other tags that do not fall into the other three categories.
See the 'process_map' and 'test' functions for examples of the expected format.
"""


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag=="tag":
        if lower.search(element.attrib['k']):
            keys["lower"] +=1
        elif lower_colon.search(element.attrib['k']):
            keys["lower_colon"]+=1
        elif problemchars.search(element.attrib['k']):
            keys["problemchars"]+=1
        else:
            keys["other"]+=1
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)
    return keys
process_map('san-francisco_california.osm')
# how to deal with problemchars

{'lower': 128299, 'lower_colon': 69108, 'other': 2547, 'problemchars': 13}

In [99]:
# return street names
import re
from collections import defaultdict

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)
            
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types;
#audit('san-francisco_california.osm')

In [102]:
street_mapping = {"151": "",
           "15th": "15th Street",
           "2": "",
           "Avenie":"Avenue",
           "203": "",
           "302": "",
           "3500": "",
           "3658": "",
           "4": "",
           "404": "",
           "502": "",
           "AVE": "Avenue",
           "Airport": "San Francisco International Airport",
           "Alameda": "Alameda Street",
           "Alto": "Alto Route",
           "Ave": "Avenue",
           "Ave. ": "Avenue",
           "Blvd": "Boulevard",
           "Blvd, ": "Boulevard",
           "Blvd.": "Boulevard",
           "California": "California Street",
           "Cres": "Crescent",
           "Ctr": "Center",
               "Dr": "Drive",
           "Hwy": "Highway",
           "Ln.": "Lane",
           "North": "",
           "Rd": "Road",
           "Rd.": "Road",
           "St": "Street",
           "St.": "Street",
           "broadway": "Broadway",
           "square": "Square",
           "st": "Street",
           "street":"Street",
           "way":"Way",
            }

def update_name(name, mapping):
    m = street_type_re.search(name)
    other_street_types = []
    if m:
        street_type= m.group( )
        print m
        if street_type in mapping.keys( ):
            name = re.sub(street_type,mapping[street_type],name)
        else:
            other_street_types.append(street_type)
    return name


def clean_streets(osmfile):
    st_types = audit(osmfile)
    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, street_mapping)
            print name, "=>", better_name
#street_cleaning()

In [2]:
# audit city names
import re
from collections import defaultdict

def audit_city(invalid_city_names, city_name):
    invalid_city_names.add(city_name)
    return invalid_city_names
            
def is_city_name(elem):
    return (elem.attrib['k'] == "addr:city")

def auditCITY(osmfile): # executes city listings
    osm_file = open(osmfile, "r")
    city_types = set([])
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_city_name(tag):
                    audit_city(city_types, tag.attrib['v'])
    osm_file.close()
    return city_types
auditCITY('san-francisco_california.osm')                
# Alameda incorrections to Alameda
# Berkeley, CA --> truncate Berkeley, CA to Berkeley
# Emeyville --> Emeryville
# All weird oaklands --> Oakland
# Pleasant Hill, CA to Pleasant Hill
# SAN CARLOS to San Carlos
# alameda to Alameda, berkeley, castro valley, 

{'11720',
 '155',
 '157',
 '952',
 'Alamda',
 'Alameda',
 'Alamo',
 'Albany',
 'Artherton',
 'Atherton',
 'Belmont',
 'Belvedere Tiburon',
 'Berkeley',
 'Berkeley, CA',
 'Brisbane',
 'Burlingame',
 'Canyon',
 'Castro Valley',
 'Colma',
 'Concord',
 'Daly City',
 'Danville',
 'East Palo Alto',
 'El Cerrito',
 'Emerald Hills',
 'Emeryville',
 'Emeyville',
 'Foster City',
 'Fremont',
 'Fremont ',
 'Greenbrae',
 'Half Moon Bay',
 'Hayward',
 'Hillsborough',
 'Kensington',
 'Kentfield',
 'Lafayette',
 'Larkspur',
 'Marin City',
 'Menlo Park',
 'Milbrae',
 'Mill Valley',
 'Millbrae',
 'Montara',
 'Moraga',
 'Moss Beach',
 'Muir Beach',
 'Newark',
 'OAKLAND',
 'Oakland',
 'Oakland ',
 'Oakland CA',
 'Oakland, CA',
 'Oakland, Ca',
 'Okaland',
 'Orinda',
 'PT RICHMOND',
 'Pacifica',
 'Palo Alto',
 'Piedmont',
 'Pleasant Hill',
 'Pleasant Hill, CA',
 'Point Richmond',
 'Redwood City',
 'Richmond',
 'SAN CARLOS',
 'San Bruno',
 'San Carlos',
 'San Francisco',
 'San Leandro',
 'San Lorenzo',
 'San

In [104]:
# Update city names based on audit & note 11720','155','157','952',
def update_name_city(name, citymapping):
    if name in citymapping: 
        name = citymapping[name]       
        return name
    else:
        return name
citymapping = {
    "Alamda": "Alameda",
    "Alamo": "Alameda",
    "Artherton": "Atherton",
    "Berkeley, CA": "Berkeley",
    "berkeley": "Berkeley",
    "Emeyville": "Emeryville",
    "Fremont ": "Fremont",
    'OAKLAND': "Oakland",
    'Oakland ': "Oakland",
    'Oakland CA': "Oakland",
    'Oakland, CA': "Oakland",
    'Oakland, Ca': "Oakland",
    "Okaland": "Oakland",
    "Pleasant Hill, CA": "Pleasant Hill",
    "PT RICHMOND": "Point Richmond",
    "SAN CARLOS": "San Carlos",
    "Sausalito ": "Sausalito",
    "alameda": "Alameda",
    "castro valley": "Castro Valley",
    "daly city":"Daly City",
    "hayward": "Hayward",
    "menlo park": "Menlo Park",
    "oakland": "Oakland",
    "richmond": "Richmond",
    "san Carlos": "San Carlos",
    "san Mateo": "San Mateo",
    "south San Francisco": "South San Francisco",
    "walnut Creek": "Walnut Creek"
    
}
def clean_cities(osmfile):
    for event, element in ET.iterparse(osmfile, events=("start","end")):
        if element.tag == "node" or element.tag == "way":
            for tag in element.iter("tag"):

                if tag.attrib['k'] == "addr:city":
                    tag.attrib['v'] = update_name_city(tag.attrib['v'], citymapping);
                    #print tag.attrib['v']

In [105]:
import re
from collections import defaultdict
amenitylist = []

def audit_amenity(amenity_names, amenity):
    amenity_names.add(amenity)
    return amenity_names;
            
def is_amenity(elem):
    return (elem.attrib['k'] == "amenity");

def auditAMMENITY(osmfile): # executes city listings
    osm_file = open(osmfile, "r")
    amenity_names = set([])
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_amenity(tag):
                    audit_amenity(amenity_names, tag.attrib['v'])
    osm_file.close()
    return amenity_names;
#auditAMMENITY('san-francisco_california.osm');
# takeaway: all amenities are good

In [106]:
def audit_post(postal_codes, code):
    postal_codes.add(code)
    return postal_codes
            
def is_code(elem):
    return (elem.attrib['k'] == "addr:postcode")

def auditCode(osmfile): # executes city listings
    osm_file = open(osmfile, "r")
    postal_codes = set([])
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_code(tag):
                    audit_post(postal_codes, tag.attrib['v'])
    osm_file.close()
    return postal_codes;

# takeaway: fix remove "CA" 
# truncate after five digits
# map CA ZIPCODE to just Zip Code

In [107]:
def clean_postal_code(post_code):   
    if(post_code[0] != '9' and post_code[0] != 'C'):
        post_code = 'Invalid Zip Code'
    elif(post_code == 'CA'):
        post_code = 'Invalid Zip Code'
    elif(len(post_code) > 5): #truncate zip codes that have secondary codes with a (-)
        if(post_code[:3] == 'CA ' or post_code[:3] == 'CA:'): # 'CA 94080' case
            post_code = post_code[3:]
        elif(post_code[:3] == 'CA9'):
            post_code = post_code[2:]
    return post_code

# execution code: clean the postal codes
def clean_postal(osmfile):
    for event, element in ET.iterparse(osmfile, events=("start","end")):
        if element.tag == "node" or element.tag == "way":
            for tag in element.iter("tag"):        
                if tag.attrib['k'] == "addr:postcode":
                    tag.attrib['v'] = clean_postal_code(tag.attrib['v'])
                    #print tag.attrib['v']


In [108]:
def clean_everything(osmfile):
    clean_streets(osmfile);
    clean_cities(osmfile);
    clean_postal(osmfile);
clean_everything("san-francisco_california.osm")    

<_sre.SRE_Match object at 0x00000004EECE2F38>
New Montgomery => New Montgomery
<_sre.SRE_Match object at 0x00000004EECE2F38>
St Marys Rd => St Marys Road
<_sre.SRE_Match object at 0x00000004EECE2F38>
Buena Vista Rd => Buena Vista Road
<_sre.SRE_Match object at 0x00000004EECE2F38>
Mecartney Rd => Mecartney Road
<_sre.SRE_Match object at 0x00000004EECE2F38>
Willow Rd => Willow Road
<_sre.SRE_Match object at 0x00000004EECE2F38>
Se Quad I-680 / Rudgear Rd => Se Quad I-680 / Rudgear Road
<_sre.SRE_Match object at 0x00000004EECE2F38>
St Jude Rd => St Jude Road
<_sre.SRE_Match object at 0x00000004EECE2F38>
Ascot Rd => Ascot Road
<_sre.SRE_Match object at 0x00000004EECE2F38>
Ygnacio Valley Rd => Ygnacio Valley Road
<_sre.SRE_Match object at 0x00000004EECE2F38>
San Mateo Rd => San Mateo Road
<_sre.SRE_Match object at 0x00000004EECE2F38>
Marshlands Rd => Marshlands Road
<_sre.SRE_Match object at 0x00000004EECE2F38>
Rollins Rd => Rollins Road
<_sre.SRE_Match object at 0x00000004EECE2F38>
Market S