In [36]:
#Generating sample for initial exploration

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "data/bergen.osm"  # Replace this with your osm file
SAMPLE_FILE = "data/sample.osm"

k = 20 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write(b'<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write(b'<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write(b'</osm>')

In [37]:
with open('data/sample.osm') as f:
    tree = ET.parse(f)
    root = tree.getroot()
    
    tag_dict = {}
    
    for line in root.iter():
        tag = line.tag
        
        tag_dict[tag] = tag_dict.get(tag, 0) + 1
        
    print(tag_dict)


{'osm': 1, 'tag': 25747, 'way': 2620, 'nd': 30939, 'relation': 130, 'member': 1055, 'node': 31439}


In [138]:
#Generating list of all Bergen street names as of 2005. Source: Wikipedia.

import requests
from bs4 import BeautifulSoup

bergen_street_names = []

r = requests.get('https://no.wikipedia.org/wiki/Liste_over_Bergens_gater')

wiki_soup = BeautifulSoup(r.text, 'lxml')

street_div = wiki_soup.find(id='mw-content-text')

for name in street_div.find_all('li'):
    #Åstveitveien is the last name of the page, so stopping after that row to avoid bad data entries
    if name.string == 'Åstveitveien':
        bergen_street_names.append(name.string)
        break
    else:
        bergen_street_names.append(name.string)
    

#Defining search query for characters that should not be found in street names
problemchars = re.compile(r'[=\+/&<>;"\?%#$@\,:<>\t\r\n]')

#Removing incorrect items from street name list. Printing them out for transparency/QA.
for item in bergen_street_names:
    if problemchars.search(item):
        print('PROBLEM STREET NAME:',item)
        print('DELETING NAME', bergen_street_names[count], 'FROM STREET NAME LIST.')
        del(bergen_street_names[count])
        

Next I will audit the quality of the street names in the data set. One tricky part is that Norwegian street names are concatenated with no clear distinction between the different words. For example, the equivalent of Main Street is Hovedgaten.

Due to this the main audit criteria will be whether the name exists in the Bergen street name list generated above.

In [179]:
#Auditing street name quiality

from collections import defaultdict
from pprint import pprint
import re

#Auditing street names

OSMFILE = "data/bergen.osm"

def is_street_name(elem):
    return (elem.attrib.setdefault('k',None) == "addr:street")

street_types = defaultdict(set)

expected = ["gate", "gaten", "vei", "veien", "veg", "vegen", "lien", "neset", "smauet", "allé",
           "høgda", "plass", "dalen", "haugen", "myra"]
expected_long = ["allmenningen", "fjorden","Flagget","Smålonane","Tangen"]

def audit_street_type(street_types, street_name):

    valid = 0
    street_type = street_name[-6::]
    
    if street_name in bergen_street_names:
        valid = 1
    
    else:
        for s_type in expected:

            if s_type in street_type:
                valid = 1
            else:
                for name in expected_long:
                    if street_type in name:
                        valid = 1                    

    if valid == 0:
        street_types[street_type].add(street_name)
            
def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower())
    for k in keys:
        v = d[k]
        print('{0}: {1}'.format(k, v))

def audit():
    for event, elem in ET.iterparse(OSMFILE, events=("start",)):
        if elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    pprint(dict(street_types))
            
            
audit()

{'eien 1': {'Laguneveien 1'},
 's Gate': {'Thormøhlens Gate'},
 'øgda 9': {'Smøråshøgda 9'}}


In [181]:
#Improving names

mapping = { "Gate": "gate",
            }

def update_name(name, mapping):

    # YOUR CODE HERE
    m = street_type_re.search(name)
    street_type = m.group()
    
    if street_type in mapping.keys():
        name = re.sub(street_type,mapping[street_type],name)

    return name

def generate_new_names(street_types):
    better_names = {}
    for st_type, ways in street_types.items():

        for old_name in ways:
            
            #if old_name ends in street number, remove street number
            if re.search(' [0-9]*$',old_name):
                better_name = re.sub(' [0-9]*$','',old_name)
                
            else:
                better_name = update_name(old_name, mapping)
            better_names[old_name] = better_name
            
    return better_names

generate_new_names(street_types)

{'Laguneveien 1': 'Laguneveien',
 'Smøråshøgda 9': 'Smøråshøgda',
 'Thormøhlens Gate': 'Thormøhlens gate'}

Sources:

For downloading map data: 
- https://mapzen.com/data/metro-extracts/ 
- https://mapzen.com/documentation/metro-extracts/file-format/ 

http://www.openstreetmap.org/relation/404159#map=10/60.3572/5.4163