In [36]:
#Generating sample for initial exploration. Code (minus path changes) provided by Udacity project description

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "data/bergen.osm"  # Replace this with your osm file
SAMPLE_FILE = "data/sample.osm"

k = 20 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write(b'<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write(b'<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write(b'</osm>')

In [182]:
#tag count of dataset

with open('data/bergen.osm') as f:
    tree = ET.parse(f)
    root = tree.getroot()
    
    tag_dict = {}
    
    for line in root.iter():
        tag = line.tag
        
        tag_dict[tag] = tag_dict.get(tag, 0) + 1
        
    print(tag_dict)


{'osm': 1, 'tag': 515044, 'bounds': 1, 'nd': 636272, 'relation': 2595, 'member': 21241, 'way': 52393, 'node': 628779}


In [138]:
#Generating list of all Bergen street names as of 2005. Source: Wikipedia.

import requests
from bs4 import BeautifulSoup

bergen_street_names = []

r = requests.get('https://no.wikipedia.org/wiki/Liste_over_Bergens_gater')

wiki_soup = BeautifulSoup(r.text, 'lxml')

street_div = wiki_soup.find(id='mw-content-text')

for name in street_div.find_all('li'):
    #Åstveitveien is the last name of the page, so stopping after that row to avoid bad data entries
    if name.string == 'Åstveitveien':
        bergen_street_names.append(name.string)
        break
    else:
        bergen_street_names.append(name.string)
    

#Defining search query for characters that should not be found in street names
problemchars = re.compile(r'[=\+/&<>;"\?%#$@\,:<>\t\r\n]')

#Removing incorrect items from street name list. Printing them out for transparency/QA.
for item in bergen_street_names:
    if problemchars.search(item):
        print('PROBLEM STREET NAME:',item)
        print('DELETING NAME', bergen_street_names[count], 'FROM STREET NAME LIST.')
        del(bergen_street_names[count])
        

Next I will audit the quality of the street names in the data set. One tricky part is that Norwegian street names are concatenated with no clear distinction between the different words. For example, the equivalent of Main Street is Hovedgaten.

Due to this the main audit criteria will be whether the name exists in the Bergen street name list generated above.

In [179]:
#Auditing street name quiality

from collections import defaultdict
from pprint import pprint
import re

#Auditing street names

OSMFILE = "data/bergen.osm"

def is_street_name(elem):
    return (elem.attrib.setdefault('k',None) == "addr:street")

street_types = defaultdict(set)

expected = ["gate", "gaten", "vei", "veien", "veg", "vegen", "lien", "neset", "smauet", "allé",
           "høgda", "plass", "dalen", "haugen", "myra"]
expected_long = ["allmenningen", "fjorden","Flagget","Smålonane","Tangen"]

def audit_street_type(street_types, street_name):

    valid = 0
    street_type = street_name[-6::]
    
    if street_name in bergen_street_names:
        valid = 1
    
    else:
        for s_type in expected:

            if s_type in street_type:
                valid = 1
            else:
                for name in expected_long:
                    if street_type in name:
                        valid = 1                    

    if valid == 0:
        street_types[street_type].add(street_name)
            
def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower())
    for k in keys:
        v = d[k]
        print('{0}: {1}'.format(k, v))

def audit():
    for event, elem in ET.iterparse(OSMFILE, events=("start",)):
        if elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    pprint(dict(street_types))
            
            
audit()

{'eien 1': {'Laguneveien 1'},
 's Gate': {'Thormøhlens Gate'},
 'øgda 9': {'Smøråshøgda 9'}}


In [181]:
#Improving names

mapping = { "Gate": "gate",
            }

def update_name(name, mapping):

    # YOUR CODE HERE
    m = street_type_re.search(name)
    street_type = m.group()
    
    if street_type in mapping.keys():
        name = re.sub(street_type,mapping[street_type],name)

    return name

def generate_new_names(street_types):
    better_names = {}
    for st_type, ways in street_types.items():

        for old_name in ways:
            
            #if old_name ends in street number, remove street number
            if re.search(' [0-9]*$',old_name):
                better_name = re.sub(' [0-9]*$','',old_name)
                
            else:
                better_name = update_name(old_name, mapping)
            better_names[old_name] = better_name
            
    return better_names

generate_new_names(street_types)

{'Laguneveien 1': 'Laguneveien',
 'Smøråshøgda 9': 'Smøråshøgda',
 'Thormøhlens Gate': 'Thormøhlens gate'}

In [18]:
with codecs.open("test.json", "w") as fo:
    fo.write(json.dumps("bradæ å æ", ensure_ascii=False))

In [20]:
#Quiz: Preparing for database - MongoDB

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import re
import codecs
import json
from pprint import pprint



lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
multi_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*:(.)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


# el = shape_element(element)
def shape_element(element):
    node = {}
    
    if element.tag == "node" or element.tag == "way" :
        
        
        # YOUR CODE HERE
        
        node['type'] = element.tag

        created = dict()
        address = dict()
        colon_dict = dict()
        node_refs = list()
        
        for nod in element.iter():  
            if element.tag == 'way' and nod.tag == 'nd':
                node_refs.append(nod.attrib['ref'])
                
            for k,v in nod.attrib.items():
            
                if 'k' in nod.keys():
                    node_key = nod.attrib['k']
                    node_val = nod.attrib['v']
                else:
                    node_key = ''
                    node_val = ''
            
                node_keys = nod.keys()
                node_vals = nod.attrib

                if problemchars.search(node_key):
                    pass

                elif re.match('addr:',node_key):
                    k_list = node_key.rstrip(':').split(':')[1:]
                    k_len =  len(k_list)
         
                    if k_len == 1:   
                        address[k_list[0]] = node_val
                    elif k_len == 2 and k_list[0] == 'street':
                        pass
                    
                    elif k_len > 1:
                        print(('RECHECK ADDRESS, UNEXPECTED  VARIABLE COUNT: {0}, {1}').format(k_len,k_list))
                
    
                elif lower_colon.search(node_key):
                    colon_dict[node_key] = node_val
                
                                
                elif k in ('k','v'):
                    pass
                
                elif k in CREATED:
                    created[k] = v
                    
                elif k == 'lat':
                    lat = float(v)
                elif k == 'lon':
                    lon = float(v)

                else:    
                   
                    node[k] = v
                
                
            
            if 'lat' in nod.attrib.keys():
                node['pos'] = [lat,lon]

        node['created'] = created
        
        if len(node_refs) != 0:
            node['node_refs'] = node_refs

        if len(address) != 0:
                node['address'] = {}
                for key in address.keys():
                    node['address'][key] = address[key]

        if len(colon_dict) > 0:

            for k in colon_dict:
                k_list = k.rstrip(':').split(':')
                if len(k_list) ==  2:
                    node[k_list[0]] = {k_list[1] : colon_dict[k]} 
                    

        return node
    else:
        return None


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        
        count = 0
        
        for _, element in ET.iterparse(file_in):
            
            
            count += 1
            
            if count == 100:
                print("above 100")
            elif count == 1000:
                print("above 1000")
            elif count == 10000:
                print("above 10000")
            elif count == 100000:
                print("above 100000")            
            
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    #Turning off ascii control to allow Norwegian letters æøå
                    fo.write(json.dumps(el, indent=2, ensure_ascii=False)+"\n")
                else:
                    #Turning off ascii control to allow Norwegian letters æøå
                    fo.write(json.dumps(el,ensure_ascii=False) + "\n")
    return data

def test():
    # NOTE: if you are running this code on your computer, with a larger dataset, 
    # call the process_map procedure with pretty=False. The pretty=True option adds 
    # additional spaces to the output, making it significantly larger.
    data = process_map('data/sample.osm', False)
#     pprint(data)   
    
    correct_first_elem = {
        "id": "261114295", 
        "visible": "true", 
        "type": "node", 
        "pos": [41.9730791, -87.6866303], 
        "created": {
            "changeset": "11129782", 
            "user": "bbmiller", 
            "version": "7", 
            "uid": "451048", 
            "timestamp": "2012-03-28T18:31:23Z"
        }
    }
    
    pprint(data[0])
#     pprint(data[-10:-1])
    
    for row in data:
        if row['id'] == "376535335":
            print(row)
    
    assert data[0] == correct_first_elem
    assert data[-1]["address"] == {
                                    "street": "West Lexington St.", 
                                    "housenumber": "1412"
                                      }
    assert data[-1]["node_refs"] == [ "2199822281", "2199822390",  "2199822392", "2199822369", 
                                    "2199822370", "2199822284", "2199822281"]

if __name__ == "__main__":
    test()

above 100
above 1000
above 10000
{'created': {'changeset': '6007582',
             'timestamp': '2010-10-10T22:30:34Z',
             'uid': '114230',
             'user': 'danerikk',
             'version': '2'},
 'id': '358065',
 'pos': [60.5320227, 5.2557628],
 'type': 'node'}
{'node_refs': ['3799466707', '3799466661', '3799466708', '3799466731', '3799466735', '3799466733', '3799466740', '1850625657', '3799466869', '3799466751', '3799466707'], 'type': 'way', 'address': {'city': 'Bergen', 'housenumber': '13', 'postcode': '5003', 'street': 'Øvregaten'}, 'id': '376535335', 'ref': '3799466707', 'created': {'changeset': '34822543', 'timestamp': '2015-10-23T15:11:21Z', 'user': 'daviesp12', 'uid': '722193', 'version': '1'}}


AssertionError: 

Sources:

For downloading map data: 
- https://mapzen.com/data/metro-extracts/ 
- https://mapzen.com/documentation/metro-extracts/file-format/ 

http://www.openstreetmap.org/relation/404159#map=10/60.3572/5.4163

Steps taken:  
1. Clean street names  
2. Generate json file from sample file  
3. While reviewing the output file I noticed that the Norwegian characters æøå was not encoded correctly. After debugging for a while I figured out that the error happened in the json module when running json.dumps(). Setting ensure_ascii = False solved this.

Steps to go:
