# Wrangling  OpenStreetMap Data  with MongoDB
### Duc Vu

OpenStreetMap is an open project that lets eveyone use and create a free editable map of the world.

## 1. Chosen Map Area

In this project, I choose to analyze data from Boston, Massachusetts want to show you to fix one type of error, that is the address of the street. And not only that, I also will show you how to put the data that has been audited into MongoDB instance. We also use MongoDB's Agregation Framework to get overview and analysis of the data

In [1]:
from IPython.display import HTML
HTML('<iframe width="425" height="350" frameborder="0" scrolling="no" marginheight="0" marginwidth="0" \
src="http://www.openstreetmap.org/export/embed.html?bbox=-81.793212890625%2C28.75441649498853%2C-80.85113525390625%2C29.756032197482973\
&amp;layer=mapnik"></iframe><br/><small><a href="http://www.openstreetmap.org/#map=10/29.2565/-81.3222" \
target="_blank">View Larger Map</a></small>')

In [2]:
from IPython.display import HTML
HTML('<iframe width="425" height="350" frameborder="0" scrolling="no" marginheight="0" marginwidth="0" \
src="http://www.openstreetmap.org/export/embed.html?bbox=-71.442,42.1858,-70.6984,42.4918&amp;layer=mapnik"></iframe><br/>')

In [3]:
from IPython.display import HTML
HTML('<iframe width="425" height="350" frameborder="0" scrolling="no" marginheight="0" marginwidth="0" \
src="http://www.openstreetmap.org/export/embed.html?bbox=-88.0671,41.7314,-87.3235,42.0396&amp;layer=mapnik"></iframe><br/>')

The dataset is here https://s3.amazonaws.com/metro-extracts.mapzen.com/boston_massachusetts.osm.bz2

In [4]:
import requests

url = 'http://overpass-api.de/api/map?bbox=-81.5600,28.8400,-80.7400,29.6713'
#filename = 'volusia_flagler.osm'
#filename = 'chicago_illinois.osm'
filename = 'boston_massachusetts.osm'

In [5]:
def download_file(url, local_filename):
    # stream = True allows downloading of large files; prevents loading entire file into memory
    r = requests.get(url, stream = True)
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024): 
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                f.flush()
                
download_file(url, filename)

## 2. Auditing the Data

In [6]:
import xml.etree.cElementTree as ET
import pprint

def count_tags(filename):
        tags = {}
        for event, elem in ET.iterparse(filename):
            if elem.tag in tags:
                tags[elem.tag] +=1
            else:
                tags[elem.tag]= 1
                
        return tags
#tags = count_tags('chicago_illinois.osm')
tags = count_tags(filename)
pprint.pprint(tags)

{'bounds': 1,
 'member': 27231,
 'meta': 1,
 'nd': 378367,
 'node': 322114,
 'note': 1,
 'osm': 1,
 'relation': 344,
 'tag': 217210,
 'way': 29774}


In [7]:
import re

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":

        if lower.search(element.attrib['k']):
            keys["lower"] += 1
        elif lower_colon.search(element.attrib['k']):
            keys["lower_colon"] += 1
        elif problemchars.search(element.attrib['k']):
            keys["problemchars"] +=1
        else:
            keys["other"] +=1
        

    return keys


def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys


keys = process_map(filename)
pprint.pprint(keys)

{'lower': 83013, 'lower_colon': 127022, 'other': 7174, 'problemchars': 1}


In [8]:
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        #print element.attrib
        
        try:
            users.add(element.attrib['uid'])
        except KeyError:
            continue
        '''
        if "uid" in element.attrib:
            users.add(element.attrib['uid'])
        '''
    return users

users = process_map(filename)
#pprint.pprint(users)
print len(users)

305


## 3. Problems Encountered in the Map

In [9]:
from collections import defaultdict

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

In [10]:
def audit_street_type(street_types, street_name, rex):
    #m = street_type_re.search(street_name)
    m = rex.search(street_name)
    #print m
    #print m.group()
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


In [11]:
def audit(osmfile,rex):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'],rex)

    return street_types


In [12]:
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

In [13]:
st_types = audit(filename, rex = street_type_re)
pprint.pprint(dict(st_types))

{'1': set(['US 1']),
 '10': set(['Deltona Boulevard, #10']),
 '100': set(['W. Highway 100']),
 '13': set(['Saxon Boulevard Suite 13']),
 '201': set(['Saxon Boulevard, Suite 201']),
 '44': set(['State Road 44']),
 'A': set(['Saxon Boulevard, Suite A']),
 'Ave': set(['Central Ave',
             'E Arizona Ave',
             'E Euclid Ave',
             'E MIchigan Ave',
             'E Michigan Ave',
             'E Minnesota Ave',
             'E Pennsylania Ave',
             'E Pennsylvania Ave',
             'E Stetson Ave',
             'E University Ave',
             'N Amelia Ave',
             'Rhode Island Ave',
             'W Minnesota Ave',
             'W Pennsylvania Ave']),
 'Blvd': set(['Commerce Blvd',
              'Harley Strickland Blvd',
              'Howland Blvd',
              'Mahogany Blvd',
              'N Woodland Blvd',
              'S Clyde Morris Blvd',
              'S Woodland Blvd',
              'Seasame Blvd',
              'Town Center Blvd',
    

In [14]:
# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "Rd." : "Road",
            "Ave" : "Avenue",
            "ave" : "Avenue",
            "Street." : "Street",
            "St,": "Street", 
            "ST": "Street", 
            "Sq." : "Square",
            "Rd" : "Road",
            "Pl": "Place",
            "HIghway": "Highway",
            "LEVEL": "Level",
            "Ct" : "Court",
            "Ave.": "Avenue"
            }

In [15]:
def update_name(name, mapping,rex):

    #m = street_type_re.search(name)
    m = rex.search(name)
    if m:
        street_type = m.group()
        new_street_type = mapping[street_type]
        name = re.sub(rex, new_street_type, name) # re.sub(old_pattern, new_pattern, file)
        #name = street_type_re.sub(new_street_type, name)
    return name

In [16]:
for st_type, ways in st_types.iteritems():
    if st_type in mapping:
        for name in ways:
            better_name = update_name(name, mapping, rex = street_type_re)
            print name, "=>", better_name

Third St. => Third Street
W Highbanks Rd => W Highbanks Road
North Nova Rd. => North Nova Road
10th St => 10th Street
E Minnesota Ave => E Minnesota Avenue
E Euclid Ave => E Euclid Avenue
E Arizona Ave => E Arizona Avenue
Central Ave => Central Avenue
E MIchigan Ave => E MIchigan Avenue
E Stetson Ave => E Stetson Avenue
W Pennsylvania Ave => W Pennsylvania Avenue
E Michigan Ave => E Michigan Avenue
W Minnesota Ave => W Minnesota Avenue
N Amelia Ave => N Amelia Avenue
E University Ave => E University Avenue
E Pennsylvania Ave => E Pennsylvania Avenue
Rhode Island Ave => Rhode Island Avenue
E Pennsylania Ave => E Pennsylania Avenue


In [17]:
cardinal_dir_re = re.compile(r'^[NSEW]\b\.?', re.IGNORECASE)

In [18]:
dir_st_types = audit(filename, rex = cardinal_dir_re)
pprint.pprint(dict(dir_st_types))

{'E': set(['E Arizona Ave',
           'E Bert Fish Dr',
           'E Euclid Ave',
           'E MIchigan Ave',
           'E Michigan Ave',
           'E Minnesota Ave',
           'E Pennsylania Ave',
           'E Pennsylvania Ave',
           'E Stetson Ave',
           'E University Ave']),
 'N': set(['N Amelia Ave',
           'N Bert FIsh Dr',
           'N Bert Fish Dr',
           'N Woodland Blvd']),
 'S': set(['S Clyde Morris Blvd', 'S Woodland Blvd']),
 'W': set(['W Granada Boulevard',
           'W Highbanks Rd',
           'W International Speedway Blvd',
           'W Minnesota Ave',
           'W Pennsylvania Ave']),
 'W.': set(['W. Highway 100'])}


In [19]:
cardinal_directions_mapping = \
    {
        "E" : "East",
        "N" : "North",
        "S" : "South",
        "W" : "West"
    }

In [20]:
for st_type, ways in dir_st_types.iteritems():
    if st_type in cardinal_directions_mapping:
        for name in ways:
            better_name = update_name(name, cardinal_directions_mapping, rex = cardinal_dir_re)
            print name, "=>", better_name

S Woodland Blvd => South Woodland Blvd
S Clyde Morris Blvd => South Clyde Morris Blvd
E Minnesota Ave => East Minnesota Ave
E Pennsylania Ave => East Pennsylania Ave
E Arizona Ave => East Arizona Ave
E MIchigan Ave => East MIchigan Ave
E Stetson Ave => East Stetson Ave
E Michigan Ave => East Michigan Ave
E University Ave => East University Ave
E Pennsylvania Ave => East Pennsylvania Ave
E Bert Fish Dr => East Bert Fish Dr
E Euclid Ave => East Euclid Ave
W International Speedway Blvd => West International Speedway Blvd
W Pennsylvania Ave => West Pennsylvania Ave
W Highbanks Rd => West Highbanks Rd
W Granada Boulevard => West Granada Boulevard
W Minnesota Ave => West Minnesota Ave
N Woodland Blvd => North Woodland Blvd
N Amelia Ave => North Amelia Ave
N Bert Fish Dr => North Bert Fish Dr
N Bert FIsh Dr => North Bert FIsh Dr


In [21]:
osm_file = open(filename, "r")
address_count = 0

for event, elem in ET.iterparse(osm_file, events=("start",)):
    if elem.tag == "node" or elem.tag == "way":
        for tag in elem.iter("tag"): 
            if is_street_name(tag):
                address_count += 1

address_count

222

## 4. Preparing for MongoDB

In [33]:
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :

        node["type"] = element.tag
        
        #for key in element.attrib.keys()
        for key in element.attrib:
            #print key
            
            if key in CREATED:
                if "created" not in node:
                    node["created"] = {}
                node["created"][key] = element.attrib[key]
            
            elif key in ["lat","lon"]:
                if "pos" not in node:
                    node["pos"] = [None, None]
                if key == "lat":
                    node["pos"][0] = float(element.attrib[key])
                elif key == "lon":
                    node["pos"][1] = float(element.attrib[key])  
            else:
                node[key] = element.attrib[key]
           
            for tag in element.iter("tag"):
                tag_key = tag.attrib["k"]   # key
                tag_value = tag.attrib["v"] # value
                if not problemchars.match(tag_key):
                    if tag_key.startswith("addr:"):# Single colon beginning with addr
                        if "address" not in node:
                            node["address"] = {}
                        sub_addr = tag_key[len("addr:"):]
                        if not lower_colon.match(sub_addr): # Tags with no colon
                            node["address"][sub_addr] = tag_value
                    elif lower_colon.match(tag_key): # Single colon not beginnning with "addr:" 
                        node[tag_key] = tag_value 
                    else:
                        node[tag_key] = tag_value # Tags with no colon, not beginnning with "addr:" 
     
        for nd in element.iter("nd"):
            if "node_refs" not in node:
                node["node_refs"] = []
            node["node_refs"].append(nd.attrib["ref"]) 

        #print node
        return node
    else:
        return None

In [34]:
import codecs
import json

def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

process_map(filename)

[{'created': {'changeset': '10423645',
   'timestamp': '2012-01-18T00:10:09Z',
   'uid': '207745',
   'user': 'NE2',
   'version': '3'},
  'id': '26932333',
  'pos': [28.8386787, -81.3243472],
  'type': 'node'},
 {'created': {'changeset': '4233448',
   'timestamp': '2010-03-25T21:35:10Z',
   'uid': '207745',
   'user': 'NE2',
   'version': '2'},
  'id': '26932350',
  'pos': [28.8589616, -81.3236483],
  'type': 'node'},
 {'created': {'changeset': '4233448',
   'timestamp': '2010-03-25T21:35:10Z',
   'uid': '207745',
   'user': 'NE2',
   'version': '2'},
  'id': '26932351',
  'pos': [28.8602677, -81.3243886],
  'type': 'node'},
 {'created': {'changeset': '4233448',
   'timestamp': '2010-03-25T21:35:10Z',
   'uid': '207745',
   'user': 'NE2',
   'version': '2'},
  'id': '26932352',
  'pos': [28.8609912, -81.3250216],
  'type': 'node'},
 {'created': {'changeset': '3531266',
   'timestamp': '2010-01-03T18:35:11Z',
   'uid': '113450',
   'user': 'nfgusedautoparts',
   'version': '2'},
  'id'

## 5. Data Overview

In [35]:
import os
print "The downloaded file is {} MB".format(os.path.getsize(filename)/1.0e6) # convert from bytes to megabytes

The downloaded file is 74.445532 MB


In [36]:
print "The json file is {} MB".format(os.path.getsize(filename + ".json")/1.0e6) # convert from bytes to megabytes

The json file is 80.163839 MB


In [37]:
import signal
import subprocess

# The os.setsid() is passed in the argument preexec_fn so
# it's run after the fork() and before  exec() to run the shell.
pro = subprocess.Popen("mongod", preexec_fn = os.setsid) 

In [38]:
from pymongo import MongoClient

db_name = "osm"

client = MongoClient('localhost:27017')
db = client[db_name]

In [40]:
# Build mongoimport command
collection = filename[:filename.find(".")]
#working_directory = "/Users/jasondamiani/Developer/IPython/Notebooks/"
working_directory = "/Users/ducvu/Documents/ud032-master/final_project"

json_file = filename + ".json"

mongoimport_cmd = "mongoimport --db " + db_name + \
                  " --collection " + collection + \
                  " --file " + working_directory + json_file

# Before importing, drop collection if it exists
if collection in db.collection_names():
    print "dropping collection"
    db[collection].drop()

# Execute the command
print "Executing: " + mongoimport_cmd
subprocess.call(mongoimport_cmd.split())

ServerSelectionTimeoutError: localhost:27017: [Errno 61] Connection refused

## 6. Additional Ideas

### Additional data exploration using MongoDB queries

### Conclusion         