In [3]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re

# Define a function to count all types of tags in the xml file
def count_tags(filename):
    tags_dict = defaultdict(int)
    for event, elem in ET.iterparse(filename, events=("start",)):
        tags_dict[elem.tag] += 1
    return tags_dict

count_tags("toronto_canada.osm")

defaultdict(int,
            {'bounds': 1,
             'member': 85911,
             'nd': 6085327,
             'node': 5452721,
             'osm': 1,
             'relation': 5713,
             'tag': 4392551,
             'way': 649048})

In [4]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re

# Regular expressions for different types of values that can show up in the file
lower = re.compile(r'^([a-z]|_)*$') #for tags that contain only lowercase letters and are valid
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$') #for otherwise valid tags with a colon in their names
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]') #for tags with problematic characters

# Define the function to find out the type of "k" attribute in "tag" tag
def key_type(element, keys):
    if element.tag == "tag":
        if lower.match(element.get("k")):
            keys["lower"] += 1
        elif lower_colon.match(element.get("k")):
            keys["lower_colon"] += 1
        elif problemchars.match(element.get("k")):
            keys["problemchars"] += 1
        else:
            keys["other"] += 1
    return keys

# Iterate through the whole file to find out all types for "k" attribute in "tag" tag
def find_types(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)
    return keys

find_types("toronto_canada.osm")

{'lower': 2502530, 'lower_colon': 1760581, 'other': 129440, 'problemchars': 0}

In [None]:
# Now find out the unique users who have contributed to the map in this particular area
def get_user(element):
    return element.get("user")

def count_unique(filename):
    users = set()
    print(users)
    for _, element in ET.iterparse(filename):
        if "user" in element.attrib:
            users.add(get_user(element))
    print(len(users))

count_unique("toronto_canada.osm")

set([])
1713


In [2]:
from collections import defaultdict
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit(filename):
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(filename, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

    return street_types

# Find out all the street types that are not in expected
audit("toronto_canada.osm")

defaultdict(set,
            {'1': {'Concession 1',
              'Concession Road 1',
              'County Road 1',
              'Line 1',
              'Regional Road 1'},
             '10': {'Concession Road 10',
              'County Road  10',
              'County Road 10',
              'Highway 10',
              'Line 10',
              'Sideroad  10',
              'Sideroad 10'},
             '101': {'Lane 101'},
             '102': {'Lane 102'},
             '106': {'Lane 106'},
             '107': {'Lane 107'},
             '109': {'County Road 109'},
             '10a': {'Firelane 10a'},
             '11': {'Concession 11',
              'Concession Road 11',
              'County Road 11',
              'Highway 11',
              'Line 11'},
             '1134': {'1134'},
             '11a': {'Firelane 11a'},
             '11b': {'Firelane 11b'},
             '12': {'Concession Road 12',
              'Highway  7 & 12',
              'Highway 7 & 12',
              'H

In [1]:
from collections import defaultdict
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json

problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

mapping = {"St": "Street",
           "St.": "Street",
           "street": "Street",
           "Rd.": "Road",
           "Rd": "Road",
           "Ave": "Avenue",
           "Ave.": "Avenue",
           "Blvd": "Boulevard",
           "Blvd.": "Boulevard",
           "Boulevade": "Boulevard",
           "Cir": "Circle",
           "Cres": "Crescent",
           "Cressent": "Crescent",
           "Crt.": "Court",
           "Dr": "Drive",
           "Dr.": "Drive",
           "Driver": "Drive",
           "Terace": "Terrace"
          }

# Define a function to update the street name if it contains problem types in mapping
def update_name(name, mapping):
    m = street_type_re.search(name)
    if m and (m.group() in mapping.keys()):
        return re.sub(street_type_re, mapping[m.group()], name)
    else: # if problem in mapping not found, keep the same
        return name

# Define the function to audit street name, i.e., wheter problem in mapping is found in the name
def audit_street_name(street_name):
    m = street_type_re.search(street_name)
    if m:
        if m.group() in mapping.keys():
            return True
        else:
            return False

# Define a function to separate a string by the first found colon
def separate_by_colon(string, pos):
    if pos == "before":
        return string[:(string.index(":"))]
    elif pos == "after":
        return string[(string.index(":")+1):]

# Define the main function to reshape the element
def shape_element(element):
    element_reshaped = {}

    if element.tag == "node" or element.tag == "way" :

        element_reshaped["tag_type"] = element.tag
        element_reshaped["id"] = element.get("id")
        element_reshaped["visible"] = element.get("visible") 
        element_reshaped["created"] = {}
        
        for attr in ["version", "changeset", "timestamp", "user", "uid"]:
            element_reshaped["created"][attr] = element.get(attr) 

        if element.findall('nd'):
            element_reshaped["node_refs"] = []

        if element.get("lat"):
            try:
                element_reshaped["pos"] = [float(element.get("lat")), float(element.get("lon"))]
            except TypeError:
                element_reshaped["pos"] = [element.get("lat"), element.get("lon")]
        
        # For those k values with colons, create dictionaries for adding values later
        key_set = set()
        # Get all the keys
        for tag in element.iter("tag"): 
            k_val = tag.get("k")
            if ":" in k_val:
                key_set.add(separate_by_colon(k_val, "before")) 
        # Make the dictionaries
        for key in key_set: 
            element_reshaped[key] = {}

        # Iterate through all the tags in the element to add them into the "element_reshaped" dict
        for elem in element.iter():
            if elem.tag == "nd":
                element_reshaped["node_refs"].append(elem.get("ref"))
                            
            elif elem.tag == "tag":
                k_val = elem.get("k")
                v_val = elem.get("v")

                # update the v_val if it's a street name and problem is found
                if k_val == "addr:street":
                    if audit_street_name(v_val):
                        v_val = update_name(v_val, mapping)
                
                if problemchars.match(k_val):
                    continue
                elif ":" in k_val:
                    if len(re.findall(":", k_val)) > 1: # if there's more than 1 colon, ignore the tag.
                        continue
                    else: # when there's only one colon, add to the premade dict
                        element_reshaped[separate_by_colon(k_val, "before")][separate_by_colon(k_val, "after")] = v_val
                elif k_val in element_reshaped.keys(): # No colon, but same name key already made, add in sub_dict
                    element_reshaped[k_val][k_val] = v_val
                else: # for those without colon and no same name key exists, add directly
                    element_reshaped[k_val] = v_val

        return element_reshaped
    
    else:
        return None
        
def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

process_map("toronto_canada.osm", False)

[{'created': {'changeset': '15661098',
   'timestamp': '2013-04-09T01:43:19Z',
   'uid': '1679',
   'user': 'andrewpmk',
   'version': '11'},
  'id': '699540',
  'pos': [43.6751621, -79.361332],
  'tag_type': 'node',
  'visible': None},
 {'created': {'changeset': '14075648',
   'timestamp': '2012-11-28T16:44:20Z',
   'uid': '425846',
   'user': 'WhiskyMacK',
   'version': '4'},
  'id': '699569',
  'pos': [43.7247576, -79.3302633],
  'tag_type': 'node',
  'visible': None},
 {'created': {'changeset': '24170472',
   'timestamp': '2014-07-15T21:31:59Z',
   'uid': '51600',
   'user': 'brandoncote',
   'version': '6'},
  'highway': 'traffic_signals',
  'id': '699620',
  'pos': [43.8886982, -78.9030497],
  'source': 'Bing',
  'tag_type': 'node',
  'visible': None},
 {'created': {'changeset': '2659290',
   'timestamp': '2009-09-27T23:17:56Z',
   'uid': '1679',
   'user': 'andrewpmk',
   'version': '2'},
  'id': '699622',
  'pos': [43.8881854, -78.9027903],
  'tag_type': 'node',
  'visible': No

In [1]:
# Get a small sample of the map data used that is 1/20 of the original dataset

import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "toronto_canada.osm"  # Replace this with your osm file
SAMPLE_FILE = "sample.osm"


def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every 10th top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % 20 == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

In [3]:
# Create a test jason file

import jason

x = {
    "layout" : "rear mid-engine rear-wheel-drive layout",
    "name" : "Porsche Boxster",
    "productionYears" : [ ],
    "modelYears" : [ ],
    "bodyStyle" : "roadster",
    "assembly" : [
        "Finland",
        "Germany",
        "Stuttgart",
        "Uusikaupunki"
    ],
    "class" : "sports car",
    "manufacturer" : "Porsche"
}

with open("car.json", "w") as f:
    json.dump(x, f)


In [12]:
# Do a test query using the imported test json file

from pymongo import MongoClient
client = MongoClient('localhost:27017')
db = client.auto

#db.car.find(query)

db.car.find_one()

{u'_id': ObjectId('5668703bc9b35aa1ee9a12e7'),
 u'assembly': [u'Finland', u'Germany', u'Stuttgart', u'Uusikaupunki'],
 u'bodyStyle': u'roadster',
 u'class': u'sports car',
 u'layout': u'rear mid-engine rear-wheel-drive layout',
 u'manufacturer': u'Porsche',
 u'modelYears': [],
 u'name': u'Porsche Boxster',
 u'productionYears': []}

In [3]:
# Test query on the imported osm database
from pymongo import MongoClient
client = MongoClient('localhost:27017')

db = client.osm
db.to.find_one()

{u'_id': ObjectId('566ce19a56c3f83762325ff2'),
 u'created': {u'changeset': u'14075648',
  u'timestamp': u'2012-11-28T16:44:20Z',
  u'uid': u'425846',
  u'user': u'WhiskyMacK',
  u'version': u'4'},
 u'id': u'699569',
 u'pos': [43.7247576, -79.3302633],
 u'tag_type': u'node',
 u'visible': None}

In [5]:
amenity = db.sample.aggregate([
        {"$match" : {"amenity" : {"$exists" : 1}}},
        {"$group" : {"_id" : "$amenity",
                     "count" : {"$sum" : 1}}},
        {"$sort" : {"count" : -1}},
        {"$limit" : 10}
    ])
pprint.pprint([doc for doc in amenity])

[{u'_id': u'parking', u'count': 1214},
 {u'_id': u'restaurant', u'count': 150},
 {u'_id': u'fast_food', u'count': 148},
 {u'_id': u'school', u'count': 131},
 {u'_id': u'place_of_worship', u'count': 107},
 {u'_id': u'bench', u'count': 106},
 {u'_id': u'post_box', u'count': 81},
 {u'_id': u'cafe', u'count': 68},
 {u'_id': u'fuel', u'count': 68},
 {u'_id': u'waste_basket', u'count': 57}]
