In [30]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
import collections

In [None]:

"""
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:

{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
          "version":"2",
          "changeset":"17206049",
          "timestamp":"2013-08-03T16:43:42Z",
          "user":"linuxUser16",
          "uid":"1219059"
        },
"pos": [41.9757030, -87.6921867],
"address": {
          "housenumber": "5157",
          "postcode": "60625",
          "street": "North Lincoln Ave"
        },
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}

You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB. 

Note that in this exercise we do not use the 'update street name' procedures
you worked on in the previous exercise. If you are using this code in your final
project, you are strongly encouraged to use the code from previous exercise to 
update the street names before you save them to JSON. 

In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
    - attributes in the CREATED array should be added under a key "created"
    - attributes for latitude and longitude should be added to a "pos" array,
      for use in geospacial indexing. Make sure the values inside "pos" array are floats
      and not strings. 
- if the second level tag "k" value contains problematic characters, it should be ignored
- if the second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if the second level tag "k" value does not start with "addr:", but contains ":", you can
  process it in a way that you feel is best. For example, you might split it into a two-level
  dictionary like with "addr:", or otherwise convert the ":" to create a valid key.
- if there is a second ":" that separates the type/direction of a street,
  the tag should be ignored, for example:

<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>

  should be turned into:

{...
"address": {
    "housenumber": 5158,
    "street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}

- for "way" specifically:

  <nd ref="305896090"/>
  <nd ref="1719825889"/>

should be turned into
"node_refs": ["305896090", "1719825889"]
"""

https://docs.python.org/2/library/xml.etree.elementtree.html

attrib
A dictionary containing the element’s attributes. Note that while the attrib value is always a real mutable Python dictionary, an ElementTree implementation may choose to use another internal representation, and create the dictionary only if someone asks for it. To take advantage of such implementations, use the dictionary methods below whenever possible.

The following dictionary-like methods work on the element attributes.

clear()
Resets an element. This function removes all subelements, clears all attributes, and sets the text and tail attributes to None.

get(key, default=None)
Gets the element attribute named key.

Returns the attribute value, or default if the attribute was not found.

items()
Returns the element attributes as a sequence of (name, value) pairs. The attributes are returned in an arbitrary order.

keys()
Returns the elements attribute names as a list. The names are returned in an arbitrary order.

set(key, value)
Set the attribute key on the element to value.

In [40]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
POS = ["lon", "lat"]

In [94]:
cnt = 0
break_out = False
for event, element in ET.iterparse('example.osm'):
    #if element.tag == "node" or element.tag == "way":
    if element.tag == "node":
        for c in element.getchildren():
            if 'addr:city' in c.attrib.values():
                break_out = True
                break
        cnt = cnt + 1
        if break_out:
            break
    if cnt > 500:
        break  

In [96]:
element.attrib.items()

[('changeset', '17206049'),
 ('uid', '1219059'),
 ('timestamp', '2013-08-03T16:43:42Z'),
 ('lon', '-87.6921867'),
 ('visible', 'true'),
 ('version', '2'),
 ('user', 'linuxUser16'),
 ('lat', '41.9757030'),
 ('id', '2406124091')]

In [108]:
for c in element.getchildren():
    if c.tag == 'tag':
        key = c.get('k')
        val = c.get('v')
        if key.startswith('addr:'):
            print "{} : {}".format(key,val)
            key_list = key.split(':')
            if len(key_list) ==2:
                node['address'][key_list[1]] = val
            continue
        #check for special characters
        m = problemchars.search(key)
        if m:
            print 'problem char found {}'.format(m.group(0))
            continue
        #replace ':' with '_' to use as key
        m = lower_colon.search(key)
        if m:
            key = key.replace(':','_')
            node[key] = val
    elif c.tag == 'nd':
        node['reference'] = c.get['ref']

print node

addr:city : Chicago
addr:housenumber : 5157
addr:postcode : 60625
addr:street : North Lincoln Ave
{'addr_city': 'Chicago', 'created': {'changeset': '3061377', 'user': 'StellanL', 'version': '8', 'uid': '28775', 'timestamp': '2009-11-08T08:11:05Z'}, 'addr_postcode': '60625', 'addr_street': 'North Lincoln Ave', 'pos': ['-122.2948711', '37.5280244'], 'addr_housenumber': '5157', 'address': {'city': 'Chicago', 'street': 'North Lincoln Ave', 'housenumber': '5157', 'postcode': '60625'}, 'type': 'node'}


In [9]:
element.items() #returns a list

[('changeset', '3061377'),
 ('uid', '28775'),
 ('timestamp', '2009-11-08T08:11:05Z'),
 ('lon', '-122.2948711'),
 ('version', '8'),
 ('user', 'StellanL'),
 ('lat', '37.5280244'),
 ('id', '281393')]

In [16]:
element.attrib #returns dictionary

{'changeset': '3061377',
 'id': '281393',
 'lat': '37.5280244',
 'lon': '-122.2948711',
 'timestamp': '2009-11-08T08:11:05Z',
 'uid': '28775',
 'user': 'StellanL',
 'version': '8'}

In [73]:
for k,v in element.attrib.items():
    print k
    print v

changeset
6777070
uid
14293
timestamp
2010-12-27T12:28:53Z
lon
-122.0315503
version
18
user
KindredCoda
lat
37.3673202
id
26027702


In [21]:
element.get('id')

'281393'

In [41]:
node = {}

In [24]:
attrib_list = element.keys()
print attrib_list

['changeset', 'uid', 'timestamp', 'lon', 'version', 'user', 'lat', 'id']


In [43]:
node['created'] = dict()
node['pos'] = list()
node['address'] = dict()
print node

{'address': {}, 'pos': [], 'created': {}}


In [45]:
for a in attrib_list:
    if a in CREATED:
        node['created'][a] = element.get(a)
    elif a in POS:
        if a == 'lon':
            node['pos'].insert(0,element.get('lon'))
        elif a == 'lat':
            node['pos'].append(element.get('lat'))

print node

{'address': {}, 'pos': ['-122.2948711', '37.5280244'], 'created': {'changeset': '3061377', 'user': 'StellanL', 'version': '8', 'uid': '28775', 'timestamp': '2009-11-08T08:11:05Z'}}


In [46]:
node['type'] = element.tag
print node

{'address': {}, 'type': 'node', 'pos': ['-122.2948711', '37.5280244'], 'created': {'changeset': '3061377', 'user': 'StellanL', 'version': '8', 'uid': '28775', 'timestamp': '2009-11-08T08:11:05Z'}}


In [50]:
#get child tags
for c in element.getchildren():
    print c

In [152]:
def shape_element(element):
    POS = ["lon", "lat"]
    node = None
    if element.tag == "node" or element.tag == "way" :
        node = { 'created': dict(),
                 'pos': list(),
                 'address': dict(),
                 'node_refs': list()}
        # YOUR CODE HERE
        node['type'] = element.tag
        attrib_list = element.keys()
        #process attributes of the top level element (node or way)
        for a in attrib_list:
            if a in CREATED:
                node['created'][a] = element.get(a)
            elif a in POS:
                if a == 'lat':
                    node['pos'].insert(0,float(element.get('lat')))
                elif a == 'lon':
                    node['pos'].append(float(element.get('lon')))
            else:
                node[a] = element.get(a)

        #process child elements (may be tag, nd )
        for c in element.getchildren():
            if c.tag == 'tag':
                key = c.get('k')
                val = c.get('v')
                if key.startswith('addr:'):
                    #print "{} : {}".format(key,val)
                    key_list = key.split(':')
                    if len(key_list) ==2:
                        node['address'][key_list[1]] = val
                    continue
                #check for special characters
                m = problemchars.search(key)
                if m:
                    print 'problem char found {}'.format(m.group(0))
                    continue
                #replace ':' with '_' to use as key
                m = lower_colon.search(key)
                if m:
                    key = key.replace(':','_')
                    node[key] = val
            elif c.tag == 'nd':
                node['node_refs'].append(c.get('ref'))
        if not node['created']:
            del node['created']
        if not node['pos']:
            del node['pos']
        if not node['address']:
            del node['address']
        if not node['node_refs']:
            del node['node_refs']
    return node

In [148]:
def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

In [143]:
def test():
    # NOTE: if you are running this code on your computer, with a larger dataset, 
    # call the process_map procedure with pretty=False. The pretty=True option adds 
    # additional spaces to the output, making it significantly larger.
    data = process_map('example.osm', True)
    pprint.pprint(data)
    
    correct_first_elem = {
        "id": "261114295", 
        "visible": "true", 
        "type": "node", 
        "pos": [41.9730791, -87.6866303], 
        "created": {
            "changeset": "11129782", 
            "user": "bbmiller", 
            "version": "7", 
            "uid": "451048", 
            "timestamp": "2012-03-28T18:31:23Z"
        }
    }
    assert data[0] == correct_first_elem
    assert data[-1]["address"] == {
                                    "street": "West Lexington St.", 
                                    "housenumber": "1412"
                                      }
    assert data[-1]["node_refs"] == [ "2199822281", "2199822390",  "2199822392", "2199822369", 
                                    "2199822370", "2199822284", "2199822281"]

In [153]:
if __name__ == "__main__":
    #data = process_map('example.osm', True)
    #pprint.pprint(data)
    test()

[{'created': {'changeset': '11129782',
              'timestamp': '2012-03-28T18:31:23Z',
              'uid': '451048',
              'user': 'bbmiller',
              'version': '7'},
  'id': '261114295',
  'pos': [41.9730791, -87.6866303],
  'type': 'node',
  'visible': 'true'},
 {'created': {'changeset': '8448766',
              'timestamp': '2011-06-15T17:04:54Z',
              'uid': '451048',
              'user': 'bbmiller',
              'version': '6'},
  'id': '261114296',
  'pos': [41.9730416, -87.6878512],
  'type': 'node',
  'visible': 'true'},
 {'created': {'changeset': '8581395',
              'timestamp': '2011-06-29T14:14:14Z',
              'uid': '451048',
              'user': 'bbmiller',
              'version': '5'},
  'id': '261114299',
  'pos': [41.9729565, -87.6939548],
  'type': 'node',
  'visible': 'true'},
 {'created': {'changeset': '8581395',
              'timestamp': '2011-06-29T14:14:14Z',
              'uid': '451048',
              'user': 'bbmiller',