In [64]:
from IPython.display import display
import xml.etree.ElementTree as ET
import pprint
import re
from collections import defaultdict
import codecs
import json

DATA_FILE = 'data/buenos-aires_argentina.osm'

In [121]:
class XmlReader:
    """
    XML parser using ElementTree object adding limit and filter options
    See https://stackoverflow.com/a/42193997/3456290
    """
    
    def __init__(self, filename):
        self.filename = filename
        self.cache = defaultdict(list)
        self.root = None
        
    def reset_doc(self, filename):
        self.doc = ET.iterparse(filename, events=('start', 'end'))
        _, self.root = next(self.doc)

    def root_tag(self):
        if self.root is None:
            self.reset_doc(self.filename)
        return self.root.tag
    
    def count_tags(self, limit=None, filter_tag=None):
        tags = defaultdict(int)
        tags[self.root_tag()] = 1
        for e in self.iterate(limit=limit, filter_tag=filter_tag):
            tags[e.tag] += 1
            for ee in e.getchildren():
                tags[ee.tag] += 1
        return tags
    
    def iterate(self, limit=None, filter_tag=None, use_cache=True):
        """
        Parse XML file allowing to use limit and filter optimising performance
        
        Args:
            limit(int): Limit of nodes to yield
            filter_tag(string): Tag name to apply as a filte
        """
        if use_cache and len(self.cache[(filter_tag, limit)]) > 0:
            print('Using cache...')
            for e in self.cache[(filter_tag, limit)]:
                yield e
            return True
        if self.root is None:
            self.reset_doc(self.filename)
        count = 0
        start_tag = None
        for event, element in self.doc:
            if limit is not None:
                if count == limit:
                    return True
            if event == 'start' and start_tag is None:
                if filter_tag is None or (filter_tag is not None and filter_tag == element.tag):
                    start_tag = element.tag
            if event == 'end' and element.tag == start_tag:
                if use_cache:
                    self.cache[(filter_tag, limit)].append(element)
                yield element
                count += 1
                start_tag = None
                self.root.clear()
    
data = XmlReader(DATA_FILE)
#display(data.count_tags(filter_tag='node'))

In [77]:
#test()
display(dict(data.count_tags()))

{'member': 202263,
 'nd': 2006656,
 'node': 1553682,
 'osm': 1,
 'relation': 9559,
 'tag': 1736778,
 'way': 338006}

In [133]:
class KeyAuditor:
    
    def __init__(self, xml_reader):
        self.regex = {
            'lower': re.compile(r'^([a-z]|_)*$'),
            'lower_colon': re.compile(r'^([a-z]|_)*:([a-z]|_)*$'),
            'problemchars': re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
        }
        self.xml_reader = xml_reader

    def key_type(self, element, keys):
        if element.tag == "tag":
            if self.regex['lower_colon'].search(element.attrib['k']):
                keys['lower_colon'] += 1
            elif self.regex['problemchars'].search(element.attrib['k']):
                keys['problemchars'] += 1
            elif self.regex['lower'].search(element.attrib['k']):
                keys['lower'] += 1
            else:
                keys['other'] += 1
        return keys

    def test(self, filter_tag=None, limit=None):
        keys = defaultdict(int)
        for element in self.xml_reader.iterate(filter_tag=filter_tag, limit=limit):
            keys = self.key_type(element, keys)
        return keys

In [136]:
key_auditor = KeyAuditor(data)
display(dict(key_auditor.test(filter_tag='tag')))

{'lower': 943386, 'lower_colon': 777494, 'other': 5556}

In [None]:
"""
Your task is to explore the data a bit more.
The first task is a fun one - find out how many unique users
have contributed to the map in this particular area!

The function process_map should return a set of unique user IDs ("uid")
"""

def get_user(element):
    return


def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if 'uid' in element.attrib:
            users.add(element.attrib['uid'])

    return users


def test3():

    users = process_map(DATA_FILE)
    display(len(users))
    # assert len(users) == 6

Identifying street types is not as easy as doing it with English:
* Street names don't include the word *Calle* explicitly, but avenues, boulevards, etc do. This means that any way not specifying the type on its name can be considered a normal street.
* Some localities use number and/or names for streets

In [None]:
"""
Your task in this exercise has two steps:

- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix 
    the unexpected street types to the appropriate ones in the expected list.
    You have to add mappings only for the actual problems you find in this OSMFILE,
    not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
    The function takes a string with street name as an argument and should return the fixed name
    We have provided a simple test so that you see what exactly is expected
"""

#street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
prefix_streets = ['[\d]+', 'Sin', 'Ada', 'Alta', 'Ana', 'Blas', 'Bell', 'Cjal', 'Cmte', 'De', 'Del',
                  'Cabo', 'Cap', 'Cnl', 'Tte', 'Carl', 'GRAL', 'GRL', 'Conc', 'Cruz', 'Cura',
                  'Dip', 'Don', 'Diaz', 'Dirk', 'Eva', 'El', 'Ema', 'Emir', 'Emma', 'Enzo',
                  'Ex', 'Fitz', 'Flor', 'John', 'Juez', 'La', 'Las', 'Leon', 'Los', 'Luis',
                  'Lima', 'Lino', 'Lola', 'Lope', 'Mar', 'Olga', 'Olof', 'Paso', 'Paul', 'Pbro',
                  'Palo', 'Paz', 'Pi', 'Pio', 'Plus', 'Raul', 'Rca', 'Rio', 'Rep', 'Ruiz',
                  'Gdor', 'Gral', 'Grl', 'Igr', 'Ing', 'Jean', 'Juan', 'Hugo', 'Dr', 'Dr',
                  'Ivan', 'Jose', 'Fray', 'Mons', 'San', 'Pte', 'Pres', 'Tcnl', 'Ruy', 'Sgt',
                  'Sadi', 'Sir', 'Sor', 'Sta', 'Tgrl', 'Tuyu', 'Tres', 'Tula', 'Veva', 'Vito', 'Von']

regex_str = r'^\b(?!(%s))[a-zA-Z]{2,4}\.?\s' % '|'.join(prefix_streets)
print('street_type_re:', regex_str)
street_type_re = re.compile(regex_str, re.IGNORECASE)
regex_str = r'^(%s)(?!\s\-\s)[\w\s]+$' % '|'.join(prefix_streets)
print('normal_street_names_re:', regex_str)
normal_street_names_re = re.compile(regex_str, re.IGNORECASE)
street_number_then_name_re = re.compile(r'^[\d]+(\s\-\s)[\w\s]+$', re.IGNORECASE)

expected = ["Calle", "Avenida", "Boulevard", "Pasaje", "Camino", "Diagonal", "Ruta Nacional", "Ruta Provincial"]

# UPDATE THIS VARIABLE
mapping = { "Av. ": "Avenida",
            "Ave. ": "Avenida",
            "Av ": "Avenida",
            "AV ": "Avenida",
            "av ": "Avenida",
            "Au ": "Autopista",
            "Avda ": "Avenida",
            "Avda. ": "Avenida",
            "BV ": "Boulevard",
            "PJE ": "Pasaje",
            "Pje. ": "Pasaje",
            "Cno ": "Camino",
            "Cno. ": "Camino",
            "Cmno ": "Camino",
            "Cno. ": "Camino",
            "Diag ": "Diagonal",
            "Diag. ": "Diagonal",
            "RN ": "Ruta Nacional",
            "RP ": "Ruta Provincial"
          }


def audit_street_type(street_types, street_name):
    """
    Types of names of streets:
    - only numbers
    - name of a person
    - number - name
    """
    m = street_type_re.search(street_name)
    if m:  # street type identified
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)
    elif normal_street_names_re.search(street_name):
        street_types['Calle'].add(street_name)
    elif street_number_then_name_re.search(street_name):
        street_without_number = re.sub(r'^([\d]+)\s\-\s', '', street_name)
        m = street_type_re.search(street_without_number)
        if m:
            street_type = m.group()
            if street_type not in expected:
                street_types[street_type].add(street_name)
        else:
            street_types['Calle'].add(street_name)
    


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types


def update_name(name, mapping):
    for k in mapping:
        if re.search(k, name, re.IGNORECASE) and re.search(mapping[k], name) is None:
            name = re.sub(k, mapping[k]+' ', name, re.IGNORECASE)
    return name


def test4():
    st_types = audit(DATA_FILE)
    # assert len(st_types) == 3
    pprint.pprint(dict(st_types).keys())

    for st_type, ways in st_types.items():
        for name in ways:
            better_name = update_name(name, mapping)
            if better_name != name:
                print(name, "=>", better_name)
            # if name == "West Lexington St.":
            #     assert better_name == "West Lexington Street"
            # if name == "Baldwin Rd.":
            #     assert better_name == "Baldwin Road"

In [None]:
test4()
# Hosting | Webdesign | Translations => Hosting | Webdesign | Translations
# 476 e/ 10 y 11 => 476 e/ 10 y 11
# 7 bis e/ 474 y 475 => 7 bis e/ 474 y 475
# Las Heras;Maipú => Las Heras;Maipú
# Diagonal 74 № 996 Diagonal 74 y 5
# Calle 411A
# Teniente 1° Oscar Camilli
# Avenida Calchaquí Esq. Av. 12 de Octubre
# Colectora RP36
# Paraguay 557
# 101 - Avenida Sadi Carnot

In [None]:
"""
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:

{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
          "version":"2",
          "changeset":"17206049",
          "timestamp":"2013-08-03T16:43:42Z",
          "user":"linuxUser16",
          "uid":"1219059"
        },
"pos": [41.9757030, -87.6921867],
"address": {
          "housenumber": "5157",
          "postcode": "60625",
          "street": "North Lincoln Ave"
        },
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}

You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB. 

Note that in this exercise we do not use the 'update street name' procedures
you worked on in the previous exercise. If you are using this code in your final
project, you are strongly encouraged to use the code from previous exercise to 
update the street names before you save them to JSON. 

In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
    - attributes in the CREATED array should be added under a key "created"
    - attributes for latitude and longitude should be added to a "pos" array,
      for use in geospacial indexing. Make sure the values inside "pos" array are floats
      and not strings. 
- if the second level tag "k" value contains problematic characters, it should be ignored
- if the second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if the second level tag "k" value does not start with "addr:", but contains ":", you can
  process it in a way that you feel is best. For example, you might split it into a two-level
  dictionary like with "addr:", or otherwise convert the ":" to create a valid key.
- if there is a second ":" that separates the type/direction of a street,
  the tag should be ignored, for example:

<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>

  should be turned into:

{...
"address": {
    "housenumber": 5158,
    "street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}

- for "way" specifically:

  <nd ref="305896090"/>
  <nd ref="1719825889"/>

should be turned into
"node_refs": ["305896090", "1719825889"]
"""


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way":
        node['type'] = element.tag
        for a in element.attrib:
            #print('iterating node/way:', tag, tag.attrib)
            if a in CREATED:
                if 'created' not in node:
                    node['created'] = {}
                node['created'][a] = element.attrib[a]
            elif a in ('lat','lon'):
                if 'pos' not in node:
                    node['pos'] = [0,0]
                key = 0 if a == 'lat' else 1
                node['pos'][key] = float(element.attrib[a])
            else:
                node[a] = element.attrib[a]
        for t in element.iter('tag'):
            if problemchars.search(t.attrib['k']):
                continue
            elif t.attrib['k'].startswith('addr:'):
                if 'address' not in node:
                    node['address'] = {}
                if t.attrib['k'].count(':') == 1:
                    addr_type = t.attrib['k'].split(':')[1]
                    node['address'][addr_type] = t.attrib['v']
            elif t.attrib['k'].count(':') == 1:
                extra_node = t.attrib['k'].split(':')
                if extra_node[0] not in node:
                    node[extra_node[0]] = {}
                node[extra_node[0]][extra_node[1]] = t.attrib['v']
        if element.tag == 'way':
            for t in element.iter('nd'):
                if 'node_refs' not in node:
                    node['node_refs'] = []
                if 'ref' in t.attrib:
                    node['node_refs'].append(t.attrib['ref'])
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

def test5():
    # NOTE: if you are running this code on your computer, with a larger dataset, 
    # call the process_map procedure with pretty=False. The pretty=True option adds 
    # additional spaces to the output, making it significantly larger.
    data = process_map(DATA_FILE, True)
    pprint.pprint(data[-1])
    
    """correct_first_elem = {
        "id": "261114295", 
        "visible": "true", 
        "type": "node", 
        "pos": [41.9730791, -87.6866303], 
        "created": {
            "changeset": "11129782", 
            "user": "bbmiller", 
            "version": "7", 
            "uid": "451048", 
            "timestamp": "2012-03-28T18:31:23Z"
        }
    }
    assert data[0] == correct_first_elem
    assert data[-1]["address"] == {
                                    "street": "West Lexington St.", 
                                    "housenumber": "1412"
                                      }
    assert data[-1]["node_refs"] == [ "2199822281", "2199822390",  "2199822392", "2199822369", 
                                    "2199822370", "2199822284", "2199822281"]"""

In [None]:
test5()