# Abstract

The purpose of this *Notebook* is to read the *OpenStreetMap* XML file, investigate its data, classify street types, map street names to an accepted standard format, and generate a JSON file with the data to be imported in MongoDB later.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from IPython.display import display
from pprint import pprint

import codecs
import json

from xmlreader import XmlReader
from audit import KeyAuditor, StreetAuditor
from mapdocument import MapDocument

SAMPLE_SKIP = 50
DATA_FILE = 'data/buenos-aires_argentina.osm'
DATA_SAMPLE_FILE = 'data/buenos-aires_argentina_sample_%d.osm' % SAMPLE_SKIP

In [3]:
# read main xml file (400+ mb)
data = XmlReader(DATA_FILE)

In [None]:
# count tags in main file
display(data.count_tags())

In [4]:
# read sample xml file
datasample = XmlReader(DATA_SAMPLE_FILE)

In [None]:
# count tags in sample file
display(dict(datasample.count_tags()))

In [None]:
# sample file: audit keys contained in tag attributes
key_auditor = KeyAuditor(datasample)
display(dict(key_auditor.test(filter_tags=('tag'))))

In [None]:
# sample file: unique users found in file
len(datasample.unique_users())

In [8]:
# main file: audit street types
# unknown=True includes also streets requiring better classification rules
# Calle is also included for research purposes
street_auditor = StreetAuditor(data)

In [5]:
# sample file: audit street types
# unknown=True includes also streets requiring better classification rules
# Calle is also included for research purposes
street_auditor_sample = StreetAuditor(datasample)

In [11]:
# main file: show non standard street types
street_types = street_auditor.audit_types(unknown=False, limit_per_type=5, include_updates=True)
display(street_types)

{'elements': 1891788,
 'streets': 254974,
 'types': {'Av': {'128 - Av. Hipólito Yrigoyen',
   '316 - Av. 12 de Octubre',
   'Av. Doctor Honorio Pueyrredón',
   'Av. Dr: Ramos Mejía',
   'av 101 n 1661 san martin'},
  'Ave': {'Ave. Fondo de la Legua 425',
   'Ave. Scalabrini Ortiz',
   'Ave. Scalabrini Ortíz'},
  'Bv': {'BV DE LOS ITALIANOS', 'BV GDOR MARTIN RODRIGUEZ'},
  'Calle': {'525', 'Edison', 'Escalada', 'Lafinur', 'Marcelo T. de Alvear'},
  'Cno': {'Cno. Belgrano e/ 473 bis y 474',
   'Cno. Centenario',
   'Cno. Centenario y 461e'},
  'Línea': {'Línea D', 'Línea H'},
  'Pje': {'PJE A MAGALDI',
   'PJE DE LA VIA',
   'PJE ECHAG?E',
   'PJE HILARIO LAGOS',
   'PJE TARIJA'}},
 'updates': [('Avenda Avellaneda', 'Avenida Avellaneda'),
  ('Cno. Belgrano e/ 473 bis y 474', 'Camino Belgrano e/ 473 bis y 474'),
  ('Cno. Centenario y 461e', 'Camino Centenario y 461e'),
  ('Ave. Scalabrini Ortiz', 'Avenida Scalabrini Ortiz'),
  ('Ave. Scalabrini Ortíz', 'Avenida Scalabrini Ortíz'),
  ('Ave

In [10]:
# sample file: show non standard street types
street_types = street_auditor_sample.audit_types(unknown=False, limit_per_type=5, include_updates=True)
display(street_types)

{'elements': 37836,
 'streets': 5126,
 'types': {'Av': {'316 - Av. 12 de Octubre',
   'AV GRL SAN MARTIN',
   'Av. Dr: Ramos Mejía'},
  'Bv': {'BV DE LOS ITALIANOS', 'BV GDOR MARTIN RODRIGUEZ'},
  'Calle': {'Bulnes', 'Catamarca', 'Luis María Campos', 'Nogoyá', 'Olleros'},
  'Pje': {'PJE EVA PERON', 'PJE JOSE MARINI', 'PJE MIGUELETE', 'PJE TARIJA'}},
 'updates': [('Cno. Belgrano e/ 473 bis y 474',
   'Camino Belgrano e/ 473 bis y 474'),
  ('BV GDOR MARTIN RODRIGUEZ', 'Boulevard GDOR MARTIN RODRIGUEZ'),
  ('BV DE LOS ITALIANOS', 'Boulevard DE LOS ITALIANOS'),
  ('BV DE LOS ITALIANOS', 'Boulevard DE LOS ITALIANOS'),
  ('BV DE LOS ITALIANOS', 'Boulevard DE LOS ITALIANOS'),
  ('BV GDOR MARTIN RODRIGUEZ', 'Boulevard GDOR MARTIN RODRIGUEZ'),
  ('BV DE LOS ITALIANOS', 'Boulevard DE LOS ITALIANOS'),
  ('BV GDOR MARTIN RODRIGUEZ', 'Boulevard GDOR MARTIN RODRIGUEZ'),
  ('BV GDOR MARTIN RODRIGUEZ', 'Boulevard GDOR MARTIN RODRIGUEZ'),
  ('BV GDOR MARTIN RODRIGUEZ', 'Boulevard GDOR MARTIN RODRIGUEZ'

In [None]:
# main file: create a list of dictionaries, using document structure gotten from MongoDB course's Problem Set
elements = list(data.iterate())
docs = []
for e in elements:
    doc = MapDocument(element=e, street_auditor=street_auditor)
    docs.append(doc)

In [None]:
# main file: write list of docs into a JSON file
with open('data/buenos-aires_argentina.json', 'w') as f:
    for doc in docs:
        if doc.todict() is not None:
            f.write(json.dumps(doc.todict())+"\n")

In [None]:
# sample file: create a list of dictionaries, using document structure gotten from MongoDB course's Problem Set
elements = list(datasample.iterate())
sampledocs = []
for e in elements:
    doc = MapDocument(element=e, street_auditor=street_auditor)
    sampledocs.append(doc)

In [None]:
# sample file: write list of docs into a JSON file
with open('data/buenos-aires_argentina_sample.json', 'w') as f:
    for doc in sampledocs:
        if doc.todict() is not None:
            f.write(json.dumps(doc.todict())+"\n")