# Abstract

The purpose of this *Notebook* is to read the *OpenStreetMap* XML file, investigate its data, classify street types, map street names to an accepted standard format, and generate a JSON file with the data to be imported in MongoDB later.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from IPython.display import display
from pprint import pprint

import codecs
import json

from xmlreader import XmlReader
from audit import KeyAuditor, StreetAuditor
from mapdocument import MapDocument

SAMPLE_SKIP = 50
DATA_FILE = 'data/buenos-aires_argentina.osm'
DATA_SAMPLE_FILE = 'data/buenos-aires_argentina_sample_%d.osm' % SAMPLE_SKIP

In [3]:
# read main xml file (400+ mb)
data = XmlReader(DATA_FILE)

In [7]:
# count tags in main file
display(data.count_tags())

defaultdict(int,
            {'bounds': 1,
             'member': 202263,
             'nd': 2006656,
             'node': 1553782,
             'osm': 1,
             'relation': 9559,
             'tag': 1736827,
             'way': 338006})

In [5]:
# read sample xml file
datasample = XmlReader(DATA_SAMPLE_FILE)

In [6]:
# count tags in sample file
display(dict(datasample.count_tags()))

{'member': 4104,
 'nd': 39116,
 'node': 31076,
 'osm': 1,
 'relation': 191,
 'tag': 34482,
 'way': 6760}

In [8]:
# sample file: audit keys contained in tag attributes
key_auditor = KeyAuditor(datasample)
display(dict(key_auditor.test(filter_tags=('tag'))))

{'alphanum_colon': 15572, 'double_colon': 41, 'lower': 18867, 'other': 2}

In [9]:
# sample file: unique users found in file
len(datasample.unique_users())

601

In [10]:
# main file: audit street types
# unknown=True includes also streets requiring better classification rules
# Calle is also included for research purposes
street_auditor = StreetAuditor(data)
street_types = street_auditor.audit_types(unknown=False)

In [11]:
# main file: show non standard street types
display(street_types['types'].keys())

dict_keys(['Calle', 'Cno', 'Ave', 'Av', 'Línea', 'Pje', 'Bv'])

In [None]:
# create a list of dictionaries, using document structure gotten from MongoDB course's Problem Set
elements = list(data.iterate())
docs = []
for e in elements:
    doc = MapDocument(element=e, street_auditor=street_auditor)
    docs.append(doc)

In [None]:
# write list of docs into a JSON file
with open('data/buenos-aires_argentina.json', 'wb') as f:
    for doc in docs:
        f.write(json.dumps(doc.todict()).encode())