In [4]:
import os
import os.path
import sys
import time
import requests
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
from collections import defaultdict

In [6]:
osm_filename = "./DisneyWorldOpenMap.xml"

def count_tags(filename):
    tags = {}
    for event, elem in ET.iterparse(filename, events=('start', )):
        if elem.tag not in tags:
            tags[elem.tag] = 1
        else:
            tags[elem.tag] += 1
    return tags

start_time = time.time()
tags = count_tags(osm_filename)
sorted_by_occurrence = [(k, v) for (v, k) in sorted([(value, key) for (key, value) in tags.items()], reverse=True)]

print('Element types and occurrence of DisneyWorldOpenMap.xml:\n')
pprint.pprint(sorted_by_occurrence)

print('\n--- %s seconds ---' % (time.time() - start_time))

Element types and occurrence of DisneyWorldOpenMap.xml:

[('nd', 328364),
 ('node', 251927),
 ('tag', 70499),
 ('way', 21928),
 ('member', 10790),
 ('relation', 679),
 ('osm', 1),
 ('note', 1),
 ('meta', 1),
 ('bounds', 1)]

--- 2.9701240062713623 seconds ---


In [8]:
def count_attrs(filename):
    attrs = {}
    for event, elem in ET.iterparse(filename, events=('start', 'end')):
        if event == 'end':
            for attr in elem.attrib:
                if attr not in attrs:
                    attrs[attr] = 1
                else:
                    attrs[attr] += 1
    return attrs

start_time = time.time()
attrs = count_attrs(osm_filename)
sorted_by_occurrence = [(k, v) for (v, k) in sorted([(value, key) for (key, value) in attrs.items()], reverse=True)]

print('Attributes and occurrence of DisneyWorldOpenMap.xml:\n')
pprint.pprint(sorted_by_occurrence)

print('\n--- %s seconds ---' % (time.time() - start_time))

Attributes and occurrence of DisneyWorldOpenMap.xml:

[('ref', 339154),
 ('version', 274535),
 ('user', 274534),
 ('uid', 274534),
 ('timestamp', 274534),
 ('id', 274534),
 ('changeset', 274534),
 ('lon', 251927),
 ('lat', 251927),
 ('v', 70499),
 ('k', 70499),
 ('type', 10790),
 ('role', 10790),
 ('osm_base', 1),
 ('minlon', 1),
 ('minlat', 1),
 ('maxlon', 1),
 ('maxlat', 1),
 ('generator', 1)]

--- 3.3659424781799316 seconds ---


In [9]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
lower_dot = re.compile(r'^([a-z]|_)*.([a-z]|_)*$')


def key_type(element, keys):
    if element.tag == "tag":
        
        if lower.search(element.attrib['k']):
            keys['lower'] += 1
        elif lower_colon.search(element.attrib['k']):
            keys['lower_colon'] += 1
        elif lower_dot.search(element.attrib['k']):
            keys['lower_dot'] += 1
        else:
            keys['other'] += 1
        
    return keys


def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "lower_dot": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

In [11]:

start_time = time.time()
keys = process_map(osm_filename)

print('Key types and occurrence in DisneyWorldOpenMap.xml:')
pprint.pprint(keys)

print('\n--- %s seconds ---' % (time.time() - start_time))

Key types and occurrence in DisneyWorldOpenMap.xml:
{'lower': 64452, 'lower_colon': 4658, 'lower_dot': 5, 'other': 1384}

--- 3.032944917678833 seconds ---
