## Create an excerpt of the data

See create_sample.py.

## Exploration

Count element types:

In [None]:
import pprint
from collections import defaultdict
import xml.etree.cElementTree as ET


def count_tags(filename):
    counter = defaultdict(int)
    with open(filename) as f:
        for event, element in ET.iterparse(f):
            counter[element.tag] += 1
    return counter
    

def test():
    tags = count_tags('dresden_germany.sample_k=100.osm')
    pprint.pprint(tags)


if __name__ == "__main__":
    test()

Parse node and ways with their tags:

In [None]:
from collections import Counter, defaultdict
import json
import xml.etree.cElementTree as ET
import re
import codecs


filename = 'dresden_germany.sample_k=10.osm'

TOP_LEVEL_TAGS = ["way", "node"]
CREATED = ["version", "changeset", "timestamp", "user", "uid"]

problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


# Count tag keys per parent element type (e.g. "node", "way", ...).
tag_keys = defaultdict(Counter)
element_counter = Counter()


def count_tag_key(tag, parent):
    k = tag.attrib["k"]
    tag_keys[parent][k] += 1

    
def parse_address(element):
    address = {}
    for subtag in element.iter("tag"):
        k = subtag.attrib["k"]
        v = subtag.attrib["v"]
        if problemchars.match(k):
            print(k)
            continue
        elif k.count(":") > 1:
            continue
        elif k.startswith("addr:"):
            key = k.split(":")[1]
            address[key] = v
    return address
        
# Maps each type of special information to a list of fields which we care for.
SPECIAL_TYPES = {"shop": ["name", "wheelchair", "opening_hours"]}

    
def parse_specials(element):
    content = {}
    type_ = None
    for subtag in element.iter("tag"):
        k = subtag.attrib["k"]
        v = subtag.attrib["v"]
        if k in SPECIAL_TYPES:
            type_ = k
        else:
            content[k] = v
    content = {k: v for k, v in content.items() 
               if type_ in SPECIAL_TYPES and k in SPECIAL_TYPES[type_]}       
    
    return type_, content
    
    
def parse_node(element):
    node = {}
    for tag in element.iter("tag"):
        count_tag_key(tag, "node")
    element_counter["node"] += 1
    address = parse_address(element)
    if address:
        node["address"] = address
    special_type, special_content = parse_specials(element)
    if special_type:
        node[special_type] = special_content
    return node


def parse_way_nodes(element):
    nodes = []
    for node in element.iter("nd"):
        nodes.append(node.attrib["ref"])
    return nodes     

    
def parse_way(element):
    way = {}
    for tag in element.iter("tag"):
        count_tag_key(tag, "way")
    element_counter["way"] += 1
    way_nodes = parse_way_nodes(element)
    if way_nodes:
        way["nodes"] = way_nodes    
    return way


def parse_tags(element):
    tags = {}
    for tag in element.iter("tag"):
        print("tag", tag.attrib)
        k, v = tag.attrib["k"], tag.attrib["v"]
        if k.startswith("addr:"):
            continue  # skip address tags, will be parsed otherwise
        tags[k] = v
    return tags

    
def parse_element(element):
    node = None
    if element.tag == "way":
        node = parse_way(element)
    elif element.tag == "node":
        node = parse_node(element)
    node["created"] = {}
    node["type"] = element.tag
    lat = lon = None
    for k, v in element.attrib.items():
        if k in CREATED:
            node["created"][k] = v
        elif k == "lat":
            lat = float(v)
        elif k == "lon":
            lon = float(v)
        else:
            node[k] = v
    if lat and lon:
        node["pos"] = [lat, lon]
        
    # TODO(Jonas): Disable generic parsing of all tags later on.    
    tags = parse_tags(element)
    if tags:
        node["tags"] = tags

    return node

    
filename_out = "{0}.json".format(filename)
with open(filename) as f:
    with codecs.open(filename_out, "w") as fout:
        for event, element in ET.iterparse(f, events=["end"]):
            if element.tag in TOP_LEVEL_TAGS:
                #print(element)
                el = parse_element(element)
                #fout.write(json.dumps(el, indent=2) + "\n")
                fout.write(json.dumps(el) + "\n")
            
        
pprint.pprint(tag_keys)
#pprint.pprint(element_counter)

## Outline

  * Parse the data using cElementTree.
  * Create some statistics while parsing, document them. Note: Maybe this should be done in the end, using queries against MongoDB.
  * _Restrict the data to streets (ways with a tag with k=highway) [optional, I guess]_
  * Do a little data cleaning:
    * Parse street name tags and unify abbreviations (later include the types into the statistics)
  * Create JSON output from the data
  * Import it into MongoDB
  * Run statistics queries against it, audit, find and document problems, iterate from the beginning
    * Number of ways, nodes
    * Longest way (most way_nodes)
    * Top 10 tags
    * Top 3 contributing users
    * Most frequent shop name
    

## Encountered problems (and solution)

  * Many elements are duplicated -- by my mistake or due to errorneous data?
    * It was my mistake, caused by reading "end" and "start" events from iterparse.
  * mongoimport reads one element per line, not prettyfied JSON
  * fix UTF-8 encoding / decoding
  * What is the most frequent shop? Some nodes have a shop field, but no name. Find out what is going on and if these can be safely ignored.
 
## Audited data

  * shops: parse name, wheelchair accessibility and opening hours
  * places: detect suburbs etc. (todo)