# OpenStreetMap Data Wrangling Project


The city whose OSM data set I've chosen to wrangle is Austin, Texas. 

* http://www.openstreetmap.org/relation/113314
* https://mapzen.com/data/metro-extracts/metro/austin_texas/

I chose this city because it's the place I presently call home. Tackling the Austin data set gives me a chance to get more acquainted with the place I live.

## Problems Encountered in Data Set

As a preliminary step to working with irregularities in the data set, I'll take a look at the distribution of tags to see which are abundant enough to serve as good data wrangling practice.

In [40]:
# Import the libraries necessary for the project
from collections import defaultdict, Counter
import csv
import pprint
import re
import pprint
import sqlite3
import xml.etree.cElementTree as ET

# Open and name data file as global variable
OSM_FILE = open('austin_texas.osm', 'r', encoding="utf8")

# Element generator for parsing individual nodes in OSM file
def get_element(osm_file, tags=('node', 'way', 'relation')):
    context = ET.iterparse(osm_file, events=('start', 'end'))
    __, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

# Dictionary for counting tag labels
k_tags = {}

# Dictionary for saving items by labels
tag_ids = {}

# Population tag count
population = 0

# Loop for going through tags
for element in get_element(OSM_FILE):
    for child in element:
        # Try-Except for dealing with tags without labels
        try:
            if child.get('k') == 'population':
                population += 1
            if child.attrib['k'] in k_tags:
                k_tags[child.attrib['k']] += 1
                tag_ids[child.atrib['k']].append(element.get('id')) 
            else:
                k_tags[child.attrib['k']] = 1
                tag_ids[child.attrib['k']] = [element.get('id')]
        except:
            continue    

# Reduce dictionaries to abundant tags
k_tags_ids = {label:[k_tags[label], tag_ids[label]] for label in k_tags.keys() if k_tags[label] >= 1000}

    
print("\nAbundant tags in {}".format(OSM_FILE))
pprint.pprint(k_tags_ids)
print(population)


Abundant tags in <_io.TextIOWrapper name='austin_texas.osm' mode='r' encoding='utf8'>
{'access': [4438, ['151576347']],
 'addr:city': [3710, ['280231689']],
 'addr:housenumber': [333664, ['280231689']],
 'addr:postcode': [86642, ['281362888']],
 'addr:state': [3353, ['280231689']],
 'addr:street': [333622, ['280231689']],
 'amenity': [8153, ['152713302']],
 'barrier': [1225, ['26546151']],
 'bicycle': [1429, ['2539446524']],
 'bridge': [1905, ['2089441906']],
 'building': [584296, ['365275738']],
 'coa:place_id': [13715, ['3823901373']],
 'created_by': [7297, ['151756603']],
 'ele': [1490, ['151321672']],
 'foot': [1327, ['2539446524']],
 'gnis:county_id': [1060, ['356698556']],
 'gnis:created': [1099, ['356698556']],
 'gnis:feature_id': [1360, ['356698556']],
 'gnis:state_id': [1059, ['356698556']],
 'height': [443886, ['3842174484']],
 'highway': [83748, ['26546008']],
 'landuse': [2523, ['356724097']],
 'lanes': [5459, ['4358672']],
 'layer': [1927, ['153157946']],
 'leisure': [216

## Preliminary Audit Results

Among the most common node labels are address labels, which I'll focus on for wrangling. Along with the 'addr:' label set, the Austin OSM file includes address data from the public domain TIGER data source put out by the US Census Bureau. According to OSM documentationThe TIGER data was first imported into OSM in 2005 and subsequently in 2007 and 2008 when it was used to populate a largely empty US OpenStreetMap with road and place data. The same source states that OSM is unlikely to be updated using direct TIGER data overwrite again. 

Other data labels that figure prominently in the data file include the 'building' and 'name' labels. I'll further audit the values for these labels to judge whether cleaning is necessary.
 

In [8]:
### Write data to be cleaned to separate XML file###

#Libraries
import xml.etree.cElementTree as ET

# Open and name data file as global variable
OSM_FILE = open('austin_texas.osm', 'r', encoding="utf8")

# Open new XML files for writing prominent data to separate files
addr_file = open('austin_addr.osm', 'w', encoding='utf8')
tiger_file = open('austin_tiger.osm', 'w', encoding='utf8')
building_file = open('austin_building.osm', 'w', encoding='utf8')
name_file = open('austin_name.osm', 'w', encoding='utf8')

# Initiazlize XML files
for f in [addr_file, tiger_file, building_file, name_file]:
    f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    f.write('<osm>\n  ')

# Element generator for parsing individual nodes in OSM file
def get_element(osm_file, tags=('node', 'way', 'relation')):
    context = ET.iterparse(osm_file, events=('start', 'end'))
    __, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

for element in get_element(OSM_FILE):
    for child in element:
        # Try-Except for dealing with tags without labels
        try:
            if child.attrib['k'].startswith('addr:'):
                addr_file.write(ET.tostring(element, encoding='utf8'))
                break
            elif child.attrib['k'].startswith('tiger'):
                tiger_file.write(ET.tostring(element, encoding='utf8'))
                break
            elif child.attrib['k'].startswith('name'):
                name_file.write(ET.tostring(element, encoding='utf8'))
                break
            elif child.attrib['k'] == 'building':
                building_file.write(ET.tostring(element, encoding='utf8'))
                break
        except:
            continue  

for f in [addr_file, tiger_file, name_file, building_file]:
    f.write('</osm>')
    f.close()


In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "austin_texas.osm"  # Replace this with your osm file
SAMPLE_FILE = "austin_sample.osm"

k = 10 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'w') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            print(ET.tostring(element, encoding='iso8859-1'))
            output.write(ET.tostring(element, encoding='iso8859-1'))

    output.write('</osm>')

b'<?xml version=\'1.0\' encoding=\'iso8859-1\'?>\n<node changeset="8497118" id="26546004" lat="30.4695355" lon="-97.7972587" timestamp="2011-06-20T18:36:15Z" uid="388279" user="Tylan" version="15" />\n\t'


TypeError: write() argument must be str, not bytes

In [18]:
import re

s = "999r99"



True


In [26]:
### Write data to be cleaned to separate XML file###

#Libraries
import xml.etree.cElementTree as ET

# Open and name data file as global variable
OSM_FILE = open('austin_texas.osm', 'r', encoding="utf8")


# Element generator for parsing individual nodes in OSM file
def get_element(osm_file, tags=('node', 'way')):
    context = ET.iterparse(osm_file, events=('start', 'end'))
    __, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

names = []
            
for element in get_element(OSM_FILE):
    elem_id = element.attrib['id']
    name = None
    place = None
    popul = None
    for tag in element.iter('tag'):
        if tag.attrib['k'] == 'name':
            name = tag.get('v')
        if tag.attrib['k'] == 'place':
            place = tag.get('v')
        if tag.attrib['k'] == 'population':
            popul = tag.get('v') 
    if (name and place in ['hamlet', 'village', 'town', 'city'] and popul):
        names.append((elem_id, name, place, popul))


print(names)

[('151324760', 'Bastrop', 'village', '7591'), ('151327976', 'Manor', 'village', '2657'), ('151345234', 'Pflugerville', 'town', '46936'), ('151349678', 'Hays', 'village', '241'), ('151353230', 'Jonestown', 'village', '2121'), ('151355719', 'Mountain City', 'village', '698'), ('151359532', 'Taylor', 'town', '15322'), ('151406356', 'Sunset Valley', 'village', '790'), ('151445864', 'Bear Creek', 'village', '376'), ('151449613', 'Wimberley', 'village', '2703'), ('151515514', 'Lago Vista', 'village', '5794'), ('151582208', 'Thrall', 'village', '852'), ('151591378', 'Briarcliff', 'village', '852'), ('151617534', 'Georgetown', 'town', '42467'), ('151623111', 'Lakeway', 'town', '11391'), ('151639682', 'Kyle', 'town', '28016'), ('151726187', 'Woodcreek', 'village', '1472'), ('151727206', 'San Leanna', 'village', '486'), ('151758834', 'Volente', 'village', '385'), ('151768617', 'Creedmoor', 'village', '190'), ('151769657', 'Buda', 'village', '4551'), ('151844641', 'Webberville', 'village', '308')

In [28]:
import pprint
pprint.pprint(names)

[('151324760', 'Bastrop', 'village', '7591'),
 ('151327976', 'Manor', 'village', '2657'),
 ('151345234', 'Pflugerville', 'town', '46936'),
 ('151349678', 'Hays', 'village', '241'),
 ('151353230', 'Jonestown', 'village', '2121'),
 ('151355719', 'Mountain City', 'village', '698'),
 ('151359532', 'Taylor', 'town', '15322'),
 ('151406356', 'Sunset Valley', 'village', '790'),
 ('151445864', 'Bear Creek', 'village', '376'),
 ('151449613', 'Wimberley', 'village', '2703'),
 ('151515514', 'Lago Vista', 'village', '5794'),
 ('151582208', 'Thrall', 'village', '852'),
 ('151591378', 'Briarcliff', 'village', '852'),
 ('151617534', 'Georgetown', 'town', '42467'),
 ('151623111', 'Lakeway', 'town', '11391'),
 ('151639682', 'Kyle', 'town', '28016'),
 ('151726187', 'Woodcreek', 'village', '1472'),
 ('151727206', 'San Leanna', 'village', '486'),
 ('151758834', 'Volente', 'village', '385'),
 ('151768617', 'Creedmoor', 'village', '190'),
 ('151769657', 'Buda', 'village', '4551'),
 ('151844641', 'Webbervill

In [38]:
import csv, sqlite3

p3_db = sqlite3.connect('p3_osm')
curs = p3_db.cursor()
with open('2015_txpopest_place.csv', 'r') as f:
    dr = csv.DictReader(f)
    to_db = [(i['Place'], i['jan1_2016_pop_est']) for i in dr]
    
curs.executemany("INSERT INTO populations (place, pop_2016) VALUES (?, ?);", to_db)
p3_db.commit()
p3_db.close()