In [1]:
%autosave 0

Autosave disabled


# OpenStreetMap Data Wrangling Project


The city I chose to work with initially was Austin, Texas. 

* http://www.openstreetmap.org/relation/113314
* https://mapzen.com/data/metro-extracts/metro/austin_texas/

I chose this city because it's the place I presently call home. Tackling the Austin data set gives me a chance to get more acquainted with the place I live. 

The auditing, wrangling, and cleaning I'll set up programmatically can be applied to any area of Texas. Later in the report I'll demonstrate how the wrangling can applicable beyond the test data set.

## Problems Encountered in Data Set

As a preliminary step to working with irregularities in the data set, I'll take a look at the distribution of tags to see which are abundant enough to serve as good data wrangling practice.

In [2]:
# Import the libraries necessary for the project
from collections import defaultdict, Counter
import csv
import numpy as np
import re
import pprint
import sqlite3
import xml.etree.cElementTree as ET

# Open and name data file as global variable
OSM_FILE = open('austin_texas.osm', 'r', encoding="utf8")

# Element generator for parsing individual nodes in OSM file
def get_element(osm_file, tags=('node', 'way', 'relation')):
    context = ET.iterparse(osm_file, events=('start', 'end'))
    __, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

# Dictionary for counting tag labels
k_tags = {}

# Dictionary for saving items by labels
tag_ids = {}

# Population tag count
population = 0

# Loop for going through tags
for element in get_element(OSM_FILE):
    for child in element:
        # Try-Except for dealing with tags without labels
        try:
            if child.get('k') == 'population':
                population += 1
            if child.attrib['k'] in k_tags:
                k_tags[child.attrib['k']] += 1
                tag_ids[child.atrib['k']].append(element.get('id')) 
            else:
                k_tags[child.attrib['k']] = 1
                tag_ids[child.attrib['k']] = [element.get('id')]
        except:
            continue    

# Reduce dictionaries to abundant tags
k_tags_ids = {label:[k_tags[label], tag_ids[label]] for label in k_tags.keys() if k_tags[label] >= 1000}

    
print("\nAbundant tags in {}".format(OSM_FILE))
pprint.pprint(k_tags_ids)
print(population)


Abundant tags in <_io.TextIOWrapper name='austin_texas.osm' mode='r' encoding='utf8'>
{'access': [4438, ['151576347']],
 'addr:city': [3710, ['280231689']],
 'addr:housenumber': [333664, ['280231689']],
 'addr:postcode': [86642, ['281362888']],
 'addr:state': [3353, ['280231689']],
 'addr:street': [333622, ['280231689']],
 'amenity': [8153, ['152713302']],
 'barrier': [1225, ['26546151']],
 'bicycle': [1429, ['2539446524']],
 'bridge': [1905, ['2089441906']],
 'building': [584296, ['365275738']],
 'coa:place_id': [13715, ['3823901373']],
 'created_by': [7297, ['151756603']],
 'ele': [1490, ['151321672']],
 'foot': [1327, ['2539446524']],
 'gnis:county_id': [1060, ['356698556']],
 'gnis:created': [1099, ['356698556']],
 'gnis:feature_id': [1360, ['356698556']],
 'gnis:state_id': [1059, ['356698556']],
 'height': [443886, ['3842174484']],
 'highway': [83748, ['26546008']],
 'landuse': [2523, ['356724097']],
 'lanes': [5459, ['4358672']],
 'layer': [1927, ['153157946']],
 'leisure': [216

## Preliminary Audit Results

Among the most abundant node labels are address labels, which were the focus for wrangling demonstrated in P3 lessons. 

Instead of focusing on correcting address data, I'll look at data that is correctly formatted but no longer current, specifically population for the primary and satellite urban centers in the focus area. Nodes marking urban centers along with the city or town name include population and place keys for populated settlements whose values are determined by population level.



Tag|Population|Description
---|----------|-----------
place=city|100,000+|
place=town|10,000 - 100,000|an urban settlement with local importance
place=village|<10,000|incorporated municipality, regardless of its population	
place=hamlet|<100|unincorporated settlement with less than 100 inhabitants
place=isolated_dwelling|<= 2 households|the smallest kind of human settlement



In [None]:
### Get population data from OSM file ###

#Libraries
import pprint
import xml.etree.cElementTree as ET

# Open and name data file as global variable
FILE_NAME = 'austin_texas.osm'
OSM_FILE = open(FILE_NAME, 'r', encoding="utf8")


# Element generator for parsing individual nodes in OSM file
def get_element(osm_file, tags=('node', 'way')):
    context = ET.iterparse(osm_file, events=('start', 'end'))
    __, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

names = []
            
for element in get_element(OSM_FILE):
    elem_id = element.attrib['id']
    time_stamp_year = element.attrib['timestamp'][:4]
    name = None
    place = None
    popul = None
    source = None
    for tag in element.iter('tag'):
        if tag.attrib['k'] == 'name':
            name = tag.get('v')
        if tag.attrib['k'] == 'place':
            place = tag.get('v')
        if tag.attrib['k'] == 'population':
            popul = tag.get('v') 
        if tag.attrib['k'].startswith('source:population'):
            source = tag.get('v')
    if (name and popul):
        names.append((elem_id, name, place, popul, source, time_stamp_year))


pprint.pprint(names)

In [29]:
import csv, sqlite3

p3_db = sqlite3.connect(r'C:\Users\User\sqlite_windows\p3_osm')
curs = p3_db.cursor()
with open('2015_txpopest_place.csv', 'r') as f:
    dr = csv.DictReader(f)
    to_db = [(i['Place'], i['jan1_2016_pop_est']) for i in dr]

curs.executemany("INSERT INTO TX_popul_est (place, popul_2016) VALUES (?, ?);", to_db)
curs.execute("CREATE TABLE IF NOT EXISTS OSM_Pop_Data_Nodes(node_id integer PRIMARY KEY, place text, designation text, popul integer, FOREIGN KEY(place) REFERENCES TX_popul_est(place));")
curs.executemany("INSERT INTO OSM_Pop_Data_Nodes (node_id, place, designation, popul) VALUES(?, ?, ?, ?);", names)
p3_db.commit()
p3_db.close()

In [44]:
p3_db = sqlite3.connect(r'C:\Users\User\sqlite_windows\p3_osm')
curs = p3_db.cursor()
curs.execute("SELECT OSM_Pop_Data_Nodes.place, popul, popul_2016 FROM TX_popul_est JOIN OSM_Pop_Data_Nodes ON OSM_Pop_Data_Nodes.place = TX_popul_est.place;")
aggr = curs.fetchall()
pprint.pprint(aggr)


[('Austin', 790390, 921781),
 ('Bastrop', 7591, 8278),
 ('Bear Creek', 376, 417),
 ('Bee Cave', 3925, 6117),
 ('Briarcliff', 852, 1536),
 ('Buda', 4551, 14644),
 ('Creedmoor', 190, 218),
 ('Dripping Springs', 1677, 2636),
 ('Georgetown', 42467, 64476),
 ('Hays', 241, 243),
 ('Hutto', 9572, 22471),
 ('Jonestown', 2121, 2018),
 ('Kyle', 28016, 38325),
 ('Lakeway', 11391, 13945),
 ('Liberty Hill', 1510, 1638),
 ('Manor', 2657, 7896),
 ('Mountain City', 698, 719),
 ('Pflugerville', 46936, 56313),
 ('Rollingwood', 1368, 1539),
 ('Round Rock', 99887, 120068),
 ('San Leanna', 486, 542),
 ('Sunset Valley', 790, 730),
 ('Taylor', 15322, 17217),
 ('Thrall', 852, 931),
 ('Volente', 385, 598),
 ('Webberville', 308, 428),
 ('Wimberley', 2703, 2745),
 ('Woodcreek', 1472, 1688),
 ('Austin', 790390, 921781),
 ('Bastrop', 7591, 8278),
 ('Bear Creek', 376, 417),
 ('Bee Cave', 3925, 6117),
 ('Briarcliff', 852, 1536),
 ('Buda', 4551, 14644),
 ('Creedmoor', 190, 218),
 ('Dripping Springs', 1677, 2636),
 ('