In [2]:
#Project Goal: Clean up openstreetmap data in XML
#Convert the XML to a csv file
#Import file into SQL to use as a database

In [68]:
#Goal 1 sub goals
#Change all street types to be uniform
#Make sure all zip codes make sense
#Change all state routes to be uniform

In [74]:
import xml.etree.cElementTree as ET
import re
from collections import defaultdict
import pprint
import unicodecsv
import csv
import schema
import cerberus
import codecs
import sqlite3
import pandas as pd

In [75]:
#OSM File
wall_osm = r'C:\Users\chels\Downloads\Project 2\Wall_Twp.osm'

In [76]:
#Change all street types to be uniform

#First get a List of expected street types
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Way", "34", "13", "35", "33", "9", "88", "66", "Plaza", "70", "Circle", "Boardwalk", "Highway", "Front", "West", "71", "South", "Broadway"]

#Dictionary of street types to change/normalize
street_mapping = {  "St": "Street",
                    "St.": "Street",
                    "st": "Street",
                    "avenue": "Avenue",
                    "Ave.": "Avenue",
                    "Ave": "Avenue",
                    'ave': "Avenue",
                    "Unit": "",
                    "Rd": "Road",
                    "Blvd": "Boulevard",
                    "Northbound": "",
                    'Goldcrest': 'Goldcrest Drive',
                    'South': '',
                    "RT-35": "Route 35",
                    "NJ-35": "Route 35",
                    "NJ-88": "Route 88"
                  }          

#Dictionary of route names to change/normalize
route_mapping = {   "NJ 13": "Route 13",
                    "State Route 33": "Route 33",
                    "NJ-34 C3": "Route 34",
                    "NJ 35": "Route 35",
                    "Highway 35": "Route 35",
                    "State Highway 35": "Route 35",
                    "Hwy 35": "Route 35",
                    "State Route 66": "Route 66",
                    "Rt. 88": "Route 88",
                    "US. Rt. 9": "Route 9",
                    "Us Highway 9": "Route 9",
                    "Rt 9": "Route 9",
                    "U.S. 9": "Route 9",
                    'US 9': "Route 9",
                    'US Highway 9': "Route 9",
                    'US RT 9': "Route 9",
                    'US Rt. 9': "Route 9",
                    'US. Rt. 9': "Route 9",
                    'Us Highway 9': "Route 9",
                    "Route 9 Northbound": "Route 9"
}

# Dictionary to update the solitairy incorrect zipcode
zip_mapping = { "8730": "08730"
              }

#re to search for the end of a street name and zip codes & initializing variables to add these values to
street_type_re = re.compile(r'\b\S+\.?$')
street_types = defaultdict(set)

zip_code_re = re.compile(r'^\d{5}')
zips = defaultdict(set)

In [77]:
#functions to determine if element is street name or zip code
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")
def is_zip_code(elem):
    return (elem.attrib['k'] == "addr:postcode")

In [78]:
#if re finds an unexpected street name adds it to street_types
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

In [79]:
#if re finds an unexpected street name adds it to zips
def audit_zip_code(zips, zip_code):
    m = zip_code_re.search(zip_code)
    if not m:
        zips[zip_code].add(zip_code)

In [80]:
#full audit of zips & street names
def audit(osm_file):
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "way" or elem.tag == "node":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
                if is_zip_code(tag):
                    audit_zip_code(zips, tag.attrib['v'])           
    pprint.pprint(dict(street_types))
    pprint.pprint(dict(zips))
    
audit(wall_osm)

{'Ave': {'Sunset Ave', '210 5th Ave', 'Lewin Ave'},
 'Ave.': {'River Ave.'},
 'Blvd': {'Hance Blvd'},
 'C3': {'NJ-34 C3'},
 'Goldcrest': {'Goldcrest'},
 'NJ-35': {'NJ-35'},
 'NJ-88': {'NJ-88'},
 'Northbound': {'Route 9 Northbound'},
 'RT-35': {'RT-35'},
 'Rd': {'Beaver Dam Rd',
        'Chambersbridge Rd',
        'County Line Rd',
        'Herman Rd',
        'Redwood Rd',
        'Wemrock Rd'},
 'St': {'Bond St', 'Court St', 'Main St'},
 'Unit': {'North County Line Road Unit'},
 'avenue': {'elberon avenue'},
 'st': {'8th st'}}
{'8730': {'8730'}}


In [81]:
def update_street_name(name, mapping=street_mapping):
    m = street_type_re.search(name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            name = re.sub(street_type_re, street_mapping[street_type], name).title()
    return name

In [82]:
def update_zip_code(number, mapping=zip_mapping):
    if number in mapping:
            number = mapping.get(number)
    return number

In [83]:
def update_route_name(name, mapping=route_mapping):
        if name in mapping:
            name = mapping.get(name)
            return name

In [84]:
#updates streets/routes names
def update_streets(item):
    if item in route_mapping:
        return update_route_name(item)
    elif item not in route_mapping:
        return update_street_name(item)
    else:
        return item

---

In [85]:
schema = {
    'node': {
        'type': 'dict',
        'schema': {
            'id': {'required': True, 'type': 'integer', 'coerce': int},
            'lat': {'required': True, 'type': 'float', 'coerce': float},
            'lon': {'required': True, 'type': 'float', 'coerce': float},
            'user': {'required': True, 'type': 'string'},
            'uid': {'required': True, 'type': 'integer', 'coerce': int},
            'version': {'required': True, 'type': 'string'},
            'changeset': {'required': True, 'type': 'integer', 'coerce': int},
            'timestamp': {'required': True, 'type': 'string'}
        }
    },
    'node_tags': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'key': {'required': True, 'type': 'string'},
                'value': {'required': True, 'type': 'string'},
                'type': {'required': True, 'type': 'string'}
            }
        }
    },
    'way': {
        'type': 'dict',
        'schema': {
            'id': {'required': True, 'type': 'integer', 'coerce': int},
            'user': {'required': True, 'type': 'string'},
            'uid': {'required': True, 'type': 'integer', 'coerce': int},
            'version': {'required': True, 'type': 'string'},
            'changeset': {'required': True, 'type': 'integer', 'coerce': int},
            'timestamp': {'required': True, 'type': 'string'}
        }
    },
    'way_nodes': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'node_id': {'required': True, 'type': 'integer', 'coerce': int},
                'position': {'required': True, 'type': 'integer', 'coerce': int}
            }
        }
    },
    'way_tags': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'key': {'required': True, 'type': 'string'},
                'value': {'required': True, 'type': 'string'},
                'type': {'required': True, 'type': 'string'}
            }
        }
    }
}


In [106]:
OSM_PATH = r"C:\Users\chels\Downloads\Project 2\Wall_Twp.osm"

NODE_PATH = "node.csv"
NODE_TAG_PATH = "node_tags.csv"
WAY_PATH = "ways.csv"
WAY_NODES_PATH = "way_nodes.csv"
WAY_TAG_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    # YOUR CODE HERE
    
    if element.tag == 'node':
        for i in element.attrib:
            if i in NODE_FIELDS:
                node_attribs[i] = element.attrib[i] #checks elements attributes & if they are also in the NODE_FIELDS list adds them to node_attribs list
        for tag in element:
            elem_tags = {}
            elem_tags['id'] = element.attrib['id']
            if PROBLEMCHARS.match(tag.attrib['k']): #continues if problem characters encountered
                continue
            if LOWER_COLON.match(tag.attrib['k']): 
                elem_tags['type'] = tag.attrib['k'].split(':',1)[0]
                elem_tags['key'] = tag.attrib['k'].split(':',1)[1] #If we find a colon in the k attrib splits first part into the type and second part into key
                if is_street_name(tag):
                    elem_tags['value'] = update_streets(tag.attrib['v']) #updates the name if it's a street name
                elif is_zip_code(tag):
                    elem_tags['value'] = update_zip_code(tag.attrib['v']) #updates zip code
                else:
                    elem_tags['value'] = tag.attrib['v']
                tags.append(elem_tags)

            else:
                elem_tags['type'] = default_tag_type
                elem_tags['key'] = tag.attrib['k']
                if is_street_name(tag):
                    elem_tags['value'] = update_streets(tag.attrib['v'])   
                elif is_zip_code(tag):
                    elem_tags['value'] = update_zip_code(tag.attrib['v'])
                else:
                    elem_tags['value'] = tag.attrib['v']
                tags.append(elem_tags)
            
                     
    elif element.tag == "way":
        pos = 0 
        for i in element.attrib:
            if i in WAY_FIELDS:
                way_attribs[i] = element.attrib[i]
        for node in element:
            way_tag = {}
            way_tag['id'] = element.attrib['id']
            if node.tag == "tag":
                if PROBLEMCHARS.match(node.attrib['k']):
                        continue
                if LOWER_COLON.match(node.attrib['k']):
                    way_tag['type'] = node.attrib['k'].split(':',1)[0]
                    way_tag['key'] = node.attrib['k'].split(':',1)[1]
                    if is_street_name(node):
                         way_tag['value'] = update_streets(node.attrib['v'])    
                    if is_zip_code(node):
                         way_tag['value'] = update_zip_code(node.attrib['v'])
                    else:
                        way_tag['value'] = node.attrib['v']         
                else:
                    way_tag['type'] = default_tag_type
                    way_tag['key'] = node.attrib['k']
                    if is_street_name(node):
                        way_tag['value'] = update_streets(node.attrib['v'])    
                    elif is_zip_code(node):
                        way_tag['value'] = update_zip_code(node.attrib['v'])
                    else:
                        way_tag['value'] = node.attrib['v']
                tags.append(way_tag)
                
                
                
            elif node.tag == 'nd':
                way_node = {}
                way_node['id'] = element.attrib['id']
                way_node['node_id'] = node.attrib['ref']
                way_node['position'] = pos 
                pos += 1
                way_nodes.append(way_node)
                
    if element.tag == 'node':
        return {'node': node_attribs, 'node_tags': tags}
    elif element.tag == 'way':
        return {'way': way_attribs, 'way_nodes': way_nodes, 'ways_tags': tags}

# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()
    

class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input. Updated for Python 3"""
    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, str) else v) for k, v in row.items()
        })
    def writerows(self, rows):
        for row in rows:
            self.writerow(row)

# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in):
    """Iteratively process each XML element and write to csv(s). Updated for Python 3"""

    with codecs.open(NODE_PATH, 'w', 'utf-8') as nodes_file, \
         codecs.open(NODE_TAG_PATH, 'w', 'utf-8') as node_tags_file, \
         codecs.open(WAY_PATH, 'w', 'utf-8') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w', 'utf-8') as way_nodes_file, \
         codecs.open(WAY_TAG_PATH, 'w', 'utf-8') as way_tags_file:

        nodes_writer = csv.DictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = csv.DictWriter(node_tags_file, NODE_TAGS_FIELDS)
        ways_writer = csv.DictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = csv.DictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = csv.DictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                
                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['ways_tags'])


if __name__ == '__main__':

    process_map(OSM_PATH)


In [87]:
#Create db file and connection to sqlite3
wall_db = r'C:\Users\chels\Downloads\wall_db.db'
conn = sqlite3.connect(wall_db)
c = conn.cursor()

In [161]:
#create our tables

c.execute('''CREATE TABLE IF NOT EXISTS node
             (id INTEGER NOT NULL, lat REAL, lon REAL, user TEXT, uid INTEGER, version INTEGER, changeset INTEGER, timestamp TEXT)''')
c.execute('''CREATE TABLE IF NOT EXISTS node_tags
             (id INTEGER NOT NULL, key TEXT, value TEXT, type TEXT, FOREIGN KEY (id) REFERENCES node(id))''')
c.execute('''CREATE TABLE IF NOT EXISTS ways
             (id INTEGER NOT NULL, user TEXT, uid TEXT, version INTEGER, changeset INTEGER, timestamp TEXT)''')
c.execute('''CREATE TABLE IF NOT EXISTS way_nodes
             (id INTEGER NOT NULL, node_id INTEGER NOT NULL, position INTEGER, FOREIGN KEY (id) REFERENCES ways(id), FOREIGN KEY (node_id) REFERENCES nodes(id))''')
c.execute('''CREATE TABLE IF NOT EXISTS ways_tags
             (id INTEGER NOT NULL, key TEXT, value TEXT, type TEXT, FOREIGN KEY (id) REFERENCES ways(id))''')

<sqlite3.Cursor at 0x2ec359ff110>

In [162]:
node = pd.read_csv('node.csv', encoding='utf-8')
node_tags = pd.read_csv('node_tags.csv', encoding='utf-8')
ways = pd.read_csv('ways.csv', encoding='utf-8')
way_nodes = pd.read_csv('way_nodes.csv', encoding='utf-8')
ways_tags = pd.read_csv('ways_tags.csv', encoding='utf-8')

node.to_sql('node', conn, if_exists='append', index = False)
node_tags.to_sql('node_tags', conn, if_exists='append', index = False)
ways.to_sql('ways', conn, if_exists='append', index = False)
way_nodes.to_sql('way_nodes', conn, if_exists='append', index = False)
ways_tags.to_sql('ways_tags', conn, if_exists='append', index = False)

In [163]:
node_cont = c.execute('''SELECT COUNT(DISTINCT node.user) FROM node''').fetchall()

In [164]:
ways_cont = c.execute('''SELECT COUNT(DISTINCT ways.user) FROM ways''').fetchall()

In [None]:
c.execute('''
SELECT COUNT(DISTINCT user) 
FROM node
INNER JOIN ways
ON node.user = ways.user''').fetchall()

In [None]:
c.execute('''SELECT DISTINCT ways.value, COUNT(*)
FROM ways
WHERE ways.key = sport
GROUP BY ways.value
ORDER BY COUNT(*) DESC
LIMIT 5;''').fetchall()