## Data Cleaning OSM data of San_Jose:
Data acquired from:
https://mapzen.com/data/metro-extracts/metro/san-jose_california/

In [21]:
## Codes to get an OSM data subset (but, >= 50 MB) of the exisiting data
#!/usr/bin/env python
# -*- coding: utf-8 -*-



import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "san_jose.osm"  # Replace this with your osm file
SAMPLE_FILE = "san_jose_compressed.osm" 

k = 7 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

## Data auditing:

In [1]:
"""
Lets find out the tag names and number of their occurences in the datafile.

"""
import xml.etree.cElementTree as ET
import pprint

filename = 'san_jose_compressed.osm'


tags = {}
def count_tags(filename):
        
    for event, elem in ET.iterparse(filename): 
        if elem.tag in tags.keys(): 
            tags[elem.tag]+=1 
        else:
            tags[elem.tag]=1  
    return tags


print count_tags(filename)

{'node': 257543, 'nd': 302148, 'member': 3370, 'tag': 107229, 'relation': 372, 'way': 34265, 'osm': 1}


In [2]:
# Lets check zip code:


def get_user(element):
    if element.tag == "node" or element.tag == "way" or element.tag == "relation":
        for child in element.iter("tag"):
            if child.attrib['k']=='addr:postcode':
                return child.attrib['v']

def process_map(filename):
    zip = set()
    for _, element in ET.iterparse(filename):
        
        try:
            zip.add(get_user(element))
            
        except:
            pass
    return zip

process_map('san_jose_compressed.osm')


{None,
 '94024',
 '94085',
 '94086',
 '94086-6406',
 '94087',
 u'94087\u200e',
 '94089',
 '95002',
 '95008',
 '95014',
 '95014-030',
 '95014-0438',
 '95014-0440',
 '95014-0444',
 '95014-0446',
 '95014-0447',
 '95014-0448',
 '95014-0449',
 '95014-0451',
 '95014-0453',
 '95014-0455',
 '95014-0456',
 '95014-0457',
 '95014-0496',
 '95014-0501',
 '95014-0503',
 '95014-0504',
 '95014-0505',
 '95014-0506',
 '95014-0507',
 '95014-0509',
 '95014-0510',
 '95014-0511',
 '95014-0512',
 '95014-0513',
 '95014-0514',
 '95014-0515',
 '95014-0517',
 '95014-0519',
 '95014-0520',
 '95014-0522',
 '95014-0523',
 '95014-0525',
 '95014-0526',
 '95014-0528',
 '95014-0530',
 '95014-0531',
 '95014-0536',
 '95014-0537',
 '95014-0539',
 '95014-0540',
 '95014-0541',
 '95014-0544',
 '95014-0545',
 '95014-0546',
 '95014-0547',
 '95014-0548',
 '95014-0549',
 '95014-0551',
 '95014-0552',
 '95014-0553',
 '95014-0554',
 '95014-0556',
 '95014-0557',
 '95014-0562',
 '95014-0563',
 '95014-0565',
 '95014-0567',
 '95014-0605

In [3]:
##################### Framework building #######################################
import re

zip1 = re.compile(r'^\d{5}')
zip2 = re.compile(r'\d{5}-\d{4}')

def correct_zip(t):
    m = re.search(zip1, t)
    if m:
        n = re.search(zip2, t)
        if n:
            return  t, " ==>> ", n.group()
        else:
            return  t, " ==>> ", m.group()
    else:
        return  t, " ==>>", "None"
                    
            
print correct_zip('94086-6406')
print correct_zip('94086-640')
print correct_zip('94086')
print correct_zip('None')
print correct_zip('CUPERTINO')


('94086-6406', ' ==>> ', '94086-6406')
('94086-640', ' ==>> ', '94086')
('94086', ' ==>> ', '94086')
('None', ' ==>>', 'None')
('CUPERTINO', ' ==>>', 'None')


In [4]:
########### Clean postal code #############################################

# Lets check zip code:

import re

zip1 = re.compile(r'^\d{5}')
zip2 = re.compile(r'\d{5}-\d{4}')

def correct_zip(element):
    if element.tag == "node" or element.tag == "way" or element.tag == "relation":
        for child in element.iter("tag"):
            if child.attrib['k']=='addr:postcode':
                t = child.attrib['v']
                m = re.search(zip1, t)
                if m:
                    n = re.search(zip2, t)
                    if n:
                        return n.group()
                        
                    else:
                        return m.group()
                else:
                    return "None"
                    
               


def process_map(filename):
    zip = set()
    for _, element in ET.iterparse(filename):
        
        try:
            zip.add(correct_zip(element))
            
        except:
            pass
    return zip

process_map('san_jose_compressed.osm')


{None,
 '94024',
 '94085',
 '94086',
 '94086-6406',
 '94087',
 '94089',
 '95002',
 '95008',
 '95014',
 '95014-0438',
 '95014-0440',
 '95014-0444',
 '95014-0446',
 '95014-0447',
 '95014-0448',
 '95014-0449',
 '95014-0451',
 '95014-0453',
 '95014-0455',
 '95014-0456',
 '95014-0457',
 '95014-0496',
 '95014-0501',
 '95014-0503',
 '95014-0504',
 '95014-0505',
 '95014-0506',
 '95014-0507',
 '95014-0509',
 '95014-0510',
 '95014-0511',
 '95014-0512',
 '95014-0513',
 '95014-0514',
 '95014-0515',
 '95014-0517',
 '95014-0519',
 '95014-0520',
 '95014-0522',
 '95014-0523',
 '95014-0525',
 '95014-0526',
 '95014-0528',
 '95014-0530',
 '95014-0531',
 '95014-0536',
 '95014-0537',
 '95014-0539',
 '95014-0540',
 '95014-0541',
 '95014-0544',
 '95014-0545',
 '95014-0546',
 '95014-0547',
 '95014-0548',
 '95014-0549',
 '95014-0551',
 '95014-0552',
 '95014-0553',
 '95014-0554',
 '95014-0556',
 '95014-0557',
 '95014-0562',
 '95014-0563',
 '95014-0565',
 '95014-0567',
 '95014-0605',
 '95014-0607',
 '95014-0608'

In [5]:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Lets explore the "k" value for each "<tag>" and see if there are any potential problems.
We have to count each of four tag categories in a dictionary:
  "lower", for tags that contain only lowercase letters and are valid,
  "lower_colon", for otherwise valid tags with a colon in their names,
  "problemchars", for tags with problematic characters, and
  "other", for other tags that do not fall into the other three categories.

"""

import re

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        # YOUR CODE HERE
        k = element.attrib['k']
        if re.search(lower,k):
            keys["lower"] += 1
        elif re.search(lower_colon,k):
            keys["lower_colon"] += 1
        elif re.search(problemchars,k):
            keys["problemchars"] += 1
        else:
            keys["other"] += 1
    return keys
        

def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

process_map(filename)

{'lower': 70867, 'lower_colon': 33173, 'other': 3189, 'problemchars': 0}

In [6]:
## Lets explore the 'other' category little bit
  

list=[]
for _, element in ET.iterparse(filename):
    if element.tag == "tag":
        c=0
        
        k = element.attrib['k']
        if re.search(lower,k):
            pass
        elif re.search(lower_colon,k):
            pass
        elif re.search(problemchars,k):
            pass
        else:
            list.append(k)
            c+=1
list                

['COUNTYFP',
 'STATEFP',
 'Tiger:MTFCC',
 'ref:ACE',
 'name:ACE',
 'ref:Amtrak',
 'name:Amtrak',
 'gnis:Class',
 'gnis:County',
 'gnis:ST_num',
 'gnis:ST_alpha',
 'gnis:County_num',
 'gnis:Class',
 'gnis:County',
 'gnis:ST_num',
 'gnis:ST_alpha',
 'gnis:County_num',
 'gnis:Class',
 'gnis:County',
 'gnis:County_num',
 'gnis:ST_alpha',
 'gnis:ST_num',
 'name:en:pronunciation',
 'gnis:Class',
 'gnis:County',
 'gnis:ST_num',
 'gnis:ST_alpha',
 'gnis:County_num',
 'gnis:Class',
 'gnis:County',
 'gnis:County_num',
 'gnis:ST_alpha',
 'gnis:ST_num',
 'gnis:Class',
 'gnis:County',
 'gnis:ST_num',
 'gnis:ST_alpha',
 'gnis:County_num',
 'disused:website:official',
 'STATEFP',
 'COUNTYFP',
 'Tiger:MTFCC',
 'STATEFP',
 'COUNTYFP',
 'Tiger:MTFCC',
 'STATEFP',
 'COUNTYFP',
 'Tiger:MTFCC',
 'STATEFP',
 'COUNTYFP',
 'Tiger:MTFCC',
 'STATEFP',
 'COUNTYFP',
 'Tiger:MTFCC',
 'STATEFP',
 'ANSICODE',
 'COUNTYFP',
 'Tiger:MTFCC',
 'STATEFP',
 'ANSICODE',
 'COUNTYFP',
 'Tiger:MTFCC',
 'STATEFP',
 'COUNTYFP',


In [7]:
'''
From the street name, we are going to extract street type
and then take a look at the all different street type
eg.. "North Lincoln Avenue"; 'Avenue' is a street type
'''
from collections import defaultdict

osm_file = open("san_jose_compressed.osm", "r") 

street_type_re = re.compile(r'\S+\.?$', re.IGNORECASE)

street_types = defaultdict(int)

'''audit_street_type() takes an argument street_name and returns a dictionary.
It searches for the street type in the string 'street_name' and then stores that
street type and the number of types it finds into the 'street_types' dictionary'''

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name) # This step does the search
    if m:
        street_type = m.group()
       
        street_types[street_type] +=1


################## Sort a dict and then print it #############################

def print_sorted_dict(d):
    keys = d.keys() #get the key of the dict d
    keys = sorted(keys, key=lambda s: s.lower())
   
    for k in keys:
        v = d[k]
        print "%s: %d" % (k, v) 

########### Check whether a string fed is a proper street name ###############
def is_street_name(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "addr:street")

def audit():
    for event, elem in ET.iterparse(osm_file):
       
        if is_street_name(elem):
          
            audit_street_type(street_types, elem.attrib['v'])  
          
    print_sorted_dict(street_types)    

if __name__ == '__main__':
    audit()

#6: 1
#81: 1
Alameda: 7
ave: 1
Ave: 6
Avenue: 845
Barcelona: 3
Blvd: 3
Boulevard: 97
Broadway: 1
Circle: 39
Court: 349
Dr: 1
Drive: 813
East: 6
Expressway: 12
Highway: 2
Lane: 161
Ln: 2
Loop: 6
Luna: 1
Marino: 1
Napoli: 6
Palamos: 2
Parkway: 13
Paviso: 3
Place: 90
Plaza: 18
Rd: 5
Real: 27
Road: 124
Sorrento: 4
Square: 55
St: 1
Street: 147
Terrace: 63
Volante: 3
Way: 242
West: 1
yes: 1


In [8]:
'''
Looking at the dictionary thrown above, lets make a list of expected/ideal street 
type and identify the street types that need to be changed
'''

OSMFILE = "san_jose_compressed.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE) 
#for last word in a string


expected = ["Avenue","Alameda", "Barcelona","Boulevard","Broadway","Circle",
            "Drive","Court","East","Expressway", "Highway","Lane","Loop",
            "Luna","Marino","Napoli","Palamos","Parkway","Paviso","Place",
            "Plaza","Lane","Road", "Real","Sorrento","Square","Street",
            "Trail","Terrace","Volante","Way","West"]


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name) #get the last word in street_name
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
   
    for event, elem in ET.iterparse(osm_file, events=("start",)):
       
        if elem.tag == "node" or elem.tag == "way": 
          
            for tag in elem.iter("tag"):
             
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

pprint.pprint(dict(audit(OSMFILE)))

{'6': set(['Pruneridge Ave #6']),
 '81': set(['Concourse Dr #81']),
 'Ave': set(['1425 E Dunne Ave',
             'Greenbriar Ave',
             'N Blaney Ave',
             'W Washington Ave']),
 'Blvd': set(['Los Gatos Blvd', 'Palm Valley Blvd']),
 'Dr': set(['Samaritan Dr']),
 'Ln': set(['Barber Ln', 'Branham Ln']),
 'Rd': set(['Berryessa Rd',
            'Mt Hamilton Rd',
            'San Antonio Valley Rd',
            'Wolfe Rd']),
 'St': set(['Casa Verde St']),
 'ave': set(['wilcox ave']),
 'yes': set(['yes'])}


In [9]:
## Mapping:


mapping = { "St": "Street","Ave": "Avenue", "ave":"Avenue","Blvd":"Boulevard",
           "Rd": "Road", "Ln":"Lane","Dr":"Drive"}

def update_name(name, mapping):
   
    m = street_type_re.search(name)
    if m:
        street_type = m.group() 
        
        if street_type in mapping.keys():
           
            name = re.sub(m.group(), mapping[m.group()], name)
            
    return name

def test():
    st_types = audit(OSMFILE)

    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name

if __name__ == '__main__':
    test()

Barber Ln => Barber Lane
Branham Ln => Branham Lane
Pruneridge Ave #6 => Pruneridge Ave #6
Casa Verde St => Casa Verde Street
Wolfe Rd => Wolfe Road
Berryessa Rd => Berryessa Road
Mt Hamilton Rd => Mt Hamilton Road
San Antonio Valley Rd => San Antonio Valley Road
W Washington Ave => W Washington Avenue
1425 E Dunne Ave => 1425 E Dunne Avenue
N Blaney Ave => N Blaney Avenue
Greenbriar Ave => Greenbriar Avenue
Palm Valley Blvd => Palm Valley Boulevard
Los Gatos Blvd => Los Gatos Boulevard
wilcox ave => wilcox Avenue
yes => yes
Concourse Dr #81 => Concourse Dr #81
Samaritan Dr => Samaritan Drive


In [8]:
#     # relevant section for 'addr:street' attributes:
#                 elif LOWER_COLON.match(child.attrib["k"]):
#                     node_tag["type"] = child.attrib["k"].split(":", 1)[0]
#                     node_tag["key"] = child.attrib["k"].split(":", 1)[1]
#                     node_tag["id"] = element.attrib["id"]

#                     # ===========>  use cleaning function:
#                     if child.attrib["k"] == 'addr:street':
#                         node_tag["value"] = update_name(child.attrib["v"], mapping)
#                     # otherwise:
#                     else:
#                         node_tag["value"] = child.attrib["v"]

## Write the data extracted from the OSM file into CSVs:

In [11]:
'''
If the top element is node, it will have following pattern and in case, if 
it doesnt have a child element, the list node_tag will be empty

{'node': {'id': 757860928,
          'user': 'uboot',
          'uid': 26299,
       'version': '2',
          'lat': 41.9747374,
          'lon': -87.6920102,
          'timestamp': '2010-07-22T16:16:51Z',
      'changeset': 5288876},
 'node_tags': [{'id': 757860928,
                'key': 'amenity',
                'value': 'fast_food',
                'type': 'regular'},
               {'id': 757860928,
                'key': 'cuisine',
                'value': 'sausage',
                'type': 'regular'},
               {'id': 757860928,
                'key': 'name',
                'value': "Shelly's Tasty Freeze",
                'type': 'regular'}]}

If key value has no colon, the type will be regular
If the key value has colon, the type will be the characters before the colon 
and key will be the characters after the colomn.
If there are are more than one colon, ignore the colon(s) after the first.

## If the top element is way, it will have following pattern and in case, if 
it doesnt have a child element, the list way_tags and/or way_nodes will be empty.

Additionally, the dictionary should have a field "way_nodes". "way_nodes" should hold a list of
dictionaries, one for each nd child tag.  Each dictionary should have the fields:

- id: the top level element (way) id
- node_id: the ref attribute value of the nd tag
- position: the index starting at 0 of the nd tag i.e. what order the nd tag appears within
            the way element

The final return value for a "way" element should look something like:

{'way': {'id': 209809850,
         'user': 'chicago-buildings',
         'uid': 674454,
         'version': '1',
         'timestamp': '2013-03-13T15:58:04Z',
         'changeset': 15353317},
 'way_nodes': [{'id': 209809850, 'node_id': 2199822281, 'position': 0},
               {'id': 209809850, 'node_id': 2199822390, 'position': 1},
               {'id': 209809850, 'node_id': 2199822392, 'position': 2},
              ],
 'way_tags': [{'id': 209809850,
               'key': 'housenumber',
               'type': 'addr',
               'value': '1412'},
              {'id': 209809850,
               'key': 'street',
               'type': 'addr',
              {'id': 209809850,
               'key': 'building_id',
               'type': 'chicago',
               'value': '366409'}]}
'''

import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus


import schema

OSM_PATH = "san_jose_compressed.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

zip1 = re.compile(r'^\d{5}')
zip2 = re.compile(r'\d{5}-\d{4}')


SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


## Mapping:


mapping = { "St": "Street","Ave": "Avenue", "ave":"Avenue","Blvd":"Boulevard",
           "Rd": "Road", "Ln":"Lane","Dr":"Drive"}

def update_name(name, mapping):
    '''
    This function updates the name of the street type
    '''
   
    m = street_type_re.search(name)
    if m:
        street_type = m.group() 
        
        if street_type in mapping.keys():
           
            name = re.sub(m.group(), mapping[m.group()], name)
            
    return name



def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = [] 
    tags = []  
    
    if element.tag=='node':
        for x in node_attr_fields:
            node_attribs[x]=element.attrib[x]
                
        for child in element.iter('tag'):
            tags_dict={}
            k = child.attrib['k']
            if re.search(problem_chars, k):
                pass
            elif re.search(LOWER_COLON,k) or ":" in k:
                tags_dict['type']=k.split(":",1)[0]
                tags_dict['key']=k.split(":",1)[1]
            else:
                tags_dict['type']='regular'
                tags_dict['key']=k
            tags_dict['id']=element.attrib['id']
            
            ## cleaning the street name
            if k=='addr:street':
                tags_dict['value'] = update_name(child.attrib["v"], mapping)
            else:
                tags_dict['value']=child.attrib['v']
               
            ## cleaning zip code:
            
            if k=='addr:postcode':
                t=child.attrib["v"]
                m = re.search(zip1, t)
                if m:
                    n = re.search(zip2, t)
                    if n:
                        tags_dict['value'] = n.group()
                        
                    else:
                        tags_dict['value'] = m.group()
                else:
                    tags_dict['value'] = "None"
            else:
                tags_dict['value']=child.attrib["v"]
            
                    
                
            tags.append(tags_dict)

##
    if element.tag=='way':
        for x in way_attr_fields:
            way_attribs[x]=element.attrib[x]
                
        for child in element.iter('tag'):
            tags_dict={}
            k = child.attrib['k']
            if re.search(problem_chars, k):
                pass
            elif re.search(LOWER_COLON,k) or ":" in k:
                tags_dict['type']=k.split(":",1)[0]
                tags_dict['key']=k.split(":",1)[1]
            else:
                tags_dict['type']='regular'
                tags_dict['key']=k
            tags_dict['id']=element.attrib['id']
            
            ## cleaning street names
            
            if k=='addr:street':
                tags_dict['value'] = update_name(child.attrib["v"], mapping)
            else:
                tags_dict['value']=child.attrib['v']
                
            ## cleaning zip codes:
            
            if k=='addr:postcode':
                t=child.attrib["v"]
                m = re.search(zip1, t)
                if m:
                    n = re.search(zip2, t)
                    if n:
                        tags_dict['value'] = n.group()
                        
                    else:
                        tags_dict['value'] = m.group()
                else:
                    tags_dict['value'] = "None"
            else:
                tags_dict['value']=child.attrib["v"]
            
                    
                
            tags.append(tags_dict)
            
                    
        count=0
        for child in element.iter('nd'):
            
            way_nodes_dict={}
            way_nodes_dict['id']=element.attrib['id']
            way_nodes_dict['node_id']=child.attrib['ref']
            way_nodes_dict['position']=count
            count+=1
            way_nodes.append(way_nodes_dict)
       
    if element.tag == 'node':
        return {'node': node_attribs, 'node_tags': tags}
    elif element.tag == 'way':
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}



# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=True)


## Create SQLite database using newly created CSV files:

In [12]:
import sqlite3
import csv
from pprint import pprint


sqlite_file = 'mymap.db'    


conn = sqlite3.connect(sqlite_file)


cur = conn.cursor()


### Create tables:

In [13]:
qry1='''
CREATE TABLE nodes (
    id INTEGER,
    lat REAL,
    lon REAL,
    user TEXT,
    uid INTEGER,
    version INTEGER,
    changeset INTEGER,
    timestamp TEXT
);
'''
cur.execute(qry1)

qry2='''

CREATE TABLE nodes_tags (
    id INTEGER,
    key TEXT,
    value TEXT,
    type TEXT,
    FOREIGN KEY (id) REFERENCES nodes(id)
);
'''
cur.execute(qry2)

qry3='''

CREATE TABLE ways (
    id INTEGER,
    user TEXT,
    uid INTEGER,
    version TEXT,
    changeset INTEGER,
    timestamp TEXT
);
'''
cur.execute(qry3)

qry4='''

CREATE TABLE ways_tags (
    id INTEGER NOT NULL,
    key TEXT NOT NULL,
    value TEXT NOT NULL,
    type TEXT,
    FOREIGN KEY (id) REFERENCES ways(id)
);
'''
cur.execute(qry4)

qry5='''

CREATE TABLE ways_nodes (
    id INTEGER NOT NULL,
    node_id INTEGER NOT NULL,
    position INTEGER NOT NULL,
    FOREIGN KEY (id) REFERENCES ways(id),
    FOREIGN KEY (node_id) REFERENCES nodes(id)
);
'''
cur.execute(qry5)


<sqlite3.Cursor at 0x52509ea0>

In [14]:
### Check whether the tables are created as intended:
qry="select name from sqlite_master where type='table';"
cur.execute(qry)
print cur.fetchall()
conn.commit()

[(u'nodes',), (u'nodes_tags',), (u'ways',), (u'ways_tags',), (u'ways_nodes',)]


### Populate the table:

In [15]:
## lEt's populate the tables using CSV files

## Before that read the csv files as dictionaries
## and format the data as tuples

## nodes:
with open('nodes.csv','rb') as x1:
    d1 = csv.DictReader(x1) # comma is default delimiter
    t1 = [(i['id'], i['lat'],i['lon'],i['user'].decode("utf-8"), i['uid'],
          i['version'],i['changeset'],i['timestamp']) for i in d1]

q1='''
INSERT INTO nodes(id,lat,lon,user,uid,version,changeset,timestamp) 
VALUES (?,?,?,?,?,?,?,?);
'''
cur.executemany(q1, t1)


## nodes_tags:
with open('nodes_tags.csv','rb') as x2:
    d2 = csv.DictReader(x2) # comma is default delimiter
    t2 = [(i['id'], i['key'],i['value'].decode("utf-8"), i['type']) for i in d2]

q2='''
INSERT INTO nodes_tags(id,key,value,type) 
VALUES (?,?,?,?);
'''
cur.executemany(q2, t2)


## ways:
with open('ways.csv','rb') as x3:
    d3 = csv.DictReader(x3) # comma is default delimiter
    t3 = [(i['id'], i['user'].decode("utf-8"), i['uid'],
          i['version'],i['changeset'],i['timestamp']) for i in d3]

q3='''
INSERT INTO ways(id,user,uid,version,changeset,timestamp) 
VALUES (?,?,?,?,?,?);
'''
cur.executemany(q3, t3)


## ways_tags:
with open('ways_tags.csv','rb') as x4:
    d4 = csv.DictReader(x4) # comma is default delimiter
    t4 = [(i['id'], i['key'],i['value'].decode("utf-8"), i['type']) for i in d4]

q4='''
INSERT INTO ways_tags(id,key,value,type) 
VALUES (?,?,?,?);
'''
cur.executemany(q4, t4)



## ways_nodes:
with open('ways_nodes.csv','rb') as x5:
    d5 = csv.DictReader(x5) # comma is default delimiter
    t5 = [(i['id'], i['node_id'],i['position']) for i in d5]

q5='''
INSERT INTO ways_nodes(id,node_id,position) 
VALUES (?,?,?);
'''
cur.executemany(q5, t5)


conn.commit()


## Using SQLite with Pandas:

In [16]:
import pandas as pd
from IPython.display import display

## Lets have a look at each of the tables:

df=pd.read_sql_query("select * from nodes limit 2;", conn)
df

Unnamed: 0,id,lat,lon,user,uid,version,changeset,timestamp
0,25457954,37.158225,-121.657474,KindredCoda,14293,10,11686320,2012-05-24T03:24:59Z
1,25457967,37.28459,-121.809196,Minh Nguyen,33757,22,47831287,2017-04-16T07:14:57Z


In [15]:
df=pd.read_sql_query("select * from nodes_tags limit 2;", conn)
df

Unnamed: 0,id,key,value,type
0,26027688,highway,traffic_signals,regular
1,26374408,highway,crossing,regular


In [16]:
df=pd.read_sql_query("select * from ways limit 2;", conn)
df

Unnamed: 0,id,user,uid,version,changeset,timestamp
0,4336229,TorieRob,3821658,17,40191384,2016-06-21T21:28:18Z
1,4368967,n76,318696,10,25711458,2014-09-27T18:38:35Z


In [19]:
df=pd.read_sql_query("select * from ways_tags limit 2;", conn)
df

Unnamed: 0,id,key,value,type
0,4336229,name,Ranch Drive,regular
1,4336229,oneway,yes,regular


In [20]:
df=pd.read_sql_query("select * from ways_nodes limit 2;", conn)
df

Unnamed: 0,id,node_id,position
0,4336229,4169891057,0
1,4336229,1080447401,1


In [21]:
## Let's count the no of unique user ids in the nodes table
q='''
select count(distinct uid) from nodes
;
'''
df=pd.read_sql_query(q,conn)
df

Unnamed: 0,count(distinct uid)
0,909


In [22]:
## Let's count the no of unique user ids in the ways table
q='''
select count(distinct uid) from ways
;
'''
df=pd.read_sql_query(q,conn)
df

Unnamed: 0,count(distinct uid)
0,598


In [23]:
## Let's count the no of nodes
q='''
select count(*) from nodes
;
'''
df=pd.read_sql_query(q,conn)
display(df)

## Let's count the no of ways
q='''
select count(*) from ways;
'''
df=pd.read_sql_query(q,conn)
df

Unnamed: 0,count(*)
0,257543


Unnamed: 0,count(*)
0,34265


We can see that there is a larger number of nodes than ways on the map.

In [24]:
## Let's count the no of tagged nodes
q='''
select count(*) from nodes_tags
;
'''
df=pd.read_sql_query(q,conn)
display(df)

## Let's count the no of tagged ways
q='''
select count(*) from ways_tags
;
'''
df=pd.read_sql_query(q,conn)
df

Unnamed: 0,count(*)
0,11882


Unnamed: 0,count(*)
0,94278


However, larger number of ways are tagged than nodes by nearly 8.5 times.

In [25]:
# Let's look at the different types of enteries and their counts
q='''
select key, count(*) from nodes_tags
group by key
order by count(*) desc limit 20;
'''
df=pd.read_sql_query(q,conn)
display(df)

q='''
select key, count(*) from ways_tags
group by key
order by count(*) desc limit 20;
'''
df=pd.read_sql_query(q,conn)
df

Unnamed: 0,key,count(*)
0,highway,2285
1,housenumber,999
2,street,973
3,name,863
4,amenity,624
5,crossing,604
6,city,506
7,postcode,439
8,natural,397
9,source,307


Unnamed: 0,key,count(*)
0,building,19639
1,highway,11131
2,name,6275
3,county,4675
4,name_base,4112
5,name_type,3953
6,cfcc,3686
7,oneway,2418
8,service,2368
9,lanes,2260


In [26]:
# Most common amenities in nodes and tags:
q='''
select value, count(*) from nodes_tags
where key = 'amenity' 
group by value
order by count(*) desc
limit 5;
'''
df=pd.read_sql_query(q,conn)
display(df)

q='''
select value, count(*) from ways_tags
where key = 'amenity' 
group by value
order by count(*) desc
limit 5;
'''
df=pd.read_sql_query(q,conn)
df

Unnamed: 0,value,count(*)
0,restaurant,121
1,fast_food,59
2,bench,52
3,cafe,38
4,bicycle_parking,28


Unnamed: 0,value,count(*)
0,parking,309
1,school,73
2,restaurant,29
3,place_of_worship,26
4,fast_food,20


It looks like nodes have markets or hangout places while the ways have offices or schools.

In [27]:
## Lets see what kind of highways are there in the nodes
q='''
select value, count(*) from nodes_tags
where key = 'highway'
group by value
order by count(*) desc limit 5;
'''
df=pd.read_sql_query(q,conn)
df

Unnamed: 0,value,count(*)
0,turning_circle,848
1,crossing,643
2,traffic_signals,426
3,stop,201
4,bus_stop,69


In [28]:
# Lets try to find out the most popular cuisines in San Jose
q='''
select value, count(*) from nodes_tags
where key = 'cuisine'
group by value
order by count(*) desc limit 5;
'''
df=pd.read_sql_query(q,conn)
display(df)

q='''
select value, count(*) from ways_tags
where key = 'cuisine'
group by value
order by count(*) desc limit 5;
'''
df=pd.read_sql_query(q,conn)
df

Unnamed: 0,value,count(*)
0,vietnamese,16
1,sandwich,15
2,chinese,13
3,coffee_shop,13
4,mexican,11


Unnamed: 0,value,count(*)
0,burger,13
1,mexican,9
2,american,2
3,chicken,2
4,korean,2


Asian food is quite popular in San Jose.

In [29]:
q='''
select nodes.user 
from nodes join nodes_tags on nodes.id=nodes_tags.id
where key = 'cuisine' and value = 'vietnamese'
;
'''
df=pd.read_sql_query(q,conn)
df

Unnamed: 0,user
0,n76
1,Minh Nguyen
2,Minh Nguyen
3,Minh Nguyen
4,Minh Nguyen
5,Minh Nguyen
6,Minh Nguyen
7,Minh Nguyen
8,Minh Nguyen
9,Minh Nguyen


So, there is a guy named Minh Nguyen who is crazy about Vietnamese food in San Jose.

In [30]:
q='''
select nodes.user 
from nodes join nodes_tags on nodes.id=nodes_tags.id
where key = 'cuisine' and value = 'chinese'
;
'''
df=pd.read_sql_query(q,conn)
df

Unnamed: 0,user
0,andyyue
1,xybot
2,xybot
3,Walk and walk around
4,YC Chao
5,lyiu
6,Minh Nguyen
7,AndrewSnow
8,stone43
9,Jon Bale


We can conclude that there are lot of people who probably like Chinese food.

Discrepency encountered:

A.There are some discrepancies in the postcode:
1.	u'94087\u200e': Unicode 
2.	‘95014’: Postcode without extension, however, extension doesn’t serve much purpose 
3.	'95014-030': Postcode with wrong extension
4.	'95014-0438': Postcode with correct extension

B. Street names:

- 'Blvd': set(['Los Gatos Blvd', 'Palm Valley Blvd']),
- 'Dr': set(['Samaritan Dr']),
- 'Ln': set(['Barber Ln', 'Branham Ln']),
- 'Ave': set(['1425 E Dunne Ave', 'Greenbriar Ave','N Blaney Ave','W Washington Ave']),
- '6': set(['Pruneridge Ave #6']),
- '81': set(['Concourse Dr #81']),

C.The ‘other’ category of k values in the tag element has a wide variety of issues that would require quite a few number of custom functions.
But, the benefit would that the comprehensibility will increase dramatically.
Some of the examples:

     ‘gnis:Class',
     'gnis:County',
     'gnis:ST_num',
     'gnis:ST_alpha',
     'gnis:County_num'

     'tiger:name_base_1',
     'tiger:name_base_2',
     'tiger:name_type_1',
     'turn:lanes:both_ways',
     'tiger:MTFCC',
     'tiger:RTTYP',
     'tiger:LINEARID',


Besides that there are some garbage data:

    - 'service:bicycle:chain_tool',
    - 'FIXME',
    - 'socket:type1',
    - 'socket:type1_chademo',
    - 'socket:type1_combo',
    - 'service:bicycle:pump'