# OPEN STREET MAP DATA CASE STUDY

MAP AREA:
San Francisco, CA, United States

https://mapzen.com/data/metro-extracts/metro/san-francisco_california/



The reason why I decided to chose San Francisco is I am pretty familiar with the Bay Area. I like the layout of the city, and its the tech hub of the world. 


In [70]:
import csv
import codecs
import re
import xml.etree.cElementTree as ET
import cerberus
import schema


LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

#Regular Expression created to remove underscores to normalize/clean the data
UNDER_SCORE = re.compile(r'([a-zA-Z0-9_])_([a-zA-Z0-9_])', re.IGNORECASE)

#Regular Expression created to remove semi colons
SEMI_COLON = re.compile(r'([a-zA-Z0-9_ \t\n\r\f\v]);([a-zA-Z0-9_ \t\n\r\f\v])')

SCHEMA = schema.schema

In [72]:
#This is a function utilized in the shape element function to remove underscores from the k and v attributes of tag 
#elements
def remove_underscores(input):
    v_under_score = UNDER_SCORE.search(input)
    if v_under_score:
        k_value_replace = input.replace('_',' ')
        return k_value_replace
    else:
        return input

In [73]:
#This is a function utilized in the shape element function to remove semi colons from the k and v attributes of tag 
#elements. 

def remove_semicolons(input):
    v_semi_colon = SEMI_COLON.search(input)
    if v_semi_colon:
        k_value_split = input.split(";")
        k_value_final = k_value_split[0]
        return k_value_final
    else:
        return input

In [74]:
#This cell is used to clean the street names for both the node and way elements

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "St" : "Street",
            "St." : "Street",
            "Rd.": "Road",
            "Ave" : "Avenue",
            "st" : "Street",
            "st." : "Street",
            "Blvd." : "Boulevard",
            "Blvd" : "Boulevard",
            "street" : "Street",
            "avenue" : "Avenue",
            "parkway" : "Parkway"}


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()

    return street_types


def update_name(name, mapping):
    
    
    name_object = street_type_re.search(name)
    if name_object:
    # YOUR CODE HERE
        if name_object.group() in mapping.keys():
            name = re.sub(street_type_re, mapping[name_object.group()], name)
        
            
    return name


#Below function used to execute the cleaning of the street names
def test():
    st_types = audit(OSM_FILE)
    #print st_types
    #assert len(st_types) == 3
    #pprint.pprint(dict(st_types))

    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)


        


In [75]:
NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"



# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements
    

    # YOUR CODE HERE
    if element.tag == 'node':
        for item in NODE_FIELDS:
            try:
                node_attribs[item] = element.attrib[item]
            except:
                node_attribs[item] = "9999999"
        
        for child in element:
                
            k_problems = PROBLEMCHARS.search(child.attrib['k'])
            k_semi_colon = LOWER_COLON.search(child.attrib['k'])
            v_problems = PROBLEMCHARS.search(child.attrib['v'])
            
 
            
            if k_problems or v_problems:
                pass
            
                
            elif k_semi_colon:
                node_tags_dict = {}
                v_value = remove_underscores(child.attrib["v"])
                v_value_semicolon = remove_semicolons(v_value)
                node_tags_dict["value"] = v_value_semicolon
                
                node_tags_dict["id"] = node_attribs["id"]
                k_value = remove_underscores(child.attrib['k'])
                k_value_semicolon = remove_semicolons(k_value)
                
                k_value_split = k_value_semicolon.split(":")
                
                
                if len(k_value_split) <= 2:
                    node_tags_dict["key"] = k_value_split[1]
                    node_tags_dict["type"] = k_value_split[0]
                if len(k_value_split) > 2:
                    node_tags_dict["key"] = k_value_split[1] + ":" + k_value_split[2]
                    node_tags_dict["type"] = k_value_split[0]
        
                tags.append(node_tags_dict)
            
            else:
                node_tags_dict = {}
                k_value = remove_underscores(child.attrib['k'])
                k_value_semicolon = remove_semicolons(k_value)
                node_tags_dict["key"] = k_value_semicolon
                
                v_value = remove_underscores(child.attrib["v"])
                v_value_semicolon = remove_semicolons(v_value)
                node_tags_dict["value"] = v_value_semicolon
                
                node_tags_dict["id"] = node_attribs["id"]
                node_tags_dict["type"] = default_tag_type
                
                tags.append(node_tags_dict)
            

        #print {'node': node_attribs, 'node_tags': tags}
        return {'node': node_attribs, 'node_tags': tags}
    
    elif element.tag == 'way':
        for item in WAY_FIELDS:
            try:
                way_attribs[item] = element.attrib[item]
            except:
                way_attribs[item] = "9999999"
                
        count = 0
        for child in element:
            if child.tag == "nd":
                way_nodes_dict = {}
                way_nodes_dict['id'] = way_attribs['id'] 
                if child.get("ref"):
                    way_nodes_dict["node_id"] = child.attrib['ref']
                    way_nodes_dict['position'] = count
                    count = count + 1
                way_nodes.append(way_nodes_dict)
            
            elif child.tag == "tag":
                
                k_problems = PROBLEMCHARS.search(child.attrib['k'])
                k_semi_colon = LOWER_COLON.search(child.attrib['k'])
                v_problems = PROBLEMCHARS.search(child.attrib['v'])
                

                
                
                if k_problems or v_problems:
                    pass
                    
                
                
                elif k_semi_colon:
                    way_tags_dict = {}
                    v_value = remove_underscores(child.attrib["v"])
                    v_value_semicolon = remove_semicolons(v_value)
                    way_tags_dict["value"] = v_value_semicolon
                    
                    way_tags_dict["id"] = way_attribs["id"]
                    k_value = remove_underscores(child.attrib['k'])
                    k_value_semicolon = remove_semicolons(k_value)
                    
                    k_value_split = k_value_semicolon.split(":")
                
                    if len(k_value_split) <= 2:
                        way_tags_dict["key"] = k_value_split[1]
                        way_tags_dict["type"] = k_value_split[0]
                    if len(k_value_split) > 2:
                        way_tags_dict["key"] = k_value_split[1] + ":" + k_value_split[2]
                        way_tags_dict["type"] = k_value_split[0]
                    
                    tags.append(way_tags_dict)
                    
                else:
                    way_tags_dict = {}
                    k_value = remove_underscores(child.attrib['k'])
                    k_value_semicolon = remove_semicolons(k_value)
                    way_tags_dict["key"] = k_value_semicolon
                    
                    v_value = remove_underscores(child.attrib["v"])
                    v_value_semicolon = remove_semicolons(v_value)
                    way_tags_dict["value"] = v_value_semicolon
                    
                    way_tags_dict["id"] = way_attribs["id"]
                    way_tags_dict["type"] = default_tag_type

                    tags.append(way_tags_dict)
                
            
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_strings = (
            "{0}: {1}".format(k, v if isinstance(v, str) else ", ".join(v))
            for k, v in errors.iteritems()
        )
        raise cerberus.ValidationError(
            message_string.format(field, "\n".join(error_strings))
        )


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_FILE, validate=True)


# PROBLEMS ENCOUNTERED IN YOUR MAP

PROBLEMS ENCOUNTERED DURING WRANGLING

Prologue: In the samples of the data I observed, the data was rather tidy in raw form. I think one of the reasons might be the San Francisco area has a higher level of computer literacy than most other places in the World. Therefore, there might be a bit more mindfulness when importing data to the OSM project. However, the id columns in the CSV files were not unique, so I do not have a primary key in either the nodes database or the ways database. 

1) Underscores

There seemed to be several entries in various columns with underscores, where those values would typically be two words. Some of the time they were entered as two words with a spaces, and other times as two words separated only by an underscore. I wrote a function to eliminate underscores to make the data more uniform and intuitive for queries. 

2) Semi colons

In handfuls of instances, while auditing the data, value entries had what seemed to be two entries. There would be the value entry followed by a semi-colon with another value, or simply additional information to supplement what was to the left of the semi-colon. Using a regular expression, I eliminated the semi-colon and the values to the right of it to make the data more uniform and clean to enter into the database. 


3) Street Names

During auditing I added onto the existing street names function to clean the values of street keys to further make the data more clean and uniform. 

In [1]:
from collections import defaultdict

#Functions used to audit the data

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

def jb_unique_k_and_v(osmfile, tagtype):
    osm_file = open(osmfile, "r")
    unique_k_and_v = defaultdict(set)
    for i, element in enumerate(get_element(osm_file)):
        if element.tag == tagtype:                         
            for tag in element.iter("tag"):
                    unique_k_and_v[tag.attrib['k']].add(tag.attrib['v'])    
    print len(unique_k_and_v.keys())  # this is a check
    print "--------------------"
    return unique_k_and_v

# OVERVIEW OF THE DATA

1) Is the OSM XML large enough? 
    - The file I used was roughly 945k KB.  

2) Are overview statistics of the dataset computed?

-size of database file

PRAGMA PAGE_SIZE
1,024
PRAGMA PAGE_COUNT
528,429 

= 541 MB


-number of unique users

SELECT count(distinct(users_ways.uid)) FROM (SELECT uid FROM nodes UNION ALL SELECT uid FROM ways) users_ways;
2422 users


-number of nodes

SELECT count(*) FROM nodes;
4,514,170 nodes


-number of ways

SELECT count(*) FROM ways;
520,961 ways



-TOP 10 most common values from nodes_tags table

SELECT value, count(*) as sum FROM nodes_tags GROUP BY value ORDER BY sum desc LIMIT 10;
JOSM|12733
crossing|7749
CA|6408
yes|6113
turning circle|5969
traffic signals|4913
US|4035
tree|3746
Berkeley|3535
no|3422


-TOP 10 most common values from way_tags table

SELECT value, count(*) as sum FROM ways_tags GROUP BY value ORDER BY sum desc LIMIT 10;
yes|421565
no|58228
A41|44081
residential|39431
service|17971
St|11787
footway|11116
Bing|10857
Ave|10280
secondary|8662


How many node entries have a latitude that is above the mean latitude?

SELECT count(*) as sum FROM nodes, (SELECT avg(nodes.lat) as av FROM nodes) as subq WHERE lat > av;
2,620,438 node entries


Most common 15 cities in the data?

SELECT key, value, count(*) as sum FROM (SELECT key, value FROM nodes_tags UNION ALL SELECT key, value FROM ways_tags) WHERE key = "city" GROUP BY value ORDER BY sum desc LIMIT 15;
city|Berkeley|5618
city|Piedmont|3812
city|Richmond|1350
city|Oakland|1335
city|Burlingame|199
city|Albany|186
city|Alameda|124
city|Hayward|78
city|Pacifica|75
city|Sausalito|56
city|Emeryville|39
city|Belmont|34
city|Colma|30
city|Fremont|18
city|Alamo|17

15 Least popular cuisine choices?

Select key, value, count(*) as sum FROM (SELECT key, value FROM nodes_tags UNION ALL SELECT key, value FROM ways_tags) WHERE key = "cuisine" GROUP BY value ORDER BY sum asc LIMIT 15;
cuisine|Asian|1
cuisine|BBQ|1
cuisine|Bakery|1
cuisine|Boba|1
cuisine|Cafe|1
cuisine|California|1
cuisine|Chicken and waffles|1
cuisine|Comfort Food|1
cuisine|Cucina Americana|1
cuisine|Deli|1
cuisine|El Salvadorean|1
cuisine|French and American|1
cuisine|French-style cafe|1
cuisine|Fruit Smoothies|1
cuisine|Hawaiian|1












# OTHER IDEAS ABOUT THE DATASETS


In [None]:
1) Are ideas for additional improvements included?
    - Submission document includes one or more additional suggestions for improving and analyzing the data.

    1) County in way_tags
    
    In the way_tags data, county is referred to as "county, CA." This could lead to confusion down the line with a query.
    Also, I noticed this in most if not all of the county value entries I observed, but this was only in the sample of the 
    data file I was using to audit the data. Those county value entries could be in a different form in other 
    parts of the file. 
    
    2) phone numbers
    
    The phone numbers are in all different forms. These could be cleaned to glean additional location based information
    from the dataset. However, is the effort in auditing and cleaning the data to get whatever information the phone numbers
    might provide worth it, given the zip code values can probably be much more exact and useful. Depending on the research
    question, an analyst may want to thoroughly audit and clean the phone numbers. It is simply a matter of whether the 
    effort is worth it for the project. 
    

2) Are benefits and problems with additional improvements discussed?

    The benefit to the exploring the cleaning of these possible problems is it makes the data more uniform, and could 
    possibly increase the amount of information we could glean from the dataset. However, is the additional effort and time 
    to do what worth it given what we are looking to find from the data. 

# Python code for "Case Study: OpenStreetMap Data" quizzes

In [3]:
# Iterative Parsing

import xml.etree.cElementTree as ET
import pprint

def count_tags(filename):
    TagList = []
    Tag_Dictionary = {}
    for event, elem in ET.iterparse(filename):
        TagList.append(elem.tag)
    for item in TagList:
        if item not in Tag_Dictionary:
            count = 0
            for item_tag in Tag_List:
                if item_tag == item:
                    count = count + 1
                Tag_Dictionary[item] = count
    return Tag_Dictionary



In [None]:
# Tag Types

def key_type(element, keys):
    if element.tag == "tag":
        k = element.attrib["k"]
        if re.search(lower, k):
            keys["lower"] += 1
        elif re.search(lower_colon, k):
            keys["lower_colon"] += 1
        elif re.search(problemchars, k):
            keys["problemchars"] += 1
        else:
            keys["other"] += 1
        
        pass
    return keys

In [None]:
# Exploring Users

def get_user(element):
    if element.tag == "node":
        idu = element.attrib["uid"]
    elif element.tag == "way":
        idu = element.attrib["uid"]
    elif element.tag == "relation":
        idu = element.attrib["uid"]
    else:
        idu = None
    
    userid = idu
    
    return userid

def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if get_user(element) == None:
            pass
        elif get_user(element) not in users:
            users.add(get_user(element))
        
        pass
    
    return users

In [None]:
# Improving Street Names

def update_name(name, mapping):
    name_object = street_type_re.search(name)
    if name_object:
        if name_object.group() in mapping.keys():
            name = re.sub(street_type_re, mapping[name_object.group()], name)
            
    return name

In [None]:
#Preparing For Database

import csv
import codecs
import re
import xml.etree.cElementTree as ET

import cerberus

import schema

OSM_PATH = "example.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements
    

    # YOUR CODE HERE
    if element.tag == 'node':
        for item in element.attrib:
            if item in NODE_FIELDS:
                node_attribs[item] = element.attrib[item]
        
        for child in element:
            node_tags_dict = {}
            node_tags_dict["value"] = child.attrib["v"]
            node_tags_dict["id"] = element.attrib["id"]
                
            k_problems = PROBLEMCHARS.search(child.attrib['k'])
            k_semi_colon = LOWER_COLON.search(child.attrib['k'])
            if k_problems:
                pass
                
            
                
            elif k_semi_colon:
                k_value_split = child.attrib['k'].split(":")
                
                if len(k_value_split) == 2:
                    node_tags_dict["key"] = k_value_split[1]
                    node_tags_dict["type"] = k_value_split[0]
                if len(k_value_split) == 3:
                    node_tags_dict["key"] = k_value_split[1] + ":" + k_value_split[2]
                    node_tags_dict["type"] = k_value_split[0]
            else:
                node_tags_dict["key"] = child.attrib["k"]
                node_tags_dict["type"] = default_tag_type

            tags.append(node_tags_dict)    
        return {'node': node_attribs, 'node_tags': tags}
    
    elif element.tag == 'way':
        for item in element.attrib:
            if item in WAY_FIELDS:
                way_attribs[item] = element.attrib[item]
                
        count = 0
        for child in element:
            if child.tag == "nd":
                way_nodes_dict = {}
                way_nodes_dict['id'] = element.attrib['id'] 
                if child.get("ref"):
                    way_nodes_dict["node_id"] = child.attrib['ref']
                    way_nodes_dict['position'] = count
                    count = count + 1
                way_nodes.append(way_nodes_dict)
            
            elif child.tag == "tag":
                way_tags_dict = {}
                way_tags_dict["value"] = child.attrib["v"]
                way_tags_dict["id"] = element.attrib["id"]
                
                k_problems = PROBLEMCHARS.search(child.attrib['k'])
                k_semi_colon = LOWER_COLON.search(child.attrib['k'])
                if k_problems:
                    pass
                
                
                elif k_semi_colon:
                    k_value_split = child.attrib['k'].split(":")
                
                    if len(k_value_split) == 2:
                        way_tags_dict["key"] = k_value_split[1]
                        way_tags_dict["type"] = k_value_split[0]
                    if len(k_value_split) == 3:
                        way_tags_dict["key"] = k_value_split[1] + ":" + k_value_split[2]
                        way_tags_dict["type"] = k_value_split[0]
                else:
                    way_tags_dict["key"] = child.attrib["k"]
                    way_tags_dict["type"] = default_tag_type

                
                tags.append(way_tags_dict)        
                
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_strings = (
            "{0}: {1}".format(k, v if isinstance(v, str) else ", ".join(v))
            for k, v in errors.iteritems()
        )
        raise cerberus.ValidationError(
            message_string.format(field, "\n".join(error_strings))
        )


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=False)


# RESOURCES

1)

https://discussions.udacity.com/t/creating-db-file-from-csv-files-with-non-ascii-unicode-characters/174958/45

2) 

https://discussions.udacity.com/t/project-problem-cant-get-through-validate-element-el-validator/179544/37

3)

https://docs.python.org/2/library/re.html

4)

https://discussions.udacity.com/t/preparing-for-database-have-i-located-the-ballpark/188800/20


