# Open Street Map（data exploring and cleaning)

### sampling dataset

In [1]:
OSM_FILE = "d:/Udacity/DataAnalysis/senior/project/openstreetmaps/zhusanjiao.osm"
SAMPLE_FILE = "d:/Udacity/DataAnalysis/senior/project/openstreetmaps/SAMPLE_FILE.osm"
PATH = "d:/Udacity/DataAnalysis/senior/project/openstreetmaps/"

In [2]:
import csv
import codecs
import pprint
import re
import xml.etree.ElementTree as ET
from collections import defaultdict

import cerberus

from schema import *

In [3]:
#对数据进行采样，得到SAMPLE_FILE
k = 35
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version = "1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')
    
    for i,element in enumerate(get_element(OSM_FILE)):
        if i%k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))
    output.write('</osm>')

### 检查标签

In [8]:
def count_tags(file_name):
    tags = {}
    for _, elem in ET.iterparse(file_name):
        if elem.tag in tags:
            tags[elem.tag] += 1
        else:
            tags[elem.tag] = 1
    return tags


tags = count_tags(OSM_FILE)
pprint.pprint(tags)

{'bounds': 1,
 'member': 159535,
 'meta': 1,
 'nd': 1889994,
 'node': 1624147,
 'note': 1,
 'osm': 1,
 'relation': 3142,
 'remark': 1,
 'tag': 514129,
 'way': 178282}


### 审查街道

In [32]:
expected = [ u'路',u'街',u'道',u'巷']
st_name_zh = re.compile(ur".*?([\u4E00-\u9FA5]+)") #匹配中文字符

In [33]:
#判断字段为街道
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

In [34]:
#检查街道名
def audit_street_type(street_types, street_name):
    m = st_name_zh.match(street_name)
    if m:
        street_type = m.group()
        if street_type[-1:] not in expected:        #使用[-1:]，符合中文街道名称习惯，街道类型一般为最后一个字确定，如“XXX街”
            street_types[street_type[-1]].add(street_name)

In [35]:
#遍历文件审查街道名
def audit_street(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osmfile, events = ("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

In [40]:
#因为中文街道名称规范和英文不同，很难通过对某一个字段来判断街道名，所以选择遍历所有文件
types = audit_street(OSM_FILE)

In [39]:
for _, j in types.items():
    for i in j:
        print i

香港新界上水河上鄉95約快景路
馬涌西涌邊
广州大道北 Guangzhou Ave N
江门市新会区冈州大道东
冈州大道东
江南大道中
芳村大道中
广州大道中
三元里大道中
布吉大芬
布吉
星河CoCoPark
f福禄沙桂轩洲
大埔工业区
布吉街道大芬社區
中康路卓越城北區
福中路市民中心A区
海岸城購物中心2層203-205號
海岸城中洲控股金融中心B座2樓
China, Guangdong Sheng, Guangzhou Shi, Tianhe Qu, TianHe GongYuan, Tianhe Rd, 太古汇L307号, 邮政编码: 510610
一坊
下白石三坊
里仁坊
下沙村八坊
长寿里
长兴里
沙坪镇园外苑20号
高新科技園高新南7道
广州市南沙区环市大道南2号南沙资讯科技园
白云大道南
合新路九號
大芬油畫村
下塘新村
鹤山市雅瑶镇直水村
佛平2路
西樵鎮漁耕粵韵
山村果围
广州大学理学实验楼
黄埔大道西
中山大道西
大沙地西


In [83]:
#通过对上面的观察，如果想得到统一格式的街道名，去除掉本身不是街道名称的干扰数据，还有就是符合街道名字的也带有多余的信息，需要更新完善
#非街道名的忽略，街道名通过mapping字典进行更新
mapping = {u'白云大道南':u'白云大道', 
           u'三元里大道中':u'三元里大道',
           u'福中路市民中心':u'福中路',
           u'中山大道西':u'中山大道',
           u'广州市南沙区环市大道南':u'环市大道',
           u'布吉街道大芬社區':u'布吉街道',
           u'合新路九號':u'合新路', 
           u'冈州大道东':u'冈州大道',
           u'江门市新会区冈州大道':u'冈州大道',
           u'芳村大道中':u'芳村大道',
           u'中康路卓越城北區':u'中康路',
           u'江南大道中':u'江南大道',
           u'广州大道中':u'广州大道',
           u'中山大道西':u'中山大道',
           u'广州大道北':u'广州大道',
           u'白云大道南':u'白云大道',
           u'黄埔大道西':u'黄埔大道',
           u'芳村大道中':u'芳村大道',
           u'高新科技園高新南':u'高新路',
           u'佛平':u'佛平二路'
           }
def update_name(name,mapping):
    zh_name = st_name_zh.match(name)
    problemname = mapping.keys()
    # 先对名字进行判断，如果街道名字包含中英文，先去除英文
    if zh_name:
        name = zh_name.group(0)  
        for key in problemname:
            if key in name:
                name = name.replace(key, mapping[key])
    else:
        name = None
    return name
    
for  _, ways in types.items():
    for name in ways:
        better_name = update_name(name, mapping)
        print name, ">" , better_name

香港新界上水河上鄉95約快景路 > 香港新界上水河上鄉
馬涌西涌邊 > 馬涌西涌邊
广州大道北 Guangzhou Ave N > 广州大道
江门市新会区冈州大道东 > 冈州大道
冈州大道东 > 冈州大道
江南大道中 > 江南大道
芳村大道中 > 芳村大道
广州大道中 > 广州大道
三元里大道中 > 三元里大道
布吉大芬 > 布吉大芬
布吉 > 布吉
星河CoCoPark > 星河
f福禄沙桂轩洲 > f福禄沙桂轩洲
大埔工业区 > 大埔工业区
布吉街道大芬社區 > 布吉街道
中康路卓越城北區 > 中康路
福中路市民中心A区 > 福中路
海岸城購物中心2層203-205號 > 海岸城購物中心
海岸城中洲控股金融中心B座2樓 > 海岸城中洲控股金融中心
China, Guangdong Sheng, Guangzhou Shi, Tianhe Qu, TianHe GongYuan, Tianhe Rd, 太古汇L307号, 邮政编码: 510610 > China, Guangdong Sheng, Guangzhou Shi, Tianhe Qu, TianHe GongYuan, Tianhe Rd, 太古汇
一坊 > 一坊
下白石三坊 > 下白石三坊
里仁坊 > 里仁坊
下沙村八坊 > 下沙村八坊
长寿里 > 长寿里
长兴里 > 长兴里
沙坪镇园外苑20号 > 沙坪镇园外苑
高新科技園高新南7道 > 高新路
广州市南沙区环市大道南2号南沙资讯科技园 > 环市大道
白云大道南 > 白云大道
合新路九號 > 合新路
大芬油畫村 > 大芬油畫村
下塘新村 > 下塘新村
鹤山市雅瑶镇直水村 > 鹤山市雅瑶镇直水村
佛平2路 > 佛平二路
西樵鎮漁耕粵韵 > 西樵鎮漁耕粵韵
山村果围 > 山村果围
广州大学理学实验楼 > 广州大学理学实验楼
黄埔大道西 > 黄埔大道
中山大道西 > 中山大道
大沙地西 > 大沙地西


### 审查城市名称

In [51]:
#所选区域包含城市仅有以下几个，非下列城市的都属于超出范围
expected_city = [u'深圳市',u'广州市',  u'东莞市',  u'佛山市',  u'中山市',  u'江门市'] 
#匹配中文字符
city_name_zh = re.compile(ur".*?([\u4E00-\u9FA5]+)")

In [53]:
#检查标签为城市名称
def is_city_name(elem):
    return (elem.attrib['k'] == 'addr:city')

In [57]:
def audit_city_name(osmfile):
    osm_file = open(osmfile,"r")
    citynames = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events = ("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_city_name(tag):
                    match_city = city_name_zh.match(tag.attrib['v'])
                    if match_city != None:
                        city = match_city.group(1)
                        if city not in expected_city:
                            citynames[city].add(tag.attrib['v'])
                            
    osm_file.close()
    return citynames

In [58]:
audit_city = audit_city_name(OSM_FILE)

In [59]:
for i in audit_city:
    print i

北京街道
常平镇
大运新城
广州市天河区
深圳
广州荔湾区解放中路街道
广东省深圳市
大塘街道
東莞市長安鎮
沙頭角
深圳市罗湖区
六榕街道
廣州
广州市天河区珠江新城
广东省广州市
东东莞
东莞
石岐区街道
白云区
佛山
龙江镇
广州市白云区
深圳市宝安区
深圳市福田区
佛山市高明区
广州
沙坪市
廣州市
沙溪镇
佛山市南海區
新会区
东莞生态园
龙岗中心城
体育新城


In [84]:
city_mapping = {u'深圳':u'深圳市',   
           u'龙岗中心城':u'深圳市',
           u'龙岗中':u'深圳市',
           u'沙頭角':u'深圳市',
           u'大运新城':u'深圳',
           u'体育新城':u'深圳市',
           u'六榕街道':u'广州',
           u'大塘街道':u'广州',
           u'白云区':u'广州',
           u'廣州':u'广州',
           u'廣州市':u'广州市',
           u'北京街道':u'广州',
           u'广州荔湾区解放中路街道':u'广州',
           u'广州':u'广州市',
           u'佛山':u'佛山市',
           u'佛山市南海區':u'佛山市',
           u'龙江镇':u'佛山',
           u'東莞市長安鎮':u'东莞',
           u'常平镇':u'东莞市',
           u'东莞生态园':u'东莞',
           u'东莞':u'东莞市',
           u'东东莞':u'东莞',
           u'新会区':u'江门市',
           u'沙坪市':u'江门市',
           u'石岐区街道':u'中山市',
           u'沙溪镇':u'中山市'
            }


def update_city_name(name,mapping):
    problemname = mapping.keys()
    zh_name = city_name_zh.match(name)
    if zh_name:
        if name[:3] in expected_city:    #这里的[:3]和下面的[-3:]使用的原因是中国地名的书写习惯，通过提取前三个字或者后三个字，能够减少工作量
            name = name[:3]
        elif name[-3:] in expected_city:
            name = name[-3:]
        else:
            for key in problemname:
                if key in name:
                    name = name.replace(key, mapping[key])[:3]
    else:
        name = None
    return name
    
for  _, citys in audit_city.items():
    for name in citys:
        better_name = update_city_name(name, city_mapping)
        print name, ">" , better_name

北京街道 > 广州市
常平镇 (Changping) > 东莞市
大运新城 > 深圳市
广州市天河区 > 广州市
深圳 > 深圳市
深圳 Shenzhen > 深圳市
广州荔湾区解放中路街道 > 广州市
广东省深圳市 > 深圳市
大塘街道 > 广州市
東莞市長安鎮 > 东莞市
沙頭角 Sha Tau Kok > 深圳市
深圳市罗湖区 > 深圳市
六榕街道 > 广州市
廣州 > 广州市
广州市天河区珠江新城 > 广州市
广东省广州市 > 广州市
东东莞 > 东莞市
东莞 > 东莞市
石岐区街道 (Shiqi) > 中山市
白云区 > 广州市
佛山 > 佛山市
龙江镇 > 佛山市
广州市白云区 > 广州市
深圳市宝安区 > 深圳市
深圳市福田区 > 深圳市
佛山市高明区 > 佛山市
广州 > 广州市
沙坪市 > 江门市
廣州市 > 广州市
沙溪镇 (Nantou) > 中山市
佛山市南海區 > 佛山市
新会区 > 江门市
东莞生态园 > 东莞市
龙岗中心城 > 深圳市
体育新城 > 深圳市


### 更新数据，写入csv

In [85]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
After auditing is complete the next step is to prepare the data to be inserted into a SQL database.
To do so you will parse the elements in the OSM XML file, transforming them from document format to
tabular format, thus making it possible to write to .csv files.  These csv files can then easily be
imported to a SQL database as tables.

The process for this transformation is as follows:
- Use iterparse to iteratively step through each top level element in the XML
- Shape each element into several data structures using a custom function
- Utilize a schema and validation library to ensure the transformed data is in the correct format
- Write each data structure to the appropriate .csv files

We've already provided the code needed to load the data, perform iterative parsing and write the
output to csv files. Your task is to complete the shape_element function that will transform each
element into the correct format. To make this process easier we've already defined a schema (see
the schema.py file in the last code tab) for the .csv files and the eventual tables. Using the 
cerberus library we can validate the output against this schema to ensure it is correct.

## Shape Element Function
The function should take as input an iterparse Element object and return a dictionary.

### If the element top level tag is "node":
The dictionary returned should have the format {"node": .., "node_tags": ...}

The "node" field should hold a dictionary of the following top level node attributes:
- id
- user
- uid
- version
- lat
- lon
- timestamp
- changeset
All other attributes can be ignored

The "node_tags" field should hold a list of dictionaries, one per secondary tag. Secondary tags are
child tags of node which have the tag name/type: "tag". Each dictionary should have the following
fields from the secondary tag attributes:
- id: the top level node id attribute value
- key: the full tag "k" attribute value if no colon is present or the characters after the colon if one is.
- value: the tag "v" attribute value
- type: either the characters before the colon in the tag "k" value or "regular" if a colon
        is not present.

Additionally,

- if the tag "k" value contains problematic characters, the tag should be ignored
- if the tag "k" value contains a ":" the characters before the ":" should be set as the tag type
  and characters after the ":" should be set as the tag key
- if there are additional ":" in the "k" value they and they should be ignored and kept as part of
  the tag key. For example:

  <tag k="addr:street:name" v="Lincoln"/>
  should be turned into
  {'id': 12345, 'key': 'street:name', 'value': 'Lincoln', 'type': 'addr'}

- If a node has no secondary tags then the "node_tags" field should just contain an empty list.

The final return value for a "node" element should look something like:

{'node': {'id': 757860928,
          'user': 'uboot',
          'uid': 26299,
       'version': '2',
          'lat': 41.9747374,
          'lon': -87.6920102,
          'timestamp': '2010-07-22T16:16:51Z',
      'changeset': 5288876},
 'node_tags': [{'id': 757860928,
                'key': 'amenity',
                'value': 'fast_food',
                'type': 'regular'},
               {'id': 757860928,
                'key': 'cuisine',
                'value': 'sausage',
                'type': 'regular'},
               {'id': 757860928,
                'key': 'name',
                'value': "Shelly's Tasty Freeze",
                'type': 'regular'}]}

### If the element top level tag is "way":
The dictionary should have the format {"way": ..., "way_tags": ..., "way_nodes": ...}

The "way" field should hold a dictionary of the following top level way attributes:
- id
-  user
- uid
- version
- timestamp
- changeset

All other attributes can be ignored

The "way_tags" field should again hold a list of dictionaries, following the exact same rules as
for "node_tags".

Additionally, the dictionary should have a field "way_nodes". "way_nodes" should hold a list of
dictionaries, one for each nd child tag.  Each dictionary should have the fields:
- id: the top level element (way) id
- node_id: the ref attribute value of the nd tag
- position: the index starting at 0 of the nd tag i.e. what order the nd tag appears within
            the way element

The final return value for a "way" element should look something like:

{'way': {'id': 209809850,
         'user': 'chicago-buildings',
         'uid': 674454,
         'version': '1',
         'timestamp': '2013-03-13T15:58:04Z',
         'changeset': 15353317},
 'way_nodes': [{'id': 209809850, 'node_id': 2199822281, 'position': 0},
               {'id': 209809850, 'node_id': 2199822390, 'position': 1},
               {'id': 209809850, 'node_id': 2199822392, 'position': 2},
               {'id': 209809850, 'node_id': 2199822369, 'position': 3},
               {'id': 209809850, 'node_id': 2199822370, 'position': 4},
               {'id': 209809850, 'node_id': 2199822284, 'position': 5},
               {'id': 209809850, 'node_id': 2199822281, 'position': 6}],
 'way_tags': [{'id': 209809850,
               'key': 'housenumber',
               'type': 'addr',
               'value': '1412'},
              {'id': 209809850,
               'key': 'street',
               'type': 'addr',
               'value': 'West Lexington St.'},
              {'id': 209809850,
               'key': 'street:name',
               'type': 'addr',
               'value': 'Lexington'},
              {'id': '209809850',
               'key': 'street:prefix',
               'type': 'addr',
               'value': 'West'},
              {'id': 209809850,
               'key': 'street:type',
               'type': 'addr',
               'value': 'Street'},
              {'id': 209809850,
               'key': 'building',
               'type': 'regular',
               'value': 'yes'},
              {'id': 209809850,
               'key': 'levels',
               'type': 'building',
               'value': '1'},
              {'id': 209809850,
               'key': 'building_id',
               'type': 'chicago',
               'value': '366409'}]}
"""

import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus

import schema

OSM_PATH = OSM_FILE

NODES_PATH = PATH + "nodes.csv"
NODE_TAGS_PATH = PATH +"nodes_tags.csv"
WAYS_PATH = PATH +"ways.csv"
WAY_NODES_PATH = PATH +"ways_nodes.csv"
WAY_TAGS_PATH = PATH +"ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon',  'version', 'changeset', 'timestamp', 'uid', 'user']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""
    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    # YOUR CODE HERE
    for tag in element.iter('tag'):
        if is_street_name(tag):
            better_street_name = update_name(tag.attrib['v'], mapping)
            tag.attrib['v'] = better_street_name
        elif tag.attrib['k'] == 'addr:city':
            better_city_name = update_city_name(tag.attrib['v'], city_mapping)
            tag.attrib['v'] = better_city_name
                            
    if element.tag == 'node':
        for field in NODE_FIELDS:
            node_attribs[field] = element.attrib[field]
        
        for child in element:
            if PROBLEMCHARS.match(child.attrib['k']):
                continue
            
            second_tag = {}
            
            second_tag['id'] = element.attrib['id']
            second_tag['value'] = child.attrib['v']
            
            if LOWER_COLON.match(child.attrib['k']):
                second_tag['type'] = child.attrib["k"].split(':', 1)[0]
                second_tag['key'] = child.attrib["k"].split(':', 1)[1]
            else:
                second_tag['type'] = "regular"
                second_tag['key'] = child.attrib["k"]
                
            tags.append(second_tag)
        return {'node': node_attribs, 'node_tags': tags}
    elif element.tag == 'way':
        for field in WAY_FIELDS:
            way_attribs[field] = element.attrib[field]
        
        position = 0
        
        for child in element:
            if child.tag == 'tag':
                if PROBLEMCHARS.match(child.attrib['k']):
                    continue
                second_way_tag = {}
                second_way_tag['id'] = element.attrib['id']
                second_way_tag['value'] = child.attrib['v']
                
                if LOWER_COLON.match(child.attrib['k']):
                    second_way_tag['type'] = child.attrib['k'].split(':', 1)[0]
                    second_way_tag['key'] = child.attrib['k'].split(':', 1)[1]
                else:
                    second_way_tag['type'] = "regular"
                    second_way_tag['key'] = child.attrib['k']
                
                tags.append(second_way_tag)
                
            elif child.tag == 'nd':
                way_nd = {}
                way_nd['id'] = element.attrib['id']
                way_nd['node_id'] = child.attrib['ref']
                way_nd['position'] = position
                position += 1
                
                way_nodes.append(way_nd)
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_FILE, validate=False)
