# Import and Declarative Section

In [115]:
import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET
import cerberus
import schema
from collections import defaultdict

In [116]:
OSM_PATH = "./dbprep/redmond_bell_kirk.xml"
OSM_FILE = "./dbprep/redmond_bell_kirk.xml"
SAMPLE_FILE = "./dbprep/rbk_samp.xml"
FILE = SAMPLE_FILE # switch file to SAMPLE_FILE or OSM_FILE depending on DRAFT / Production Cut


In [117]:
tag_freq = defaultdict(int)
tag_key = defaultdict(int)
elem_tag = {}
elem_tags = []
street_types = defaultdict(set)

# Step 1: Generate a sample file and perform intital analysis

In [118]:

k = 10 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

# Problem with the dataset

Based on my exploratory analysis, the dataset had the following set of problems

1) There are names spelled alternatively in foreign languages (unicode) in the data. Although they provide useful information in some context, it is good to trap and filter them out here as they do not necessarily add insight about the dataset for my purpose.<br>
<br>
2) I looked into the Top 3 Tag Keys to scope the data cleansing for this project and they are namely - addr:housenumber, addr:street and addr:postcode. I would like to examine these popular tags and scrub them of any anomaly <br>
    &emsp;a) There is one entry with alpha numeric housenumber. But then again on searching for the property in google maps, it turned out to be a multi-family home and the housenumber is legitimate. So, this data is not filtered out.<br>
    &emsp;b) The street needs to be standardized as there are different versions of the street types<br>
    &emsp;c) postal codes are both in 5 digit and 10 digit format. For this project I am only considering the first 5 digit of the zipcode.<br>
3) Besides this, I would like to standardize phone numbers as there is lots of variations of phone numbers.<br>



In [119]:
for event, elem in ET.iterparse(OSM_FILE):
    # determine the element tags and their frequency
    tag_freq[elem.tag] +=1
    
    if elem.tag == 'tag':
        # analyze the Key attribute of the tags
        tag_key[elem.attrib['k']] +=1    

print "**************** Frequency of the Element Tag ****************************"
pprint.pprint(tag_freq)
        
print "**************** Top 3 Tag Key ****************************"
     
for x,y in sorted(tag_key.items(),key=lambda(k,v): v, reverse = True)[:3]:
    print x,y     

**************** Frequency of the Element Tag ****************************
defaultdict(<type 'int'>, {'node': 328395, 'member': 5146, 'remark': 1, 'nd': 360231, 'tag': 198648, 'bounds': 1, 'note': 1, 'meta': 1, 'relation': 240, 'way': 35416, 'osm': 1})
**************** Top 3 Tag Key ****************************
addr:street 26784
addr:housenumber 26782
addr:postcode 26552


In [120]:
# regex functions

lower_re = re.compile(r'^([a-z]|_)*$')
lower_colon_re = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars_re = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
eng_set_re = re.compile('[^\W]', re.IGNORECASE)
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
phone_re = re.compile(r''' ([2-9][0-9][0-9])\D*(\d{3})\D* (\d{4})\D*(\d*)$''', re.VERBOSE)
zcode_re = re.compile(r'''(\d{5})''')

# The strategy is to start from the basic expected and mapping data structure and customize it by the end of the analysis

In [121]:
# expected list of street type
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# mapping of unconventional to standarad street type
mapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "Rd": "Road",
            "Rd.": "Road"
            }

In [122]:
def audit_tag(FILE):
    street_types.clear() # street_types needs to be built from the beginning as the prior function call would have already added the elements
    for event, elem in ET.iterparse(FILE):
   
            if elem.tag == 'tag':
                

                # analyze and identify if there are a) any foreign characters in the tag attribute values & b) problem chars in the tag keys
                e = eng_set_re.search(elem.attrib['v'])
                lc = lower_colon_re.search(elem.attrib['k'])
                p = problemchars_re.search(elem.attrib['k'])
                
        
                if e:
                    pass
                else:
                    print "non-english char detected in ->",elem.attrib
                if p:
                    print "problem chars detected in ->",elem.attrib
                else: 
                    pass
                if lc:
                    pass

                # check housenumber
                if  "housenumber" in elem.attrib['k']:
                    hno = problemchars.search(elem.attrib['v'])
                    if hno:
                        print "problem char detected in house numbers ->",elem.attrib
                # check postcode
                elif "postcode" in elem.attrib['k']:
                    if len(elem.attrib['v']) > 5:
                        print "Error ",elem.attrib['v']
                    
                    ztemp = zcode.search(elem.attrib['v'])
                    if ztemp:
                        zipcode = ztemp.groups()
                        if len(zipcode) > 1:
                            print "zip code > 5 : ", elem.attrib['v'], ' -> ',zipcode[0]
                    else:
                        
                        print "Error" , elem.attrib['k'], ' -> ',elem.attrib['v']
                    
                    
                # check street types
                elif "street" in elem.attrib['k']:
                    s = street_type_re.search(elem.attrib['v'])
                    if s:
                        street_type = s.group()
                        if street_type not in expected:
                            street_types[street_type].add(elem.attrib['v']) #build list to identify unexpected street type
                # check phone number
                elif "phone" in elem.attrib['k']:
                    ptemp = phone_re.search(elem.attrib['v'])
                    if ptemp:
                        pnum = ptemp.groups() 
                        pno =  pnum[0]+'-'+pnum[1]+'-'+pnum[2]
                        if pnum[3] <>'':
                            pno = pno + '-'+pnum[3]
                    else:
                        pno = 'Error'
                    print elem.attrib['v'] , ' -> ', pno
    print "Stree Type not in expected list -> ", street_types.keys()


# Audit the data to have a feel of anomalies

In [123]:
audit_tag(FILE)

non-english char detected in -> {'k': 'name:ar', 'v': u'\u0643\u064a\u0631\u0643\u0644\u0627\u0646\u062f'}
non-english char detected in -> {'k': 'name:bg', 'v': u'\u041a\u044a\u0440\u043a\u043b\u0430\u043d\u0434'}
non-english char detected in -> {'k': 'name:fa', 'v': u'\u06a9\u0631\u06a9\u0644\u0646\u062f'}
non-english char detected in -> {'k': 'name:ja', 'v': u'\u30ab\u30fc\u30af\u30e9\u30f3\u30c9'}
non-english char detected in -> {'k': 'name:ko', 'v': u'\ucee4\ud074\ub79c\ub4dc'}
non-english char detected in -> {'k': 'name:zh', 'v': u'\u67ef\u514b\u862d'}
+1-425-497-8809  ->  425-497-8809
+1-425-8679195  ->  425-867-9195
+14258815678  ->  425-881-5678
+1-425-558-4531  ->  425-558-4531
(425) 869-2616  ->  425-869-2616
(425) 882-0630  ->  425-882-0630
+1-425-8812232  ->  425-881-2232
425-885-4848  ->  425-885-4848
+1-866-4373682  ->  866-437-3682
+1-425-488-3730  ->  425-488-3730
+1-425-821-9810  ->  425-821-9810
+1-425-8699772  ->  425-869-9772
8776370450  ->  877-637-0450
+1-425-497-

# Expand the expected and mapping data structure related to the street types based on the above findings

In [124]:
expected.append("Northeast")
expected.append("Center")
expected.append("Way")
expected.append("Circle")
expected.append("South")

mapping["NE"] ="Northeast"
mapping["Ln"] = "Lane"
mapping["WY"] = "Way"
mapping["St"] = "Street"


# Step 2: Proceed to analyze the final OSM File

In [103]:
audit_tag(OSM_FILE)

non-english char detected in -> {'k': 'name:ar', 'v': u'\u0631\u064a\u062f\u0645\u0648\u0646\u062f'}
non-english char detected in -> {'k': 'name:bg', 'v': u'\u0420\u0435\u0434\u043c\u044a\u043d\u0434'}
non-english char detected in -> {'k': 'name:fa', 'v': u'\u0631\u062f\u0645\u0648\u0646\u062f'}
non-english char detected in -> {'k': 'name:he', 'v': u'\u05e8\u05d3\u05de\u05d5\u05e0\u05d3'}
non-english char detected in -> {'k': 'name:ja', 'v': u'\u30ec\u30c9\u30e2\u30f3\u30c9'}
non-english char detected in -> {'k': 'name:ko', 'v': u'\ub808\ub4dc\uba3c\ub4dc'}
non-english char detected in -> {'k': 'name:ru', 'v': u'\u0420\u0435\u0434\u043c\u043e\u043d\u0434'}
non-english char detected in -> {'k': 'name:ta', 'v': u'\u0bb0\u0bc6\u0b9f\u0bcd\u0bae\u0bbe\u0ba3\u0bcd\u0b9f\u0bcd'}
non-english char detected in -> {'k': 'name:uk', 'v': u'\u0420\u0435\u0434\u043c\u043e\u043d\u0434'}
non-english char detected in -> {'k': 'name:zh', 'v': u'\u96f7\u5fb7\u8499\u5fb7'}
non-english char detected in -> 

non-english char detected in -> {'k': 'climbing:sport', 'v': '*'}
non-english char detected in -> {'k': 'climbing:sport', 'v': '*'}
425.861.4614  ->  425-861-4614
+14258270785  ->  425-827-0785
+1-425-8977411  ->  425-897-7411
425.558.5625  ->  425-558-5625
+1-425-8812500  ->  425-881-2500
425-558-110  ->  Error
problem char detected in house numbers -> {'k': 'addr:housenumber', 'v': '747;777'}
+1 (425) 882-1111  ->  425-882-1111
+1-425-8856363  ->  425-885-6363
+1-425-739-6727  ->  425-739-6727
+1-425-4986000  ->  425-498-6000
+1-425-885-6358  ->  425-885-6358
+1-425-556-9533  ->  425-556-9533
+1-425-8832050  ->  425-883-2050
+1-425-8699713  ->  425-869-9713
+1-425-8812600  ->  425-881-2600
4252845800  ->  425-284-5800
problem char detected in house numbers -> {'k': 'addr:housenumber', 'v': '11295 1/2'}
+1-425-8283600  ->  425-828-3600
problem char detected in house numbers -> {'k': 'addr:housenumber', 'v': '5833 A'}
problem char detected in house numbers -> {'k': 'addr:housenumber', 

# Based on the above results, the OSM-File has the issues that was seen in the sample file and some more <br>
1) From examining the street names and types: I see that the regex needs to improved to handle streetname effectively.<br>
For instance: for cases like NE 90th st  where both NE and st needs to be standardized, the regex fell short. So my solution is to standardize any non numerical part of street address i.e. NE & st here should be standardized not just the last part of the street which in aforementioned case is st.<br>
2) Further there are the Expected and Mapping data structure needs to be expanded to include the legitimate street type <br> 
3) There were tag with source:phone which points to the URL and these should be allowed. my current validation throws an error on this as I checking by phone in key, this needs to be addressed by looking the "phone" tag specifically <br>


In [125]:
def audit_street_name(name, mapping,expected):
    
    st = name.split()
    for s in st:
        if s.isalpha():
            if s not in expected:
                if mapping.get(s,0) <> 0:
                    #print s , ' -> ',mapping[s]
                    pass
                else:
                    print name," Please handle -> ",s                   


In [126]:
def audit_zip(pcode, pzcode_re =zcode_re):
    ztemp = pzcode_re.search(pcode)
    if ztemp:
        zipcode = ztemp.groups()
        #print "zip code > 5 : ", elem.attrib['v'], ' -> ',ztemp[0]
        return zipcode[0]
    else:
        return 'Error'

In [127]:
def audit_phone_num(phnum, pphone_re):
    ptemp = ''
    pno = ''
    ptemp_grp = pphone_re.search(phnum)
    if ptemp_grp: # objective here is to only identify the phone #s that fail the regex
        '''
        ptemp = ptemp_grp.groups()
        pno =  ptemp[0]+'-'+ptemp[1]+'-'+ptemp[2]
        if ptemp[3] <>'': # if there is extenion associated with the phone #
            pno = pno + '-'+ptemp[3]'''
        pass
    else:
        pno = phnum
    return pno

In [104]:

for event, elem in ET.iterparse(OSM_FILE):
    if elem.tag == 'tag':
        if "street" in elem.attrib['k']:
            n = audit_street_name(elem.attrib['v'],mapping,expected)
        if "phone" in elem.attrib['k']:
            phone = audit_phone_num(elem.attrib['v'],phone_re)
            if phone <> '':
                print elem.tag,elem.attrib['k'],elem.attrib['v'], ' -> ',phone 
        if  "postcode" in elem.attrib['k']:
            zipcode = audit_zip(elem.attrib['v'])
            if zipcode <> elem.attrib['v']:
                print elem.attrib['k'], ' : ', elem.attrib['v'] ,' -> ',zipcode
    

Leary Way Northeast  Please handle ->  Leary
Redmond Way  Please handle ->  Redmond
Avondale Road Northeast  Please handle ->  Avondale
Avondale Road Northeast  Please handle ->  Avondale
addr:postcode  :  W Lake Sammamish Pkwy NE  ->  Error
West Lake Sammamish Parkway Northeast  Please handle ->  West
West Lake Sammamish Parkway Northeast  Please handle ->  Lake
West Lake Sammamish Parkway Northeast  Please handle ->  Sammamish
Redmond Way  Please handle ->  Redmond
Redmond Way  Please handle ->  Redmond
Redmond Way  Please handle ->  Redmond
Redmond Way  Please handle ->  Redmond
Park Lane  Please handle ->  Park
Central  Please handle ->  Central
Central Way  Please handle ->  Central
Central Way  Please handle ->  Central
Central Way  Please handle ->  Central
Central Way  Please handle ->  Central
Central Way  Please handle ->  Central
Central Way  Please handle ->  Central
156th Pl NE  Please handle ->  Pl
156th Pl NE  Please handle ->  Pl
156th Pl NE  Please handle ->  Pl
156th 

Totem Lake Boulevard  Please handle ->  Totem
Totem Lake Boulevard  Please handle ->  Lake
Totem Lake Boulevard  Please handle ->  Totem
Totem Lake Boulevard  Please handle ->  Lake
Totem Lake Boulevard  Please handle ->  Totem
Totem Lake Boulevard  Please handle ->  Lake
Totem Lake Boulevard  Please handle ->  Totem
Totem Lake Boulevard  Please handle ->  Lake
Totem Lake Boulevard  Please handle ->  Totem
Totem Lake Boulevard  Please handle ->  Lake
Totem Lake Boulevard  Please handle ->  Totem
Totem Lake Boulevard  Please handle ->  Lake
Totem Lake Boulevard  Please handle ->  Totem
Totem Lake Boulevard  Please handle ->  Lake
Totem Lake Boulevard  Please handle ->  Totem
Totem Lake Boulevard  Please handle ->  Lake
Totem Lake Boulevard  Please handle ->  Totem
Totem Lake Boulevard  Please handle ->  Lake
Totem Lake Boulevard  Please handle ->  Totem
Totem Lake Boulevard  Please handle ->  Lake
Totem Lake Boulevard  Please handle ->  Totem
Totem Lake Boulevard  Please handle ->  Lake

Railroad Avenue  Please handle ->  Railroad
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Way  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland A

Forbes Creek Drive  Please handle ->  Creek
Forbes Creek Drive  Please handle ->  Forbes
Forbes Creek Drive  Please handle ->  Creek
Forbes Creek Drive  Please handle ->  Forbes
Forbes Creek Drive  Please handle ->  Creek
Forbes Creek Drive  Please handle ->  Forbes
Forbes Creek Drive  Please handle ->  Creek
Forbes Creek Drive  Please handle ->  Forbes
Forbes Creek Drive  Please handle ->  Creek
Forbes Creek Drive  Please handle ->  Forbes
Forbes Creek Drive  Please handle ->  Creek
Forbes Creek Drive  Please handle ->  Forbes
Forbes Creek Drive  Please handle ->  Creek
Forbes Creek Drive  Please handle ->  Forbes
Forbes Creek Drive  Please handle ->  Creek
Forbes Creek Drive  Please handle ->  Forbes
Forbes Creek Drive  Please handle ->  Creek
Forbes Creek Drive  Please handle ->  Forbes
Forbes Creek Drive  Please handle ->  Creek
Forbes Creek Drive  Please handle ->  Forbes
Forbes Creek Drive  Please handle ->  Creek
Forbes Creek Drive  Please handle ->  Forbes
Forbes Creek Drive  P

Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Alexander Avenue  Please handle ->  Alexander
Slater Street South  Please handle ->  Slater
Slater Street South  Please handle ->  Slater
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please ha

Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle ->  Kirkland
Kirkland Avenue  Please handle -

Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater


Redmond Way  Please handle ->  Redmond
Redmond Way  Please handle ->  Redmond
Old Redmond Road  Please handle ->  Old
Old Redmond Road  Please handle ->  Redmond
West Lake Sammamish Parkway Northeast  Please handle ->  West
West Lake Sammamish Parkway Northeast  Please handle ->  Lake
West Lake Sammamish Parkway Northeast  Please handle ->  Sammamish
Redmond Way  Please handle ->  Redmond
West Lake Sammamish Parkway  Please handle ->  West
West Lake Sammamish Parkway  Please handle ->  Lake
West Lake Sammamish Parkway  Please handle ->  Sammamish
Totem Lake Boulevard  Please handle ->  Totem
Totem Lake Boulevard  Please handle ->  Lake
Northeast Totem Lake Way  Please handle ->  Totem
Northeast Totem Lake Way  Please handle ->  Lake
Northeast Totem Lake Way  Please handle ->  Totem
Northeast Totem Lake Way  Please handle ->  Lake
Northeast Totem Lake Way  Please handle ->  Totem
Northeast Totem Lake Way  Please handle ->  Lake
Northeast Totem Lake Way  Please handle ->  Totem
Northeast

State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handle ->  State
State Street South  Please handl

Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater
Slater Avenue Northeast  Please handle ->  Slater


159th Pl NE  Please handle ->  Pl
Old Redmond Road  Please handle ->  Old
Old Redmond Road  Please handle ->  Redmond
Old Redmond Road  Please handle ->  Old
Old Redmond Road  Please handle ->  Redmond
Old Redmond Road  Please handle ->  Old
Old Redmond Road  Please handle ->  Redmond
Old Redmond Road  Please handle ->  Old
Old Redmond Road  Please handle ->  Redmond
Totem Lake Boulevard  Please handle ->  Totem
Totem Lake Boulevard  Please handle ->  Lake
Redmond Woodinville Road Northeast  Please handle ->  Redmond
Redmond Woodinville Road Northeast  Please handle ->  Woodinville
Redmond Woodinville Road Northeast  Please handle ->  Redmond
Redmond Woodinville Road Northeast  Please handle ->  Woodinville
Redmond Woodinville Road Northeast  Please handle ->  Redmond
Redmond Woodinville Road Northeast  Please handle ->  Woodinville
Redmond Woodinville Road Northeast  Please handle ->  Redmond
Redmond Woodinville Road Northeast  Please handle ->  Woodinville
Redmond Woodinville Road No

# Expand the Expected and Mapping data structure based on the final OSM file

In [105]:
expected = ["Redmond","Kirkland","Woodinville","Willows","Sammamish","Juanita",
            "Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway","Park", "Commons","Way","Circle",
            "North","East","West","Northeast","Center","South","Central","Northwest",
           "Parkplace","Totem","Lake","Bear","Creek","Leary","Forbes","Railroad","Hill","Union","River",
           "State","Cedar","Ohde","Slater","Lakeview","Avondale","Observation","Bridlewood","Old","Cleveland","Alexander"
           ]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "st": "Street",
            "ST": "Street",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "AVE": "Avenue",
            "ave": "Avenue",
            "Rd": "Road",
            "Rd.": "Road",
           "NE" : "Northeast",
           "ne" : "Northeast",
           "Ln" : "Lane",
           "LN" : "Lane",
           "WY" : "Way",
           "Pl" : "Place",
           "PL" : "Place",
           "Ct" :"Court",
           "Dr" :"Drive",
           "Remond" : "Redmond",
           "Northest":"Northeast"
          }
           
           
           

# Rerun the audit to double check the Mapping dictionary update

In [106]:
for event, elem in ET.iterparse(OSM_FILE):
    if elem.tag == 'tag':
        if "street" in elem.attrib['k']:
            n = audit_street_name(elem.attrib['v'],mapping,expected)
        if "phone" in elem.attrib['k']:
            phone = audit_phone_num(elem.attrib['v'],phone_re)
            if phone <> '':
                print elem.tag,elem.attrib['k'],elem.attrib['v'], ' -> ',phone 
        if  "postcode" in elem.attrib['k']:
            zipcode = audit_zip(elem.attrib['v'])
            if zipcode <> elem.attrib['v']:
                print elem.attrib['k'], ' : ', elem.attrib['v'] ,' -> ',zipcode

addr:postcode  :  W Lake Sammamish Pkwy NE  ->  Error
addr:postcode  :  98034-7114  ->  98034
tag source:phone http://www.tacomascrew.com/locations  ->  http://www.tacomascrew.com/locations
tag source:phone http://www.organizedspaces.com/micro/m-about-us/location/  ->  http://www.organizedspaces.com/micro/m-about-us/location/
tag phone +1-425-569-090  ->  +1-425-569-090
addr:postcode  :  980452  ->  98045
addr:postcode  :  98052-3111  ->  98052
addr:postcode  :  98052-2866  ->  98052
tag phone 425-558-110  ->  425-558-110
addr:postcode  :  98052-6088  ->  98052
addr:postcode  :  98052-4176  ->  98052
addr:postcode  :  98052-4176  ->  98052
addr:postcode  :  98052-4176  ->  98052
addr:postcode  :  98052-4176  ->  98052
addr:postcode  :  98052-4176  ->  98052
addr:postcode  :  98052-4176  ->  98052
addr:postcode  :  98052-4176  ->  98052
addr:postcode  :  98052-4176  ->  98052
addr:postcode  :  98052-4176  ->  98052
addr:postcode  :  98052-4176  ->  98052
addr:postcode  :  98052-4176  ->

# Observations <br>
This leaves the results with <br>
a) genuine error cases that is the instances where postcode has alphabets in them like addr:postcode  :  W Lake Sammamish Pkwy NE  ->  Error. These needs to be filtered out<br>
b) genuine error cases where phone number is 9 digits. These needs to be filtered out<br>
c) cases where housenumbers are alphanumeric: These, I am marking as valid causes for my purpose as they include unit numbers of multi tenant homes. No filtering is needed here <br>
d) the tags source:phone needs to be allowed unlike phone attribute <br>
e) Zip code to be standardized to 5 digit zip codes
f) phone number to be standardized to standard npa-nxx-xxxx format optionally extension will be stored when available
g) street name will be standardized to an expected convention as reflected in the Expected list


