### Assess data - validity, accuracy and completeness, consistency, uniformity

### Clean data

### Store data

In [16]:
# import libraries

import xml.etree.cElementTree as ET
import re
from collections import defaultdict
import pprint

In [17]:
filepath = "pittsburgh_pennsylvania.osm.bz2"
filename = filepath[:-4]

In [18]:
# Assess data

def count_tags(filename):
        tags = {}
        for event, elem in ET.iterparse(filename, events = ('start', 'end')):
            if elem.tag in tags and event == 'start':
                tags[elem.tag] += 1
            elif event == 'start':
                tags[elem.tag] = 1
            if event == 'end':
                elem.clear()
        return tags

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = [ "Avenue", "Boulevard", "Court","Drive", "Lane", "Place", "Parkway",   "Road", "Route", "Street", 
            "Square", "Trail", "Way" ]
    
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit(osmfile):
    osm_file = open(osmfile, "r",encoding='utf8')
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types
            

In [19]:
# update mapping after checking fields

mapping = { "Av": "Avenue",
            "Av.": "Avenue",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "Blvd": "Boulevard",
            "Bl": "Boulevard",
            "Blvd.": "Boulevard",
            "Ct" : "Court",
            "Dr" : "Drive",
            "Dr." : "Drive",
            "Hwy": "Highway",
            "Pl" : "Place",
            "Pk" : "Parkway", 
            "Pkwy" : "Parkway",
            "Prky" : "Parkway",
            "Rd" : "Road",
            "St" : "Street",
            "St." : "Street"
}

In [8]:
# run then update mapping dictionary
st_types = audit(filename)

pprint.pprint (dict(st_types),compact=True)

{'18': {'Route 18', 'PA 18'},
 '19': {'Route 19', 'US 19'},
 '202': {'Wilkins Avenue #202'},
 '217': {'217'},
 '228': {'Pennylvania 228', 'State Route 228', 'Pennsylvania 228'},
 '30': {'U.S. 30', 'State Route 30', 'Route 30'},
 '400': {'West Kensinger Drive #400'},
 '51': {'State Route 51', 'Route 51'},
 '519': {'PA 519', 'Route 519'},
 '8': {'Route 8'},
 '837': {'Route 837'},
 '885': {'Route 885'},
 '910': {'PA 910', 'Route 910'},
 'Allegheny': {'South Allegheny'},
 'Alley': {'2nd Alley', '4th Alley', 'Beech Alley', 'Bluff Alley',
           'Center Alley', 'Chestnut Alley', 'Oak Alley', 'Park Alley',
           'Peach Alley', 'Pine Alley', 'Plum Alley', 'Poplar Alley',
           'School Alley', 'Summit Alley', 'Taylor Alley', 'Thompson Alley',
           'Union Alley'},
 'Allies': {'Boulevard of the Allies'},
 'Automotive': {'California Automotive'},
 'Av': {'Center Av'},
 'Av.': {'Fifth Av.'},
 'Ave': {'5th Ave', '6th Ave', 'Arlington Ave', 'Atlantic Ave', 'Center Ave',
         '

In [30]:
# check updated names
def update_name(name, mapping):
    
    p=re.findall(r'\b\S+\.?$', name)
    
    if p[0] in mapping.keys():
      name = name.replace(p[0],mapping[p[0]])   

    return name

for st_type, ways in st_types.items():
        for name in ways:
            better_name = update_name(name, mapping)
            if better_name != name:

                print("{0:30} =>\t{1}".format(name,better_name))

Lysle Blvd                     =>	Lysle Boulevard
Pennsbury Blvd                 =>	Pennsbury Boulevard
Beechwood Blvd                 =>	Beechwood Boulevard
Washington Blvd                =>	Washington Boulevard
Sunset Blvd                    =>	Sunset Boulevard
Kirkwall Dr                    =>	Kirkwall Drive
Camden Dr                      =>	Camden Drive
Black Hawk Dr                  =>	Black Hawk Drive
Fox Ridge Farms Dr             =>	Fox Ridge Farms Drive
Douglas Dr                     =>	Douglas Drive
Glengary Dr                    =>	Glengary Drive
Eastminster Dr                 =>	Eastminster Drive
Corporate Dr                   =>	Corporate Drive
Industry Dr                    =>	Industry Drive
Selvin Dr                      =>	Selvin Drive
Greyfriar Dr                   =>	Greyfriar Drive
Berwick Dr                     =>	Berwick Drive
Main St                        =>	Main Street
N Neville St                   =>	N Neville Street
First St                       =>	First Str

In [32]:
# Postal Codes

def is_postcode(elem):
    return (elem.attrib['k'] == "addr:postcode")

def pin_audit(osmfile):
    postalcode_list = []
    osm_file = open(osmfile, "r",encoding='utf8')
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_postcode(tag):
                    pin = str(tag.attrib['v'])
                    if (len(pin) != 5):
                        postalcode_list.append(pin)
    osm_file.close()
    return postalcode_list

In [33]:
pin_audit(filename)

['15235-3494',
 '15210-1845',
 '15232-2210',
 '15206-3807',
 '15232-2734',
 '15232-2705',
 '15213-1678',
 '15213-1763',
 '15213-2712',
 'PA 15033',
 'PA 15601',
 '15215x',
 '15213-4026',
 '425 1/2',
 '15010-4503',
 '15147-1423',
 '15213-2608',
 '15301-6133',
 '15232-2131',
 '15232-2735',
 '15232-2716',
 '15213-1704',
 '15206-4336',
 '15213-1713',
 '15213-1530',
 '15213-1502',
 '15213-1502',
 '15213-1502',
 '15213-1500',
 '15206-4472',
 '15206-4449',
 '15206-4471',
 '15213-1405',
 '15213-1400',
 '15213-1405',
 '15206-4320',
 '15206-4456',
 '15206-4403',
 '15206-4403',
 '15206-4818',
 '15232-1803',
 '15232-1803',
 '15232-1803',
 '15232-1803',
 '15232-1803',
 '15232-1803',
 '15232-1826',
 '15232-1832',
 '15232-1833',
 '15232-1845',
 '15232-2106',
 '15213-2911',
 '15232-1419',
 '15232-1419',
 '15232-1419',
 '15232-1419',
 '15232-1419',
 '15232-1419',
 '15232-1421',
 '15232-1421',
 '15232-1447',
 '15213-1503',
 '15213-1738',
 '15213-1705',
 '15232-1823',
 '15232-1879',
 '15232-1823',
 '1523

In [34]:
def update_pin(pin):
    
    if re.findall(r'\d{5}', pin):
        new = re.findall(r'\d{5}', pin)
    else:
        return pin   
    print(pin, '=>', new[0])
    return new[0]

In [None]:
# Phone number
def phone_audit(osmfile):
    osm_file = open(osmfile, "r")
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_phone(tag):
                   ph = str(tag.attrib['v'])
                   print ph
    osm_file.close()

def is_phone(elem):
    return (elem.attrib['k'] == "phone")