Reference for valid city names in Houston Area:
https://en.wikipedia.org/wiki/List_of_cities_and_towns_in_Greater_Houston

Your task in this exercise has two steps:

- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix 
    the unexpected street types to the appropriate ones in the expected list.
    You have to add mappings only for the actual problems you find in this OSMFILE,
    not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
    The function takes a string with street name as an argument and should return the fixed name
    We have provided a simple test so that you see what exactly is expected


In [1]:
from collections import defaultdict
import pprint
import xml.etree.cElementTree as ET

In [2]:
SAMPLE_FILE = "C:\Chaitanya\Jupyter\Data_Wrangling\Project_DataWrangling\houston_texas_sample.osm"
OSMFILE = "C:\Chaitanya\Jupyter\Data_Wrangling\Project_DataWrangling\houston_texas.osm"

In [3]:
def tag_city(file):
    tag = defaultdict(int)
    for event, elem in ET.iterparse(file):
        if (elem.tag == 'way'):
            for e in elem.iter("tag"):
                    if (e.attrib['k'] == 'addr:city'):
                        tag[e.attrib['v']] +=1
    print tag            
    return tag
tag_city(SAMPLE_FILE)

defaultdict(<type 'int'>, {'Houston, Texas': 1, 'Pearland': 5, 'Magnolia': 2, 'Baytown': 2, 'Kingwood': 5, 'Texas City': 2, 'The Woodlands': 4, 'Dickinson': 1, 'Pasadena': 3, 'Humble, TX': 1, 'Bay City': 1, 'Klein': 5, 'Cypress': 5, 'Rosharon': 1, 'Missouri City': 4, 'League City': 2, 'Tomball': 27, 'Stafford': 1, 'Tomball, Tx': 5, 'Lake Jackson': 2, 'Katy': 39, 'Humble': 8, 'Seabrook': 3, 'Galveston Island': 1, 'Bellaire': 3, 'Webster': 2, 'Sugar Land': 4, 'Galveston': 68, 'Houston': 250, 'Friendswood': 1, 'Spring': 3, 'Richmond': 2, 'Liberty': 3})


defaultdict(int,
            {'Bay City': 1,
             'Baytown': 2,
             'Bellaire': 3,
             'Cypress': 5,
             'Dickinson': 1,
             'Friendswood': 1,
             'Galveston': 68,
             'Galveston Island': 1,
             'Houston': 250,
             'Houston, Texas': 1,
             'Humble': 8,
             'Humble, TX': 1,
             'Katy': 39,
             'Kingwood': 5,
             'Klein': 5,
             'Lake Jackson': 2,
             'League City': 2,
             'Liberty': 3,
             'Magnolia': 2,
             'Missouri City': 4,
             'Pasadena': 3,
             'Pearland': 5,
             'Richmond': 2,
             'Rosharon': 1,
             'Seabrook': 3,
             'Spring': 3,
             'Stafford': 1,
             'Sugar Land': 4,
             'Texas City': 2,
             'The Woodlands': 4,
             'Tomball': 27,
             'Tomball, Tx': 5,
             'Webster': 2})

In [4]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

#street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Alvin", "Angleton", "Atascocita", "Bay City", "Baytown", "Bellaire", "Brazoria", "Channelview", "Clear Lake Shores", 
            "Clute", "Conroe", "Crosby", "Cypress", "Deer Park", "Dickinson", "Freeport", "Fresno'", "Friendswood", 
            "Galveston", "Hedwig Village", "Hockley", "Houston", "Humble", "Katy", "Kemah", "Kingwood", "Klein", "La Porte", "LaMarque",
            "Lake Jackson", "League City", "Liberty", "Magnolia", "Meadows Place", "Missouri City", "Nassau Bay", "Needville", 
            "Pasadena", "Pearland", "Porter", "Richmond", "Rosenberg", "Santa Fe", "Seabrook", "Shenandoah", "Spring", "Stafford", 
            "Sugar Land", "Texas City", "The Woodlands", "Tomball", "Webster", "West Columbia", "West University Place", "Winnie"]


mapping = {
    "Laks Jackson": "Lake Jackson", "Houson": "Houston", "Galveston Island": "Galveston", "The Woodlands": "Woodlands",\
    "Sugarland": "Sugar Land", "West University": "West University Place", "Dickenson": "Dickinson"
}

def audit_city(invalid_city_names, city_name):
    if city_name not in expected:
        invalid_city_names[city_name] +=1
    return invalid_city_names


def is_city_name(elem):
    return (elem.attrib['k'] == "addr:city")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    invalid_city_names = defaultdict(int)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_city_name(tag):
                    audit_city(invalid_city_names, tag.attrib['v'])
    osm_file.close()
    return invalid_city_names


def correct_case(name):
    # This function is used to correct the case of city names
    if (name.isupper()):
        bettername = name.title()
        # convert UPPER case names to CAMEL case
    elif (name.islower()):
        bettername = name.title()
        # convert LOWER case names to CAMEL case
    else:
        bettername = name
    
    return bettername

def truncate_extension(name):
    # This function will return the name, truncating the part of the name after ',' or '#'
    # The objective of this function is to remove extensions of city/street names (like 'Tx' or 'Texas') and 
    # any house numbers or suite numbers following city/street names
    if (',' in name):
        bettername = name[:name.index(',')]
        # return the part of the name before ','
    elif ('#' in name):
        bettername = name[:name.index('#')]
        # return the part of the name before '#'
    else:
        bettername = name
    return bettername

def update_name(name, mapping):
    # This function is used to correct a few spellings in city names and to maintain consistency in city names
    if (name in mapping.keys()):
        bettername = mapping[name]
    else:
        bettername = name
    return bettername
    

if __name__ == '__main__':
    invalid_city = audit(OSMFILE)
    pprint.pprint(dict(invalid_city))
    for city, times in invalid_city.iteritems():
        better_name1 = correct_case(city)
        better_name2 = truncate_extension(better_name1)
        better_name3 = update_name(better_name2, mapping)
        print city, "=>", better_name3

{'77386': 1,
 'Alvin, TX': 1,
 'Angleton, TX': 1,
 'Angleton,TX': 1,
 'Bay City, TX': 2,
 'Beasley': 1,
 'Bellaire, TX': 1,
 'Crystal Beach': 2,
 'Cypress, TX': 4,
 'DEER PARK': 2,
 'Dickenson': 1,
 'Dickinson, Tx': 1,
 'El Lago': 1,
 'Fort Bend': 2,
 'Fresno': 3,
 'Friendswood, TX': 2,
 'Fulshear': 1,
 'Galveston Island': 1,
 'HOUSTON': 5,
 'Hempstead': 1,
 'Houson': 1,
 'Houston, TX': 46,
 'Houston, Texas': 7,
 'Huffman': 1,
 'Humble, TX': 2,
 'Jersey Village': 1,
 'KATY': 1,
 'Katy, TX': 6,
 'Kendleton': 3,
 'Kingwood, TX': 1,
 'LAKE JACKSON': 3,
 'La Marque': 1,
 'Lake Jackson, TX': 1,
 'Laks Jackson': 2,
 'League City, TX': 3,
 'Little York': 1,
 'MAGNOLIA': 2,
 'Missouri City, TX': 4,
 'Mont Belvieu': 1,
 'Navasota': 1,
 'New Caney': 4,
 'Pasadena, TX': 2,
 'Pearland, TX': 2,
 'Plantersville': 1,
 'Rosharon': 6,
 'San Leon': 1,
 'Santa Fe, TX': 1,
 'Sealy': 1,
 'South Houston': 1,
 'Spring, TX': 8,
 'Sugar Land, TX': 5,
 'Sugarland': 3,
 'TEXAS CITY': 2,
 'The Woodlands, TX': 2,
