In [5]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSM_FILE = "C:\Chaitanya\Jupyter\Data_Wrangling\Project_DataWrangling\houston_texas.osm"
SAMPLE_FILE = "C:\Chaitanya\Jupyter\Data_Wrangling\Project_DataWrangling\houston_texas_sample.osm"


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "East", "West", "North", "South", "Freeway", "Highway", "Circle", "Park"]

mapping = { "St": "Street", "St.": "Street", "street": "Street", "Ave": "Avenue", "Rd.": 'Road', "Rd": "Road",\
           "Blvd": "Boulevard", "Blvd.": "Boulevard", "Dr": "Drive", "Fwy": "Freeway", "Frwy": "Freeway", "Hwy": "Highway",\
           "N": "North", "N.": "North", "W": "West", "W.": "West", "E": "East", "E.": "East", "S": "South", "S.": "South",\
           "N ": "North ", "W ": "West ", "E ": "East ", "S ": "South ", "Ln": "Lane", "Farm-to-Market Road 1774": "FM 1774",\
          "Pkwy": "Parkway", "W. ": "West ", "Stree": "Street"}
street_type_re = re.compile(r'\b\S+\.?\s?$', re.IGNORECASE)
    # regex to pull out the street type from the name


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)



def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

def truncate_extension(name):
    # This function will return the name, truncating the part of the name after ',' or '#' or '('
    # The objective of this function is to remove extensions of city/street names (like 'Tx' or 'Texas') and 
    # any house numbers or suite numbers following city/street names
    bettername = name
    match1 = re.compile(r',')
    for m in match1.finditer(name):
        bettername = name[:m.start()]
        # return the part of the name before ','
    match2 = re.compile(r'#')
    for m in match2.finditer(name):
        bettername = name[:m.start()]
        # return the part of the name before '#'

    return bettername


def update_name(name, mapping):
    
    better_name = name
    
    if ("FM" not in name and name.isupper()):
        better_name = name.title()
        # convert UPPER case names to CAMEL case
        # ignore the street names which have "FM" (Farm to Market Road) in them
    
    if ("Farm-to-Market Road" in name):
        better_name = name.replace("Farm-to-Market Road", "FM")
        # replace "Farm-to-Market Road" with "FM" to maintain consistency
    
    m = street_type_re.search(name)
    if m:
        st_type = m.group()
        if (st_type in mapping.keys()):
            better_name = re.sub(st_type, mapping[st_type], name)
    
    street_abbrev_re = re.compile(r'^([a-z]){1}\.?(\s)+', re.IGNORECASE)
    # regex to pull out abbreviations like E/W/N/S in the name
    m2 = street_abbrev_re.search(better_name)
    better_name2 = better_name
    if m2:
        abbr_name = m2.group()
        if (abbr_name in mapping.keys()):
            better_name2 = re.sub(abbr_name, mapping[m2.group()], better_name)
        
    
    
    return better_name2


def test():
    st_types = audit(OSM_FILE)
    #pprint.pprint(dict(st_types))
    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name1 = truncate_extension(name)
            better_name2 = update_name(better_name1, mapping)
            print name, "=>", better_name2
            


if __name__ == '__main__':
    test()

Lake Woodlands Drive #1142 => Lake Woodlands Drive 
City Walk => City Walk
Creek Ridge => Creek Ridge
Woodcreek Bend Lake => Woodcreek Bend Lake
Memorial Drive, Suite F-3 => Memorial Drive
Cossey Rd => Cossey Road
FM 1960 Rd => FM 1960 Road
Cypress North Houston Rd => Cypress North Houston Road
Ferry Rd => Ferry Road
Powerline Rd => Powerline Road
Kuykendahl Rd => Kuykendahl Road
Gordon Side Rd => Gordon Side Road
Huffman Cleveland Rd => Huffman Cleveland Road
Macedonia Rd => Macedonia Road
Katy Fort Bend Rd => Katy Fort Bend Road
Hope Village Rd => Hope Village Road
E Atascocita Rd => East Atascocita Road
Rogerdale Rd => Rogerdale Road
Westheimer Rd => Westheimer Road
Hamish Rd => Hamish Road
Old Katy Rd => Old Katy Road
Fairbanks North Houston Rd => Fairbanks North Houston Road
Stuebner Airline Rd => Stuebner Airline Road
E FM 528 Rd => East FM 528 Road
Barker Cypress Rd Suite 130 => Barker Cypress Rd Suite 130
Bailey => Bailey
West Spreading Oaks => West Spreading Oaks
TX-332 => Tx-