# Project: Wrangle OpenStreetMap data 
## By: Chaitanya Narayanavaram
In this project, I have used data wrangling techniques to audit, clean and analyze the data of an area from OpenStreetMap.


## Area: Houston, Texas
Data (.osm) available at: https://mapzen.com/data/metro-extracts/metro/houston_texas/

## Importing the necessary libraries:

In [1]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import pprint
import re
import codecs
import json

### Source files:

In [2]:
SAMPLE_FILE = "houston_texas_sample.osm" # shorter version of the original file (26.7 MB)
# This sample file is declared here, but the file is written below
OSM_FILE = "houston_texas.osm" # This is the original file (656 MB)

### Producing a smaller sample (houston_texas_sample.osm) of the original osm file (houston_texas.osm)

In [17]:
k = 25 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

### Analyzing the tags in the OSM file:
Below we can see what tags are present in the OSM file and how many times each tag occurs 

In [3]:
def analyze_tags(file):
    tag_types = defaultdict(int)
    for event, elem in ET.iterparse(file):
        tag_types[elem.tag] += 1
    return tag_types
analyze_tags(SAMPLE_FILE)

defaultdict(int,
            {'member': 671,
             'nd': 149936,
             'node': 121136,
             'osm': 1,
             'relation': 99,
             'tag': 83760,
             'way': 14681})

### Tag patterns:

In [4]:
lower = re.compile(r'^([a-z]|_)*$')
#  tags that contain only lowercase letters and are valid
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
# otherwise valid tags with a colon in their names
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
# tags with problematic characters


def key_type(element, keys):
    if element.tag == "tag":
        k_value = element.attrib['k']
        
        if lower.search(k_value):
            keys["lower"] += 1
        elif lower_colon.search(k_value):
            keys["lower_colon"] += 1
        elif problemchars.search(k_value):
            keys["problemchars"] += 1
        else:
            keys["other"] += 1
            # other tags that do not fall into the other three categories
        
    return keys


def process_map_tag_type(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

keys = process_map_tag_type(OSM_FILE)
print "Tags Patterns are: "
pprint.pprint(keys)

Tags Patterns are: 
{'lower': 892280, 'lower_colon': 1141166, 'other': 54265, 'problemchars': 3}


### Analyzing the keys in tags:
Below we can see what are the keys that occur in tags and how many times each of them occur.

In [4]:
def tag_keys(filename):
    tag = defaultdict(int)
    for event, elem in ET.iterparse(filename):
        if (elem.tag == 'way'):
            for e in elem.iter("tag"):
                    tag[e.attrib['k']] +=1
                
    return tag
tag_keys(OSM_FILE)
            

defaultdict(int,
            {'FIXME': 1064,
             'FIXME:hazmat': 2,
             'FIXME:oneway': 7,
             'NHS': 1425,
             'PASS_route': 1152,
             'RPM': 1,
             'TNRIS:OBJECTID': 1,
             'Texas_Trunk_System': 381,
             'UFID': 3,
             '_LINEARID_': 4,
             'abandoned': 9,
             'abandoned:highway': 41,
             'abandoned:junction': 1,
             'abandoned:leisure': 3,
             'abandoned:name': 3,
             'abandoned:ref': 1,
             'access': 12147,
             'access:bus': 74,
             'access:conditional': 6,
             'access:lanes:both_ways': 4,
             'addr:city': 4567,
             'addr:country': 289,
             'addr:full': 3,
             'addr:housename': 29,
             'addr:housenumber': 2935,
             'addr:inclusion': 20,
             'addr:interpolation': 20,
             'addr:postcode': 2315,
             'addr:state': 4220,
             'addr:

### Below is the function that will audit a particular field in the tags

In [5]:
def tag_audit(file, field):
    # This function will accept the file and the field that needs to be analyzed as the input
    # This function will return a dictionary with unique entries in the field (as keys) 
    # and the number of times this unique field has occurred (as values)
    tag = defaultdict(int)
    for event, elem in ET.iterparse(file):
        if (elem.tag == 'way'):
            for e in elem.iter("tag"):
                    if (e.attrib['k'] == field):
                        tag[e.attrib['v']] +=1           
    return tag

### Analyzing the values of city in tags:
Below we can see the values for city and how many times each of them occur in the OSM file

In [7]:
tag_audit(SAMPLE_FILE, "addr:city")

defaultdict(int,
            {'Alvin': 1,
             'Bay City': 1,
             'Baytown': 1,
             'Bellaire': 3,
             'Cypress': 2,
             'Deer Park': 1,
             'Galveston': 55,
             'Houston': 172,
             'Houston, Texas': 2,
             'Humble': 6,
             'Katy': 30,
             'Kingwood': 5,
             'Klein': 4,
             'League City': 1,
             'Liberty': 1,
             'Missouri City': 1,
             'Pasadena': 3,
             'Pearland': 2,
             'Seabrook': 2,
             'Stafford': 2,
             'Sugar Land': 4,
             'Texas City': 1,
             'The Woodlands': 2,
             'Tomball': 13,
             'Tomball, Tx': 5,
             'Webster': 2})

In the results above we can observe the following:
* There are a few entries that have 'Tx'/'Texas' following the city name. This has to be corrected (we will only retain the city name)
* There is one entry 'Galveston Island' which has to be changes to 'Galveston' to maintain consistency.
* The first entry '77386' seems to be a pincode value which is erroneously present here. This needs to be removed.
* 'TEXAS CITY' should be changed to 'Texas City' to maintain consistency.
* 'West University' and 'West University Place' refer to the same area. So, 'West University' should be changed to 'West University Place' to maintain consistency.
* 'Sugarland' and 'Sugar Land, TX' should be changed to 'Sugar Land' to maintain consistency as they refer to the same place.
* 'clear lake shores' should be changed to 'Clear Lake Shores' to maintain consistency as they refer to the same place.

### Analyzing the values of street names in tags:
Below we can see the values for street names and how many times each of them occur in the OSM file

In [8]:
tag_audit(SAMPLE_FILE, 'addr:street')

defaultdict(int,
            {'74th Street': 1,
             '80th Street': 2,
             'Airline Drive': 2,
             'Allenbrook Drive': 1,
             'Avenue F': 1,
             'Avenue N 1/2': 1,
             'Avenue P 1/2': 1,
             'Avenue R': 1,
             'Barraud Court': 2,
             'Bay Street North': 1,
             'Bayport Boulevard': 1,
             'Beaudelaire Circle': 1,
             'Beechnut Street': 1,
             'Beltway 8 & Wilson Road': 1,
             'Beluche Drive': 9,
             'Binz Street': 2,
             'Bissonet Street': 1,
             'Blossom': 1,
             'Briar Forest Drive': 1,
             'Bucktrout Lane': 5,
             'Candlewood Drive': 1,
             'Capitol Street': 1,
             'Carters Grove Lane': 1,
             'Carvel Lane': 1,
             'Castlewood Street': 1,
             'Cavalcade Street': 1,
             'Cinco Ranch Boulevard': 1,
             'Clara Barton Lane': 3,
             'Clay Roa

In the results above, we can observe the following:
* Abbreviations in street names like Dr, Blvd, Pkwy, Fwy need to be corrected (Eg: Dr -> Drive, Blvd -> Boulevard)
* Abbreviations like E,W,N,S need to be cahnged to East, West, North, South respectively
* All upper and lower case names need to be changed to camel case to maintain consistency
* Names with Farm-to-Market Road needs to be changed to "FM" to maintain consistency

### Analyzing the country values in the tags:
Below we can see the values for country and how many times each of them occur in the OSM file

In [9]:
tag_audit(OSM_FILE, 'addr:country')

defaultdict(int, {'US': 289})

As we can see above, all the country names in the tags are correct and consistent. Thus, there is no need to audit this field further.

### Analyzing the house numbers in the tags:
Below we can see the values for house numbers and how many times each of them occur in the OSM file

In [10]:
tag_audit(SAMPLE_FILE, 'addr:housenumber')

defaultdict(int,
            {'100': 1,
             '1000': 1,
             '10103': 1,
             '1011': 1,
             '1020': 1,
             '10700': 1,
             '10843': 1,
             '110': 1,
             '11211': 1,
             '11511 1/2': 1,
             '11815': 1,
             '11949': 1,
             '1200': 1,
             '12211': 1,
             '12424': 1,
             '12500': 1,
             '12555': 1,
             '12754': 1,
             '12901': 1,
             '1300': 1,
             '13102': 1,
             '1313': 2,
             '13210': 1,
             '1333': 1,
             '13403': 1,
             '1360': 1,
             '13600': 1,
             '13618': 1,
             '13700': 1,
             '13768': 1,
             '14': 1,
             '1400': 1,
             '1410': 1,
             '1500': 1,
             '1506': 1,
             '1510': 1,
             '1517': 1,
             '1520': 1,
             '15210': 1,
             '1522': 1,
  

From the above results, we can observe that some of the house numbers have street names in them. These need to be corrected.
For eg: "600 jefferson st" -> "600"

### Analyzing the pincodes in the tags
Below we can see the values for pincodes and how many times each of them occur in the OSM file

In [11]:
tag_audit(OSM_FILE, 'addr:postcode')

defaultdict(int,
            {'73032': 1,
             '74404': 1,
             '77002': 45,
             '77003': 5,
             '77004': 25,
             '77005': 12,
             '77005-1890': 1,
             '77006': 34,
             '77007': 19,
             '77007-2112': 1,
             '77007-2113': 1,
             '77007-2121': 1,
             '77008': 11,
             '77009': 8,
             '77010': 11,
             '77011': 4,
             '77012': 1,
             '77014': 8,
             '77015': 2,
             '77016': 1,
             '77018': 5,
             '77019': 5,
             '77020': 7,
             '77021': 4,
             '77022': 17,
             '77024': 40,
             '77024-8022': 1,
             '77025': 9,
             '77025-9998': 1,
             '77027': 16,
             '77028': 1,
             '77030': 45,
             '77031': 2,
             '77032': 13,
             '77034': 2,
             '77035': 5,
             '77036': 10,
             '7

### Analyzing the state names in the tags:
Below we can see the values for states and how many times each of them occur in the OSM file

In [12]:
tag_audit(OSM_FILE, 'addr:state')

defaultdict(int,
            {'TEXAS': 1,
             'TX': 4051,
             'TX - Texas': 1,
             'Texas': 73,
             'Tx': 79,
             'Tx.': 7,
             'texas': 3,
             'tx': 5})

As we can see from the results above, all of the values have to be changed to 'TX' (which is the most common) to maintain consistency 

In [6]:
def is_street_name(elem):
    return ((elem.tag == "tag") and (elem.attrib['k'] == "addr:street"))

def is_state(elem):
    return ((elem.tag == "tag") and (elem.attrib['k'] == "addr:state" or elem.attrib['k'] == "is_in:state_code"))

def is_postcode(elem):
    return ((elem.tag == "tag") and (elem.attrib['k'] == "addr:postcode"))

def is_housenumber(elem):
    return ((elem.tag == "tag") and (elem.attrib['k'] == "addr:housenumber"))

def is_city(elem):
    return ((elem.tag == "tag") and (elem.attrib['k'] == 'addr:city'))


In [7]:
def audit_street_type(street_types, street_name):
    street_type_re = re.compile(r'\b\S+\.?\s?$', re.IGNORECASE)
    expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",\
                "Trail", "Parkway", "Commons", "East", "West", "North", "South", "Freeway", "Highway", "Circle", "Park"]
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

In [8]:
def audit_state_type(state_types, state):
    if (state != 'TX'):
        state_types.append(state)

In [9]:
def audit_postcode(incorrect_postcodes, postcode):
    postcode_re = re.compile(r'^(77...)') # Postcode outside Houston, TX
    m = postcode_re.search(postcode)
    if not m:
        incorrect_postcodes.append(postcode)
    return incorrect_postcodes

In [10]:
def audit_housenumber(house_numbers, number):
    house_number_re = re.compile(r'^\d+(-?\d)*$')
    m = house_number_re.search(number)
    if not m:
        house_numbers.append(number)
    return house_numbers

In [11]:
def audit_city(invalid_city_names, city_name):
    expected = ["Alvin", "Angleton", "Atascocita", "Bay City", "Baytown", "Bellaire", "Brazoria", "Channelview", "Clear Lake Shores", 
            "Clute", "Conroe", "Crosby", "Cypress", "Deer Park", "Dickinson", "Freeport", "Fresno'", "Friendswood", 
            "Galveston", "Hedwig Village", "Hockley", "Houston", "Humble", "Katy", "Kemah", "Kingwood", "Klein", "La Porte", "LaMarque",
            "Lake Jackson", "League City", "Liberty", "Magnolia", "Meadows Place", "Missouri City", "Nassau Bay", "Needville", 
            "Pasadena", "Pearland", "Porter", "Richmond", "Rosenberg", "Santa Fe", "Seabrook", "Shenandoah", "Spring", "Stafford", 
            "Sugar Land", "Texas City", "The Woodlands", "Tomball", "Webster", "West Columbia", "West University Place", "Winnie"]

    if city_name not in expected:
        invalid_city_names[city_name] +=1
    return invalid_city_names

In [12]:
incorrect_street_types = defaultdict(set) 
# This is a dictionary of street types which contain the street values 
# that are not in the expected list
    
incorrect_state_types = []
# This is a list that contains values of states other than 'TX'
    
incorrect_postcodes = []
# This list contains values of postcodes that are not in Houston or are invalid
    
incorrect_House_Numbers = []
# This list contains values of house numbers that are invalid
    
incorrect_city_names = defaultdict(int)
# This dictionary contains all the invalid city names


def audit_fields(osmfile):
    
    osm_file = open(osmfile, 'r')
    
    for event, elem in ET.iterparse(osm_file):
        if is_street_name(elem):
            audit_street_type(incorrect_street_types, elem.attrib['v'])
        elif is_state(elem):
            audit_state_type(incorrect_state_types, elem.attrib['v'])
        elif is_postcode(elem):
            audit_postcode(incorrect_postcodes, elem.attrib['v'])
        elif is_housenumber(elem):
            audit_housenumber(incorrect_House_Numbers, elem.attrib['v'])
        elif is_city(elem):
            audit_city(incorrect_city_names, elem.attrib['v'])
            
    print "Incorrect Street Types: "        
    pprint.pprint(incorrect_street_types)
    print "Incorrect State: "
    pprint.pprint(incorrect_state_types)
    print "Incorrect PostCode: "
    pprint.pprint(incorrect_postcodes)
    print "Incorrect House Numbers: "
    pprint.pprint(incorrect_House_Numbers)
    print "Incorrect City Names: "
    pprint.pprint(incorrect_city_names)

audit_fields(OSM_FILE)

Incorrect Street Types: 
defaultdict(<type 'set'>, {'1142': set(['Lake Woodlands Drive #1142']), 'Walk': set(['City Walk']), 'Ridge': set(['Creek Ridge']), 'Lake': set(['Woodcreek Bend Lake']), 'F-3': set(['Memorial Drive, Suite F-3']), 'Rd': set(['Cossey Rd', 'FM 1960 Rd', 'Cypress North Houston Rd', 'Ferry Rd', 'Powerline Rd', 'Kuykendahl Rd', 'Gordon Side Rd', 'Huffman Cleveland Rd', 'Macedonia Rd', 'Katy Fort Bend Rd', 'Hope Village Rd', 'E Atascocita Rd', 'Rogerdale Rd', 'Westheimer Rd', 'Hamish Rd', 'Old Katy Rd', 'Fairbanks North Houston Rd', 'Stuebner Airline Rd', 'E FM 528 Rd']), '130': set(['Barker Cypress Rd Suite 130']), 'Bailey': set(['Bailey']), 'Oaks': set(['West Spreading Oaks']), 'TX-332': set(['TX-332']), 'Texas': set(['BIRNHAM WOODS DRIVE SPRING, Texas']), '1774': set(['FM 1774', 'Farm-to-Market Road 1774']), 'Mews': set(['Chesham Mews']), 'Business': set(['TX-288 Business']), '90A': set(['Highway 90A']), 'Cypress': set(['Baker Cypress']), 'Dallas': set(['W. Dallas']

## Improving Data Quality of the Fields

### Improving the Data Quality of Street Names:

In [13]:
mapping = { "St": "Street", "St.": "Street", "street": "Street", "Ave": "Avenue", "Rd.": 'Road', "Rd": "Road",\
           "Blvd": "Boulevard", "Blvd.": "Boulevard", "Dr": "Drive", "Fwy": "Freeway", "Frwy": "Freeway", "Hwy": "Highway",\
           "N": "North", "N.": "North", "W": "West", "W.": "West", "E": "East", "E.": "East", "S": "South", "S.": "South",\
           "N ": "North ", "W ": "West ", "E ": "East ", "S ": "South ", "Ln": "Lane", "Farm-to-Market Road 1774": "FM 1774",\
          "Pkwy": "Parkway", "W. ": "West ", "Stree": "Street"}
street_type_re = re.compile(r'\b\S+\.?\s?$', re.IGNORECASE)
    # regex to pull out the street type from the name

def truncate_extension(name):
    # This function will return the name, truncating the part of the name after ',' or '#' or '('
    # The objective of this function is to remove extensions of city/street names (like 'Tx' or 'Texas') and 
    # any house numbers or suite numbers following city/street names
    bettername = name
    match1 = re.compile(r',')
    for m in match1.finditer(name):
        bettername = name[:m.start()]
        # return the part of the name before ','
    match2 = re.compile(r'#')
    for m in match2.finditer(name):
        bettername = name[:m.start()]
        # return the part of the name before '#'

    return bettername


def update_name(name):
    mapping = { "St": "Street", "St.": "Street", "street": "Street", "Ave": "Avenue", "Rd.": 'Road', "Rd": "Road",\
           "Blvd": "Boulevard", "Blvd.": "Boulevard", "Dr": "Drive", "Fwy": "Freeway", "Frwy": "Freeway", "Hwy": "Highway",\
           "N": "North", "N.": "North", "W": "West", "W.": "West", "E": "East", "E.": "East", "S": "South", "S.": "South",\
           "N ": "North ", "W ": "West ", "E ": "East ", "S ": "South ", "Ln": "Lane", "Farm-to-Market Road 1774": "FM 1774",\
          "Pkwy": "Parkway", "W. ": "West ", "Stree": "Street"}
    
    better_name = name
    
    if ("FM" not in name and name.isupper()):
        better_name = name.title()
        # convert UPPER case names to CAMEL case
        # ignore the street names which have "FM" (Farm to Market Road) in them
    
    if ("Farm-to-Market Road" in name):
        better_name = name.replace("Farm-to-Market Road", "FM")
        # replace "Farm-to-Market Road" with "FM" to maintain consistency
    
    m = street_type_re.search(name)
    if m:
        st_type = m.group()
        if (st_type in mapping.keys()):
            better_name = re.sub(st_type, mapping[st_type], name)
    
    street_abbrev_re = re.compile(r'^([a-z]){1}\.?(\s)+', re.IGNORECASE)
    # regex to pull out abbreviations like E/W/N/S in the name
    m2 = street_abbrev_re.search(better_name)
    better_name2 = better_name
    if m2:
        abbr_name = m2.group()
        if (abbr_name in mapping.keys()):
            better_name2 = re.sub(abbr_name, mapping[m2.group()], better_name)
    
    return better_name2


def improve_street_names():
    for st_type, ways in incorrect_street_types.iteritems():
        for name in ways:
            better_name1 = truncate_extension(name)
            better_name2 = update_name(better_name1)
            print name, "=>", better_name2

improve_street_names()

Lake Woodlands Drive #1142 => Lake Woodlands Drive 
City Walk => City Walk
Creek Ridge => Creek Ridge
Woodcreek Bend Lake => Woodcreek Bend Lake
Memorial Drive, Suite F-3 => Memorial Drive
Cossey Rd => Cossey Road
FM 1960 Rd => FM 1960 Road
Cypress North Houston Rd => Cypress North Houston Road
Ferry Rd => Ferry Road
Powerline Rd => Powerline Road
Kuykendahl Rd => Kuykendahl Road
Gordon Side Rd => Gordon Side Road
Huffman Cleveland Rd => Huffman Cleveland Road
Macedonia Rd => Macedonia Road
Katy Fort Bend Rd => Katy Fort Bend Road
Hope Village Rd => Hope Village Road
E Atascocita Rd => East Atascocita Road
Rogerdale Rd => Rogerdale Road
Westheimer Rd => Westheimer Road
Hamish Rd => Hamish Road
Old Katy Rd => Old Katy Road
Fairbanks North Houston Rd => Fairbanks North Houston Road
Stuebner Airline Rd => Stuebner Airline Road
E FM 528 Rd => East FM 528 Road
Barker Cypress Rd Suite 130 => Barker Cypress Rd Suite 130
Bailey => Bailey
West Spreading Oaks => West Spreading Oaks
TX-332 => Tx-

### Improving the Data Quality of City Names:

In [14]:
def correct_case(name):
    # This function is used to correct the case of city names
    if (name.isupper()):
        bettername = name.title()
        # convert UPPER case names to CAMEL case
    elif (name.islower()):
        bettername = name.title()
        # convert LOWER case names to CAMEL case
    else:
        bettername = name
    
    return bettername

def truncate_extension(name):
    # This function will return the name, truncating the part of the name after ',' or '#'
    # The objective of this function is to remove extensions of city/street names (like 'Tx' or 'Texas') and 
    # any house numbers or suite numbers following city/street names
    if (',' in name):
        bettername = name[:name.index(',')]
        # return the part of the name before ','
    elif ('#' in name):
        bettername = name[:name.index('#')]
        # return the part of the name before '#'
    else:
        bettername = name
    return bettername

def update_city_name(name):
    # This function is used to correct a few spellings in city names and to maintain consistency in city names
    mapping = {
    "Laks Jackson": "Lake Jackson", "Houson": "Houston", "Galveston Island": "Galveston", "The Woodlands": "Woodlands",\
    "Sugarland": "Sugar Land", "West University": "West University Place", "Dickenson": "Dickinson"}
    if (name in mapping.keys()):
        bettername = mapping[name]
    else:
        bettername = name
    return bettername
    

def improve_city_names():
    
    for city, times in incorrect_city_names.iteritems():
        better_name1 = correct_case(city)
        better_name2 = truncate_extension(better_name1)
        better_name3 = update_city_name(better_name2)
        print city, "=>", better_name3

improve_city_names()

DEER PARK => Deer Park
KATY => Katy
Houston, Texas => Houston
katy => Katy
El Lago => El Lago
Sugarland => Sugar Land
Fulshear => Fulshear
Friendswood, TX => Friendswood
TEXAS CITY => Texas City
Jersey Village => Jersey Village
Humble, TX => Humble
Beasley => Beasley
Bellaire, TX => Bellaire
Santa Fe, TX => Santa Fe
HOUSTON => Houston
LAKE JACKSON => Lake Jackson
The Woodlands, TX => Woodlands
Hempstead => Hempstead
Pasadena, TX => Pasadena
MAGNOLIA => Magnolia
Angleton, TX => Angleton
Rosharon => Rosharon
Angleton,TX => Angleton
Wallis => Wallis
Dickenson => Dickinson
La Marque => La Marque
Fort Bend => Fort Bend
Kingwood, TX => Kingwood
Bay City, TX => Bay City
West Columbia, TX => West Columbia
Houston, TX => Houston
Crystal Beach => Crystal Beach
Dickinson, Tx => Dickinson
Spring, TX => Spring
Mont Belvieu => Mont Belvieu
Todd Mission => Todd Mission
South Houston => South Houston
San Leon => San Leon
Lake Jackson, TX => Lake Jackson
Navasota => Navasota
Galveston Island => Galvest

In [15]:
def improve_city_name(name):
    better_name1 = correct_case(name)
    better_name2 = truncate_extension(better_name1)
    better_name3 = update_city_name(better_name2)
    return better_name3
    

### Improving the Data Quality of House Numbers:

In [16]:
house_number_re = re.compile(r'^\d+(-?\d)*$')

def update_house_number(house_number):
    street_re = re.compile(r'[a-z]{3}')
    # This regex checks if street name is present in house number field
    
    m1 = street_re.search(house_number)
    better_number = house_number
    if m1:
        #print m1.group(), "    ", house_number
        split = house_number.split(" ")
        if (split[1] != "Suite"):
            return split[0]
        else:
            better_number = house_number
    if ('Ste' in better_number):
        better_number = better_number.replace('Ste', 'Suite')

    return better_number
    


def improve_house_numbers():
    for house_number in incorrect_House_Numbers:
        better_house_number = update_house_number(house_number)
        print house_number, "=>", better_house_number

improve_house_numbers()

2926 Westerland Drive => 2926
201  => 201 
3747 #A => 3747 #A
3200  => 3200 
1200 East Blvd. => 1200
3995-B => 3995-B
4326-B => 4326-B
4326-A => 4326-A
6450 A => 6450 A
28760 BIRNHAM WOODS DRIVE SPRING, Texas => 28760
10th => 10th
2502B => 2502B
7707 #C013 => 7707 #C013
3622D => 3622D
20260-D => 20260-D
4582 Suite A => 4582 Suite A
4582 Suite F => 4582 Suite F
Ste 462 => Suite 462
7416A => 7416A
5315-A FM => 5315-A FM
724-B => 724-B
2855 Suite 111 => 2855 Suite 111
2815 Suite 110 => 2815 Suite 110
4801 Suite 400 => 4801 Suite 400
4801 Suite 100 => 4801 Suite 100
11411 Suite 124 => 11411 Suite 124
4003 Suite B => 4003 Suite B
4003 Suite I => 4003 Suite I
1900 #400 => 1900 #400
6543B => 6543B
2745D => 2745D
io353 => io353
217B => 217B
11209 Suite C30 => 11209 Suite C30
One => One
600 jefferson st => 600
13236 B => 13236 B
11511 1/2 => 11511 1/2
2002 Milby St => 2002
17700 Clay Road => 17700
18027B => 18027B
2811A => 2811A
9002F => 9002F
22151-G => 22151-G
930 1/2 => 930 1/2


### Improving the Data Quality of Pincodes:

In [17]:
def update_postcode(postcode):
    incorrect_list = ['Weslayan Street', '88581', '75057', '7-', '73032', '74404']
    updated_postcode = postcode
    if ('TX ' in postcode):
        updated_postcode = postcode.replace('TX ', '')
    if ('tx ' in postcode):
        updated_postcode = postcode.replace('tx ', '')
    if (postcode in incorrect_list):
        return None
    return updated_postcode

def improve_postcode():
    for pin in incorrect_postcodes:
        better_postcode = update_postcode(pin)
        print pin, "=>", better_postcode


improve_postcode()

Weslayan Street => None
88581 => None
TX 77494 => 77494
TX 77009 => 77009
TX 77086 => 77086
75057 => None
7- => None
TX 77005 => 77005
TX 77086 => 77086
TX 77042 => 77042
TX 77086 => 77086
tx 77042 => 77042
TX 77043 => 77043
73032 => None
74404 => None


## Shaping Elements:

In [None]:
import xml.etree.cElementTree as ET
import pprint
import re

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def shape_element(element):
    node = {}  # creating empty node dictionary
    address={}  # creating empty address dictonary
    if element.tag == "node" or element.tag == "way":   
        node["type"] = element.tag
        node["created"] = {}
        node["pos"] =[ None , None]
        
        for i,val in element.attrib.items():
            if i in CREATED:
                node["created"].update({i:val})
            elif i == 'lat':
                node["pos"][0] = float(element.get(i))
            elif i == 'lon':
                node["pos"][1] = float(element.get(i))
            else:
                node[i] = element.get(i)
                
        node_refs = []
        for child in element:
            if child.tag == 'nd':
                node_refs.append(child.attrib['ref'])
            node["node_refs"] = node_refs
            
            if child.tag == 'tag':
                node["address"] = {}
                empty_dict = {}
               
                for loc,val in child.attrib.items():
                    if val.startswith("addr:") and lower_colon.search(val):
                        key, value = val.split(":",1)
                        #print key, "   ", value
                        if (value == 'street'):
                            address[value] = update_name(child.attrib['v'])
                        elif (value == 'city'):
                            address[value] = improve_city_name(child.attrib['v'])
                        elif (value == 'housenumber'):
                            address[value] = update_house_number(child.attrib['v'])
                        elif (value == 'postcode'):
                            address[value] = update_postcode(child.attrib['v'])
                        else:
                            address[value] = child.attrib['v']
                        
    
                    elif "addr:" not in val and lower.search(val):
                        node[val] = child.attrib['v']
                    elif problemchars.search(val):
                        pass
                if address:
                    node["address"] = address                            
        
    else:
        return None
    return node
    
def process_map(file_in, pretty = False):
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

def test():
    data = process_map(SAMPLE_FILE, False)
    print "data: "
    pprint.pprint(data)


test()

data: 
[{'created': {'changeset': '6369617',
              'timestamp': '2010-11-14T18:44:21Z',
              'uid': '9176',
              'user': 'Maarten Deen',
              'version': '2'},
  'id': '4335942',
  'pos': [29.353278, -95.487423],
  'type': 'node'},
 {'created': {'changeset': '472935',
              'timestamp': '2007-09-19T11:56:30Z',
              'uid': '13871',
              'user': 'Daniel P',
              'version': '1'},
  'id': '54597605',
  'pos': [29.2561836, -94.8504077],
  'type': 'node'},
 {'created': {'changeset': '25081720',
              'timestamp': '2014-08-28T19:38:49Z',
              'uid': '167417',
              'user': 'osm-sputnik',
              'version': '2'},
  'id': '54627830',
  'pos': [29.1128559, -95.0870894],
  'type': 'node'},
 {'created': {'changeset': '22052096',
              'timestamp': '2014-04-30T21:03:36Z',
              'uid': '1110270',
              'user': 'afdreher',
              'version': '2'},
  'id': '54627859',
  'po