## Importing the necessary libraries:

In [6]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import pprint
import re

In [2]:
SAMPLE_FILE = "houston_texas_sample.osm" # shorter version of the original file (26.7 MB)
OSM_FILE = "houston_texas.osm" # This is the original file (656 MB)

### Analyzing the tags in the OSM file:
Below we can see what tags are present in the OSM file and how many times each tag occurs 

In [3]:
def analyze_tags(file):
    tag_types = defaultdict(int)
    for event, elem in ET.iterparse(file):
        tag_types[elem.tag] += 1
    return tag_types
analyze_tags(SAMPLE_FILE)

defaultdict(int,
            {'member': 1907,
             'nd': 352205,
             'node': 302840,
             'osm': 1,
             'relation': 246,
             'tag': 208724,
             'way': 36703})

### Analyzing the keys in tags:
Below we can see what are the keys that occur in tags and how many times each of them occur.

In [4]:
def tag_keys(file):
    tag = defaultdict(int)
    for event, elem in ET.iterparse(file):
        if (elem.tag == 'way'):
            for e in elem.iter("tag"):
                    tag[e.attrib['k']] +=1
                
    return tag
tag_keys(SAMPLE_FILE)
            

defaultdict(int,
            {'FIXME': 111,
             'FIXME:oneway': 2,
             'NHS': 152,
             'PASS_route': 116,
             'Texas_Trunk_System': 43,
             'UFID': 1,
             '_LINEARID_': 1,
             'abandoned:highway': 2,
             'access': 1248,
             'access:bus': 9,
             'access:conditional': 1,
             'addr:city': 466,
             'addr:country': 27,
             'addr:full': 1,
             'addr:housename': 2,
             'addr:housenumber': 301,
             'addr:inclusion': 2,
             'addr:interpolation': 2,
             'addr:postcode': 237,
             'addr:state': 427,
             'addr:street': 289,
             'addr:unit': 2,
             'admin_level': 62,
             'aeroway': 123,
             'alt_name': 19,
             'amenity': 653,
             'area': 60,
             'atm': 3,
             'attraction': 4,
             'attribution': 33,
             'barrier': 106,
             'ba

Below we categorize the tags into 4 categories: lower, lower_colon, problemchars and other 

In [8]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        if (lower.match(element.attrib['k'])):
            keys['lower'] += 1
        elif (lower_colon.match(element.attrib['k'])):
            keys['lower_colon'] += 1
        elif (problemchars.match(element.attrib['k'])):
            keys['problemchars'] += 1
        else:
            keys['other'] += 1 
    return keys

def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

pprint.pprint(process_map(OSM_FILE))

{'lower': 892280, 'lower_colon': 1141166, 'other': 54268, 'problemchars': 0}


### Analyzing the values of city in tags:
Below we can see the values for city and how many times each of them occur in the OSM file

In [5]:
def tag_city(file):
    tag = defaultdict(int)
    for event, elem in ET.iterparse(file):
        if (elem.tag == 'way'):
            for e in elem.iter("tag"):
                    if (e.attrib['k'] == 'addr:city'):
                        tag[e.attrib['v']] +=1
    print tag            
    return tag
tag_city(SAMPLE_FILE)

defaultdict(<type 'int'>, {'Houston, Texas': 1, 'Pearland': 1, 'Texas City': 1, 'The Woodlands': 2, 'Pasadena': 1, 'Baytown': 1, 'Klein': 1, 'Cypress': 1, 'Rosharon': 1, 'League City': 1, 'Tomball': 11, 'Stafford': 1, 'Galveston': 32, 'Kingwood': 1, 'Katy': 19, 'Humble': 4, 'Galveston Island': 1, 'Sugar Land': 3, 'Tomball, Tx': 6, 'Houston': 102, 'Deer Park': 1, 'Houston, TX': 1})


defaultdict(int,
            {'Baytown': 1,
             'Cypress': 1,
             'Deer Park': 1,
             'Galveston': 32,
             'Galveston Island': 1,
             'Houston': 102,
             'Houston, TX': 1,
             'Houston, Texas': 1,
             'Humble': 4,
             'Katy': 19,
             'Kingwood': 1,
             'Klein': 1,
             'League City': 1,
             'Pasadena': 1,
             'Pearland': 1,
             'Rosharon': 1,
             'Stafford': 1,
             'Sugar Land': 3,
             'Texas City': 1,
             'The Woodlands': 2,
             'Tomball': 11,
             'Tomball, Tx': 6})

In the results above we can observe the following:
* There are a few entries that have 'Tx'/'Texas' following the city name. This has to be corrected (we will only retain the city name)
* There is one entry 'Galveston Island' which has to be changes to 'Galveston' to maintain consistency.
* The first entry '77386' seems to be a pincode value which is erroneously present here. This needs to be removed.
* 'TEXAS CITY' should be changed to 'Texas City' to maintain consistency.
* 'West University' and 'West University Place' refer to the same area. So, 'West University' should be changed to 'West University Place' to maintain consistency.
* 'Sugarland' and 'Sugar Land, TX' should be changed to 'Sugar Land' to maintain consistency as they refer to the same place.
* 'clear lake shores' should be changed to 'Clear Lake Shores' to maintain consistency as they refer to the same place.