## Importing the necessary libraries:

In [2]:
import xml.etree.cElementTree as ET
from collections import defaultdict

In [3]:
SAMPLE_FILE = "houston_texas_sample.osm" # shorter version of the original file (26.7 MB)
OSM_FILE = "houston_texas.osm" # This is the original file (656 MB)

### Analyzing the tags in the OSM file:
Below we can see what tags are present in the OSM file and how many times each tag occurs 

In [4]:
def analyze_tags(file):
    tag_types = defaultdict(int)
    for event, elem in ET.iterparse(file):
        tag_types[elem.tag] += 1
    return tag_types
analyze_tags(SAMPLE_FILE)

defaultdict(int,
            {'member': 671,
             'nd': 149936,
             'node': 121136,
             'osm': 1,
             'relation': 99,
             'tag': 83760,
             'way': 14681})

### Analyzing the keys in tags:
Below we can see what are the keys that occur in tags and how many times each of them occur.

In [15]:
def tag_keys(file):
    tag = defaultdict(int)
    for event, elem in ET.iterparse(file):
        if (elem.tag == 'way'):
            for e in elem.iter("tag"):
                    tag[e.attrib['k']] +=1
                
    return tag
tag_keys(SAMPLE_FILE)
            

defaultdict(int,
            {'FIXME': 49,
             'NHS': 66,
             'PASS_route': 48,
             'Texas_Trunk_System': 19,
             'abandoned:highway': 1,
             'access': 480,
             'access:bus': 3,
             'addr:city': 193,
             'addr:country': 11,
             'addr:housename': 2,
             'addr:housenumber': 122,
             'addr:postcode': 101,
             'addr:state': 178,
             'addr:street': 117,
             'admin_level': 27,
             'aeroway': 51,
             'alt_name': 9,
             'amenity': 262,
             'area': 16,
             'attraction': 2,
             'attribution': 11,
             'barrier': 54,
             'baseball': 1,
             'basin': 6,
             'bench': 1,
             'bicycle': 131,
             'bicycle_road': 1,
             'boat': 6,
             'border_type': 22,
             'boundary': 28,
             'brand': 2,
             'bridge': 258,
             'building'

### Analyzing the values of city in tags:
Below we can see the values for city and how many times each of them occur in the OSM file

In [18]:
def tag_city(file):
    tag = defaultdict(int)
    for event, elem in ET.iterparse(file):
        if (elem.tag == 'way'):
            for e in elem.iter("tag"):
                    if (e.attrib['k'] == 'addr:city'):
                        tag[e.attrib['v']] +=1
                
    return tag
tag_city(OSM_FILE)

defaultdict(int,
            {'77386': 1,
             'Alvin': 4,
             'Alvin, TX': 1,
             'Angleton': 1,
             'Angleton, TX': 1,
             'Angleton,TX': 1,
             'Atascocita': 1,
             'Bay City': 1,
             'Bay City, TX': 1,
             'Baytown': 11,
             'Bellaire': 13,
             'Brazoria': 1,
             'Channelview': 1,
             'Clear Lake Shores': 1,
             'Clute': 3,
             'Conroe': 3,
             'Crosby': 1,
             'Cypress': 39,
             'Cypress, TX': 2,
             'Deer Park': 2,
             'Dickinson': 7,
             'Fort Bend': 2,
             'Freeport': 1,
             'Fresno': 3,
             'Friendswood': 8,
             'Friendswood, TX': 1,
             'Galveston': 684,
             'Galveston Island': 1,
             'Hedwig Village': 1,
             'Hockley': 2,
             'Houston': 2428,
             'Houston, TX': 18,
             'Houston, Texas': 5,
   

In the results above we can observe the following:
* There are a few entries (3 entries in the sample file) that have 'Tx'/'Texas' following the city name. This has to be corrected (we will only retain the city name)
* There is one entry 'Galveston Island' which has to be changes to 'Galveston' to maintain consistency.

In [19]:
def tag_city(file):
    tag = defaultdict(int)
    for event, elem in ET.iterparse(file):
        if (elem.tag == 'way'):
            for e in elem.iter("tag"):
                    if (e.attrib['k'] == 'addr:city'):
                        tag[e.attrib['v']] +=1
                
    return tag
tag_city(SAMPLE_FILE)

defaultdict(int,
            {'Baytown': 1,
             'Cypress': 1,
             'Deer Park': 1,
             'Galveston': 32,
             'Galveston Island': 1,
             'Houston': 102,
             'Houston, TX': 1,
             'Houston, Texas': 1,
             'Humble': 4,
             'Katy': 19,
             'Kingwood': 1,
             'Klein': 1,
             'League City': 1,
             'Pasadena': 1,
             'Pearland': 1,
             'Rosharon': 1,
             'Stafford': 1,
             'Sugar Land': 3,
             'Texas City': 1,
             'The Woodlands': 2,
             'Tomball': 11,
             'Tomball, Tx': 6})