Analyzing TriCities Area OpenStreetMap Data
Part 1: Counting Tags

In [27]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Your task is to use the iterative parsing to process the map file and
find out not only what tags are there, but also how many, to get the
feeling on how much of which data you can expect to have in the map.
Fill out the count_tags function. It should return a dictionary with the 
tag name as the key and number of times this tag can be encountered in 
the map as value.

Note that your code will be tested with a different data file than the 'example.osm'
"""

#Import statements
import xml.etree.cElementTree as ET
import pprint

#Create a dictionary
TagTypes = {}

#Count the number of each tag and store it in the dictionary
def count_tags(filename):
    for event, elem in ET.iterparse(filename):
        if elem.tag not in TagTypes.keys():
            TagTypes[elem.tag] = 1
        else:
            TagTypes[elem.tag] += 1

#This is the main function. We are going to count the tags in wallawalla.osm and print out the results.
def test():
    tags = count_tags('TriCities.xml')
    pprint.pprint(TagTypes)
    
test()

{'bounds': 1,
 'member': 19762,
 'meta': 1,
 'nd': 471096,
 'node': 393396,
 'note': 1,
 'osm': 1,
 'relation': 1463,
 'tag': 169924,
 'way': 47757}


Part Two: Auditing Street Types

In [28]:
"""
In this bit of code, we are going to be getting a list of the different street types present in the file. We will get
a list of the different street types as well as how many of each there are.
"""

#Import statements
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

#Here, we set the variable osm_file to open wallawalla.osm
osm_file = open("wallawalla.osm","r", encoding="utf8")

#Here, we set up our registry expression as well as our street_types dictionary, and our street_types_names dictionary.
#The first one will count the number of each street type, and the second one will save the street names in sets in that
#dictionary.
street_type_re = re.compile(r'\b\S+\.?$',re.IGNORECASE)
street_types = {}
street_type_names = defaultdict(set)

#This function will take a street name and cut off the last chunk in order to get the street type, and then
#set it as the key in a dictionary that points to the number of times that street type has occured.
def audit_street_type(street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in street_types.keys():
            street_types[street_type] = 1
        else:
            street_types[street_type] += 1

#This will sort the dictionary, and then print it
def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower())
    for k in keys:
        v = d[k]
        print("%s: %d" %(k,v))

#This function checks to see if an element is a street name
def is_street_name(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "addr:street")

#This function will store the different street names in the correct location in the dictionary based on street types.
def store_street_names(street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in street_type_names.keys():
            street_type_names[street_type].add(street_name)
        else:
            street_type_names[street_type].add(street_name)

#This is one of our main functions we are going to test. It parses through the osm file and then,
#if it runs into a street name, it will audit the street type.
def audit(filename):
    for event, elem in ET.iterparse(filename):
        if is_street_name(elem):
            audit_street_type(elem.attrib['v'])

#This is our second main function, if we choose to use it. It will store the street names instead of counting them.
def audit2(filename):
    for event, elem in ET.iterparse(filename):
        if is_street_name(elem):
            store_street_names(elem.attrib['v'])

audit("TriCities.xml")

print(street_types)

{'Avenue': 147, 'Street': 314, 'Drive': 102, 'Ave': 3, 'Boulevard': 54, 'Way': 85, 'St.': 1, 'Ct': 2, 'Court': 21, '397': 1, '44': 1, 'Road': 35, 'St': 3, 'Landing': 1, 'Loop': 25, '92': 1, 'Place': 6, '3920': 1, 'Parkway': 1, '68': 4, 'Dr': 3, 'Lane': 22, '36': 3, 'Blvd': 3, 'Trail': 3, 'ST': 1, '72': 1, 'Dri': 1, '240': 1}


Part Three: Fixing Street Names

In [29]:
"""
This program is going to suggest better street names for us to use where there are problems.
"""

#Import statements
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

#Define the OSM file and street_type_re re
OSMFILE = "TriCities.xml"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_name_re = re.compile(r'^(.*(?!(\S*$)))', re.IGNORECASE)

#Use the expected array in order to determine street types to exclude from the analysis
expected = []

#Use the mapping dictionary in order to suggest better names for streets that have incorrect names.
mapping = { "Ave": "Avenue",
            "Pl": "Place",
            "St": "Street",
            "St.": "Street",
            "Steet": "Street",
            "ave": "Avenue",
            "Ct": "Court",
            "Dr": "Drive",
            "Blvd": "Boulevard",
            "ST": "Street",
            "Dri": "Drive"
            }

#This function will audit the osm file and call the audit_street_type function if the element is a street type.
def audit(osmfile):
    osm_file = open(osmfile, "rb")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osmfile, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

#This function will add street names to a dictionary in the correct set according to street type.
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)
            
#This function checks to see if an element is a street.     
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

#This function will update the street name if necessary to the correct one.
def update_name(name, mapping):
    m = street_type_re.search(name)
    o = street_name_re.search(name)
    if m:
        street_type = m.group()
        if street_type in mapping.keys():
            name = mapping[street_type]
            if o:
                firstname = o.group()
                
    return firstname + " " + name

#This is our first main function, which is going to create and print out a dictionary of street names
#according to street type. It will return the dictionary.
def part1():
    st_types = audit(OSMFILE)
    pprint.pprint(dict(st_types))
    return st_types

#This is our second main function, which is going to discover the street names that are incorrect and then
#suggest new names for them.

def part2(street_types):
    for street_type, names in street_types.items():
            m = street_type_re.search(street_type)
            if m:
                street_type = m.group()
            
                if street_type in mapping.keys():
                    for name1 in names:
                        print (street_type)
                        better_name = update_name(name1, mapping)
                        print (name1, "=>", better_name)

streat_types = part1()
part2(streat_types)

{'36': {'Road 36'},
 '3920': {'3920'},
 '397': {'E. SR 397'},
 '44': {'North Road 44'},
 '68': {'North Road 68'},
 '72': {'Road 72'},
 '92': {'North Road 92'},
 'Ave': {'Willamette Ave', 'West Kennewick Ave'},
 'Avenue': {'20th Avenue',
            '24th Avenue',
            '4th Avenue',
            'Bellaview Avenue',
            'Camillia Avenue',
            'Cullum Avenue',
            'Dale Avenue',
            'Daphne Avenue',
            'Dupont Avenue',
            'Hunt Avenue',
            'Jadwin Avenue',
            'Mahan Avenue',
            'Malbec Avenue',
            'Meritage Avenue',
            'North 10th Avenue',
            'North 1st Avenue',
            'North 20th Avenue',
            'North 22nd Avenue',
            'North 3rd Avenue',
            'North 4th Avenue',
            'Parkview Avenue',
            'South 18th Avenue',
            'South 6th Avenue',
            'Syrah Avenue',
            'Tawny Avenue',
            'West 10th Avenue',
          

Part Four: Exporting to .json

In [30]:
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json

In [31]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
startswithaddr = re.compile(r'\Aaddr:')
afteraddr = re.compile(r':.+$')
afteraddr2 = re.compile(r'[a-zA-Z+$]')


CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
pos = []

mapping = { "Ave": "Avenue",
            "Pl": "Place",
            "St": "Street",
            "St.": "Street",
            "Steet": "Street",
            "ave": "Avenue",
            "Ct": "Court",
            "Dr": "Drive",
            "Blvd": "Boulevard",
            "ST": "Street",
            "Dri": "Drive"
            }

#Checks to see if the element is a street name
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

#This function will update the street name if necessary to the correct one.
def update_name(name, mapping):
    m = street_type_re.search(name)
    o = street_name_re.search(name)
    if m:
        street_type = m.group()
        if street_type in mapping.keys():
            name = mapping[street_type]
            if o:
                firstname = o.group()
                
    return firstname + " " + name

#This processes the file
def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

In [32]:
def shape_element(element):
    node = {}
    
    if element.tag == "node" or element.tag == "way" :
        node['type'] = element.tag
        if 'lon' in element.attrib:
            # treat geo attribs values
            node.update({'pos' : [element.attrib['lon'], element.attrib['lat']]})
        
        for attr in element.attrib:
            
            if attr in ['lat', 'lon']:
                pass # already treated
            elif attr in CREATED:
                node.setdefault('created', {})[attr] = element.attrib[attr]
            else:   
                node[attr] = element.attrib[attr]
        
        for tag in element.iter("tag"):
            # treat child tags
            m = startswithaddr.search(tag.attrib['k'])
            if m:
                m = m.group()
                o = afteraddr.search(m)
                if o:
                    o = o.group()
                    if is_street_name(tag):
                        input1 = update.name(tag.attrib['v'], mapping)
                    else: 
                        input1 = tag.attrib['v']
                        node.update({"Address" : {o : input1}})
                else: node.update({tag.attrib['k'] : tag.attrib['v']})
        
        for tag in element.iter("nd"):
            #treat nd childs
            node.update({'node_ref' : [tag.attrib['ref']]})
            
        # here you can print your element to check if it is ok
        return node
    else:
        return None

In [11]:
process_map('TriCities.xml')

[{'type': 'node',
  'pos': ['-118.5210595', '45.6203835'],
  'id': '42625053',
  'created': {'version': '3',
   'timestamp': '2020-05-21T19:09:14Z',
   'changeset': '85570126',
   'uid': '478161',
   'user': 'MappingJunkie'}},
 {'type': 'node',
  'pos': ['-119.2956804', '46.1991042'],
  'id': '46801353',
  'created': {'version': '3',
   'timestamp': '2010-04-14T23:14:42Z',
   'changeset': '4427377',
   'uid': '261155',
   'user': 'HendricksR'}},
 {'type': 'node',
  'pos': ['-119.2907559', '46.1990691'],
  'id': '46801356',
  'created': {'version': '4',
   'timestamp': '2013-08-28T04:51:12Z',
   'changeset': '17547819',
   'uid': '91499',
   'user': 'paulmach'}},
 {'type': 'node',
  'pos': ['-119.2900155', '46.1991487'],
  'id': '46801358',
  'created': {'version': '3',
   'timestamp': '2010-04-14T23:14:43Z',
   'changeset': '4427377',
   'uid': '261155',
   'user': 'HendricksR'}},
 {'type': 'node',
  'pos': ['-119.2897292', '46.1992729'],
  'id': '46801362',
  'created': {'version': '4

In [65]:
from pymongo import MongoClient
import pprint
import pymongo


In [63]:
client = MongoClient('localhost:27017')
db = client["WGUNanodegree"]
db = db["StreetData"]

In [64]:
db.dataSize()

TypeError: 'Collection' object is not callable. If you meant to call the 'dataSize' method on a 'Collection' object it is failing because no such method exists.