## OpenStreetMap Data Case Study

#### Map Area

Beaver County, PA, US and minor outlying and surrounding areas

http://beavercountypa.gov/Pages/Default.aspx


This region is where I live, where I currently work from home, and volunteer as a firefigher & EMT. I'm curious how much data has been entered for this somewhat rural community.

![alt text](image.png "Beaver County PA")

### Problems Encountered in Map Data

Below I created a sample subset and examined some of the data.

Problems encountered included:

1. Inconsistently abbreviated street types, including Blvd, Tpke, St etc.
2. Problem characters within the listings
3. 


In [15]:
#Importing useful python elements to examine data.
import xml.etree.ElementTree as ET
import pprint
from collections import defaultdict
import re
import csv
import codecs
import cerberus
import sqlite3
import os

OSM_FILE = "BeaverCounty.osm" # My OSM file of Beaver County, PA 
SAMPLE_FILE = "BeaverCountySample.osm" # My sample file to be created below 

k = 10 # Take every 10th element from the top level tags.

def get_element(osm_file, tags=('node', 'way', 'relation')):
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n'.encode())
    output.write('<osm>\n'.encode())

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>'.encode())


In [10]:
# Get the file size
os.stat('BeaverCounty.osm')

os.stat_result(st_mode=33206, st_ino=844424930873890, st_dev=2523738793, st_nlink=1, st_uid=0, st_gid=0, st_size=105381578, st_atime=1601126653, st_mtime=1601126653, st_ctime=1601126629)

### File Statistics:
File is 105381578 bytes, or 105.381578 Megabytes (st_size=105381578)

#### We will count elements, get unique users and list them, to compare with our SQL later:

In [2]:
# Count the number of each type of tag in main OSM file

def count_tags(filename):
    tree=ET.iterparse(filename)
    tags={}
    for event,elem in tree:
        if elem.tag not in tags.keys():
            tags[elem.tag]=1
        else:
            tags[elem.tag] = tags[elem.tag]+1
    return tags    
    
with open(OSM_FILE,'rb') as f:
    tags=count_tags(OSM_FILE)
    pprint.pprint(tags)
f.close()

{'bounds': 1,
 'member': 25791,
 'meta': 1,
 'nd': 551791,
 'node': 478084,
 'note': 1,
 'osm': 1,
 'relation': 940,
 'tag': 181551,
 'way': 53020}


In [8]:
# Find the number of users who have updated Beaver County, and list them.

def get_users(element):
    return element.get('user')


def process_users(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if element.get('user'):
            users.add(get_users(element))
        element.clear()    
    return users


with open(OSM_FILE,'rb') as f:
    users = process_users(OSM_FILE)

print(len(users))
pprint.pprint(users)
f.close()

814
{'-Schwarz-',
 '1 Piece',
 '16ajmain',
 '25or6to4',
 '42429',
 'AJRAJR',
 'Aaron Musick',
 'AbbyO12',
 'Adam Killian',
 'Adam_P',
 'Adamant1',
 'Alan Trick',
 'Alma Ross',
 'AndreThib',
 'AndrewBuck',
 'AndrewSnow',
 'AnonymousAlligator',
 'Anthony Jantzi',
 'AnthonyJackman',
 'ArminGh',
 'Ashley3870',
 'Asphaze',
 'Aurimas Fišeras',
 'BMBurgh',
 'Bacon_BMW',
 'BeholdersEye',
 'Bhojaraj',
 'BiIbo',
 'Bike Rider 9591',
 'Bored',
 'Brandon Saccomanno',
 'Brian Herskovitz',
 'Caitlin Downes',
 'California Bear',
 'Calwbb5',
 'CamelCaseNick',
 'Captblack13',
 'Cato_d_Ae',
 'Caulin',
 'Cheeto',
 'Christoph Lotz',
 'Claytonlukes77',
 'CloCkWeRX',
 'Cody23yo',
 'ColbertDolbert',
 'Cory1',
 'DJ FQA',
 'DLichti',
 'Dami_Tn',
 'Dan C1234',
 'Dan Wood',
 'Daneel76',
 'DannyAiquipa',
 'DaveHansenTiger',
 'Delaney C',
 'Der Landvermesser',
 'DevinBl1230',
 'Dion Dock',
 'DonHohman',
 'DonovanG',
 'DougV',
 'Dowluri',
 'Dr Kludge',
 'DylanHC',
 'ELadner',
 'EdSS',
 'Edbo',
 'Edward',
 'Egazda',


In [17]:
# Check formatting of the K attribute in the tags in the OSM file, code referenced from Udacity & Github
#

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        if lower.search(element.attrib['k']):
            keys['lower'] += 1
        elif lower_colon.search(element.attrib['k']):
            keys['lower_colon'] += 1
        elif problemchars.search(element.attrib['k']):
            keys['problemchars'] = keys['problemchars'] + 1
        else:    
            keys['other'] += 1  
            print(element.attrib['k'])
            print(element.attrib['v'])
    return keys


def process_keys_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

with open(OSM_FILE,'rb') as f:
    keys = process_keys_map(OSM_FILE)
    pprint.pprint(keys)
f.close()

gnis:Class
Populated Place
gnis:County
Beaver
gnis:County_num
007
gnis:ST_alpha
PA
gnis:ST_num
42
gnis:Class
Populated Place
gnis:County
Beaver
gnis:County_num
007
gnis:ST_alpha
PA
gnis:ST_num
42
gnis:Class
Populated Place
gnis:County
Beaver
gnis:County_num
007
gnis:ST_alpha
PA
gnis:ST_num
42
gnis:Class
Populated Place
gnis:County
Beaver
gnis:County_num
007
gnis:ST_alpha
PA
gnis:ST_num
42
gnis:Class
Populated Place
gnis:County
Allegheny
gnis:County_num
003
gnis:ST_alpha
PA
gnis:ST_num
42
gnis:Class
Populated Place
gnis:County
Beaver
gnis:County_num
007
gnis:ST_alpha
PA
gnis:ST_num
42
gnis:Class
Populated Place
gnis:County
Beaver
gnis:County_num
007
gnis:ST_alpha
PA
gnis:ST_num
42
gnis:Class
Populated Place
gnis:County
Beaver
gnis:County_num
007
gnis:ST_alpha
PA
gnis:ST_num
42
gnis:Class
Populated Place
gnis:County
Beaver
gnis:County_num
007
gnis:ST_alpha
PA
gnis:ST_num
42
gnis:Class
Populated Place
gnis:County
Beaver
gnis:County_num
007
gnis:ST_alpha
PA
gnis:ST_num
42
gnis:Class
Popula

voltage-high
345000
voltage-high
345000
socket:type1
2
socket:type1
yes
socket:type1
2
railway:signal:direction
forward
railway:signal:position
right
railway:signal:direction
forward
railway:signal:position
right
name_1
Raccoon Township Police Department
name_1
Crestridge Drive
tiger:name_base_1
Crestridge
tiger:name_type_1
Dr
tiger:zip_left_1
15046
tiger:zip_left_1
15046
tiger:zip_left_1
15143
tiger:zip_right_1
15143
name_1
Clinton Flaugherty Road
tiger:name_base_1
Clinton Flaugherty
tiger:name_type_1
Rd
tiger:zip_left_1
15143
tiger:zip_left_2
15143
tiger:zip_left_3
15143
tiger:zip_right_3
15143
name_1
Shousetown Bridge
tiger:name_base_1
Shousetown
tiger:name_type_1
Brg
tiger:name_base_1
Lincoln
tiger:name_type_1
Hwy
tiger:zip_left_1
15143
tiger:zip_left_1
15126
name_1
Winnwood Road
tiger:name_base_1
Winnwood
tiger:name_type_1
Rd
name_1
Winter Trail
tiger:name_base_1
Winter
tiger:name_type_1
Trl
tiger:zip_left_1
15143
name_1
Cherry Street
tiger:name_base_1
Cherry
tiger:name_type_1
St


tiger:name_base_1
State Route 551
tiger:name_base_1
State Route 65
parking:lane:both
parallel
name_1
T 410
tiger:name_base_1
T 410
tiger:zip_left_1
15010
name_1
Pine Run Road
tiger:name_base_1
Pine Run
tiger:name_type_1
Rd
tiger:zip_left_1
15061
tiger:zip_right_1
15061
tiger:zip_left_1
15061
tiger:zip_right_1
15061
name_1
Fry Drive
tiger:name_base_1
Fry
tiger:name_type_1
Dr
tiger:name_base_1
Pennsylvania
tiger:name_type_1
Tpke
name_1
T544
tiger:name_base_1
T544
tiger:zip_left_1
15001
tiger:zip_left_1
15066
name_1
T486
tiger:name_base_1
T486
name_1
T329
tiger:name_base_1
T329
name_1
State Route 3022
tiger:name_base_1
State Route 3022
name_1
State Route 3022
tiger:name_base_1
State Route 3022
NHS
yes
tiger:name_base_1
State Route 68
tiger:zip_left_1
15001
tiger:zip_right_1
15001
name_1
Shreeves Road
name_2
T303
tiger:name_base_1
Shreeves
tiger:name_base_2
T303
tiger:name_type_1
Rd
NHS
yes
tiger:name_base_1
State Route 51
name_1
T544
tiger:name_base_1
T544
tiger:zip_left_1
15066
tiger:zip

tiger:zip_left_1
15009
tiger:zip_right_1
15009
name_1
Jenny Street
tiger:name_base_1
Jenny
tiger:name_type_1
St
name_1
T345
tiger:name_base_1
T345
tiger:name_base_1
State Route 168
tiger:zip_left_1
15010
tiger:zip_right_1
15010
tiger:zip_left_1
15043
tiger:zip_left_1
15052
name_1
Brown Road
tiger:name_base_1
Brown
tiger:name_type_1
Rd
tiger:zip_left_1
15066
tiger:zip_left_1
15001
tiger:zip_left_2
15001
tiger:zip_left_3
15001
tiger:zip_left_4
15001
tiger:zip_right_2
15001
tiger:zip_right_3
15001
tiger:zip_right_4
15001
tiger:zip_left_1
15001
tiger:zip_right_1
15001
tiger:name_base_1
State Route 588
tiger:zip_left_1
15010
tiger:zip_left_1
15066
tiger:zip_left_2
15066
name_1
T457
tiger:name_base_1
T457
tiger:name_base_1
State Route 65
tiger:name_base_1
State Route 351
tiger:zip_left_1
16141
tiger:zip_left_1
15001
tiger:zip_left_2
15061
tiger:zip_right_2
15061
name_1
T368
tiger:name_base_1
T368
tiger:zip_left_1
15009
parking:lane:both
parallel
name_1
Cullen Drive
name_2
T466
tiger:name_bas

NHS
yes
tiger:name_base_1
State Route 68
NHS
yes
tiger:name_base_1
State Route 68
NHS
yes
tiger:name_base_1
State Route 68
NHS
yes
tiger:name_base_1
State Route 68
tiger:name_base_1
United States Highway 30
tiger:zip_left_1
15001
tiger:zip_right_1
15001
tiger:name_base_1
State Route 168
tiger:name_base_1
State Route 168
destination:ref:to
PA 351
tiger:name_base_1
State Route 288
tiger:name_base_1
State Route 288
tiger:name_base_1
State Route 288
tiger:name_base_1
Pennsylvania
tiger:name_type_1
Tpke
tiger:name_base_1
Pennsylvania
tiger:name_type_1
Tpke
tiger:zip_left_1
15026
tiger:zip_right_1
15026
destination:ref:lanes
PA 18 South|PA 551 West
destination:ref:to:lanes
none|I 376 Toll
destination:ref:lanes
PA 18 South|PA 551 West
destination:ref:to:lanes
none|I 376 Toll
tiger:name_base_1
State Route 588
tiger:name_base_1
State Route 588
turn:lanes:forward
through|right
NHS
yes
turn:lanes:forward
through|through;right
turn:lanes:backward
left|through
turn:lanes:forward
left
tiger:name_bas

tiger:zip_left_1
15108
tiger:zip_left_1
15108
tiger:zip_left_1
15108
tiger:zip_left_1
15108
tiger:zip_left_2
15108
tiger:zip_right_2
15108
parking:lane:both
parallel
parking:lane:left
parallel
parking:lane:right
diagonal
maxspeed:advisory:backward
25 mph
maxspeed:advisory:forward
30 mph
maxspeed:advisory:backward
35 mph
maxspeed:advisory:backward
35 mph
maxspeed:advisory:forward
30 mph
turn:lanes:forward
left|right
source:ref:penndot
PennDOT Beaver County Type 10 Map (http://www.dot7.state.pa.us/BPR_pdf_files/Maps/GHS/Roadnames/Beaver_GHSN.PDF)
source:ref:penndot
PennDOT Beaver County Type 10 Map (http://www.dot7.state.pa.us/BPR_pdf_files/Maps/GHS/Roadnames/Beaver_GHSN.PDF)
source:ref:penndot
PennDOT Beaver County Type 10 Map (http://www.dot7.state.pa.us/BPR_pdf_files/Maps/GHS/Roadnames/Beaver_GHSN.PDF)
turn:lanes:backward
through|right
destination:street:lanes
Stone Quarry Road|Wagner Road||
destination:street:lanes
Stone Quarry Road|Wagner Road||
destination:street:lanes
Stone Quarry

NHS
yes
tiger:name_base_1
State Route 51
generator:output:electricity
yes
tiger:zip_left_1
16136
tiger:zip_right_1
16136
tiger:zip_left_1
16136
tiger:zip_right_1
16136
tiger:name_base_1
State Route 65
tiger:name_base_1
Moon and Clinton
tiger:name_type_1
Rd
turn:lanes:both_ways
left
tiger:name_base_1
State Route 65
tiger:name_base_1
State Route 65
tiger:name_base_1
State Route 65
tiger:name_base_1
State Route 65
tiger:name_base_1
State Route 168
tiger:name_base_1
State Route 168
tiger:name_base_2
State Route 551
name_1
Township Highway 679
tiger:name_base_1
Township Highway 679
tiger:name_base_1
State Route 168
tiger:name_base_1
State Route 168
tiger:name_base_2
State Route 551
name_1
I-341
name_2
Rambo Road
tiger:name_base_1
I-341
tiger:name_base_2
Rambo
tiger:name_type_2
Rd
plant:output:electricity
2741 MW
is_in:iso_3166_2
US:PA
tiger:CLASSFP
C5
tiger:CPI
N
tiger:FUNCSTAT
A
tiger:LSAD
21
tiger:MTFCC
G4110
tiger:NAME
South Heights
tiger:NAMELSAD
South Heights borough
tiger:PCICBSA
N
ti

destination:ref:to
I 376
destination:ref:to
I 376
destination:ref:to
I 376
is_in:iso_3166_2
US:PA
destination:ref:to
I 376 Toll;PA 51
destination:ref:to
I 376 Toll;PA 51
destination:ref:to
PA 51;I 376 Toll
destination:ref:to
PATP
destination:ref:to
I 376
destination:ref:to
PATP
destination:ref:to
I 376 Toll
destination:ref:to
I 376 Toll
{'lower': 116841, 'lower_colon': 61630, 'other': 3080, 'problemchars': 0}


In [16]:
# Finding unique k (tag attrib['k']) and count

def unique_keys(filename):
    distinct_keys=[]
    count=1
    #loop and count unique elements
    EL=get_element(filename, tags=('node', 'way', 'relation'))
    for element in EL:
        if element.tag=='node' or element.tag=='way':
            for tag in element.iter('tag'):
                if tag.attrib['k'] not in distinct_keys:
                    distinct_keys.append(tag.attrib['k'])
                    count+=1
    distinct_keys.sort()
    print("Total of unique keys is {}".format(count))
    
    return distinct_keys
      
    pprint.pprint(distinct_keys)
    
                
unique_keys(OSM_FILE)  # Using real file as input to audit the addr:street key

Total of unique keys is 430


['FIXME',
 'NHS',
 'abandoned:amenity',
 'abandoned:railway',
 'access',
 'addr:city',
 'addr:country',
 'addr:housename',
 'addr:housenumber',
 'addr:postcode',
 'addr:state',
 'addr:street',
 'addr:unit',
 'admin_level',
 'aerodrome:type',
 'aeroway',
 'air_conditioning',
 'airmark',
 'alt_name',
 'amenity',
 'amenity_1',
 'area',
 'atm',
 'backrest',
 'bar',
 'barrier',
 'basin',
 'beacon:code',
 'beacon:frequency',
 'beacon:type',
 'bicycle',
 'bicycle_parking',
 'board_type',
 'boat',
 'boundary',
 'brand',
 'brand:wikidata',
 'brand:wikipedia',
 'brewery',
 'bridge',
 'bridge:name',
 'building',
 'building:levels',
 'building:levels:underground',
 'building:material',
 'building:part',
 'building_1',
 'bus',
 'cables',
 'capacity',
 'capacity:disabled',
 'capacity:parent',
 'capacity:women',
 'category',
 'census:population',
 'centre_turn_lane',
 'communication:mobile_phone',
 'construction',
 'contact:phone',
 'contact:website',
 'content',
 'covered',
 'craft',
 'created_by',


In [27]:
# Auditing Street Names
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons","Freeway","Circle","Strand","Sterling","Way","Highway",
            "Terrace","South","East","West","North"]

# Here are the corrections for different road/street types, Referenced from Udacity Course & GitHub; 
# I also added a few I noticed in my listings.
mapping = {
            " St ": " Street ",
            " St.": " Street ",
            " Rd.": " Road ",
            " Rd ": " Road ",
            " Rd": " Road ",
            " Ave ": " Avenue ", 
            " Ave.": " Avenue ",
            " Av ": " Avenue ", 
            " Dr ": " Drive ",
            " Dr.": " Drive",
            " Blvd ": " Boulevard ",
            " Blvd": " Boulevard",
            " Blvd.": " Boulevard",
            " Ct ": " Centre ",
            " Ctr": " Centre",
            " Pl ": " Place ",
            " Ln ": " Lane ",
            " Cir ": " Circle ",
            " Wy": " Way ",
            " Tpke ": " Turnpike ",
            " S ": " South ",
            " E ": " East ",
            " W ": " West ",
            " N ": "North"
}


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(filename):
    f = open(filename, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(filename, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
            elem.clear()        
    f.close()
    return street_types

def update_name(name, mapping):
    for key,value in mapping.items():
        if key in name:
            return name.replace(key,value)
    return name        

st_types = audit(OSM_FILE)

pprint.pprint(dict(st_types))
for st_type, ways in st_types.items():
    for name in ways:
        better_name = update_name(name, mapping)
        print (name, "=>", better_name)

{'151': {'State Route 151'},
 '18': {'PA 18', 'State Route 18'},
 '30': {'State Route 30'},
 'Blvd': {'Elmwood Blvd'},
 'Center': {'Quaker Village Shopping Center'},
 'Centre': {'Chippewa Towne Centre', 'Chippewa Town Centre'},
 'Extended': {'Crescent Boulevard Extended',
              'Maple Street Extended',
              'West End Avenue Extended'},
 'Extension': {'Hookstown Road Extension'},
 'Oaks': {'Colonial Oaks'},
 'Rd': {'Kane Rd'},
 'Rossway': {'Rossway'},
 'rd': {'Broadhead rd'}}
Quaker Village Shopping Center => Quaker Village Shopping Center
State Route 30 => State Route 30
Colonial Oaks => Colonial Oaks
PA 18 => PA 18
State Route 18 => State Route 18
Chippewa Towne Centre => Chippewa Towne Centre
Chippewa Town Centre => Chippewa Town Centre
Kane Rd => Kane Road 
Rossway => Rossway
Maple Street Extended => Maple Street Extended
Crescent Boulevard Extended => Crescent Boulevard Extended
West End Avenue Extended => West End Avenue Extended
Broadhead rd => Broadhead rd
Hooks

NameError: name 'match_two_colons' is not defined

## References
1. Github
2. stackoverflow
3. Udacity course materials