# OSM PROJECT CODE

In [2]:
# import required modules & functions

import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint



#### Create sample file on k=10 (1 in 10) element sampling

In [1]:
# To create a (reduced size) sample file. CODE from Udacity

import xml.etree.cElementTree as ET  

OSM_FILE = "swlondon.osm"  
SAMPLE_FILE = "swlondon_sample.osm"

k = 10 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

In [3]:

# An initial audit of street types, to see what types of names are present.

#osm_file = open("swlondon_sample.osm", "r")
osm_file = open("swlondon.osm", "r")

street_type_re = re.compile(r'\S+\.?$', re.IGNORECASE)
street_types = defaultdict(int)

def audit_street_type(street_types, street_name):
    #print street_name
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        street_types[street_type] += 1

def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower())
    for k in keys:
        v = d[k]
        print ("%s: %d" % (k, v) )

def is_street_name(elem):
    
    return (elem.tag == "tag") and (elem.attrib['k'] == "addr:street")

def audit():
    #n = 0
    for event, elem in ET.iterparse(osm_file):
        #if n>20: break
        if is_street_name(elem):
            #n+=1
            audit_street_type(street_types, elem.attrib['v'])    
    print_sorted_dict(street_types)    

if __name__ == '__main__':
    audit()

Approach: 3
Ave: 2
Avenue: 152
Billet: 1
Boltons: 2
Boulevard: 1
Bridge: 2
Broadway: 17
Castelnau: 8
Causeway: 1
Centre: 10
Chase: 2
Close: 79
Common: 1
Copse: 3
Court: 6
Crescent: 16
Crossway: 1
Cutting: 1
Danesfield: 3
Deep: 2
Drive: 37
East: 1
Elms: 1
Embankment: 3
Estate: 2
Fields: 3
Gardens: 76
Glade: 2
Green: 11
Grove: 21
Heath: 1
Heathside: 1
Heights: 1
Hill: 67
Hillcrest: 9
Horseshoe: 1
House: 2
Lane: 106
Lawns: 1
Lodge: 1
Mall: 8
Mansions: 1
Meadows: 1
Meadway: 1
Mews: 7
Moorings: 1
North: 1
Parade: 9
Park: 5
Parkside: 1
Pier: 1
Place: 45
Quadrant: 3
Rd: 2
Ridgeway: 1
Rise: 4
Road: 1092
Side: 5
South: 5
Square: 24
Street: 158
Summerstown: 1
Sweep: 5
Terrace: 11
Verralls: 1
View: 1
Villas: 1
Walk: 8
Walton: 2
Way: 36
West: 38
Wharf: 1
Wood: 2
Woodside: 5


In [5]:
# Does a count of element types in osm data file

#OSM_FILE = 'swlondon_sample.osm' 
OSM_FILE = 'swlondon.osm'

# NOTE use n count and break to test script on short sequences before running on full file


tag_types = {}
context = ET.iterparse(OSM_FILE, events=('end',))
#_, root = next(context)
n = 0
for event, elem in context:
    n+=1
    #if n>2000: break
    #print (event, elem)
    if event == 'end': 
        tag_types[elem.tag] = tag_types.get(elem.tag,0)+1

            
print 'Final n= ',n,'\n\n',tag_types


Final n=  3475775 

{'node': 1100871, 'member': 99686, 'nd': 1437122, 'tag': 658995, 'note': 1, 'meta': 1, 'relation': 4117, 'way': 174981, 'osm': 1}


#### Tag Types

Check the"k" value for each "<tag>" and see if there are any potential problems.

Udacity provided  3 regular expressions to check for certain patterns
in the tags. 
(As in the quiz earlier, we would like to change the data
model and expand the "addr:street" type of keys to a dictionary like this:
{"address": {"street": "Some value"}})

So, we have to see if we have such tags, and if we have any tags with
problematic characters.



In [6]:
OSM_FILE = 'swlondon_sample.osm' 
#OSM_FILE = 'swlondon.osm'

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)+:([a-z]|_)+')        #(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'\"\?%#$@\,\. \t\r\n]')

naptan = re.compile(r'^(naptan:|Naptan:)[a-zA-Z]*$')


# NOTE use n count and break to test script on short sequences before running on full file


def key_type(element, keys):
    n=0
    #print (element, keys)
    #print ('before tag test',element.attrib)
    if element.tag == "tag":
        if naptan.search(element.attrib['k']):
            keys['naptan'] +=1
            #print 'found naptan:', element.attrib['k']
            
        elif lower.search(element.attrib['k']):
            #print 'found lower:', element.attrib['k']
            keys['lower'] +=1
            
        elif lower_colon.search(element.attrib['k']):
            #print 'found lower_colon:', element.attrib['k']
            keys['lower_colon'] +=1
            
        elif problemchars.search(element.attrib['k']):
            #print 'found problemchars:', element.attrib['k']
            keys['problemchars'] +=1
        else:
            #print 'found other:', element.attrib['k'].lower()
            keys['other']+=1
        
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0, 'naptan':0}
    n=0
    for _, element in ET.iterparse(filename):
        n+=1
        #if n>20000: break
        #print (n, _, element)
        keys = key_type(element, keys)

    return keys



if __name__ == "__main__":
    keys = process_map(OSM_FILE)
    pprint.pprint(keys)

{'lower': 51558,
 'lower_colon': 10251,
 'naptan': 3971,
 'other': 104,
 'problemchars': 0}


In [28]:
#Exploring Users

#OSM_FILE = 'swlondon_sample.osm' 
OSM_FILE = 'swlondon.osm'


def get_user(element):
    if 'uid' in element.attrib.keys():
        #print n, 'has uid', element.attrib['uid']
        return element.attrib['uid']
    else:
        #print n, 'no uid attrib', element.tag, element.attrib
        return None

# NOTE use n count and break to test script on short sequences before running on full file

def process_map(filename):
    users = set()
    n=0
    for _, element in ET.iterparse(filename):
        n+=1
        #if n>20000: break
        element_uid = get_user(element)
        # only add to users set if there is a UID value
        if element_uid:
            users.add(element_uid)


    print len(users), '\n'
    #pprint.pprint(users)

    return (users)






if __name__ == "__main__":
    #test()
    process_map(OSM_FILE)

2128 



(LHR): 1
11: 1
1EG: 1
218: 1
24: 1
9: 1
Airport: 2
Apartments: 1
Approach: 26
Ave: 10
Avenue: 1533
Avenuen: 1
Bank: 9
Basin: 1
Billet: 2
Boltons: 29
Boulevard: 8
Bridge: 30
Broadway: 149
Buildings: 1
Byeways: 1
Bypass: 1
Castelnau: 66
Causeway: 4
Centre: 80
Chase: 9
Chelsea: 1
Close: 855
Common: 20
Copse: 13
Corner: 1
Cottages: 4
Court: 71
Crescent: 158
Croft: 1
Cross: 1
Crossway: 12
Cutting: 4
Danesfield: 36
Deep: 4
Downs: 2
Downside: 1
Draycott: 1
Drive: 456
East: 24
Elmers: 2
Elms: 16
Embankment: 22
End: 1
Estate: 20
Farm: 1
Fields: 15
Foleys: 1
Frogmore: 1
Gardens: 759
Gate: 8
Glade: 17
Green: 114
Grove: 267
Hall: 1
Heath: 11
Heathside: 5
Heights: 5
Hill: 614
Hillcrest: 101
Horseshoe: 2
House: 16
Hurstbourne: 4
Hythe: 2
Island: 3
Lane: 1124
lane: 1
Lanes: 1
Lawns: 3
Lodge: 4
Mall: 60
Mansions: 3
Market: 5
Mead: 1
Meadows: 6
Meadway: 3
Mews: 90
Mills: 1
Moorings: 6
North: 9
Parade: 72
Park: 46
park: 1
Parkshot: 2
Parkside: 3
Path: 9
Pier: 1
Place: 397
Pleasant: 1
Quadrant: 40
Quay: 2
Rd: 11
Rd): 1
Reach: 2
Retreat: 7
Ride: 3
Ridge: 2
Ridgeway: 6
Ridgway: 7
Rise: 73
Riverbank: 1
Riverside: 7
ROAD: 1
road: 1
Road: 10965
Row: 18
Services: 1
Sheen: 2
Shortlands: 4
Shotfield: 1
Side: 42
South: 34
Square: 285
St: 1
Stables: 2
Station: 1
Stepgates: 3
street: 1
Street: 1510
Streets: 2
Strreet: 1
Summerstown: 2
Sweep: 40
Terrace: 152
Thameside: 1
Town: 1
Towpath: 1
Triangle: 1
Vale: 4
Verralls: 5
View: 6
Villas: 9
Vineyard: 2
Walk: 61
Walton: 27
Way: 352
West: 326
Wharf: 2
Whitton: 1
Willowbank: 1
Wimbledon: 1
Wood: 10
Woodside: 64
Yeldham: 1

In [7]:
# Auditing Street Names


#OSM_FILE = 'swlondon_sample.osm' 
OSM_FILE = 'swlondon.osm'

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ['Approach','Avenue','Bank','Boulevard','Bridge','Broadway','Buildings','Causeway','Centre',
            'Chase','Close','Common','Copse','Corner','Cottages','Court','Crescent','Croft',
           'Crossway','Cutting','Deep','Drive','East', 'Embankment','Gardens','Green','Grove','Heath','Hill',
            'Heights','Lane','Mall','Meadows','Mews','North','Path','Parade','Park','Place','Quadrant','Quay',
            'Rise','Road','Row', 'South','Square','Street','Terrace','Vale','Villas','Walk','Way','West']



# UPDATE THIS VARIABLE
mapping = { 'St': 'Street',
            'St.': 'Street',
            'Strreet': 'Street',
            'street': 'Street',
            'Rd.':'Road',
            'Rd' : 'Road',
            'ROAD':'Road',
            'road':'Road',
            'Ave':'Avenue',
            'Avenuen':'Avenue',
            'lane': 'Lane',
            'park':'Park'           
            } # etc. to be updated


problem_street_names = ['11', '218','24', 'Rectory Grove Hampton TW12 1EG','Fulham Road, Chelsea',
                        'Sheffield Rd, Heathrow Airport (LHR)','Beacon Rd (Entrance Sanctuary Rd)',
                        'Wimbledon']
    
change_list_mapping = { 'Rectory Grove Hampton TW12 1EG': 'Rectory Grove',
                        'Sheffield Rd, Heathrow Airport (LHR)': 'Sheffield Road',
                        'Beacon Rd (Entrance Sanctuary Rd)': 'Beacon Road',
                        'Wimbledon' : 'Wimbledon Hill Road'}


drop_list = ['11', '218','24']
    
    

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def is_problem_name(street_name, problems):
    return (street_name in problems)

def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    n=0
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        n+=1
        #if n>20000: break
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter('tag'):
                if is_street_name(tag):
                    # Used code below to examine 'problem' street names in detail
                    #if is_problem_name(tag.attrib['v'], problem_street_names):

                        #print '\n',elem.tag
                        #for tag in elem.iter():
                            #print tag.attrib
                            #continue
                        #continue
                    
                    audit_street_type(street_types, tag.attrib['v'])
                    
                    
    osm_file.close()
    return street_types


def update_name(name, mapping):
    #print (name.split()[-1])
    if name in drop_list:
        return 'DROP TAG'
    if name in change_list_mapping:
        name = change_list_mapping[name]
        return name
    name_list = name.split()
    try:
        name_list[-1] = mapping[name_list[-1]]
        name_list[0] = name_list[0].title()
        return ' '.join(name_list)
    except:
        return name


def propose_name(st_types):
    #st_types = audit(OSM_FILE)
    #assert len(st_types) == 3
    #pprint.pprint(dict(st_types))
    for st_type, ways in st_types.items():
        for name in ways:
            better_name = update_name(name, mapping)
            if better_name != name:
                print (name, "=>", better_name)
            #if name == "West Lexington St.":
            #    assert better_name == "West Lexington Street"
            #if name == "Baldwin Rd.":
            #    assert better_name == "Baldwin Road"


if __name__ == '__main__':
    #test()
    st_types = audit(OSM_FILE)
    propose_name(st_types)
    print '\n\n'
    #pprint.pprint(dict(st_types))

('218', '=>', 'DROP TAG')
("St John's Rd", '=>', "St John's Road")
('Spur Rd', '=>', 'Spur Road')
('Cobham Rd', '=>', 'Cobham Road')
('Warwick Rd', '=>', 'Warwick Road')
('Ewell Rd', '=>', 'Ewell Road')
('Kew Bridge Rd', '=>', 'Kew Bridge Road')
('24', '=>', 'DROP TAG')
('WALTON ROAD', '=>', 'Walton Road')
('Rectory Grove Hampton TW12 1EG', '=>', 'Rectory Grove')
('Kingston Avenuen', '=>', 'Kingston Avenue')
('sydney street', '=>', 'Sydney Street')
('Beacon Rd (Entrance Sanctuary Rd)', '=>', 'Beacon Road')
('Sheffield Rd, Heathrow Airport (LHR)', '=>', 'Sheffield Road')
('Kings road', '=>', 'Kings Road')
('Langdon park', '=>', 'Langdon Park')
('Eden St', '=>', 'Eden Street')
('Ebury Strreet', '=>', 'Ebury Street')
('11', '=>', 'DROP TAG')
('Quintin Ave', '=>', 'Quintin Avenue')
('Baywillow Ave', '=>', 'Baywillow Avenue')
('Wimbledon', '=>', 'Wimbledon Hill Road')
('Town lane', '=>', 'Town Lane')





**NOTE all problem names now either have fix, or will drop**


Street names still to fix (working list)
```
'11': set(['11']), - ignore / drop
 '1EG': set(['Rectory Grove Hampton TW12 1EG']), -- CHANGE to Rectory Grove
 '218': set(['218']) - ignore / drop
 '24': set(['24']) - ignore / drop
 
'Chelsea': set(['Fulham Road, Chelsea']) -- CHANGE to Fulham Road

'LHR)': set(['Sheffield Rd, Heathrow Airport (LHR)']), -- CHANGE to Sheffield Road

'Rd)': set(['Beacon Rd (Entrance Sanctuary Rd)']), -- CHANGE to Beacon Road

REMOVED FROM PROBLEM LIST 'Ridgway': set(['Ridgway']), -- to 'The Ridgway' _ Correct sp?

REMOVED FROM PROBLEM LIST'Station': set(['Richmond Station']), - leave -platform address

REMOVED FROM PROBLEM LIST 'Walton': set(['The Heart of Walton']), -- leave - shopping centre

REMOVED FROM PROBLEM LIST'Whitton': set(['High Street Whitton']), Correct!

'Wimbledon': set(['Wimbledon']), - CHANGE to Wimbledon Hill Road
```


#### Counting tag k: attribs

a look at all the 'k' attribs in tags to look for problems.

In [9]:
#OSM_FILE = 'swlondon_sample.osm' 
OSM_FILE = 'swlondon.osm'


def process_map(filename):
    tag_ks = defaultdict(int)
    n=0
    for _, element in ET.iterparse(filename):
        n+=1
        #if n>20000: break
        #print (n, _, element)
        if element.tag == "tag":
            tag_ks[element.attrib['k']] += 1


    return tag_ks


if __name__ == "__main__":
    tag_keys = process_map(OSM_FILE)
    pprint.pprint(sorted(tag_keys.items() , key=lambda x: x[1], reverse=True))

[('highway', 78659),
 ('building', 77127),
 ('name', 59270),
 ('source', 39911),
 ('created_by', 30980),
 ('addr:housenumber', 25247),
 ('addr:street', 21598),
 ('lit', 15215),
 ('amenity', 13599),
 ('maxspeed', 13555),
 ('source:name', 11109),
 ('surface', 10187),
 ('ref', 9130),
 ('oneway', 9026),
 ('barrier', 8903),
 ('natural', 8163),
 ('naptan:verified', 7254),
 ('landuse', 6987),
 ('foot', 6895),
 ('addr:postcode', 6864),
 ('addr:city', 6008),
 ('access', 5775),
 ('operator', 5603),
 ('service', 5484),
 ('naptan:AtcoCode', 5467),
 ('bicycle', 5356),
 ('naptan:CommonName', 5326),
 ('naptan:Bearing', 5323),
 ('shop', 5048),
 ('naptan:Indicator', 4412),
 ('building:levels', 4294),
 ('leisure', 4150),
 ('type', 4127),
 ('lanes', 4121),
 ('naptan:Street', 3935),
 ('layer', 3266),
 ('sidewalk', 3162),
 ('railway', 2931),
 ('crossing', 2781),
 ('local_ref', 2554),
 ('footway', 2447),
 ('bridge', 2442),
 ('naptan:StopAreaType', 2127),
 ('naptan:StopAreaCode', 2127),
 ('website', 2117),
 

 ('circuits', 9),
 ('council_style', 9),
 ('recycling:magazines', 8),
 ('crop', 8),
 ('access:conditional', 8),
 ('contact:twitter', 8),
 ('building:roof:shape', 8),
 ('beauty', 8),
 ('telecom', 8),
 ('industrial', 8),
 ('oneway:bus', 8),
 ('map_type', 8),
 ('second_hand', 8),
 ('support', 8),
 ('source:ref:lau:1', 8),
 ('paved', 8),
 ('disused:railway', 8),
 ('healthcare:speciality', 8),
 ('trade', 8),
 ('name:es', 8),
 ('fuel:diesel', 8),
 ('surveillance:zone', 8),
 ('source:old_name', 8),
 ('fuel:octane_95', 8),
 ('cycle_network', 8),
 ('passenger_information_display', 8),
 ('authority', 8),
 ('source:oneway', 8),
 ('ref:lau:1', 8),
 ('source:alt_name', 7),
 ('military', 7),
 ('handrail', 7),
 ('opening_date', 7),
 ('colour:text', 7),
 ('placement:backward', 7),
 ('inscription', 7),
 ('colour:arrow', 7),
 ('camera:mount', 7),
 ('lastcheck', 7),
 ('fhrs:confidence_management', 7),
 ('length', 7),
 ('aerialway', 7),
 ('name:he', 7),
 ('fhrs:hygiene', 7),
 ('mkgmap:flare-check', 7),
 (

A look at k = 'types' in tags - to see if they are overwriting any 'higher up' type labels.

I can't see any that are.

In [69]:
#OSM_FILE = 'swlondon_sample.osm' 
OSM_FILE = 'swlondon.osm'


def is_tag_k_type(elem):
    return (elem.attrib['k'] == "type")


def audit(osmfile):
    osm_file = open(osmfile, "r")

    n=0
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        n+=1
        #if n>2000000: break
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter('tag'):
                if is_tag_k_type(tag):
                    print elem.attrib
                    for tag in elem.iter():
                        print tag.attrib
                        
    osm_file.close()
    return 







if __name__ == '__main__':
    #test()
    audit(OSM_FILE)


{'changeset': '40429617', 'uid': '4198936', 'timestamp': '2016-07-01T22:43:24Z', 'lon': '-0.2521358', 'version': '2', 'user': 'ccityplanner12', 'lat': '51.4728551', 'id': '298847546'}
{'changeset': '40429617', 'uid': '4198936', 'timestamp': '2016-07-01T22:43:24Z', 'lon': '-0.2521358', 'version': '2', 'user': 'ccityplanner12', 'lat': '51.4728551', 'id': '298847546'}
{'k': 'description', 'v': 'This is a private house which was once inhabited by the composer Gustav Holst.'}
{'k': 'historic', 'v': 'memorial'}
{'k': 'name', 'v': "Holst's House"}
{'k': 'type', 'v': 'Blue Plaque'}
{'changeset': '1705558', 'uid': '7630', 'timestamp': '2009-07-01T23:15:05Z', 'lon': '-0.3885019', 'version': '1', 'user': 'LivingWithDragons', 'lat': '51.4047157', 'id': '432011061'}
{'changeset': '1705558', 'uid': '7630', 'timestamp': '2009-07-01T23:15:05Z', 'lon': '-0.3885019', 'version': '1', 'user': 'LivingWithDragons', 'lat': '51.4047157', 'id': '432011061'}
{'k': 'description', 'v': 'Moved 2m North in June 199

{'changeset': '4491419', 'uid': '6035', 'timestamp': '2010-04-21T22:10:43Z', 'version': '1', 'user': 'Sorbus_x_kewensis', 'id': '56046726'}
{'changeset': '4491419', 'uid': '6035', 'timestamp': '2010-04-21T22:10:43Z', 'version': '1', 'user': 'Sorbus_x_kewensis', 'id': '56046726'}
{'ref': '703504888'}
{'ref': '703504889'}
{'ref': '703504890'}
{'ref': '703504891'}
{'ref': '703504888'}
{'k': 'building', 'v': 'yes'}
{'k': 'name', 'v': 'Kew Cricket Club'}
{'k': 'type', 'v': 'pavilion'}
{'changeset': '32983952', 'uid': '181135', 'timestamp': '2015-07-30T15:35:55Z', 'version': '3', 'user': 'Manu1400', 'id': '57836783'}
{'changeset': '32983952', 'uid': '181135', 'timestamp': '2015-07-30T15:35:55Z', 'version': '3', 'user': 'Manu1400', 'id': '57836783'}
{'ref': '717553843'}
{'ref': '717553844'}
{'k': 'man_made', 'v': 'pipeline'}
{'k': 'type', 'v': 'gas'}
{'changeset': '47245678', 'uid': '322039', 'timestamp': '2017-03-29T01:28:36Z', 'version': '2', 'user': 'MacLondon', 'id': '188617701'}
{'change

#### Looking at Postcodes

In [10]:
#OSM_FILE = 'swlondon_sample.osm' 
OSM_FILE = 'swlondon.osm'

code_ingroup_re = re.compile(r'^[a-zA-Z]', re.IGNORECASE)


expected = []


#splits  postcode into outgroup & ingroup, adds ingroups to a 'set' of ingroups for each outgroup
#adds count to occurances of outgroup

def audit_postcode(postcode_outgroups, postcodes, code):
    # test on ';' added to deal with postcode data point containing ';' separated list
    if ';' in code:
        code = code.split(';')[0]
    code_sep = code.split()
    code_outgroup = code_sep[0]
    postcode_outgroups[code_outgroup] +=1
    if len(code_sep) == 2:
            postcodes[code_sep[0]].add(code_sep[1])
            return
    # code below purely to deal with one example found of multiple(2) postcodes separated by ;
    if len(code_sep)>2:
        print 'still have code_sep >2', code_sep
        code_1 = code.split(';')[0]
        code_sep = code_1.split()
        code_outgroup = code_sep[0]
        postcode_outgroups[code_outgroup] +=1
        postcodes[code_sep[0]].add(code_sep[1])
        code_2 = code.split(';')[1]
        code_sep = code_2.split()
        code_outgroup = code_sep[0]
        postcode_outgroups[code_outgroup] +=1
        postcodes[code_sep[0]].add(code_sep[1])
        return
        

def is_postcode(elem):
    return (elem.attrib['k'] == "addr:postcode")


# audits postcodes
# counts outgroup occurances, and builds set of ingroups for each outgroup
def audit(osmfile):
    osm_file = open(osmfile, "r")
    postcodes = defaultdict(set)
    postcode_outgroups = defaultdict(int)
    n=0
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        n+=1
        #if n>200000: break
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter('tag'):
                if is_postcode(tag):
                    audit_postcode(postcode_outgroups,postcodes, tag.attrib['v'])
    osm_file.close()
    return postcode_outgroups, postcodes



if __name__ == '__main__':

    postcode_outgroups, postcodes = audit(OSM_FILE)

    
    print len(postcode_outgroups),'postcode outgroups in data (with counts of occurance) \n'
    pprint.pprint(sorted(postcode_outgroups.items()))
    
    # to print out the full 'sets' of ingroups for each outgroup
    #pprint.pprint(sorted(postcodes.items()))

86 postcode outgroups in data (with counts of occurance) 

[('CR4', 44),
 ('CR5', 4),
 ('GU21', 7),
 ('GU22', 12),
 ('GU23', 142),
 ('GU4', 6),
 ('KT1', 90),
 ('KT10', 35),
 ('KT11', 43),
 ('KT12', 31),
 ('KT13', 149),
 ('KT14', 9),
 ('KT15', 20),
 ('KT16', 83),
 ('KT17', 22),
 ('KT18', 13),
 ('KT19', 284),
 ('KT2', 76),
 ('KT20', 18),
 ('KT21', 29),
 ('KT22', 75),
 ('KT23', 7),
 ('KT24', 11),
 ('KT3', 64),
 ('KT4', 21),
 ('KT5', 6),
 ('KT6', 97),
 ('KT7', 6),
 ('KT8', 29),
 ('KT9', 25),
 ('RH1', 2),
 ('RH2', 1),
 ('RH4', 1),
 ('RH5', 1),
 ('SL3', 2),
 ('SM1', 54),
 ('SM2', 21),
 ('SM3', 12),
 ('SM4', 29),
 ('SM5', 15),
 ('SM6', 28),
 ('SM7', 11),
 ('SW10', 28),
 ('SW11', 693),
 ('SW12', 50),
 ('SW13', 141),
 ('SW14', 220),
 ('SW15', 221),
 ('SW17', 103),
 ('SW18', 186),
 ('SW19', 501),
 ('SW1W', 19),
 ('SW1X', 2),
 ('SW20', 805),
 ('SW3', 167),
 ('SW4', 12),
 ('SW5', 73),
 ('SW6', 158),
 ('SW7', 25),
 ('SW8', 16),
 ('TW1', 186),
 ('TW10', 467),
 ('TW11', 77),
 ('TW12', 137),
 ('TW13',

In [53]:
# to print out the full 'sets' of ingroups for each outgroup
pprint.pprint(sorted(postcodes.items()))

[('CR4',
  set(['1SD',
       '1SF',
       '1YG',
       '2AJ',
       '2DJ',
       '2DZ',
       '2HZ',
       '2JA',
       '2JD',
       '2JS',
       '2LF',
       '2NT',
       '2PF',
       '3AA',
       '3AF',
       '3BE',
       '3EB',
       '3ED',
       '3FH',
       '3GD',
       '3HD',
       '3HG',
       '3HS',
       '3LA',
       '3LD',
       '3ND',
       '3NH',
       '3NN',
       '3PQ',
       '4BG',
       '4BJ',
       '4HB',
       '4HR',
       '4LA',
       '4XU'])),
 ('CR5', set(['3AL', '3LN', '3NP', '3QW'])),
 ('GU21', set(['4YH', '5NL', '5QE', '5RF', '5SH', '5UB'])),
 ('GU22', set(['8AR', '8AX', '8HA', '8SP', '9AL', '9DL', '9JX'])),
 ('GU23',
  set(['6AA',
       '6AL',
       '6BN',
       '6EZ',
       '6JQ',
       '6JS',
       '6JX',
       '6JY',
       '6JZ',
       '6LE',
       '6LJ',
       '6LP',
       '6LS',
       '6NE',
       '6QB',
       '6QS',
       '7BP',
       '7BS',
       '7EN',
       '7ER',
       '7ET',
       '7EZ',
       '

       '8SY',
       '8TE',
       '9AA',
       '9AG',
       '9AU',
       '9AZ',
       '9BE',
       '9BN',
       '9BP',
       '9BS',
       '9BW',
       '9ET',
       '9EY',
       '9EZ',
       '9RG',
       '9SG',
       '9SS',
       '9SY'])),
 ('SW13',
  set(['0AP',
       '0BE',
       '0DB',
       '0DG',
       '0JP',
       '0NE',
       '0NR',
       '0NY',
       '0PQ',
       '0PW',
       '0PY',
       '0PZ',
       '8EL',
       '9AE',
       '9DH',
       '9DW',
       '9ED',
       '9ER',
       '9ET',
       '9EW',
       '9HE',
       '9HJ',
       '9HQ',
       '9HR',
       '9JS',
       '9JT',
       '9LB',
       '9LD',
       '9LW',
       '9NR',
       '9PY',
       '9QE',
       '9QF',
       '9QG',
       '9QL',
       '9QN',
       '9RU'])),
 ('SW14',
  set(['7AF',
       '7DE',
       '7ED',
       '7EH',
       '7ET',
       '7EW',
       '7EX',
       '7EZ',
       '7HJ',
       '7JE',
       '7JG',
       '7JN',
       '7JR',
       '7JT',
       '

In [115]:

import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus

import schema

OSM_FILE = 'swlondon_sample.osm' 
#OSM_FILE = 'swlondon.osm'

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

#SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']



# CLEANUP FUNCTIONS BELOW:








# A function to shape the tags and add to the relevant dict objects.
# Used for both nodes and way tags - as treatment is the same.
# Takes list of desired tag fields (from NODE_TAGS_FIELDS)
# For id (an attrb from parent node or way and NOT in tag), parent node / way id is returned
# for 'key' checks if: 
#                      a PROBLEMCHARS match, and ignores / drops
#                      a LOWER_COLON match and splits, extracts / sets type (see rules above) 
#                                        and returns rest as key
#                       (NOTE - type is 'regular' if no other found)


def shape_tags(element, id_number, tag_attr_fields = NODE_TAGS_FIELDS,
                   problem_chars=PROBLEMCHARS, lower_colon=LOWER_COLON,
                   default_tag_type='regular'):
    tag_dict={}
    for field in tag_attr_fields:
        if field == 'id':
            tag_dict[field]= id_number
        elif field == 'key':
            tag_dict['type'] = default_tag_type
            if problem_chars.search(element.attrib['k']):
                print 'IGNORING A TAG - PROBLEM CHARS', element.attrib['k']
            elif lower_colon.search(element.attrib['k']):
                split_k =  element.attrib['k'].split(':')
                tag_dict['type'] = split_k[0]
                tag_dict[field] = ':'.join(split_k[1:])

            else:
                tag_dict[field]= element.attrib['k']
        elif field == 'value':
            tag_dict[field]= element.attrib['v']
      
    return tag_dict




def shape_way_nodes(element, id_number, position, way_node_fields=WAY_NODES_FIELDS):
    way_node_dict = {}
    for field in way_node_fields:
        if field == 'id':
            way_node_dict[field]= id_number
        elif field == 'position':
            way_node_dict[field]= position
        elif field == 'node_id':
            way_node_dict[field]= element.attrib['ref']
    return way_node_dict
            


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    if element.tag == 'node':
        for field in node_attr_fields:
            node_attribs[field]= element.attrib[field]
            id_number = element.attrib['id']
        for tag in element.iter('tag'):
            tags.append(shape_tags(tag, id_number))            
        return {'node': node_attribs, 'node_tags': tags}
        
    elif element.tag == 'way':
        for field in way_attr_fields:
            way_attribs[field]= element.attrib[field]
            id_number = element.attrib['id']        
        for tag in element.iter('tag'):
            tags.append(shape_tags(tag, id_number))
        position = 0
        for way_node in element.iter('nd'):
            way_nodes.append(shape_way_nodes(way_node, id_number, position))
            position += 1     
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()





def process_map_simple(file_in, validate):
    #validator = cerberus.Validator()
    n=0
    for element in get_element(file_in, tags=('node', 'way')):
            n+=1
            if n>200: break
            el = shape_element(element)
            pprint.pprint (el)
            print '\n'
            #if el:
            #    if validate is True:
            #        validate_element(el, validator)

                    
if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map_simple(OSM_FILE, validate=False)

{'node': {'changeset': '43286497',
          'id': '109642',
          'lat': '51.4835856',
          'lon': '-0.1673611',
          'timestamp': '2016-10-30T16:09:18Z',
          'uid': '322039',
          'user': 'MacLondon',
          'version': '10'},
 'node_tags': []}


{'node': {'changeset': '36747072',
          'id': '109656',
          'lat': '51.4884895',
          'lon': '-0.1893827',
          'timestamp': '2016-01-22T20:02:16Z',
          'uid': '352985',
          'user': 'ecatmur',
          'version': '6'},
 'node_tags': []}


{'node': {'changeset': '16703076',
          'id': '109896',
          'lat': '51.4907726',
          'lon': '-0.2132303',
          'timestamp': '2013-06-25T18:13:57Z',
          'uid': '1016290',
          'user': 'Amaroussi',
          'version': '8'},
 'node_tags': []}


{'node': {'changeset': '24802132',
          'id': '109917',
          'lat': '51.4872140',
          'lon': '-0.2546853',
          'timestamp': '2014-08-17T08:46:16Z',
     

 'node_tags': []}


{'node': {'changeset': '17839492',
          'id': '292524',
          'lat': '51.4624600',
          'lon': '-0.1846809',
          'timestamp': '2013-09-14T20:30:53Z',
          'uid': '1016290',
          'user': 'Amaroussi',
          'version': '4'},
 'node_tags': []}


{'node': {'changeset': '535585',
          'id': '292575',
          'lat': '51.4756994',
          'lon': '-0.1730585',
          'timestamp': '2007-09-29T14:03:10Z',
          'uid': '4049',
          'user': 'randomjunk',
          'version': '1'},
 'node_tags': []}


{'node': {'changeset': '10175642',
          'id': '292608',
          'lat': '51.4297454',
          'lon': '-0.2675723',
          'timestamp': '2011-12-21T21:43:25Z',
          'uid': '16703',
          'user': 'joelholdsworth',
          'version': '2'},
 'node_tags': []}


{'node': {'changeset': '3979479',
          'id': '292630',
          'lat': '51.4211655',
          'lon': '-0.2863507',
          'timestamp': '2010-02