In [1]:
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
import collections
import pymongo

In [2]:
import os
datadir = "C:\Users\sampa\Desktop\Other\Udacity\Lesson 4"
datafile = "bangalore.osm"
cal_data = os.path.join(datadir, datafile)

In [3]:
def count_tags(filename):
        tags = {}
        for event, elem in ET.iterparse(filename):
            if elem.tag in tags: 
                tags[elem.tag] += 1
            else:
                tags[elem.tag] = 1
        return tags
cal_tags = count_tags(cal_data)
pprint.pprint(cal_tags)

{'bounds': 1,
 'member': 6000,
 'nd': 3683187,
 'node': 2986688,
 'osm': 1,
 'relation': 946,
 'tag': 816117,
 'way': 662747}


In [4]:
import re

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        for tag in element.iter('tag'):
            k = tag.get('k')
            if lower.search(k):
                keys['lower'] += 1
            elif lower_colon.search(k):
                keys['lower_colon'] += 1
            elif problemchars.search(k):
                keys['problemchars'] += 1
            else:
                keys['other'] += 1
    return keys


def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

cal_keys = process_map(cal_data)
pprint.pprint(cal_keys)

{'lower': 777263, 'lower_colon': 37700, 'other': 1149, 'problemchars': 5}


In [4]:
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        for e in element:
            if 'uid' in e.attrib:
                users.add(e.attrib['uid'])
    return users
users = process_map(cal_data)
len(users)

1914

In [5]:
from collections import defaultdict

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Avenue", "Boulevard", "Commons", "Court", "Drive", "Lane", "Parkway", 
                         "Place", "Road", "Square", "Street", "Trail"]

mapping = {'Ave'  : 'Avenue',
           'Blvd' : 'Boulevard',
           'Dr'   : 'Drive',
           'Ln'   : 'Lane',
           'Pkwy' : 'Parkway',
           'Rd'   : 'Road',
           'Rd.'   : 'Road',
           'St'   : 'Street',
           'street' :"Street",
           'Ct'   : "Court",
           'Cir'  : "Circle",
           'Cr'   : "Court",
           'ave'  : 'Avenue',
           'Hwg'  : 'Highway',
           'Hwy'  : 'Highway',
           'Sq'   : "Square"}

In [6]:

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

    return street_types

In [7]:
cal_street_types = audit(cal_data)


In [9]:
pprint.pprint(dict(cal_street_types))


{'02': set(['756/77/1 marathahalli -sarajpur outer ring road bellandur,banglore 02']),
 '03': set(['56,srinivas reddy building bellandur banglore 03',
            'mantri commercio tower B 5th floor outer ring road marathahalli banglore 03']),
 '048': set(['Whitefield Main Rd, Whitefield, Inner Valley, Whitefield, Bengaluru, Karnataka 560 048']),
 '1': set(['2nd Main Road, Electronic City Phase 1',
           '3rd Main,KEB Layout, BTM Layout  1',
           "4th floor, 27th Main, Above Gold's Gym, Near CPWD Complex, Sector 1",
           'Borewell Road,Phase 1',
           'Chikka Madivala BTM Layout 1',
           'EPIP Area, Road No. 1',
           'Electronic City Phase 1',
           'Shikaripalya Road, Electronic City - 1']),
 '1,': set(['18th Main Rd, Cashier Layout, 1st Stage, BTM Layout 1,',
            '1st Cross Rd, Aicobo Nagar, 1st Stage, BTM Layout 1,',
            'Chocolate Factory Rd, Cashier Layout, 1st Stage, BTM Layout 1,']),
 '100': set(['100']),
 '18th': set(['18th

In [8]:
from collections import defaultdict

def audit_zipcode(invalid_zipcodes, zipcode):
    threeDigits = zipcode[0:3]
    
    if not threeDigits.isdigit():
        invalid_zipcodes[threeDigits].add(zipcode)
    
    elif threeDigits != 560:
        invalid_zipcodes[threeDigits].add(zipcode)
        
def is_zipcode(elem):
    return (elem.attrib['k'] == "addr:postcode")

def audit_zip(osmfile):
    osm_file = open(osmfile, "r")
    invalid_zipcodes = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_zipcode(tag):
                    audit_zipcode(invalid_zipcodes,tag.attrib['v'])

    return invalid_zipcodes

cal_zipcode = audit_zip(cal_data)

In [9]:
pprint.pprint(dict(cal_zipcode))


{'- 5': set(['- 560001',
             '- 560011',
             '- 560027',
             '- 560034',
             '- 560051',
             '- 560068',
             '- 560076',
             '- 560094,',
             '- 560095']),
 '380': set(['380068']),
 '500': set(['500006', '500036', '500076']),
 '530': set(['530078', '530103']),
 '540': set(['540045']),
 '556': set(['5560034']),
 '560': set(['560 001',
             '560 002',
             '560 020',
             '560 025',
             '560 036',
             '560 037',
             '560 048',
             '560 052',
             '560 055',
             '560 064',
             '560 068',
             '560 077',
             '560 078',
             '560 080',
             '560 090',
             '560 091',
             '560 100',
             '5600',
             '560001',
             '5600011',
             '560001ph',
             '560002',
             '560003',
             '560003 ',
             '5600037',
             '560004'

In [11]:
def update_name(zipcode):
    testNum = re.findall('[a-zA-Z]*', zipcode)
    if testNum:
        testNum = testNum[0]
    testNum.strip()
    if testNum == "- ":
        convertedZipcode = (re.findall(r'\d+', zipcode))
        if convertedZipcode:
            if convertedZipcode.__len__() ==2:
                return (re.findall(r'\d+', zipcode))[0] + "-" +(re.findall(r'\d+', zipcode))[1]
            else:
                return (re.findall(r'\d+', zipcode))[0]

for street_type, ways in cal_zipcode.iteritems():
    for name in ways:
        better_name = update_name(name)
        print name, "=>", better_name

iam in bang => None
೫೬೦೦೬೦ => None
- 560034 => None
- 560068 => None
- 560051 => None
- 560011 => None
- 560076 => None
- 560094, => None
- 560001 => None
- 560095 => None
- 560027 => None
Bengaluru => None
635126 => None
635109 => None
560067, => None
560 001 => None
560 002 => None
560087 => None
560086 => None
560085 => None
560084 => None
560083 => None
560082 => None
560080 => None
560 080 => None
560069 => None
5601003 => None
560086, => None
560100" => None
560009 => None
560008 => None
560007 => None
560027" => None
560005 => None
560004 => None
560003 => None
560002 => None
560001 => None
560 078 => None
56066 => None
560 037 => None
560 036 => None
560070" => None
560080" => None
56085 => None
560078 => None
560079 => None
560072 => None
560073 => None
560070 => None
560071 => None
560076 => None
560077 => None
560074 => None
560075 => None
560 100 => None
56007 => None
56005 => None
5600 => None
5600011 => None
560076, => None
56077 => None
5600091 => None
560106 => None
560

In [13]:
import re
import codecs
import json

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
address_regex = re.compile(r'^addr\:')
street_regex = re.compile(r'^street')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :
        # YOUR CODE HERE
        node['type'] = element.tag
        # initialize empty address
        address = {}
        # parsing through attributes
        for a in element.attrib:
            if a in CREATED:
                if 'created' not in node:
                    node['created'] = {}
                node['created'][a] = element.get(a)
            elif a in ['lat', 'lon']:
                continue
            else:
                node[a] = element.get(a)
        # populate position
        if 'lat' in element.attrib and 'lon' in element.attrib:
            node['pos'] = [float(element.get('lat')), float(element.get('lon'))]

        # parse second-level tags for nodes
        for e in element:
            # parse second-level tags for ways and populate `node_refs`
            if e.tag == 'nd':
                if 'node_refs' not in node:
                    node['node_refs'] = []
                if 'ref' in e.attrib:
                    node['node_refs'].append(e.get('ref'))

            # throw out not-tag elements and elements without `k` or `v`
            if e.tag != 'tag' or 'k' not in e.attrib or 'v' not in e.attrib:
                continue
            key = e.get('k')
            val = e.get('v')

            # skip problematic characters
            if problemchars.search(key):
                continue

            # parse address k-v pairs
            elif address_regex.search(key):
                key = key.replace('addr:', '')
                address[key] = val

            # catch-all
            else:
                node[key] = val
        # compile address
        if len(address) > 0:
            node['address'] = {}
            street_full = None
            street_dict = {}
            street_format = ['prefix', 'name', 'type']
            # parse through address objects
            for key in address:
                val = address[key]
                if street_regex.search(key):
                    if key == 'street':
                        street_full = val
                    elif 'street:' in key:
                        street_dict[key.replace('street:', '')] = val
                else:
                    node['address'][key] = val
            # assign street_full or fallback to compile street dict
            if street_full:
                node['address']['street'] = street_full
            elif len(street_dict) > 0:
                node['address']['street'] = ' '.join([street_dict[key] for key in street_format])
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data
process_map(cal_data)

[{'created': {'changeset': '16957521',
   'timestamp': '2013-07-15T08:10:50Z',
   'uid': '634020',
   'user': 'user_634020',
   'version': '4'},
  'id': '17327077',
  'pos': [12.9026964, 77.5949117],
  'type': 'node'},
 {'created': {'changeset': '18611831',
   'timestamp': '2013-10-30T05:16:40Z',
   'uid': '634020',
   'user': 'user_634020',
   'version': '32'},
  'id': '17327092',
  'pos': [12.9063367, 77.5950592],
  'type': 'node'},
 {'created': {'changeset': '18598983',
   'timestamp': '2013-10-29T11:01:32Z',
   'uid': '634020',
   'user': 'user_634020',
   'version': '32'},
  'id': '17327095',
  'pos': [12.910516, 77.5987265],
  'type': 'node'},
 {'created': {'changeset': '44678289',
   'timestamp': '2016-12-26T09:39:52Z',
   'uid': '136860',
   'user': 'indigomc',
   'version': '75'},
  'highway': 'traffic_signals',
  'id': '17327106',
  'name': 'Aurobindo Circle',
  'name:kn': u'\u0c85\u0cb0\u0cb5\u0cbf\u0c82\u0ca6 \u0cb5\u0cc3\u0ca4\u0ccd\u0ca4',
  'pos': [12.9171587, 77.5858225

In [15]:
import signal
import subprocess
##pro = subprocess.Popen('mongod', preexec_fn = os.setsid)

In [16]:
from pymongo import MongoClient

db_name = 'openstreetmap'

# Connect to Mongo DB
client = MongoClient('localhost:27017')
db = client[db_name]

In [24]:

# Build mongoimport command instead of using homebrew
collection = cal_data[:cal_data.find('.')]
json_file = cal_data + '.json'

mongoimport_cmd = 'mongoimport -h 127.0.0.1:27017 ' + \
                  '--db ' + db_name + \
                  ' --collection ' + collection + \
                  ' --file ' + json_file

# Before importing, drop collection if it is already running 
if collection in db.collection_names():
    print 'Dropping collection: ' + collection
    db[collection].drop()
    
# Execute the command
print 'Executing: ' + mongoimport_cmd
subprocess.call(mongoimport_cmd.split(),shell=True)

Executing: mongoimport -h 127.0.0.1:27017 --db openstreetmap --collection C:\Users\sampa\Desktop\Other\Udacity\Lesson 4\bangalore --file C:\Users\sampa\Desktop\Other\Udacity\Lesson 4\bangalore.osm.json


1

In [25]:
bangalore = db[collection]


In [26]:

import os
print 'The original OSM file is {} MB'.format(os.path.getsize(cal_data)/1.0e6) # convert from bytes to megabytes
print 'The JSON file is {} MB'.format(os.path.getsize(cal_data + ".json")/1.0e6) # convert from bytes to megabytes

The original OSM file is 668.159791 MB
The JSON file is 788.602808 MB


In [27]:
bangalore.find().count()

0

In [23]:
len(bangalore.distinct('created.user'))


0