In [1]:
import xml.etree.cElementTree as ET
import pprint
import pandas as pd
import re
import numpy as np
import codecs
import json
from pymongo import MongoClient

In [2]:
man='/Users/garymu/Dropbox/Udacity/DAND/project3/manhattan_new-york.osm'

In [189]:
#Audit Data
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

#Get Data Type by Key: lower case, lower case with colon, with problem characters, and others
def key_type(element, keys):
    if element.tag == "tag":
        key = element.attrib['k']
        
        if lower.search(key):
            keys['lower'] += 1
        elif lower_colon.search(key):
            keys['lower_colon'] += 1
        elif problemchars.search(key):
            keys['problemchars'] += 1
        else:
            keys['other'] += 1
        
    return keys


def get_elem_type(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)
        
        
    return keys

#Audit street type
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
                    elem.clear()

    return street_types

In [190]:
audit(man)

defaultdict(set,
            {'10024': {'West 80th Street NYC 10024'},
             '1801': {'505th 8th Avenue Suite 1801'},
             '27th': {'W 27th'},
             '29th': {'29th'},
             '2N': {'400th West 20th St., Suite 2N'},
             '3': {'Hanover Square #3'},
             '300': {'Ste 300'},
             '306': {'West 30th Street Suite 306'},
             '41st': {'41st'},
             '42nd': {'West 42nd'},
             '4B': {'Union Avenue 4B'},
             '500': {'Main St., Suite 500'},
             '633': {'633'},
             '861': {'861'},
             'A': {'Avenue A'},
             'Alley': {'Broadway Alley',
              'Cortlandt Alley',
              'Exchange Alley',
              'Freeman Alley',
              'Harrison Alley',
              'Mac Dougal Alley',
              'Theater Alley'},
             'Americas': {'Avenue Of The Americas',
              'Avenue of Americas',
              'Avenue of the Americas'},
             'Atrium': {'

In [13]:
#Shape Manhattan Data and Insert Into JSON File
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

def shape_element(element):    
    node = {}
    node['created'] ={}
    node['node_refs'] =[] 
    node['address']={}

    if element.tag == "node" or element.tag == "way":
        if element.tag=='node':
            node["nodetype"]="node"
        if element.tag=='way':
            node["nodetype"]="way"
        for tag in element.iter():
            dic=tag.attrib
            for i in dic.keys():
                if i in CREATED:
                    node['created'][i] = dic[i]
                elif i == 'k':
                    k = dic['k']
                    v = dic['v']
                    if problemchars.search(k):
                        continue
                    if k.count(":")<=1:
                        if "addr:" in k:
                            addr = k.split(":")[1]
                            node['address'][addr] = v
                        else:
                            node[k] = v
                elif i in ['lat', 'lon']:
                    node['pos'] =[]
                    lat = float(dic['lat'])
                    lon = float(dic['lon'])
                    node['pos'] = [lat, lon]
                elif i== 'ref':
                    node['node_refs'].append(dic['ref'])
                elif i== 'id':
                    node['id'] = dic[i]
                elif i =='visible':
                    node['visible'] = dic[i]
        if len(node['address'])==0:
            del node['address']
        if len(node['node_refs'])==0:
            del node['node_refs']
        return node
    else:
        pass


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

In [14]:
#Create MongoDB Connections
client = MongoClient("mongodb://localhost:27017")

In [166]:
db.drop_collection('maps')
data = process_map(man, False)
db = client.examples
db.maps.insert_many(data)

<pymongo.results.InsertManyResult at 0x21569a5f0>

In [191]:
#Number of documents
db.maps.find().count()

1013930

In [192]:
#Number of unique users
len(db.maps.distinct("created.user"))

1258

In [215]:
#Cities in Manhattan Database
result = [doc for doc in db.maps.aggregate([ 
{"$group":{"_id":"$address.city", "count":{"$sum":1}}}])]

pprint.pprint(result)

[{u'_id': u'New York NY', u'count': 1},
 {u'_id': u'BRONX, NY', u'count': 2},
 {u'_id': u'brooklyn', u'count': 1},
 {u'_id': u'Ridgefield Park', u'count': 2},
 {u'_id': u'fort lee', u'count': 1},
 {u'_id': u'Roosevelt Island', u'count': 1},
 {u'_id': u'Cliffside Park', u'count': 1},
 {u'_id': u'New york', u'count': 2},
 {u'_id': u'Palisades Park', u'count': 1},
 {u'_id': u'new york', u'count': 2},
 {u'_id': u'Ridgewood', u'count': 2},
 {u'_id': u'Ridgefield', u'count': 2},
 {u'_id': u'Union City', u'count': 1},
 {u'_id': u'Blissville', u'count': 1},
 {u'_id': u'New York city', u'count': 1},
 {u'_id': u'Brooklyn, NY', u'count': 4},
 {u'_id': u'new York', u'count': 2},
 {u'_id': u'NEW YORK CITY', u'count': 3},
 {u'_id': u'West New York', u'count': 2},
 {u'_id': u'Woodside', u'count': 5},
 {u'_id': u'North Bergen', u'count': 1},
 {u'_id': u'Manhattan NYC', u'count': 1},
 {u'_id': u'Sunnyside', u'count': 11},
 {u'_id': u'Queens, NY', u'count': 1},
 {u'_id': None, u'count': 1008771},
 {u'_i

In [193]:
#Number of nodes and ways
result = [doc for doc in db.maps.aggregate([ 
{"$group":{"_id":"$nodetype", "count":{"$sum":1}}}])]

pprint.pprint(result)

[{u'_id': u'way', u'count': 139095}, {u'_id': u'node', u'count': 874835}]


In [194]:
#nTop 10 type of nodes 
result = [doc for doc in db.maps.aggregate([ 
    {'$match': {'amenity':{'$exists': True}}},
    {"$group":{"_id":"$amenity", 'count':{"$sum":1}}},
    {'$sort':{'count':-1}},
    {'$limit':10}
   ])]

pprint.pprint(result)

[{u'_id': u'bicycle_parking', u'count': 3254},
 {u'_id': u'restaurant', u'count': 1171},
 {u'_id': u'school', u'count': 580},
 {u'_id': u'parking', u'count': 580},
 {u'_id': u'place_of_worship', u'count': 562},
 {u'_id': u'cafe', u'count': 483},
 {u'_id': u'bicycle_rental', u'count': 354},
 {u'_id': u'fast_food', u'count': 338},
 {u'_id': u'bank', u'count': 275},
 {u'_id': u'bench', u'count': 249}]


In [185]:
def make_pipeline():
    pipeline = [ 
    {'$match': {'amenity':{'$exists': 1}, 'amenity': 'cafe'}},
    {"$group":{"_id":"$name", 'count':{"$sum":1}}},
    {'$sort':{'count':-1}},
    {'$limit':10}
   ]
    return pipeline
def aggregate(db, pipeline):
    return [doc for doc in db.maps.aggregate(pipeline)]

In [206]:
#Top 10 contributing users
result = [doc for doc in db.maps.aggregate([ 
    {"$group":{"_id":"$created.user", "count":{"$sum":1}}},
    {"$sort":{"count":-1}},   
    {'$limit':5}    
   ])]

pprint.pprint(result)

[{u'_id': u'Rub21_nycbuildings', u'count': 683689},
 {u'_id': u'lxbarth_nycbuildings', u'count': 76585},
 {u'_id': u'robgeb', u'count': 58256},
 {u'_id': u'Korzun', u'count': 23450},
 {u'_id': u'woodpeck_fixbot', u'count': 10579}]


In [212]:
#Most popular cuisines
result = [doc for doc in db.maps.aggregate([
            {"$match":{"amenity":{"$exists":1}, 'amenity':'restaurant'}}, 
            {"$group":{"_id":"$cuisine","count":{"$sum":1}}}, 
            {"$sort":{"count":-1}}, 
            {"$limit":10}]
   )]
pprint.pprint(result)

[{u'_id': None, u'count': 410},
 {u'_id': u'italian', u'count': 88},
 {u'_id': u'american', u'count': 63},
 {u'_id': u'pizza', u'count': 63},
 {u'_id': u'mexican', u'count': 60},
 {u'_id': u'chinese', u'count': 43},
 {u'_id': u'french', u'count': 32},
 {u'_id': u'japanese', u'count': 30},
 {u'_id': u'burger', u'count': 29},
 {u'_id': u'thai', u'count': 26}]


In [214]:
#Top fast food restaurants in Manhattan
result = [doc for doc in db.maps.aggregate([ 
    {'$match': {'amenity':"fast_food"}},
    {"$group":{"_id":{'place':"$name"}, 'count':{'$sum':1}}},
    {'$sort':{'count':-1}},
    {'$limit':10}
   ])]
pprint.pprint(result)

[{u'_id': {u'place': u"McDonald's"}, u'count': 47},
 {u'_id': {u'place': u'Subway'}, u'count': 32},
 {u'_id': {u'place': u"Dunkin' Donuts"}, u'count': 18},
 {u'_id': {u'place': None}, u'count': 14},
 {u'_id': {u'place': u'Chipotle'}, u'count': 12},
 {u'_id': {u'place': u'Chipotle Mexican Grill'}, u'count': 9},
 {u'_id': {u'place': u'Burger King'}, u'count': 7},
 {u'_id': {u'place': u"Wendy's"}, u'count': 4},
 {u'_id': {u'place': u'Shake Shack'}, u'count': 4},
 {u'_id': {u'place': u'Baskin-Robbins'}, u'count': 3}]


In [203]:
result = [doc for doc in db.maps.aggregate([
            {"$match":{"amenity":{"$exists":1}}}, 
            {"$group":{"_id":"$amenity","count":{"$sum":1}}}, 
            {"$sort":{"count":-1}}, 
            {"$limit":10}])]
pprint.pprint(result)

[{u'_id': u'bicycle_parking', u'count': 3254},
 {u'_id': u'restaurant', u'count': 1171},
 {u'_id': u'school', u'count': 580},
 {u'_id': u'parking', u'count': 580},
 {u'_id': u'place_of_worship', u'count': 562},
 {u'_id': u'cafe', u'count': 483},
 {u'_id': u'bicycle_rental', u'count': 354},
 {u'_id': u'fast_food', u'count': 338},
 {u'_id': u'bank', u'count': 275},
 {u'_id': u'bench', u'count': 249}]


In [199]:
#number of cafes in Manhattan
db.maps.find({'amenity':'cafe'}).count()

483

In [210]:
#Top cafes in Manhattan
result = [doc for doc in db.maps.aggregate( [ 
    {'$match': {'amenity':{'$exists': 1}, 'amenity': 'cafe'}},
    {"$group":{"_id":"$name", 'count':{"$sum":1}}},
    {'$sort':{'count':-1}},
    {'$limit':10}
])]
pprint.pprint(result)

[{u'_id': u'Starbucks', u'count': 96},
 {u'_id': u'Starbucks Coffee', u'count': 23},
 {u'_id': u"Dunkin' Donuts", u'count': 22},
 {u'_id': u'Le Pain Quotidien', u'count': 13},
 {u'_id': None, u'count': 7},
 {u'_id': u'Pinkberry', u'count': 5},
 {u'_id': u'Think Coffee', u'count': 4},
 {u'_id': u'The Coffee Bean & Tea Leaf', u'count': 4},
 {u'_id': u'16 Handles', u'count': 3},
 {u'_id': u'Piccolo Cafe', u'count': 3}]


In [139]:
#Check the tags with problematic chracters
for _, element in ET.iterparse(man):
    if element.tag == "tag":
        key = element.attrib['k']        
        if problemchars.search(key):
            print key

cityracks.housenum
cityracks.large
cityracks.rackid
cityracks.small
cityracks.street
cityracks.housenum
cityracks.large
cityracks.rackid
cityracks.small
cityracks.street
cityracks.housenum
cityracks.large
cityracks.rackid
cityracks.small
cityracks.street
cityracks.housenum
cityracks.large
cityracks.rackid
cityracks.small
cityracks.street
cityracks.housenum
cityracks.installed
cityracks.large
cityracks.rackid
cityracks.small
cityracks.street
cityracks.housenum
cityracks.large
cityracks.rackid
cityracks.small
cityracks.street
cityracks.housenum
cityracks.installed
cityracks.large
cityracks.rackid
cityracks.small
cityracks.street
cityracks.housenum
cityracks.installed
cityracks.large
cityracks.rackid
cityracks.small
cityracks.street
cityracks.housenum
cityracks.large
cityracks.rackid
cityracks.small
cityracks.street
cityracks.housenum
cityracks.large
cityracks.rackid
cityracks.small
cityracks.street
cityracks.housenum
cityracks.installed
cityracks.large
cityracks.rackid
cityracks.small
ci