In [None]:
#To generate a sample of the orignal osm data

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pprint import pprint
import xml.etree.cElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "san-francisco-bay_california.osm"  # Replace this with your osm file
SAMPLE_FILE = "sample_san-francisco-bay_california.osm"

k = 500

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write(bytes('<?xml version="1.0" encoding="UTF-8"?>\n', 'UTF-8'))
    output.write(bytes('<osm>\n  ', 'UTF-8'))

    # Write every 10th top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write(bytes('</osm>', 'UTF-8'))

In [3]:
#Unique Tags

import xml.etree.cElementTree as ET
from collections import defaultdict

def count_tags(filename):
    
    countTags = defaultdict(int)
    
    for event, elem in ET.iterparse(filename):
        countTags[elem.tag] += 1
        
    return countTags

tags = count_tags(SAMPLE_FILE)

tags

defaultdict(int,
            {'member': 222,
             'nd': 32900,
             'node': 30456,
             'osm': 1,
             'relation': 27,
             'tag': 11542,
             'way': 3509})

In [188]:
#Categorization of Data 
import re

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
two_colon = re.compile(r'^([a-z]|_|[A-Z]|[0-9])*:([a-z]|_|[A-Z]|[0-9])*:([a-z]|_|[A-Z]|[0-9])*$')


def key_type(element, keys):
    
    if element.tag == "tag":
        for ele in element.iter():
            eleTemp = ele.attrib['k']
            
            
            if lower.match(eleTemp) != None:
                keys['lower'] += 1
            elif lower_colon.match(eleTemp) != None:
                keys['lower_colon'] += 1
            elif two_colon.match(eleTemp) != None:
                keys['two_colon'] += 1  
            elif problemchars.findall(eleTemp) != []:
                keys['problemchars'] += 1
            else:
                keys['other'] += 1
        
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "two_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys



keys = process_map(SAMPLE_FILE)
keys

{'lower': 6873,
 'lower_colon': 4371,
 'other': 262,
 'problemchars': 0,
 'two_colon': 36}

In [5]:
def get_user(element, u):
    if 'uid' in element.attrib:
        u.add(element.attrib['uid'])
    
    return u


def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        users = get_user(element, users)

    return users

users = process_map(SAMPLE_FILE)
len(users)

934

In [6]:
#Auditing The Street Names

import re
from collections import defaultdict
import xml.etree.cElementTree as ET
from pprint import pprint
import copy


street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
OSMFILE = SAMPLE_FILE

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Circle", "Terrace", "Way", "Gardens"]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Rd.": "Road",
           "Rd": "Road",
           "Dr": "Drive",
           "Ct": "Court",
           "Blvd": "Boulevard"
            }


def audit_street_type(street_types, street_name):

    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)
#         
#             street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(OSMFILE, "rb")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types


def update_name(name, mapping):
    name = name.split(' ')
    for i in range(len(name)):
        if name[i] in mapping:
            name[i] = mapping[name[i]]
    
    name = ' '.join(name)
    
    return name



st_types = audit(OSMFILE)

for st_type, ways in st_types.items():
       for name in ways:
        better_name = update_name(name, mapping)
        print (str(name) + " => " + str(better_name)) 
    

Somme Ave => Somme Avenue
Auburn Blvd => Auburn Boulevard
Broadway => Broadway
Paseo Presada => Paseo Presada
Teresita Ct => Teresita Court
South St => South Street
East El Camino Real => East El Camino Real
El Camino Real => El Camino Real
County Road 98 => County Road 98
San Tropez Dr => San Tropez Drive
Summer Dr => Summer Drive
Hillock Dr => Hillock Drive
Ranchito Dr => Ranchito Drive
Alpine Dr => Alpine Drive
Majestic Dr => Majestic Drive
Helen Dr => Helen Drive
San Felipe Rd => San Felipe Road
Hillcrest Rd => Hillcrest Road
Fairview Rd => Fairview Road


In [26]:
#Schema for the database

schema = {
    'node': {
        'type': 'dict',
        'schema': {
            'id': {'required': True, 'type': 'integer', 'coerce': int},
            'lat': {'required': True, 'type': 'float', 'coerce': float},
            'lon': {'required': True, 'type': 'float', 'coerce': float},
            'user': {'required': True, 'type': 'string'},
            'uid': {'required': True, 'type': 'integer', 'coerce': int},
            'version': {'required': True, 'type': 'string'},
            'changeset': {'required': True, 'type': 'integer', 'coerce': int},
            'timestamp': {'required': True, 'type': 'string'}
        }
    },
    'node_tags': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'key': {'required': True, 'type': 'string'},
                'value': {'required': True, 'type': 'string'},
                'type': {'required': True, 'type': 'string'}
            }
        }
    },
    'way': {
        'type': 'dict',
        'schema': {
            'id': {'required': True, 'type': 'integer', 'coerce': int},
            'user': {'required': True, 'type': 'string'},
            'uid': {'required': True, 'type': 'integer', 'coerce': int},
            'version': {'required': True, 'type': 'string'},
            'changeset': {'required': True, 'type': 'integer', 'coerce': int},
            'timestamp': {'required': True, 'type': 'string'}
        }
    },
    'way_nodes': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'node_id': {'required': True, 'type': 'integer', 'coerce': int},
                'position': {'required': True, 'type': 'integer', 'coerce': int}
            }
        }
    },
    'way_tags': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'key': {'required': True, 'type': 'string'},
                'value': {'required': True, 'type': 'string'},
                'type': {'required': True, 'type': 'string'}
            }
        }
    }
}


In [138]:
#Fixing the Street names and Postal Codes while shaping all elements to generate csv files

import unicodecsv as csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET
import cerberus


OSM_PATH = SAMPLE_FILE

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
lower = re.compile(r'^([a-z]|_)*$')
one_colon = re.compile(r'^([a-z]|_|[A-Z]|[0-9])*:([a-z]|_|[A-Z]|[0-9])*$')
two_colon = re.compile(r'^([a-z]|_|[A-Z]|[0-9])*:([a-z]|_|[A-Z]|[0-9])*:([a-z]|_|[A-Z]|[0-9])*$')
one_hyphen = re.compile(r'^([a-z]|_|[A-Z]|[0-9])*-([a-z]|_|[A-Z]|[0-9])*$')


SCHEMA = schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']



expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Circle", "Terrace", "Way", "Gardens"]

mapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Rd.": "Road",
            "Rd": "Road",
            "Dr": "Drive",
            "Blvd": "Boulevard",
            "Ct": "Court"
            }

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)



def audit_street_type_update(street_name):

    lastWord = street_type_re.search(street_name)

        
    if lastWord:
        streetType = lastWord.group()
            
        if streetType not in expected:
            if streetType in mapping:
                street_name = re.sub(street_type_re, mapping[streetType], street_name)
        
    return street_name



def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements
    tag_dict = {}

    # YOUR CODE HERE
    if element.tag == 'node':
        for key, value in element.items():
            if key in node_attr_fields:
                node_attribs[key] = value
            
        for tag in element.iter("tag"):
            
            if PROBLEMCHARS.match(tag.attrib['k']) == None:
                tag_dict = {}
                
                tag_dict['id'] = node_attribs['id']
                
                if two_colon.match(tag.attrib['k']):
                    n = []
                    ty, pe, ky = tag.attrib['k'].split(":")
                    n.append(ty)
                    n.append(pe)
                    typ = ":".join(n)
                    
                    tag_dict['type'] = typ
                    tag_dict['key'] = ky
                    
                    tag_dict['value'] = tag.attrib['v']
                    
                elif one_colon.match(tag.attrib['k']) == None:
                    tag_dict['type'] = default_tag_type
                    tag_dict['key'] = tag.attrib['k']
                    
                    tag_dict['value'] = tag.attrib['v']   
                    
                    
                else:
                    typ, ky = tag.attrib['k'].split(":")
                    tag_dict['type'] = typ
                    tag_dict['key'] = ky
                    
                    if ky == "postcode":
                        if one_hyphen.match(tag.attrib['v']):
                            post, meh = tag.attrib['v'].split("-")
                            tag_dict['value'] =  post
                            
                        else:
                            tag_dict['value'] = tag.attrib['v']
                    
                    elif tag.attrib['k'] == "addr:street":
                        newKy = audit_street_type_update(tag.attrib['v'])
                        
                        tag_dict['value'] = newKy 
                        
                    else:
                        tag_dict['value'] = tag.attrib['v']
                    

                        
                            
                tags.append(tag_dict)
                
            
                

        
        return {'node': node_attribs, 'node_tags': tags}
        
        
    elif element.tag == 'way':
        pos = -1
        
        for key, value in element.items():
            if key in way_attr_fields:
                way_attribs[key] = value
                
        
        for nd in element.iter("nd"):
            
            tag_dict = {}
            
                
            tag_dict['id'] = way_attribs['id']
            tag_dict['node_id'] = nd.attrib['ref']
            pos += 1
            tag_dict['position'] = pos
            
            way_nodes.append(tag_dict)
            

            
        for tag in element.iter("tag"):
            
            if PROBLEMCHARS.match(tag.attrib['k']) == None:
                tag_dict = {}
                
                tag_dict['id'] = way_attribs['id']
                
                
                if two_colon.match(tag.attrib['k']):
                    n = []
                    ty, pe, ky = tag.attrib['k'].split(":")
                    n.append(ty)
                    n.append(pe)
                    typ = ":".join(n)
                    
                    tag_dict['type'] = typ
                    tag_dict['key'] = ky
                    
                    tag_dict['value'] = tag.attrib['v']
                
                
                elif one_colon.match(tag.attrib['k']) == None:
                    tag_dict['type'] = default_tag_type
                    tag_dict['key'] = tag.attrib['k']
                    
                    tag_dict['value'] = tag.attrib['v']   
                    
                  
                    
                else:
                    typ, ky = tag.attrib['k'].split(":", 1)
                    tag_dict['type'] = typ
                    tag_dict['key'] = ky
                    
                    if ky == "postcode":
                        if one_hyphen.match(tag.attrib['v']):
                            post, meh = tag.attrib['v'].split("-")
                            tag_dict['value'] =  post
                            
                        else:
                            tag_dict['value'] = tag.attrib['v']
                    
                    elif tag.attrib['k'] == "addr:street":
                        newKy = audit_street_type_update(tag.attrib['v'])
                        
                        tag_dict['value'] = newKy 
                        
                    else:
                        tag_dict['value'] = tag.attrib['v']
                    
                        
                            
                tags.append(tag_dict)
                

        
        
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, str) else v) for k, v in row.items()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'wb') as nodes_file, codecs.open(NODE_TAGS_PATH, 'wb') as nodes_tags_file, codecs.open(WAYS_PATH, 'wb') as ways_file, codecs.open(WAY_NODES_PATH, 'wb') as way_nodes_file, codecs.open(WAY_TAGS_PATH, 'wb') as way_tags_file:
        
        nodes_writer = csv.DictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = csv.DictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = csv.DictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = csv.DictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = csv.DictWriter(way_tags_file, WAY_TAGS_FIELDS)

        
        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])



process_map(OSM_PATH, validate=True)

In [169]:
import pandas as pd
import sqlite3

conn = sqlite3.connect("SanFrancisco.db")
c = conn.cursor()

df = pd.read_csv("nodes_tags.csv")
df.to_sql("nodes_tags", conn, if_exists='append', index=False)

df


Unnamed: 0,id,key,value,type
0,281266,highway,motorway_junction,regular
1,281266,ref,414B,regular
2,53020809,highway,turning_circle,regular
3,53025313,highway,turning_circle,regular
4,53040126,highway,traffic_signals,regular
5,53046608,highway,stop,regular
6,53047813,highway,turning_circle,regular
7,53050565,highway,traffic_signals,regular
8,53055937,highway,turning_circle,regular
9,53081879,highway,turning_circle,regular


In [170]:
#Loading the data in Database

import sqlite3
import csv

conn = sqlite3.connect("SanFrancisco.db")
c = conn.cursor()

#nodes
c.execute("CREATE TABLE IF NOT EXISTS nodes (id, lat, lon, user, uid, version, changeset, timestamp)")

with open('nodes.csv','r') as fin: 
    dr = csv.DictReader(fin)
    to_db = [(i['id'], i['lat'], i['lon'], i['user'], i['uid'], i['version'], i['changeset'], i['timestamp']) for i in dr]
    
c.executemany("INSERT INTO nodes (id, lat, lon, user, uid, version, changeset, timestamp) VALUES (?, ?, ?, ?, ?, ?, ?, ?);", to_db)
conn.commit()

#nodes_tags
# c.execute("CREATE TABLE IF NOT EXISTS nodes_tags (id, key, value, type)")

# with open('nodes_tags.csv','r') as fin: 
#     dr = csv.DictReader(fin)
#     to_db = [(i['id'], i['key'], i['value'], i['type']) for i in dr]
     
# c.executemany("INSERT INTO nodes_tags (id, key, value, type) VALUES (?, ?, ?, ?);", to_db)
# conn.commit()

#ways
c.execute("CREATE TABLE IF NOT EXISTS ways (id, user, uid, version, changeset, timestamp)")

with open('ways.csv','r') as fin: 
    dr = csv.DictReader(fin)
    to_db = [(i['id'], i['user'], i['uid'], i['version'], i['changeset'], i['timestamp']) for i in dr]
     
c.executemany("INSERT INTO ways (id, user, uid, version, changeset, timestamp) VALUES (?, ?, ?, ?, ?, ?);", to_db)
conn.commit()

#ways_nodes
c.execute("CREATE TABLE IF NOT EXISTS ways_nodes (id, node_id, position)")

with open('ways_nodes.csv','r') as fin: 
    dr = csv.DictReader(fin)
    to_db = [(i['id'], i['node_id'], i['position']) for i in dr]
     
c.executemany("INSERT INTO ways_nodes (id, node_id, position) VALUES (?, ?, ?);", to_db)
conn.commit()

#ways_tags
c.execute("CREATE TABLE IF NOT EXISTS ways_tags (id, key, value, type)")

with open('ways_tags.csv','r') as fin: 
    dr = csv.DictReader(fin)
    to_db = [(i['id'], i['key'], i['value'], i['type']) for i in dr]
     
c.executemany("INSERT INTO ways_tags (id, key, value, type) VALUES (?, ?, ?, ?);", to_db)
conn.commit()

In [189]:
import os


def convert_bytes(num):
    """
    this function will convert bytes to MB.... GB... etc
    """
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0

file_size_nodes = convert_bytes(os.path.getsize('nodes.csv'))
file_size_nodestags = convert_bytes(os.path.getsize('nodes_tags.csv'))
file_size_ways = convert_bytes(os.path.getsize('ways.csv'))
file_size_waysnodes = convert_bytes(os.path.getsize('ways_nodes.csv'))
file_size_waystags = convert_bytes(os.path.getsize('ways_tags.csv'))
file_size_osmfile = convert_bytes(os.path.getsize('san-francisco-bay_california.osm'))
file_size_samplefile = convert_bytes(os.path.getsize('sample_san-francisco-bay_california.osm'))
file_size_db = convert_bytes(os.path.getsize('SanFrancisco.db'))

print("OSM File: " + str(file_size_osmfile))
print("Sample File: " + str(file_size_samplefile))
print("Database Size: " + str(file_size_db))
print("Nodes.csv: " + str(file_size_nodes))
print("Nodes_tagss.csv: " + str(file_size_nodestags))
print("Ways.csv: " + str(file_size_ways))
print("Ways_tags.csv: " + str(file_size_waystags))
print("Ways_nodes.csv: " + str(file_size_waysnodes))

OSM File: 3.0 GB
Sample File: 6.2 MB
Database Size: 14.5 MB
Nodes.csv: 2.4 MB
Nodes_tagss.csv: 56.6 KB
Ways.csv: 206.9 KB
Ways_tags.csv: 330.8 KB
Ways_nodes.csv: 763.9 KB


In [171]:
conn = sqlite3.connect("SanFrancisco.db")

c = conn.cursor()

#nodes
c.execute("SELECT tags.value, COUNT(*) as count FROM (SELECT * FROM nodes_tags 	  UNION ALL SELECT * FROM ways_tags) \
tags WHERE tags.key='postcode'GROUP BY tags.value ORDER BY count DESC;")

conn.commit()

rows = c.fetchall()

rows

[('95691', 30),
 ('95014', 29),
 ('95023', 13),
 ('94122', 10),
 ('95605', 9),
 ('94043', 7),
 ('94116', 6),
 ('94611', 5),
 ('94127', 4),
 ('94610', 4),
 ('94117', 3),
 ('94133', 3),
 ('95616', 3),
 ('94063', 2),
 ('94114', 2),
 ('94123', 2),
 ('94568', 2),
 ('94061', 1),
 ('94080', 1),
 ('94103', 1),
 ('94105', 1),
 ('94110', 1),
 ('94115', 1),
 ('94118', 1),
 ('94121', 1),
 ('94131', 1),
 ('94134', 1),
 ('94305', 1),
 ('94501', 1),
 ('94510', 1),
 ('94530', 1),
 ('94556', 1),
 ('94598', 1),
 ('94606', 1),
 ('94607', 1),
 ('94608', 1),
 ('94703', 1),
 ('94704', 1),
 ('94706', 1),
 ('94903', 1),
 ('95032', 1),
 ('95064', 1),
 ('95070', 1),
 ('95076', 1),
 ('95138', 1),
 ('95330', 1),
 ('95405', 1),
 ('95448', 1),
 ('95498', 1),
 ('95632', 1)]

In [172]:
conn = sqlite3.connect("SanFrancisco.db")

c = conn.cursor()

c.execute("SELECT COUNT(*) FROM nodes")

conn.commit()

rows = c.fetchall()

rows

[(30456,)]

In [173]:
conn = sqlite3.connect("SanFrancisco.db")

c = conn.cursor()

c.execute("SELECT COUNT(*) FROM nodes_tags")

conn.commit()

rows = c.fetchall()

rows

[(1558,)]

In [174]:
conn = sqlite3.connect("SanFrancisco.db")

c = conn.cursor()

c.execute("SELECT COUNT(*) FROM ways")

conn.commit()

rows = c.fetchall()

rows

[(3509,)]

In [175]:
conn = sqlite3.connect("SanFrancisco.db")

c = conn.cursor()

c.execute("SELECT COUNT(*) FROM ways_tags")

conn.commit()

rows = c.fetchall()

rows

[(9887,)]

In [176]:
conn = sqlite3.connect("SanFrancisco.db")

c = conn.cursor()

c.execute("SELECT COUNT(*) FROM ways_nodes")

conn.commit()

rows = c.fetchall()

rows

[(32900,)]

In [177]:
# Sort of Cities by Descending Order

conn = sqlite3.connect("SanFrancisco.db")

c = conn.cursor()

c.execute("SELECT tags.value, COUNT(*) as count FROM (SELECT * FROM nodes_tags UNION ALL SELECT * FROM ways_tags) tags \
WHERE tags.key LIKE '%city'GROUP BY tags.value ORDER BY count DESC LIMIT 15;")

conn.commit()

rows = c.fetchall()

rows

[('Stockton', 136),
 ('Redwood City', 48),
 ('San Francisco', 41),
 ('West Sacramento', 41),
 ('Palo Alto', 34),
 ('Berkeley', 15),
 ('Hollister', 13),
 ('Mountain View', 7),
 ('Piedmont', 6),
 ('Sunnyvale', 6),
 ('Oakland', 4),
 ('Richmond', 4),
 ('Davis', 3),
 ('Dublin', 2),
 ('Walnut Creek', 2)]

In [135]:
# Total Number of Unique Users

conn = sqlite3.connect("SanFrancisco.db")

c = conn.cursor()

c.execute("SELECT COUNT(DISTINCT(e.uid)) FROM (SELECT uid FROM nodes UNION ALL SELECT uid FROM ways) e;")

conn.commit()

rows = c.fetchall()

rows

[(931,)]

In [186]:
#Top 10 Contributors

conn = sqlite3.connect("SanFrancisco.db")

c = conn.cursor()

c.execute("SELECT e.user, COUNT(*) as num FROM (SELECT user FROM nodes UNION ALL SELECT user FROM ways) e GROUP BY e.user \
ORDER BY num DESC;")

conn.commit()

rows = c.fetchall()

rows



[('andygol', 3727),
 ('nmixter', 3391),
 ('ediyes', 2098),
 ('Luis36995', 1607),
 ('Eureka gold', 1565),
 ('dannykath', 1414),
 ('RichRico', 1303),
 ('woodpeck_fixbot', 1185),
 ('Rub21', 820),
 ('mk408', 817),
 ('calfarome', 700),
 ('stevea', 674),
 ('samely', 659),
 ('karitotp', 653),
 ('oldtopos', 554),
 ('DanHomerick', 539),
 ('jraller', 441),
 ('KindredCoda', 391),
 ('oba510', 268),
 ('TheDutchMan13', 248),
 ('abel801', 219),
 ('bdiscoe', 215),
 ('dchiles', 213),
 ('nikhilprabhakar', 202),
 ('Bike Mapper', 190),
 ('Apo42', 186),
 ('KevinGillette', 181),
 ('ricomundy', 172),
 ('n76', 168),
 ('thisishap', 164),
 ('StellanL', 162),
 ('marthaleena', 162),
 ('bhavana naga', 150),
 ('Chris Lawrence', 149),
 ('doug_sfba', 146),
 ('Jothirnadh', 138),
 ('pratikyadav', 138),
 ('AndrewSnow', 135),
 ('Minh Nguyen', 134),
 ('MustangBuyer', 130),
 ('wallclimber21', 130),
 ('saikabhi', 127),
 ('beddy', 123),
 ('Aric', 115),
 ('Andreyhmk', 114),
 ('matthieun', 113),
 ('ramyaragupathy', 113),
 ('di

In [147]:
# TOP 10 appearing amenities

conn = sqlite3.connect("SanFrancisco.db")

c = conn.cursor()

c.execute("SELECT value, COUNT(*) as num FROM nodes_tags WHERE key='amenity' GROUP BY value ORDER BY num DESC LIMIT 10;")

all_rows = c.fetchall()
conn.commit()

all_rows



[('restaurant', 15),
 ('drinking_water', 7),
 ('fast_food', 6),
 ('place_of_worship', 6),
 ('cafe', 4),
 ('post_box', 4),
 ('atm', 3),
 ('fire_station', 2),
 ('fuel', 2),
 ('school', 2)]

In [148]:
#Most Appearing Cuisines

conn = sqlite3.connect("SanFrancisco.db")

c = conn.cursor()

c.execute("SELECT nodes_tags.value, COUNT(*) as num FROM nodes_tags JOIN (SELECT DISTINCT(id) FROM nodes_tags \
WHERE value='restaurant') i ON nodes_tags.id=i.id WHERE nodes_tags.key='cuisine' GROUP BY nodes_tags.value ORDER BY num DESC;")

all_rows = c.fetchall()
conn.commit()

all_rows

[('chinese', 2),
 ('mexican', 2),
 ('vietnamese', 2),
 ('brazilian', 1),
 ('japanese', 1),
 ('thai', 1)]

In [155]:
#Most Popular Religion in the Area

conn = sqlite3.connect("SanFrancisco.db")

c = conn.cursor()

c.execute("SELECT nodes_tags.value, COUNT(*) as num FROM nodes_tags JOIN (SELECT DISTINCT(id) \
          FROM nodes_tags WHERE value='place_of_worship') i ON nodes_tags.id=i.id \
          WHERE nodes_tags.key='religion' GROUP BY nodes_tags.value ORDER BY num DESC LIMIT 1;")

all_rows = c.fetchall()
conn.commit()

all_rows

[('christian', 5)]