In [1]:
import re
import json
import pprint
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [6]:
#HELPER FUNCTIONS
def get_tag_data(json_dataset):
    """
    # info on available address fields ( and how are they used )
    # export addr data to pandas-dataset [ element-id, addr-field, addr-field-val ]
    #   run analysis of the dataset
    """
    data_header = ['node_id', 'node_type', 'tag', 'val']
    raw_data = []
    for json_node in json_dataset:
        if 'tags' in json_node:
            for tag in json_node['tags']:
                raw_data.append([
                    json_node['id'],
                    json_node['type'],
                    tag,
                    json_node['tags'][tag]
                ])
        else:
            raw_data.append([
                json_node['id'],
                json_node['type'],
                None,
                None
            ])
    return pd.DataFrame(raw_data, columns=data_header)


In [21]:
#CONST DEFINITIONS
DATA_SOURCE='..\sample-100-redwood-city-ca.osm.json'

#LOADING DATA
parsed_json_data = []
for line in open(DATA_SOURCE, 'r'):
    parsed_json_data.append(json.loads(line))
    
data_df = get_tag_data(parsed_json_data)
tag_data_df = data_df[data_df['tag'].notnull()]
tag_data_df.head()

Unnamed: 0,node_id,node_type,tag,val
6,31866760,node,waterway,slipway
16,65384740,node,highway,turning_circle
21,65392413,node,highway,turning_circle
27,65400452,node,highway,turning_circle
50,65429242,node,highway,turning_circle


In [37]:
#Investigate tags
tag_data_df['tag'].value_counts()

building                   958
source                     458
highway                    203
redwood_city_ca:bld_gid    185
redwood_city_ca:addr_id    178
name                       112
paloalto_ca:id              73
tiger:county                70
tiger:cfcc                  59
tiger:name_base             59
tiger:name_type             53
service                     53
oneway                      41
tiger:reviewed              25
maxspeed                    21
amenity                     17
lanes                       17
redwood_city_ca:bldg_id     16
shop                        15
phone                       14
turn:lanes                  13
surface                     12
tiger:name_base_1           11
source:maxspeed              9
hgv                          8
ref                          8
emergency                    7
leisure                      6
gnis:feature_id              6
website                      6
                          ... 
male                         1
source_i

In [38]:
#Investigate tags of interest
for tag in ['building', 'source', 'service', 'amenity', 'shop', 'emergency']:
    print '[%s]~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' % tag
    print tag_data_df[tag_data_df['tag'] == tag]['val'].value_counts()

[building]~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
yes            915
house           24
residential     17
retail           1
garage           1
Name: val, dtype: int64
[source]~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
City of Redwood City, CA 1013                                                     236
bing                                                                               84
City of Palo Alto CA 0713                                                          72
Bing                                                                               25
photograph                                                                         17
Redwood City 0813                                                                   7
Yahoo                                                                               3
yahoo                                                                               3
bing;photograph                                                                     2
mapquest op