## Building graph using Nobel_prize JSON data

In [None]:
import json
import pandas as pd

file1 = '../data/Nobel_prize.json'
file2 = '../data/Nobel_laureate.json'
file3 = '../data/Nobel_country.json' # this file has simple json structure

In [None]:
with open(file1) as json_file:
    json_data1 = json.load(json_file)
# json_data

In [None]:
# from pandas.io.json import json_normalize

# # this is not practical for the this json data
# df = json_normalize(json_data1, 'prizes')
# df.head()

In [None]:
json_data1['prizes'][0]['laureates']

In [None]:
with open(file2) as json_file:
    json_data2 = json.load(json_file)

In [None]:
json_data2['laureates'][100]

In [None]:
# with open(file3) as json_file:
#     json_data3 = json.load(json_file)

# json_data3['countries']

In [None]:
import networkx as nx
from pprint import pprint
from graphgen import create_graph

In [None]:
nodes_mapper = {
    'nodes': [
        {
            'type': 'Affiliations',
            'path': '/prizes/affiliations',
            'key' : [
                {'name': 'name', 'raw': 'name'}
            ],
            'attributes': [
                {'name': 'name',    'raw': 'name'},
                {'name': 'city',    'raw': 'city'},
                {'name': 'country', 'raw': 'country'},
            ]
        },
        {
            'type': 'Prize',
            'path': '/prizes',
            'key' : [
                {'name': 'category', 'raw': 'category'}
            ],
            'attributes': [
                {'name': 'category', 'raw': 'category'},
            ]
        }
    ]
}
edges_mapper = {
    'edges': [
        {
            'type': 'Awarded',
            'from': {
                'type': 'Affiliations',
                'path': '/prizes/affiliations',
                'key' : [
                    {'name': 'name', 'raw': 'name'}
                ]
            },
            'to'  : {
                'type': 'Prize',
                'path': '/prizes',
                'key' : [
                    {'name': 'category', 'raw': 'category'}
                ]
            },
            'attributes': [
                {'name': 'year',    'raw': 'year', 'path':'prizes/year'}, 
            ]
        }
    ]
}

In [None]:
def extract_node_attrs_from_json(jdata, type_path, attr_dict):
#     print('>>> looking for:', type_path)
#     print('>>> looking for attrs:', attr_dict)
    out = []

    # make sure our type_path end with '/'
    if type_path[-1] != '/':
        type_path += '/' 
    
    
    def extract_data(jdata, cur_path = '/', cur_obj = None):
        if type(jdata) is dict:            
            if cur_path == type_path:
#                 print('<<< MATCHED_TYPE >>>')
#                 print('@path:', cur_path)
                obj = {}
                for a in jdata:
                    extract_data(jdata[a], cur_path + a + '/', obj)
#                 print('>>> got obj', obj)
                out.append(obj)
            else:
                for a in jdata:
                    extract_data(jdata[a], cur_path + a + '/')
        elif type(jdata) is list:
            for a in jdata:
                extract_data(a, cur_path)
        else:
#             print('cur_path: {} - type_path: {}'.format(cur_path, type_path))
            if cur_obj != None and cur_path in attr_dict.keys():
#                 print('<<< MATCHED_ATTR >>>')
                cur_obj[attr_dict[cur_path]] = jdata
    
    extract_data(jdata)
    return out
                
    

In [None]:
def extract_edge_attrs_from_json(jdata, src_type_path, dst_type_path, attr_dict):
#     print('>>> looking for:', type_path)
#     print('>>> looking for attrs:', attr_dict)
    out = []

    # make sure our type_path end with '/'
    if src_type_path[-1] != '/':
        src_type_path += '/' 
    
    if dst_type_path[-1] != '/':
        dst_type_path += '/' 
    
    
    def extract_data(jdata, cur_path = '/', cur_obj = None):
        if type(jdata) is dict:            
            if cur_path == src_type_path or cur_path == dst_type_path:
                print('<<< MATCHED_TYPE >>>')
                print('@path:', cur_path)
                obj = {}
                for a in jdata:
                    extract_data(jdata[a], cur_path + a + '/', obj)
                    print('>>> got edge data:', obj)
                out.append(obj)
            else:
                for a in jdata:
                    extract_data(jdata[a], cur_path + a + '/')
        elif type(jdata) is list:
            for a in jdata:
                extract_data(a, cur_path)
        else:
            print('cur_path: {} - type_path: {}'.format(cur_path, type_path))
            if cur_obj != None and cur_path in attr_dict.keys():
                print('<<< MATCHED_ATTR >>>')
                cur_obj[attr_dict[cur_path]] = jdata
    
    extract_data(jdata)
    return out
                

In [None]:
def create_graph_nodes_from_json(graph, graph_mapper, data_provider, update = True):
    '''
    
    params:
        graph: fully constructed graph object to add new nodes and edges to it.
        graph_mapper: dictionary describing the type of object to extract
        data_provider: json_data
        
    return:
        constructured "graph_type" graph object based on the provided source data and according to 
        the mapper schema description.
    '''

    assert (graph != None),"Graph object wasn't constructed correctly"
    # TBD... assert (isinstance(data_provider, pd.DataFrame)),"The data provider should be a pandas DataFrame"
    
    # get list of node types and edge types
    node_types = []
    edge_types = []

    if 'nodes' in graph_mapper.keys():
        node_types = graph_mapper['nodes']
    if 'edges' in graph_mapper.keys():
        edge_types = graph_mapper['edges']

    raw_data = data_provider
    
#     print(node_types)
#     print(edge_types)
    
    for node_type in node_types:
        # TBD... assert check_attributes(node_type, raw_data, node_type['attributes'])
       
        # TBD: Need to support multiple keys. For now we'll only have a single key for each record 
        node_key = node_type['key'][0]
        key_name = node_key['name']
        key_raw_name = node_key['raw']

        attr_dict = {}
        for a in node_type['attributes']:
            attr_dict[a['name']] = a['raw']
        
        attr = dict()
        count = 0
        node_type_name = node_type['type']
        node_type_path = node_type['path']
 
        # construct attribute mapping between type_path+raw_attrib_name -> attrib_name
        lookup_attr_dict = {}
        if node_type_path[:-1] != '/':
            node_type_path += '/' 
    
        for k, v in attr_dict.items():
            lookup_attr_dict[node_type_path + v + '/'] = k

        # iterate and collect.  
        for j in raw_data:
#             print('json>> ', j)
            jelem = extract_node_attrs_from_json(j, node_type_path, lookup_attr_dict)
            if len(jelem) > 0:
                for e in jelem:
#                     print('{} - type_found: {} - attr: {}'.format(count, node_type_name, e))
                    key_value = e[key_raw_name] if key_raw_name in e else 'UNKNOWN_'+str(count)
                    node_id = '{}_{}'.format(node_type_name, key_value)
                    if not update and graph.has_node(node_id):
                        continue

                    attr['_type_'] = node_type_name
                    for k,v in attr_dict.items():
                        attr[k] = e[v] if v in e else ''
                    graph.add_node(node_id, **attr)
                count += 1
        
        print(count)
        
        return graph

In [None]:
def create_graph_edges_from_json(graph, graph_mapper, data_provider, update = True):
    '''
    
    params:
        graph: fully constructed graph object to add new nodes and edges to it.
        graph_mapper: dictionary describing the type of object to extract
        data_provider: json_data
        
    return:
        constructured "graph_type" graph object based on the provided source data and according to 
        the mapper schema description.
    '''

    assert (graph != None),"Graph object wasn't constructed correctly"
    # TBD... assert (isinstance(data_provider, pd.DataFrame)),"The data provider should be a pandas DataFrame"
    
    # get list of edge types and edge types
    edge_types = []

    if 'edges' in graph_mapper.keys():
        edge_types = graph_mapper['edges']

    raw_data = data_provider
    
#     print(edge_types)
    for edge_type in edge_types:
        
        # TBD... assert check_attributes(edge_type, raw_data, edge_type['attributes'])

        # TBD: Need to support multiple keys. For now we'll only have a single key for each record 
        edge_type_name = edge_type['type']

        # source node metadata
        src_type_name = edge_type['from']['type']
        src_type_path = edge_type['from']['path']
        if src_type_path[:-1] != '/':
            src_type_path += '/' 

        src_key = edge_type['from']['key']
        src_key_name = src_key['name']
        src_key_raw_name = src_key['raw']
        
        # destination node metadata
        dst_type_name = edge_type['to']['type']
        dst_type_path = edge_type['to']['path']
        if dst_type_path[:-1] != '/':
            dst_type_path += '/' 

        dst_key = edge_type['to']['key']
        dst_key_name = src_key['name']
        dst_key_raw_name = src_key['raw']
    

        attr_dict = {}
        for a in node_type['attributes']:
            attr_dict[a['name']] = a['path']
        
        attr = dict()
        count = 0
 
        # construct attribute mapping between raw_attrib_name_path -> attrib_name
        lookup_attr_dict = {}
    
        for k, v in attr_dict.items():
            if v[:-1] != '/':
                v += '/'
            lookup_attr_dict[v] = k

        # iterate and collect.  
        for j in raw_data:
#             print('json>> ', j)
            jelem = extract_edge_attrs_from_json(j, src_type_path, dst_type_path, lookup_attr_dict)
            if len(jelem) > 0:
                for e in jelem:
                    print('{} - src: {} - dest: {} - attr: {}'.format(count, src_type_name, dst_type_name, e))
                    src_key_value = e[src_key_raw_name] if src_key_raw_name in e else 'UNKNOWN_'+str(count)
                    dst_key_value = e[dst_key_raw_name] if dst_key_raw_name in e else 'UNKNOWN_'+str(count)
        
                    attr['_type_'] = edge_type_name
                    for k,v in attr_dict.items():
                        attr[k] = e[v] if v in e else ''
                    graph.add_node(node_id, **attr)
                    from_id = '{}_{}'.format(src_type_name, src_key_value)
                    to_id = '{}_{}'.format(dst_type_name, dst_key_value)
                    graph.add_edge(from_id, to_id, **attr)
                    
                count += 1
        
        print(count)
        
        return graph

In [None]:
g = nx.MultiDiGraph()

g = create_graph_nodes_from_json(g, graph_mapper = nodes_mapper, 
                 data_provider = json_data2['laureates'])

# g = create_graph(g, graph_mapper = edges_mapper, 
#                  data_provider = trips_df)

In [None]:
type(g)

In [None]:
nx.number_of_nodes(g)

In [None]:
pprint(json_data2['laureates'][216])

In [None]:
nx.number_of_edges(g)

In [None]:
print(g.node['Station_2'])

In [None]:
pprint(g.get_edge_data('Station_2', 'Station_16'))