In [2]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import csv
import os
import json
import itertools
from collections import Counter

from slugify import slugify

import pymongo

import networkx as nx
from networkx.readwrite import write_gpickle

In [3]:
def get_slug(name, type):
    """ get a clean string ID from name and type"""
    return "%s-%s"%(slugify( name.decode('utf-8') ),type.decode('utf-8'))


def create_node(name, type, start, end, orga=None, info=None) : 
    """create the node at the right format in the main graph"""
    slug = get_slug(name, type)
    
    try :
        if start > G.node[slug]["start"] : start =  G.node[slug]["start"]
        if end > G.node[slug]["end"] : start =  G.node[slug]["end"]
    except:
        start = start
        end = end
            
    node = {}
    node["id"] = slug
    node["type"] = type
    node["orga"] = orga # cluster or ARC ?
    node["name"] = name
    node["start"] = start
    node["end"] = end
    
    if info :
        node["info"]=info
    
    G.add_node(node["id"], node)
    return node["id"]

def merge_edge_data(Graph, e, data):
    """
    merge data properly :prevent data within existing edges to be erased
    """
    try : 
        Graph.edge[e[0]][e[1]]
    except KeyError:
        Graph.add_edge(e[0], e[1])
        
    try:
        Graph.edge[e[0]][e[1]]["edge_types"].append(data)
    except KeyError:
        Graph.edge[e[0]][e[1]]["edge_types"] = [data]

## Fetch data from the copy of database

Previously extracted from China Vitae website (on Feb, 21 - 2017)

In [4]:
# connect to mongo
client = pymongo.MongoClient('localhost', 27017)
db = client.chinaVitae

## Build the network of locations

For each person :

* create a node corresponding to locations
* ignore institutions (for now)
* create an edge between nodes for each common person

In [23]:
from itertools import combinations 

G = nx.Graph() # main graph

for bio in db.biographies.find():#.limit(1000):
    nodes = []
    for row in bio["career"]:
        for link in row["links"]:
            
            if link["type"] == "location":
                try : 
                    # create node
                    node = create_node(
                        link["name"], 
                        link["type"], 
                        row["start"], 
                        row["end"], 
                        info={ "url" : link["url"] }
                    )

                    nodes.append(node)

                except UnicodeEncodeError:
                    print "UnicodeEncodeError"
        
        
    # create edges
    for e in list(combinations(set(nodes), 2)):
        merge_edge_data(G, e, { "name" : bio["name"], "mongo_id" : bio["_id"]} )

print "%s nodes, %s edges "%(len(G.nodes()), len(G.edges()))

1419 nodes, 9105 edges 


In [60]:
region_pages = []

for n in G.nodes(data=True):
    if "region" in n[0] or "province" in n[0]: 
        region_pages.append((n[0], n[1]["info"]["url"]))

# for stats
unknown_slugs = []
all_slugs = []

# parse subregions index
sub_to_regions = {}
for r in region_pages:
    region_slug = r[0]
    path = "data/regions/"+region_slug+".json"
    if os.path.exists(path):
        with open(path, "r") as f:
            sub_regions = json.load(f)
        
        for sub in sub_regions["children"]:
            slug = get_slug(sub["short_name"], "location")
            all_slugs.append(slug)
            try: 
                G[slug]
                sub_to_regions[slug] = region_slug
            except KeyError:
                unknown_slugs.append(slug)

## Mapping 


In [67]:
# create graph by region
G_by_regions = nx.Graph()

# create nodes

for r in region_pages:
    slug = r[0]
    G_by_regions.add_node(slug)
    

for e in G.edges():
    if e[0] in sub_to_regions.keys() and e[1] in sub_to_regions.keys():
        source = sub_to_regions[e[0]]
        target = sub_to_regions[e[1]]
        
        if G_by_regions.has_edge(source, target):
            G_by_regions[source][target]['weight'] += 1
        else:
            G_by_regions.add_edge(source, target, weight=1)
        

print "%s nodes, %s edges "%(len(G_by_regions.nodes()), len(G_by_regions.edges()))

33 nodes, 193 edges 


In [83]:
# create the graph

nodes = []
for n in G_by_regions.nodes(data=True): 
    if G_by_regions.degree(n[0]) > 0: # ignore singletons
        node = G.node[n[0]]
        node["id"] = n[0]
        nodes.append(node)

print "%s nodes"%len(nodes)
        
edges = []
for i, e in enumerate(G_by_regions.edges(data=True)): 
    
    edge = e[2] 
    edge["source"] = e[0]
    edge["target"] = e[1]
    
    edges.append(edge)

print "%s edges"%len(edges)

27 nodes
193 edges


In [82]:
# send regions graph to topogram

from topogram_client import TopogramAPIClient
import logging 
import datetime

now=datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

# passwords
TOPOGRAM_URL = "https://app.topogram.io" # http://localhost:3000
USER = "clement.renaud@epfl.ch"
PASSWORD = "makerspaces"

# connect to the topogram instance 
topogram = TopogramAPIClient(TOPOGRAM_URL)

# topogram.create_user(USER, PASSWORD)
topogram.user_login(USER, PASSWORD)

r = topogram.create_topogram("China Political Networks")
print r["message"]
topogram_ID = r["data"][0]["_id"]

# get and backup existing nodes and edges
existing_nodes = topogram.get_nodes(topogram_ID)["data"]
existing_edges = topogram.get_edges(topogram_ID)["data"]

# clear existing graph
topogram.delete_nodes([n["_id"] for n in existing_nodes])
print "nodes deleted"
topogram.delete_edges([n["_id"] for n in existing_edges])
print "edges deleted"

r = topogram.create_nodes(topogram_ID, nodes)
print "%s nodes created."%len(r["data"])
r = topogram.create_edges(topogram_ID, edges)
print "%s edges created."%len(r["data"])

print "done. Topogram is online at %s/topograms/%s/view"%(TOPOGRAM_URL, topogram_ID)

A topogram with the same name already exists
nodes deleted
edges deleted
27 nodes created.
193 edges created.
done. Topogram is online at https://app.topogram.io/topograms/PYCHNaBubGHLErZXo/view
