# Mapping political networks in China

> This is an ongoing work.

We attempt to draw a map of the unfolding of political networks in greater China. 

The present document relates the methodology we use to explore the data and answer our first questions.


* What are the exchanges between provinces over time ?
* Which provinces are exchanging the most people ?
* What is the particular pattern emerging fomr specific cities in this process ?  

We want to see how politicians are moving from one city to another, to interrogate how the experience developed in one specific place can be transfered to another over time. Therefore, we want to study how Chinese politicans travel and resettle along their carreers.

We rely on biographic data of 4705 politicians extracted from the website [China Vitae](http://chinavitae.com).

We analyse their carreer paths to create network maps of movement of politicians in China.
We use Python (and the present notebook) for data analysis and the software [Topogram](http://topogram.io) to create network visualizations.


In [99]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import csv
import os
import json
import itertools
from collections import Counter
import logging 
import datetime

from slugify import slugify

import pymongo

import networkx as nx
from networkx.readwrite import write_gpickle

from topogram_client import TopogramAPIClient

now=datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

# passwords
TOPOGRAM_URL = "https://app.topogram.io" # http://localhost:3000
USER = "clement.renaud@epfl.ch"
PASSWORD = "makerspaces"

In [129]:
# connect to the topogram instance 
topogram = TopogramAPIClient(TOPOGRAM_URL)

# topogram.create_user(USER, PASSWORD)
topogram.user_login(USER, PASSWORD)


def get_slug(name, type):
    """ get a clean string ID from name and type"""
    return "%s-%s"%(slugify( name.decode('utf-8') ),type.decode('utf-8'))


def create_node(name, type, start, end, orga=None, info=None) : 
    """create the node at the right format in the main graph"""
    slug = get_slug(name, type)
    
    try :
        if start > G.node[slug]["start"] : start =  G.node[slug]["start"]
        if end > G.node[slug]["end"] : start =  G.node[slug]["end"]
    except:
        start = start
        end = end
            
    node = {}
    node["id"] = slug
    node["type"] = type
    node["orga"] = orga # cluster or ARC ?
    node["name"] = name
    node["start"] = start
    node["end"] = end
    
    if info :
        node["info"]=info
    
    G.add_node(node["id"], node)
    return node["id"]

def merge_edge_data(Graph, e, data):
    """
    merge data properly :prevent data within existing edges to be erased
    """
    try : 
        Graph.edge[e[0]][e[1]]
    except KeyError:
        Graph.add_edge(e[0], e[1])
        
    try:
        Graph.edge[e[0]][e[1]]["edge_types"].append(data)
    except KeyError:
        Graph.edge[e[0]][e[1]]["edge_types"] = [data]


# functions for Topogram
def parse_nodes(Graph):
    # create the graph
    nodes = []
    for n in Graph.nodes(data=True): 
        if Graph.degree(n[0]) > 0: # ignore singletons
            node = G.node[n[0]]
            node["id"] = n[0]
            nodes.append(node)

    print "%s nodes"%len(nodes)
    return nodes

def parse_edges(Graph):
    edges = []
    for i, e in enumerate(Graph.edges(data=True)): 
        edge = e[2] 
        edge["source"] = e[0]
        edge["target"] = e[1]

        edges.append(edge)

    print "%s edges"%len(edges)
    return edges
    
def create_topogram(title, nodes, edges): 
    
    print "Creating topogram '%s'"%title
    
    r = topogram.create_topogram(title)
    print r["message"]
    topogram_ID = r["data"][0]["_id"]

    # get and backup existing nodes and edges
    existing_nodes = topogram.get_nodes(topogram_ID)["data"]
    existing_edges = topogram.get_edges(topogram_ID)["data"]

    # clear existing graph
    topogram.delete_nodes([n["_id"] for n in existing_nodes])
    print "nodes deleted"
    topogram.delete_edges([n["_id"] for n in existing_edges])
    print "edges deleted"

    r = topogram.create_nodes(topogram_ID, nodes)
    print "%s nodes created."%len(r["data"])
    r = topogram.create_edges(topogram_ID, edges)
    print "%s edges created."%len(r["data"])

    print "done. Topogram is online at %s/topograms/%s/view"%(TOPOGRAM_URL, topogram_ID)

## Fetch data from the copy of database

Previously extracted from China Vitae website on Feb, 21 2017.

In [130]:
# connect to mongo
client = pymongo.MongoClient('localhost', 27017)
db = client.chinaVitae

## Build the network of locations


During his/her carreer, each politican occupied a number of sucessive positions. These positions are located in specific regions and institutions. For each line in the CV, the original data contains two main components (location and institution). 

Here, we decided to extract all locations (nodes in the graph). 
Each time two locations are mentioned in a single person's carrer, we add a link between both location (edges in the graph)

So, for each person we :

* create a node corresponding to locations
* ignore institutions
* create an edge between nodes for each place mentioned
* TODO: properly store the "start" and "end" of the link.
* TODO: add names of the politicians in the edge


In [189]:
from itertools import combinations 

G = nx.Graph() # main graph

for bio in db.biographies.find():
    nodes = []
    for row in bio["career"]:
        for link in row["links"]:
            
            if link["type"] == "location":
                try : 
                    # create node
                    node = create_node(
                        link["name"], 
                        link["type"], 
                        row["start"], 
                        row["end"], 
                        info={ "url" : link["url"] }
                    )

                    nodes.append(node)

                except UnicodeEncodeError:
                    print "UnicodeEncodeError"
        
        
    # create edges
    # TODO : for now
    for e in list(combinations(set(nodes), 2)):
        merge_edge_data(G, e, { "name" : bio["name"], "mongo_id" : bio["_id"]} )

print "Final size of the graph : %s nodes (places), %s edges (common people)"%(len(G.nodes()), len(G.edges()))

Final size of the graph : 1419 nodes (places), 9105 edges (common people)


## Province/county geographic hierarchy

The data format relies on a specific classification of county/province/cities.
To make the graph more explicit, we decide to aggregate by province.

TODO: aggregate by county, or at the city level


In [190]:
region_pages = []

for n in G.nodes(data=True):
    if "region" in n[0] or "province" in n[0]: 
        region_pages.append((n[0], n[1]["info"]["url"]))

# for stats
unknown_slugs = []
all_slugs = []

# parse subregions index
sub_to_regions = {}
for r in region_pages:
    region_slug = r[0]
    path = "data/regions/"+region_slug+".json"
    if os.path.exists(path):
        with open(path, "r") as f:
            sub_regions = json.load(f)
        
        for sub in sub_regions["children"]:
            slug = get_slug(sub["short_name"], "location")
            all_slugs.append(slug)
            try: 
                G[slug]
                sub_to_regions[slug] = region_slug
            except KeyError:
                unknown_slugs.append(slug)
                
print "%s sub-regions parsed"%len(sub_to_regions.keys())

911 sub-regions parsed


## Mapping relationships between regions

We first map relationships between regions. 

For each place mentioned in the graph, we aggregate the cities and counties into the province they belong to.

Then, we use the software [Topogram](http://topogram.io) to create a map of the network.

In [191]:
# create graph by region
G_by_regions = nx.Graph()

# create nodes

for r in region_pages:
    slug = r[0]
    G_by_regions.add_node(slug)
    

for e in G.edges():
    if e[0] in sub_to_regions.keys() and e[1] in sub_to_regions.keys():
        source = sub_to_regions[e[0]]
        target = sub_to_regions[e[1]]
        
        if G_by_regions.has_edge(source, target):
            G_by_regions[source][target]['weight'] += 1
        else:
            G_by_regions.add_edge(source, target, weight=1)
        

print "Regional graph: %s nodes (provinces), %s edges "%(len(G_by_regions.nodes()), len(G_by_regions.edges()))


# Create the interactive map
nodes = parse_nodes(G_by_regions)
edges = parse_edges(G_by_regions)
create_topogram("China Political Networks", nodes, edges)

Regional graph: 33 nodes (provinces), 193 edges 
27 nodes
193 edges
Creating topogram 'China Political Networks'
A topogram with the same name already exists
nodes deleted
edges deleted
27 nodes created.
193 edges created.
done. Topogram is online at https://app.topogram.io/topograms/PYCHNaBubGHLErZXo/view


## Mapping relationships from Shenzhen

In [215]:
import html2text # used to parse text

G_shenzhen = nx.Graph()


# select only shenzhen
for n in G.nodes():
    if "shenzhen" in n: # only the city of shenzhen
        shenzhen_id = n

shenzhen_targets = G[shenzhen_id]

G_shenzhen.add_node(n,G.node[shenzhen_id])

print len(shenzhen_targets)
# print shenzhen_links

# store names of the guys

for target in shenzhen_targets:
    additionalInfo= "\n" 
    weight = 0
    
    for info in shenzhen_targets[target]["edge_types"]:
        weight = weight+1
        
        # get some precisions
        bio = db.biographies.find_one(info["mongo_id"])
        additionalInfo += """#### [%s](%s) \n
\n
**%s**\n
%s\n
"""%(bio["name"], bio["url"], bio["title"], html2text.html2text(bio["biography"]))
        
        
    
    e = {}
    e["additionalInfo"] = additionalInfo
    e["weight"] = weight
    G_shenzhen.add_edge(shenzhen_id, target, **e)
    
    
    # store data
    G_shenzhen.add_node(target,G.node[target])

# Create the interactive map
nodes = parse_nodes(G_shenzhen)
edges = parse_edges(G_shenzhen)
create_topogram("Shenzhen Links", nodes, edges)
    

53
54 nodes
53 edges
Creating topogram 'Shenzhen Links'
A topogram with the same name already exists
nodes deleted
edges deleted
54 nodes created.
53 edges created.
done. Topogram is online at https://app.topogram.io/topograms/WhCjLGZFL5JMfEGZr/view


In [218]:
import html2text # used to parse text

G_chengdu = nx.Graph()


# select only chengdu
for n in G.nodes():
    if "chengdu" in n: # only the city of chengdu
        print n
        chengdu_id = n

chengdu_targets = G[chengdu_id]

G_chengdu.add_node(n,G.node[chengdu_id])

print len(chengdu_targets)

for target in chengdu_targets:
    additionalInfo= "\n" 
    weight = 0
    
    for info in chengdu_targets[target]["edge_types"]:
        weight = weight+1
        
        # get some precisions
        bio = db.biographies.find_one(info["mongo_id"])
        additionalInfo += """#### [%s](%s) \n
\n
**%s**\n
%s\n
"""%(bio["name"], bio["url"], bio["title"], html2text.html2text(bio["biography"]))
        
        
    
    e = {}
    e["additionalInfo"] = additionalInfo
    e["weight"] = weight
    G_chengdu.add_edge(chengdu_id, target, **e)
    
    
    # store data
    G_chengdu.add_node(target,G.node[target])

# Create the interactive map
nodes = parse_nodes(G_chengdu)
edges = parse_edges(G_chengdu)
create_topogram("Mobility of officials around Chengdu", nodes, edges)
    

chengdu-city-location
67


NameError: name 'chengdu_links' is not defined