# ARC5 - Parse data as network

In [358]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import csv
import os
import json
from slugify import slugify
import networkx as nx
import itertools
from collections import Counter

data_dir = os.getcwd()

fichier_projets = "../final/ARC5-Final - Projets (tous).csv"
fichier_partenaires = "../final/ARC5-Final - Partenariats (tous).csv"
fichier_nodes = "../final/ARC5-Final - Noms (tous).csv"

# parsing helpers
project_types = {
    "ADR" : "Thèse",
    "projet" : "Projet de recherche",
    "postdoc" : "Recherche post-doctorale"
}

In [359]:
def get_slug(name):
    return slugify(name.decode('utf-8'))

def get_project(name):
    slug = get_slug(name)
    try :
        return G.node[slug]
    except KeyError:
        n=stored_projects[slug]
        print n
        node = create_node(n["name"], n["type"], n["start"], n["end"], orga=n["orga"])
        print node
        return G.node[slug]
    
def create_node(name, type, start, end, orga=None) : 
    
    slug = get_slug(name)
    
    try :
        if start > G.node[slug]["start"] : start =  G.node[slug]["start"]
        if end > G.node[slug]["end"] : start =  G.node[slug]["end"]
    except:
        start = start
        endd = end
            
    node = {}
    node["id"] = slug
    node["type"] = type
    node["orga"] = orga # cluster or ARC ?
    node["name"] = name
    node["start"] = start
    node["end"] = end
    
    G.add_node(node["id"], node)
    return node["id"]

# keep data when merging edges
def merge_edge_data(Graph, e, data):
    
    try : 
        Graph.edge[e[0]][e[1]]
    except KeyError:
        Graph.add_edge(e[0], e[1])
        
    try:
        Graph.edge[e[0]][e[1]]["edge_types"].append(data)
    except KeyError:
        Graph.edge[e[0]][e[1]]["edge_types"] = [data]
    

### Parse all nodes

In [360]:
print fichier_nodes

G = nx.Graph()

with open( os.path.join(data_dir, fichier_nodes), "r") as f :
    reader = csv.DictReader(f)
    for line in reader :
#         print line
        start = int(line["Début"])
        end =  int(line["Fin"])
        node = create_node(line["Nom"], line["Type"], start, end)

print "%s nodes"%len(G.nodes())
print "%s edges"%len(G.edges())
print Counter([n[1]["type"] for n in G.nodes(data=True)]) 

../final/ARC5-Final - Noms (tous).csv
281 nodes
0 edges
Counter({'laboratoire': 64, 'm\xc3\xa9diation': 51, 'patrimoine': 46, 'localit\xc3\xa9': 36, 'creation': 29, 'etablissement': 14, 'enseignement': 11, 'ecole-doctorale': 10, 'cr\xc3\xa9ation': 8, '\xc3\xa9conomique': 6, 'cst': 6})


### Parse thèses et projets

In [361]:
stored_projects={}

print fichier_projets

# print G.nodes()

with open( os.path.join(data_dir, fichier_projets), "r") as f :
    reader = csv.DictReader(f)
    for line in reader :
        
        if line["Nom Projet"] and line["Orga"] != "13" and line["Orga"] != "14":
            start = int(line["Année"])
            end =  start+3
            
            # create project
            projet = create_node(line["Nom Projet"], line["Type"], start, end, orga=line["Orga"])            

            # porteur de projet
            porteur = create_node(line["Porteurs (nom)"], "personne", start, end)
            
            # get existing
            etablissement = G.node[get_slug(line["Etablissement"])]["id"]
            laboratoire = G.node[get_slug(line["Labo"])]["id"]
            
            # TODOs : ville !
#             ville = G.node[get_slug(line["Ville"], "localite")]

            edges = []            
            edges.append((projet, etablissement))
            edges.append((projet, laboratoire))
            edges.append((projet, porteur))
            edges.append((laboratoire, porteur))
            
#             edges.append((etablissement, ville))
#             edges.append((laboratoire, ville))

            for e in edges :
                merge_edge_data(G, e, { "type" : line["Type"], "name" : line["Nom Projet"] })
        
        elif line["Orga"] == "13" or line["Orga"] == "14":
            start = int(line["Année"])
            end =  start+3
            stored_projects[get_slug(line["Nom Projet"])] = { "name" : line["Nom Projet"], "type": line["Type"], "start" : start, "end" : end, "orga" : line["Orga"] }

        
print "%s nodes"%len(G.nodes())
print "%s edges"%len(G.edges())
print Counter([n[1]["type"] for n in G.nodes(data=True)]) 
print Counter([n[1]["orga"] for n in G.nodes(data=True)]) 

../final/ARC5-Final - Projets (tous).csv
485 nodes
441 edges
Counter({'personne': 91, 'projet': 66, 'laboratoire': 64, 'm\xc3\xa9diation': 51, 'ADR': 46, 'patrimoine': 46, 'localit\xc3\xa9': 36, 'creation': 29, 'etablissement': 14, 'enseignement': 11, 'ecole-doctorale': 10, 'cr\xc3\xa9ation': 8, 'cst': 6, '\xc3\xa9conomique': 6, 'postdoc': 1})
Counter({None: 372, 'ARC5': 113})


### Parse partenaires

In [366]:
print fichier_partenaires

with open( os.path.join(data_dir,  fichier_partenaires), "r") as f :
    reader = csv.DictReader(f) 
    for i, line in enumerate(reader):
        if line["Projet"] and line["Structure"] : 

            start = int(line["début"])
            end =  int(line["fin"])

            partenaire = G.node[get_slug(line["Structure"])]

    #         TODO : ville
    #         ville = create_node(line["Ville"], "ville", start, end)

            # get project (only those with partners)
            projet = get_project(line["Projet"])        

            e = (partenaire["id"], projet["id"])
            merge_edge_data(G, e, { "type" : projet["type"], "name" : projet["name"] })

print "%s nodes"%len(G.nodes())
print "%s edges"%len(G.edges())
print Counter( [n[1]["type"] for n in G.nodes(data=True)] )

../final/ARC5-Final - Partenariats (tous).csv
494 nodes
716 edges
Counter({'personne': 91, 'projet': 67, 'laboratoire': 64, 'ADR': 54, 'm\xc3\xa9diation': 51, 'patrimoine': 46, 'localit\xc3\xa9': 36, 'creation': 29, 'etablissement': 14, 'enseignement': 11, 'ecole-doctorale': 10, 'cr\xc3\xa9ation': 8, 'cst': 6, '\xc3\xa9conomique': 6, 'postdoc': 1})


#### Convert persons to edges

In [367]:
print "before : %s nodes / %s edges"%(len(G.nodes()),len(G.edges()))

G_without_people = G.copy()

# get all persons in the graph
persons = [node[0] for node in G_without_people.nodes(data=True) if node[1]["type"] == "personne"]
# persons_edges = clean_G.edges(persons, data=True)

years=[]

for person in persons:

    # edges for a single person
    person_edges = G_without_people.edges(person)
  
    # get all nodes linked by a single person
    list_of_person_nodes = []; map(list_of_person_nodes.extend, map(list,person_edges))
    assert len(list_of_person_nodes) == len(person_edges)*2 # make sure we have all nodes
    
    clean_nodes = [n for n in list_of_person_nodes if n != person]
    
    #  years += [n[1]["start"] for n in clean_nodes]
    #  assert len(clean_nodes) == len(person_edges) # make sure we have all new nodes, except the person

    if len(person_edges) > 2 : # if have less than degree of 1 then remove node

        # get data from the node to add to the edge
        data = G_without_people.node[person]
        
        years.append(data["start"])
        
        # create new edges between all those
        new_edges = list(itertools.combinations(clean_nodes, 2))

        # create new edges with merge data info
        for e in new_edges:
            merge_edge_data(G_without_people, e, { "type" : "personne", "name" : None })

    # remove person from the graph
    G_without_people.remove_node(person)

print Counter(years)
print "after : %s nodes / %s edges"%(len(G_without_people.nodes()),len(G_without_people.edges()))

before : 494 nodes / 716 edges
Counter({2015: 8, 2016: 4, 2017: 2, 2013: 2, 2012: 1})
after : 403 nodes / 564 edges


#### Convert projects to edge

In [375]:
print "before : %s nodes / %s edges"%(len(G_without_people.nodes()),len(G_without_people.edges()))

G_without_people_and_projects = G_without_people.copy()

# get all projects in the graph
projects = [node[0] for node in G_without_people_and_projects.nodes(data=True) if node[1]["type"] == "projet" or node[1]["type"] == "ADR" or node[1]["type"] == "postdoc" ]

years = []

for project in projects:

    # edges for a single person
    project_edges = G_without_people_and_projects.edges(project)
  
    # get all nodes linked by a single person
    list_of_project_nodes = []; map(list_of_project_nodes.extend, map(list, project_edges))
    assert len(list_of_project_nodes) == len(project_edges)*2 # make sure we have all nodes
    
    clean_nodes = [n for n in list_of_project_nodes if n != project]
#     assert len(clean_nodes) == len(person_edges) # make sure we have all new nodes, except the person

    if len(project_edges) > 2 : # if have less than degree of 1 then remove node

        # get data from the node to add to the edge
        data = G_without_people_and_projects.node[project]
        
        years.append(data["start"])

        # create new edges between all those
        new_edges = list(itertools.combinations(clean_nodes, 2))
                
        # parse text properly
        
        # merge data into edge info
        for e in new_edges:
            
            proj=G.node[project]
            txt = { "type" : proj["type"], "name" : proj["name"]}
            
            merge_edge_data(G_without_people_and_projects, e, txt) 
#             G_without_people_and_projects.add_edge( e[0], e[1], {"type" : "projet", "additionalInfo" : data} )

    # remove person from the graph
    G_without_people_and_projects.remove_node(project)


print Counter(years)

print "after : %s nodes / %s edges"%(len(G_without_people_and_projects.nodes()),len(G_without_people_and_projects.edges()))
print Counter([ c[1]["type"] for c in G_without_people_and_projects.nodes(data=True)])

before : 403 nodes / 564 edges
Counter({2013: 36, 2012: 30, 2014: 12, 2015: 9, 2016: 8, 2011: 7, 2010: 1})
after : 281 nodes / 1024 edges
Counter({'laboratoire': 64, 'm\xc3\xa9diation': 51, 'patrimoine': 46, 'localit\xc3\xa9': 36, 'creation': 29, 'etablissement': 14, 'enseignement': 11, 'ecole-doctorale': 10, 'cr\xc3\xa9ation': 8, 'cst': 6, '\xc3\xa9conomique': 6})


## Parse final graph

In [376]:
# create the graph

nodes = []
for n in G_without_people_and_projects.nodes(data=True): 
    
    # ignore singletons
    if G_without_people_and_projects.degree(n[0]) > 0:
        node = n[1]
        node["id"] = n[0]
        node["group"] = n[1]["type"]
        nodes.append(node)

print "%s nodes"%len(nodes)
        
edges = []
for i, e in enumerate(G_without_people_and_projects.edges(data=True)): 
    
    edge = e[2]
    
    # calculate edge weight
    edge["weight"] = len(edge["edge_types"])
        
    notes = ""
    team = 0
    
    for t in edge["edge_types"]:
        if t["type"] == "ADR" or t["type"] == "projet" or t["type"] == "postdoc" : 
            notes = notes  + "* %s : %s \n"%(project_types[t["type"]], t["name"])
        elif t["type"] == "personne":
            team = team + 1
    
    if team != 0 : 
        notes = "* Membres d'équipe en commun \n" + notes
    
    edge["additionalInfo"] = notes    
    
    edge["source"] = e[0]
    edge["target"] = e[1]
    
    edges.append(edge)

print "%s edges"%len(edges)


212 nodes
1024 edges


#### Create the final graph on Topogram

In [382]:
from topogram_client import TopogramAPIClient
import logging 
import datetime

now=datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

# passwords
TOPOGRAM_URL = "https://app.topogram.io" # "http://localhost:3000" 
USER = "arc5@arc5.com"
PASSWORD = "culture&recherche"

# connect to the topogram instance 
topogram = TopogramAPIClient(TOPOGRAM_URL)

# topogram.create_user(USER, PASSWORD)
topogram.user_login(USER, PASSWORD)
print topogram

r = topogram.create_topogram("ARC 5 - Collaborations Culture / Recherche en Rhône-Alpes")
print r
topogram_ID = r["data"][0]["_id"]

# get and backup existing nodes and edges
existing_nodes = topogram.get_nodes(topogram_ID)["data"]
url = slugify(TOPOGRAM_URL.decode('utf-8'))
with open('data/ARC5-%s-nodes-%s.json'%(url,now), 'w') as outfile:
    json.dump(existing_nodes, outfile)

existing_edges = topogram.get_edges(topogram_ID)["data"]
with open('data/ARC5-%s-edges-%s.json'%(url,now), 'w') as outfile:
    json.dump(existing_edges, outfile)

print "%s existing edges, %s existing nodes"%(len(existing_edges), len(existing_nodes))

# clear existing graph
topogram.delete_nodes([n["_id"] for n in existing_nodes])
print "nodes deleted"
topogram.delete_edges([n["_id"] for n in existing_edges])
print "edges deleted"

r = topogram.create_nodes(topogram_ID, nodes)
print "%s nodes created."%len(r["data"])
r = topogram.create_edges(topogram_ID, edges)
print "%s edges created."%len(r["data"])

print "done. Topogram is online at %s/topograms/%s/view"%(TOPOGRAM_URL, topogram_ID)

<topogram_client.TopogramAPIClient object at 0x7f3ddf79ae90>
{u'status': u'error', 'status_code': 200, u'message': u'A topogram with the same name already exists', u'data': [{u'sharedPublic': True, u'name': u'ARC 5 - Collaborations Culture / Recherche en Rh\xf4ne-Alpes', u'owner': u'FWGtG3EXa2F7nZRKp', u'_id': u'3Fep7oZAFjqBnHLQR', u'slug': u'arc-5-collaborations-culture-recherche-en-rhne-alpes', u'createdAt': u'2016-09-12T04:25:24.618Z'}]}
876 existing edges, 197 existing nodes
nodes deleted
edges deleted
212 nodes created.
1024 edges created.
done. Topogram is online at https://app.topogram.io/topograms/3Fep7oZAFjqBnHLQR/view


In [383]:
## Save to File

with open('../final/ARC5-nodes.json', 'w') as outfile:
    json.dump(nodes, outfile)

existing_edges = topogram.get_edges(topogram_ID)["data"]
with open('../final/ARC5-edges.json', 'w') as outfile:
    json.dump(edges, outfile)