# ARC5 - Parse data as network

In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import csv
import os
import json
from slugify import slugify
import networkx as nx
import itertools
from collections import Counter

data_dir = os.getcwd()
fichier_theses = "../final/ARC5-Final-ADRs.csv"
fichier_projets = "../final/ARC5-Final-projets.csv"
fichier_partenaires= "../final/partenaires.csv"
partner_categories_file="../final/partner_categories.txt"


In [3]:
def create_node(name, type, start, end, tmp=False) : 
    
    slug = slugify(name.decode('utf-8')) + "-" + type
    if tmp : slug = slug + "-tmp"
    
    try :
        if start > G.node[slug]["start"] : start =  G.node[slug]["start"]
        if end > G.node[slug]["end"] : start =  G.node[slug]["end"]
    except:
        start = start
        endd = end
            
    node = {}
    node["id"] = slug
    node["type"] = type
    node["name"] = name
    node["start"] = start
    node["end"] = end
    node["tmp"] = tmp
    
    G.add_node(node["id"], node)
    return node["id"]

# keep data when merging edges
def merge_edge_data(Graph, edge, data):
    
    try : 
        Graph.edge[e[0]][e[1]]    
    except KeyError:
        Graph.add_edge(e[0], e[1])
        
    try:
        prevData = Graph.edge[e[0]][e[1]]["additionalInfo"]
        Graph.edge[e[0]][e[1]]["additionalInfo"] = prevData + data
    except KeyError:
        Graph.edge[e[0]][e[1]]["additionalInfo"] = data
    
    x = Graph.edge[e[0]][e[1]]["additionalInfo"]
    

### Parse thèses et projets

In [4]:
print fichier_theses
print fichier_projets

G = nx.Graph()

with open( os.path.join(data_dir, fichier_projets), "r") as f :
    reader = csv.DictReader(f)
    for line in reader :

        start = int(line["Start"])
        end =  start+3

        projet = create_node(line["Titre"], "projet", start, end)
        etablissement = create_node(line["Etablissement"], "etablissement", start, end)
        porteur = create_node(line["Porteur (Nom)"] + " " + line["Porteur(Prenom)"], "personne", start, end)
        laboratoire = create_node(line["Laboratoire"], "laboratoire", start, end)
        ville = create_node(line["Ville"], "ville", start, end)
        
        edges = []
        edges.append((projet, porteur))
        edges.append((projet, etablissement))
        edges.append((projet, laboratoire))
        edges.append((etablissement, ville))
        edges.append((laboratoire, ville))
        edges.append((laboratoire, porteur))
        
        for e in edges : merge_edge_data(G, e, "* **projet** : %s \n "%line["Titre"])

with open( os.path.join(data_dir, fichier_theses), "r") as f :
    reader = csv.DictReader(f)
    for line in reader :
        
        start = int(line["Start"])
        end =  start+3

        these = create_node(line["Titre"], "these", start, end)
        etablissement = create_node(line["Etablissement"], "etablissement", start, end)
        directeur = create_node(line["Directeur de thèse"], "personne", start, end)
        ecole_doctorale = create_node(line["Ecole doctorale"], "ecole-doctorale", start, end)
        laboratoire = create_node(line["Laboratoire"], "laboratoire", start, end)
        
        edges = []
        edges.append((these, etablissement))
        edges.append((these, laboratoire))
        edges.append((these, directeur))
        edges.append((these, ecole_doctorale))
        edges.append((ecole_doctorale, ville))
        edges.append((laboratoire, ville))
        edges.append((laboratoire, directeur))
        edges.append((etablissement, ville))
        
        for e in edges : merge_edge_data(G, e, "* **thèse** : %s \n "%line["Titre"])

        
print "%s nodes"%len(G.nodes())
print "%s edges"%len(G.edges())
print Counter([n[1]["type"] for n in G.nodes(data=True)]) 

../final/ARC5-Final-ADRs.csv
../final/ARC5-Final-projets.csv
338 nodes
680 edges
Counter({'personne': 111, 'projet': 75, 'laboratoire': 61, 'these': 49, 'etablissement': 22, 'ecole-doctorale': 16, 'ville': 4})


### Parse partenaires 

In [5]:
print fichier_partenaires

with open( os.path.join(data_dir,  fichier_partenaires), "r") as f :
    reader = csv.DictReader(f) 
    for line in reader:
        
        partenaire = create_node(line["Structure"], "partenaire", start, end)
        referent = create_node(line["Personne référente"], "personne", start, end)
        ville = create_node(line["Ville"], "ville", start, end)
        
        # check if the project exists
        projet_name = line["Projet"]
        projet_id = slugify(projet_name.decode('utf-8'))
        try: 
            projet = G.node[projet_id + "-these"]["id"]
        except KeyError:
            try : 
                projet = G.node[projet_id + "-projet"]["id"]
            except KeyError:
                # add projet with a tmp flag to make sure we are not erasing anything
                projet = create_node(projet_name, "projet", start, end, True)
            
#         edges = itertools.combinations([partenaire, ville, referent, projet], 2)
        edges = []
        edges.append((partenaire, ville))
        edges.append((partenaire, referent))
        edges.append((partenaire, projet))
        
        for e in edges : 
            merge_edge_data(G, e, "* **projet** : %s \n "%projet_name)

# remove empty node
# G.nodes().remove("")
print "%s nodes"%len(G.nodes())
print "%s edges"%len(G.edges())

../final/partenaires.csv
602 nodes
1070 edges


### Check similar /identical words

In [6]:
import pickle
from difflib import get_close_matches

# store similar nodes in a file
similar_words_file = "../final/raw_similar_words.txt"
similars = []
if os.path.isfile(similar_words_file):
    with open(similar_words_file, "r") as infile:
        similars = pickle.load(infile)
        
# get all similar nodes
if len(similars) == 0:
    for node in G.nodes() :
        similar = get_close_matches(node, G.nodes())
        if len(similar) > 0 :
            for s in similar:
                if G.node[node]["type"] == G.node[s]["type"] and len( set([node,s]) ) != 1: # check if has the same type
                    if set([node,s]) not in similars : # check if it is already added
                        if s != "" :
                            similars.append(set([node,s]))

if not os.path.isfile(similar_words_file):
    with open(similar_words_file, "wb") as outfile:
        pickle.dump(similars, outfile)

print "%s similar nodes"%len(similars)

# print similars

702 similar nodes


#### Manual matching 

In [7]:
matches = []

if os.path.isfile("../final/matches.txt"):
    with open("matches.txt", "r") as infile:
        matches = pickle.load(infile)
    print "%s duplicate nodes loaded"%len(matches)

# print matches
if len(matches) == 0:
    for i, sim in enumerate(similars):
        print "---------- %s/%s"%(i, len(similars))
        if sim not in matches:
            source = tuple(sim)[0]
            target = tuple(sim)[1]

            # manually check if similar

            manual_check = raw_input("Are those words similar (y/n)?  \n '%s' \n '%s' ?: "%(source, target))
            if manual_check == "y":
                print "they are similar"
                matches.append( (source , target) )
                with open("matches.txt", "wb") as outfile:
                    pickle.dump(matches, outfile)
            else :
                print "they are not similar"
        else :
            print "already processed"

    print "%s names that match "%len(matches)

print "-"*10

4 duplicate nodes loaded
----------


#### Check tmp projects

In [8]:
tmp_projects = [ n[0] for n in G.nodes(data=True) if n[1]["type"] == "projet" and n[1]["tmp"] == True ]
print "%s tmp projects"%len(tmp_projects)

print tmp_projects
for p in [ n[0] for n in G.nodes(data=True) if n[1]["type"] == "projet"]:
    if "monnaie" in p : print p
    if "epistekne" in p : print p

matches.append((
    "monnaie-imperiale-et-corpus-numismatique-en-rhone-alpes-projet", 
    "monnaie-imperiale-et-corpus-numismatique-en-rhone-alpes-mara-postdoc-projet-tmp"
)) 

matches.append((
    "monnaie-imperiale-et-corpus-numismatique-en-rhone-alpes-projet", 
    "mara-monnaie-antique-en-rhone-alpes-du-document-monetaire-a-son-exploitation-projet"
)) 

matches.append((
    "epistekne-projet-danimation-en-vue-de-la-creation-dun-gdr-cnrs-autour-des-processus-de-creation-production-dans-les-rapports-arts-sciences-techniques-projet",
    "pistekne-projet-danimation-en-vue-de-la-creation-dun-gdr-cnrs-autour-des-processus-de-creation-production-dans-les-rapports-arts-sciences-techniques-projet-tmp"
))

print "%s occurences matches"%len(matches)

3 tmp projects
[u'pistekne-projet-danimation-en-vue-de-la-creation-dun-gdr-cnrs-autour-des-processus-de-creation-production-dans-les-rapports-arts-sciences-techniques-projet-tmp', u'-projet-tmp', u'monnaie-imperiale-et-corpus-numismatique-en-rhone-alpes-mara-postdoc-projet-tmp']
epistekne-projet-danimation-en-vue-de-la-creation-dun-gdr-cnrs-autour-des-processus-de-creation-production-dans-les-rapports-arts-sciences-techniques-projet
mara-monnaie-antique-en-rhone-alpes-du-document-monetaire-a-son-exploitation-projet
monnaie-imperiale-et-corpus-numismatique-en-rhone-alpes-projet
monnaie-imperiale-et-corpus-numismatique-en-rhone-alpes-mara-postdoc-projet-tmp
7 occurences matches


#### Merge matching nodes 

In [9]:
# prevent deleting indexes
match_reverse_index = { m[1]:m[0] for m in matches }

for k in match_reverse_index.values():
    if k in match_reverse_index.keys() :
        matches.append((match_reverse_index[k], k)) 

print "%s duplicates nodes"%len(matches)

7 duplicates nodes


#### Create a clean graph

In [10]:
from collections import Counter

excluded_nodes = [""]
duplicates = { m[0] : m[1] for m in matches}

edges_count_before = len(G.edges())
nodes_count_before = len(G.nodes())

G_clean = G.copy()
# G_clean.remove_node("") # delete the node with an empty name  
print edges_count_before, len(G_clean.edges())

# get only clean nodes and edges
for n in G_clean.nodes(data=True):
    if n[0] not in excluded_nodes:
        
        if n[0] in duplicates.keys():
            
            # get the duplicate
            duplicate_id = duplicates[n[0]]
            
            # store information from the previous node
            duplicate = G_clean.node[duplicate_id]
            
            if duplicate["start"] != "" and duplicate["start"] < n[1]["start"]:
                G_clean.node[n[0]]["start"] = duplicate["start"]
            
            if duplicate["end"] != "" and duplicate["end"] > n[1]["end"]:
                G_clean.node[n[0]]["end"] = duplicate["end"]
        
            # update edges
            duplicate_edges = G_clean.edges(duplicate_id)
            for e in duplicate_edges: 
                G_clean.add_edge( n[0], e[1])


# delete the duplicates nodes and all their edges
# for d in duplicates : 
#     G_clean.remove_node(d)

print "%s nodes merged"%(edges_count_before-len(G_clean.edges()))
print "%s edges merged"%(nodes_count_before-len(G_clean.nodes()))
print "-"*10

print "REMOVE DUPLICATES"
print "before : %s nodes / %s edges "%(len(G.nodes()), len(G.edges()))
print "after : %s nodes / %s edges "%(len(G_clean.nodes()), len(G_clean.edges()))
print "-"*10

print "types :%s " % Counter([ n[1]["type"] for n in G_clean.nodes(data=True) ])
print "dates :%s " % Counter([ n[1]["start"] for n in G_clean.nodes(data=True) ])
print "-"*10

tmp_projects = [ n[0] for n in G_clean.nodes(data=True) if n[1]["type"] == "projet" and n[1]["tmp"] == True ]
print "%s tmp projects"%len(tmp_projects)
print tmp_projects

1070 1070
-5 nodes merged
0 edges merged
----------
REMOVE DUPLICATES
before : 602 nodes / 1070 edges 
after : 602 nodes / 1075 edges 
----------
types :Counter({'personne': 225, 'partenaire': 114, 'projet': 78, 'laboratoire': 61, 'these': 49, 'ville': 37, 'etablissement': 22, 'ecole-doctorale': 16}) 
dates :Counter({2015: 304, 2013: 129, 2012: 102, 2014: 29, 2011: 27, 2016: 6, 2017: 5}) 
----------
3 tmp projects
[u'-projet-tmp', u'pistekne-projet-danimation-en-vue-de-la-creation-dun-gdr-cnrs-autour-des-processus-de-creation-production-dans-les-rapports-arts-sciences-techniques-projet-tmp', u'monnaie-imperiale-et-corpus-numismatique-en-rhone-alpes-mara-postdoc-projet-tmp']


#### Convert persons to edges

In [11]:
print "before : %s nodes / %s edges"%(len(G_clean.nodes()),len(G_clean.edges()))

G_without_people = G_clean.copy()

# get all persons in the graph
persons = [node[0] for node in G_without_people.nodes(data=True) if node[1]["type"] == "personne"]
# persons_edges = clean_G.edges(persons, data=True)


for person in persons:

    # edges for a single person
    person_edges = G_without_people.edges(person)
  
    # get all nodes linked by a single person
    list_of_person_nodes = []; map(list_of_person_nodes.extend, map(list,person_edges))
    assert len(list_of_person_nodes) == len(person_edges)*2 # make sure we have all nodes
    
    clean_nodes = [n for n in list_of_person_nodes if n != person]
    #  assert len(clean_nodes) == len(person_edges) # make sure we have all new nodes, except the person

    if len(person_edges) > 2 : # if have less than degree of 1 then remove node

        # get data from the node to add to the edge
        data = G_without_people.node[person]

        # create new edges between all those
        new_edges = list(itertools.combinations(clean_nodes, 2))

        # create new edges with merge data info
        for e in new_edges:
            merge_edge_data(G_without_people, e, "* **personnel commun**")

    # remove person from the graph
    G_without_people.remove_node(person)

print "after : %s nodes / %s edges"%(len(G_without_people.nodes()),len(G_without_people.edges()))

before : 602 nodes / 1075 edges
after : 377 nodes / 896 edges


#### Convert projects to edge

In [12]:
print "before : %s nodes / %s edges"%(len(G_without_people.nodes()),len(G_without_people.edges()))

G_without_people_and_projects = G_without_people.copy()

# get all projects in the graph
projects = [node[0] for node in G_without_people_and_projects.nodes(data=True) if node[1]["type"] == "projet" or node[1]["type"] == "these" or node[1]["type"] == "postdoc" ]


    
for project in projects:

    # edges for a single person
    project_edges = G_without_people_and_projects.edges(project)
  
    # get all nodes linked by a single person
    list_of_project_nodes = []; map(list_of_project_nodes.extend, map(list, project_edges))
    assert len(list_of_project_nodes) == len(project_edges)*2 # make sure we have all nodes
    
    clean_nodes = [n for n in list_of_project_nodes if n != project]
#     assert len(clean_nodes) == len(person_edges) # make sure we have all new nodes, except the person

    if len(project_edges) > 2 : # if have less than degree of 1 then remove node

        # get data from the node to add to the edge
        data = G_without_people_and_projects.node[project]

        # create new edges between all those
        new_edges = list(itertools.combinations(clean_nodes, 2))
                
        # TODO: merge data into edge info
        for e in new_edges:
            merge_edge_data(G_without_people_and_projects, e, "* **projet** ： %s \n"%str(G_clean.node[project]["name"]))
#             G_without_people_and_projects.add_edge( e[0], e[1], {"type" : "projet", "additionalInfo" : data} )

    # remove person from the graph
    G_without_people_and_projects.remove_node(project)


    

print "after : %s nodes / %s edges"%(len(G_without_people_and_projects.nodes()),len(G_without_people_and_projects.edges()))
print Counter([ c[1]["type"] for c in G_without_people_and_projects.nodes(data=True)])

before : 377 nodes / 896 edges
after : 250 nodes / 1134 edges
Counter({'partenaire': 114, 'laboratoire': 61, 'ville': 37, 'etablissement': 22, 'ecole-doctorale': 16})


#### Create the list of partners with the right type

In [16]:
partners = [ n for n in G_without_people_and_projects.nodes(data=True) if n[1]["type"] == "partenaire"]
print "%s partenaires"%len(partners)
print Counter([n[1]["type"] for n in G_without_people_and_projects.nodes(data=True)]) 

categories=[
{
    "id" : "patrimoine",
    "name" : "Institutions Patrimoniales"
},
{
    "id" : "creation",
    "name" : "Création"
    },
{
    "id" : "médiation",
    "name" :  "Structures Médiatrices"
},
{
    "id" : "cst",
    "name" :  "Culture Scientifique et Technique (CST)"
},
{
    "id" : "enseignement",
    "name" :  "Enseignement & Recherche"
}
]

# import existing results
partner_categories_file="../final/partner_categories.txt"
print partner_categories_file
if os.path.isfile(partner_categories_file):
    with open(partner_categories_file, "r") as infile:
        partner_categories = pickle.load(infile)
#         print partner_categories_raw
#         partner_categories =  { partner_categories_raw[p] for p in partner_categories_raw }
    partner_categories_names = [str(p) for p in partner_categories.keys()]
print "%s partner categories loaded"%len(partner_categories_names )


partner_without_categories = [ p for p in partners if p[0] not in partner_categories_names]
print "%s partner without categories"%len( partner_without_categories )

for partner in partner_without_categories:
    print "-"*10

    manual_check = raw_input("""A quelle catégorie appartient? \n
    [0] : Institutions Patrimoniales (musées, bibliothèques, archives...)
    [1] : Création (théatre, art...)
    [2] : Structures Médiatrices (CCSTI, Arald, OPC, Nacre...)
    [3] : Culture Scientifique et Technique (CST)
    [4] : Enseignement (Ecoles, conservatoire...)\n
    '%s'
    """%partner[1]["name"])

    # assert int(manual_check)
    assert int(manual_check)  < 5

    category  = categories[int(manual_check)]
    print "%s"%(category["name"])

    partner_categories[partner[0]] = category["id"]

    with open(partner_categories_file, "wb") as outfile:
        pickle.dump(partner_categories, outfile)

G_ok = G_without_people_and_projects.copy()

for n in G_ok.nodes(data=True):
    if n[1]["type"] == "partenaire" : 
        n[1]["type"] = partner_categories[n[0]]

print Counter([n[1]["type"] for n in G_ok.nodes(data=True)]) 

114 partenaires
Counter({'partenaire': 114, 'laboratoire': 61, 'ville': 37, 'etablissement': 22, 'ecole-doctorale': 16})
../final/partner_categories.txt
289 partner categories loaded
0 partner without categories
Counter({'laboratoire': 61, 'ville': 37, 'creation': 36, 'm\xc3\xa9diation': 36, 'patrimoine': 33, 'etablissement': 22, 'ecole-doctorale': 16, 'cst': 5, 'enseignement': 4})


#### Create the final graph on Topogram

In [25]:
from topogram_client import TopogramAPIClient
import logging 

# passwords
TOPOGRAM_URL = "https://app.topogram.io" # "http://localhost:3000"
USER = "arc5@arc5.com"
PASSWORD = "culture&recherche"

# connect to the topogram instance 
topogram = TopogramAPIClient(TOPOGRAM_URL)

# topogram.create_user(USER, PASSWORD)
topogram.user_login(USER, PASSWORD)
print topogram

r = topogram.create_topogram("ARC 5 - Collaborations Culture / Recherche en Rhône-Alpes")
print r
topogram_ID = r["data"][0]["_id"]


# delete existing nodes
existing_nodes = topogram.get_nodes(topogram_ID)["data"]
existing_edges = topogram.get_edges(topogram_ID)["data"]
print "%s existing edges, %s existing nodes"%(len(existing_edges), len(existing_nodes))

topogram.delete_nodes([n["_id"] for n in existing_nodes])
print "nodes deleted"
topogram.delete_edges([n["_id"] for n in existing_edges])
print "edges deleted"

existing_nodes = topogram.get_nodes(topogram_ID)["data"]
existing_edges = topogram.get_edges(topogram_ID)["data"]
print "%s existing edges, %s existing nodes"%(len(existing_edges), len(existing_nodes))


# create the graph
nodes = []
for n in G_ok.nodes(data=True): 
    node = n[1]
    node["id"] = n[0]
    node["group"] = n[1]["type"]
    nodes.append(node)

print "creating %s nodes ..."%len(nodes)
r = topogram.create_nodes(topogram_ID, nodes)

edges = []
for e in G_ok.edges(data=True): 
    edge = e[2]
    edge["source"] = e[0]
    edge["target"] = e[1]
    edges.append(edge)

print "creating %s edges ..."%len(edges)
r = topogram.create_edges(topogram_ID, edges)

print "done. Topogram is online at http://app.topogram.io/topograms/%s"%topogram_ID

<topogram_client.TopogramAPIClient object at 0x7f5b73ea1b50>
{u'status': u'error', 'status_code': 200, u'message': u'A topogram with the same name already exists', u'data': [{u'sharedPublic': False, u'name': u'ARC 5 - Collaborations Culture / Recherche en Rh\xf4ne-Alpes', u'owner': u'FWGtG3EXa2F7nZRKp', u'_id': u'3Fep7oZAFjqBnHLQR', u'slug': u'arc-5-collaborations-culture-recherche-en-rhne-alpes', u'createdAt': u'2016-09-12T04:25:24.618Z'}]}
0 existing edges, 0 existing nodes
nodes deleted
edges deleted
0 existing edges, 0 existing nodes
creating 250 nodes ...
creating 1134 edges ...
done. Topogram is online at http://app.topogram.io/topograms/3Fep7oZAFjqBnHLQR
