In [1]:
%matplotlib inline

In [2]:
import networkx as nx
import projx as px

#### Dirty XML from Gephi

In [3]:
g = nx.read_graphml('data/full_prelims.graphml').to_undirected()
g.nodes(data=True)[0]

('2318',
 {'Fecha': u'1605-04-01',
  'Titulo': u'Erratas_Arcadia Madrid Cuesta 1605',
  '[Schema] Type': u'Fe de erratas',
  '[Schema] Type Id': u'153',
  'b': 153,
  'g': 153,
  'label': u'Erratas_Arcadia Madrid Cuesta 1605',
  'r': 153,
  'size': 10.0,
  'x': 379.74106,
  'y': -81.48442})

In [4]:
# Process the data with a couple one-off functions.
def process_graph(g):
    graph = nx.Graph()
    to_merge = [u'Fe de erratas', u'Tasa', u'Aprobacion', u'Carta',
                u'Otro documento', u'Privilegio/Licencia']
    ignore = ['x', 'y', 'r', 'size', 'b', 'g', '[Schema] Type', '[Schema] Type Id', 'label', 'Nombre']
    rel_types = []
    for n, attrs in g.nodes(data=True):
        new_attrs = {k.lower(): v.replace('"', "").encode('utf-8') for (k, v) in attrs.items() if k not in ignore}
        graph.add_node(n, new_attrs)
        node_type = g.node[n]["[Schema] Type"]
        if node_type in to_merge:
            graph.node[n]["doc_type"] = node_type
            node_type = u'Documento'
        graph.node[n]["type"] = node_type
        graph.node[n]["label"] = g.node[n]["label"].replace('"', '')     
    for s, t, attrs in g.edges(data=True):
        try:
            rel_type = '{0}_{1}'.format(graph.node[s]["type"], graph.node[t]["type"])
            rev_rel_type = '{0}_{1}'.format(graph.node[t]["type"], graph.node[s]["type"])
            rel_type = rel_type.lower()
            rev_rel_type = rev_rel_type.lower()
            if rel_type not in rel_types and rev_rel_type not in rel_types:
                rel_types.append(rel_type)
            elif rev_rel_type in rel_types:
                rel_type = rev_rel_type
            new_attrs = {"type": rel_type}
            graph.add_edge(s, t, new_attrs)
        except KeyError:
            print('Failed to add edge: {0}, {1} - No Allowed Relationship for RelTypeId {2}'.format(
                s, t, attrs['[Schema] Allowed Relationship Id']
            ))
    return graph


def process_projection(proj):
    g = proj.copy()
    for n, attrs in proj.nodes(data=True):
        places = attrs.get("lugar", {})
        places = sorted(places, key=places.get, reverse=True)
        dates = attrs.get("fecha", {})
        genres = attrs.get("genero", {})
        genres = sorted(genres, key=genres.get, reverse=True)
        date_list= []
        for k, v in dates.items():
            year = k.split("-")[0]
            if year:
                date_list += [int(year)] * v
        if date_list:
            avg_date = int(sum(date_list) / float(len(date_list)))
        else:
            avg_date = ""
        if places:
            top_place = places[0]
            try:
                second_place = places[1]
            except IndexError:
                second_place = ""
        else:
            top_place = ""
            second_place = ""
        if genres:
            top_genre = genres[0]
            try:
                second_genre = genres[1]
            except IndexError:
                second_genre = ""
        else:
            top_genre = ""
            second_genre = ""
        g.node[n]["avg_date"] = avg_date
        g.node[n]["top_place"] = top_place
        g.node[n]["top_genre"] = top_genre
        g.node[n]["second_place"] = second_place
        g.node[n]["second_genre"] = second_genre
    for s, t, attrs in proj.edges(data=True):
        earliest_year = 2000
        dates = attrs.get("fecha", {})
        for k, v in dates.items():
            year = int(k.split("-")[0])
            if year < earliest_year:
                earliest_year = year
        g.adj[s][t]["earliest_year"] = earliest_year
        g.adj[t][s]["earliest_year"] = earliest_year
    return g

In [5]:
graph = process_graph(g)

#### Transfer the Lugar type as an attribute to documents, impressions, institutions:<br><br>  (need to implement WHERE in MATCH statement and I could do this is one statement)

In [6]:
p1 = px.Projection(graph)
sub1 = p1.execute("""
    MATCH GRAPH (l:Lugar)-(d:Documento)
    TRANSFER (l)-(d)
    METHOD ATTRS
    SET lugar=l.label
""")

In [7]:
p2 = px.Projection(sub1)
sub2 = p2.execute("""
    MATCH GRAPH (l:Lugar)-(d:Dedicatoria)
    TRANSFER (l)-(d)
    METHOD ATTRS
    SET lugar=l.label
""")

In [8]:
p3 = px.Projection(sub2)
sub3 = p3.execute("""
    MATCH GRAPH (l:Lugar)-(i:Institucion)
    TRANSFER (l)-(i)
    METHOD ATTRS
    SET lugar=l.label
""")

In [9]:
p4 = px.Projection(sub3)
sub4 = p4.execute("""
    MATCH GRAPH (l:Lugar)-(i:Impresion)
    TRANSFER (l)-(i)
    METHOD ATTRS
    SET lugar=l.label
""")

In [10]:
nx.write_gexf(sub3, 'projections/multipartite.gexf')

#### Refine the graph with some more fine tuned planned transfomations:

In [11]:
### TRANSFERS ###

#Documento --> Persona: attrs - doc_type, fecha, lugar; edges - Edicion
#Institucion --> wild: attrs - name
#Impresion --> Persona: attrs - type, fecha, lugar; edges - Edicion
#Impresion --> Edicion: attrs - lugar
#Obra --> Edicion: attrs - genero
#Edicion --> Obra: attrs - type, lugar, fecha
#Obra --> Persona: attrs - type, lugar, fecha; edges - Edicion

### PROJECTION ###

#Persona -- Edicion -- Persona --> Persona -- Persona: attrs: fecha

In [12]:
p5 = px.Projection(sub4)
sub5 = p5.execute("""
    MATCH GRAPH (d:Dedicatoria)-(p:Persona)
    TRANSFER (d)-(p)
    METHOD EDGES Edicion
    SET doc_type="dedicatory", fecha=d.fecha, lugar=d.lugar, role="patron"
""")

In [13]:
p6 = px.Projection(sub5)
sub6 = p6.execute("""
    MATCH GRAPH (d:Documento)-(p:Persona)
    TRANSFER (d)-(p)
    METHOD EDGES Edicion
    SET doc_type=d.doc_type, fecha=d.fecha, lugar=d.lugar, role="signatory"
""")

In [14]:
p7 = px.Projection(sub6)
sub7 = p7.execute("""
    MATCH GRAPH (i:Institucion)-(wild)
    TRANSFER (i)-(wild)
    METHOD ATTRS
    SET inst=i.label
""")

In [15]:
p8 = px.Projection(sub7)
sub8 = p8.execute("""
    MATCH GRAPH (i:Impresion)-(p:Persona)
    TRANSFER (i)-(p)
    METHOD EDGES Edicion
    SET doc_type=i.type, fecha=i.fecha, lugar=i.lugar, role="printer/editor"
""")

In [16]:
p9 = px.Projection(sub8)
sub9 = p9.execute("""
    MATCH GRAPH (i:Impresion)-(e:Edicion)
    TRANSFER (i)-(e)
    METHOD ATTRS
    SET lugar=i.lugar
""")

In [17]:
p10 = px.Projection(sub9)
sub10 = p10.execute("""
    MATCH GRAPH (o:Obra)-(e:Edicion)
    TRANSFER (o)-(e)
    METHOD ATTRS
    SET genero=o.genero
""")
### This essentially counts 12 theatre obras as 1 genre edition
for n, attrs in sub10.nodes(data=True):
    if attrs.get("genero", ""):
        k = attrs.get("genero", {})
        if isinstance(k, str):
            sub10.node[n]["genero"] = {k: 1}
        else:
            k = k.keys()[0]
            sub10.node[n]["genero"][k] = 1

In [18]:
p11 = px.Projection(sub10)
sub11 = p11.execute("""
    MATCH GRAPH (e:Edicion)-(o:Obra)
    TRANSFER (e)-(o)
    METHOD ATTRS
    SET doc_type=e.type, ed_fecha=e.fecha, lugar=e.lugar
""")

In [19]:
p12 = px.Projection(sub11)
sub12 = p12.execute("""
    MATCH GRAPH (o:Obra)-(p:Persona)
    TRANSFER (o)-(p)
    METHOD EDGES Edicion
    SET doc_type=o.type, fecha=o.ed_fecha, lugar=o.lugar, author="true", role="author"
""")

In [20]:
p13 = px.Projection(sub12)
sub13 = p13.execute("""
    MATCH GRAPH (e:Edicion)-(p:Persona)
    TRANSFER (e)-(p)
    METHOD ATTRS
    SET genero=e.genero
""")

In [21]:
p14 = px.Projection(sub13)
bipartite = p14.execute("""
    MATCH (p:Persona)-(e:Edicion)
""")

In [22]:
nx.write_gexf(bipartite, "projections/bipartite.gexf")

In [23]:
p16 = px.Projection(bipartite)
projection = p16.execute("""
    MATCH (p1:Persona)-(e:Edicion)-(p2:Persona)
    PROJECT (p1)-(p2)
    METHOD NEWMAN Edicion
    DELETE e
""")

In [24]:
proj = process_projection(projection)
nx.write_gexf(proj, 'projections/onemode.gexf')

In [25]:
proj.nodes(data=True)[1]

(4,
 {'avg_date': 1603,
  'doc_type': {u'Privilegio/Licencia': 1},
  'fecha': {'1603-12-11': 1},
  'genero': {'Teatro': 1},
  'inst': {u'Archidiocesis de Zaragoza': 1},
  'label': u'Pedro de Moya, vicario general',
  'lugar': {u'Zaragoza': 1},
  'real': {'True': 1},
  'role': 'signatory',
  'second_genre': '',
  'second_place': '',
  'top_genre': 'Teatro',
  'top_place': u'Zaragoza',
  'type': u'Persona'})

In [34]:
# Need to implement Cypher.
def places_per_role(g):
    places = {}
    for n, a in g.nodes(data=True):
        lugar = a.get("lugar", "")
        role = a.get("role", "")
        if lugar and role:
            places.setdefault(role, [])
            places[role].append(len(filter(bool, lugar.keys())))
    return places
            
def aggs(tps):
    aggs = {}
    for k, v in tps.items():
        aggs[k] = (sum(v) / float(len(v)))
    return aggs

In [35]:
plc = places_per_role(proj)
a = aggs(plc)

In [36]:
a

{'author': 2.888888888888889,
 'patron': 0.19047619047619047,
 'printer/editor': 1.095890410958904,
 'signatory': 0.950920245398773}

In [48]:
# Need to implement Cypher.
def known_places_per_role(g):
    places = {}
    for n, a in g.nodes(data=True):
        lugar = a.get("lugar", "")
        role = a.get("role", "")
        if lugar and role:
            places.setdefault(role, [])
            num_places = len(filter(bool, lugar.keys()))
            if num_places:
                places[role].append(num_places)
    return places
            
def aggs(tps):
    aggs = {}
    for k, v in tps.items():
        aggs[k] = (sum(v) / float(len(v)))
    return aggs

In [49]:
plc = known_places_per_role(proj)
a = aggs(plc)

In [50]:
a

{'author': 2.888888888888889,
 'patron': 1.0,
 'printer/editor': 1.095890410958904,
 'signatory': 1.1231884057971016}

In [47]:
[(n, a) for n, a in proj.nodes(data=True) if a.get("role") == 'printer/editor']

[(9,
  {'avg_date': 1617,
   'doc_type': {u'Impresion': 1},
   'fecha': {'1617-01-27': 1},
   'genero': {'Ficcion': 1},
   'label': u'Sebastian Matevat',
   'lugar': {u'Barcelona': 1},
   'real': {'True': 1},
   'role': 'printer/editor',
   'second_genre': '',
   'second_place': '',
   'top_genre': 'Ficcion',
   'top_place': u'Barcelona',
   'type': u'Persona'}),
 (11,
  {'avg_date': 1616,
   'doc_type': {u'Impresion': 3, u'Privilegio/Licencia': 2},
   'fecha': {'1613-12-23': 1,
    '1614-05-17': 1,
    '1617-11-07': 1,
    '1618-01-01': 1,
    '1618-06-25': 1},
   'genero': {'Ficcion': 2, 'Teatro': 1},
   'label': u'Miguel Martinez',
   'lugar': {u'Madrid': 5},
   'real': {'True': 1},
   'role': 'printer/editor',
   'second_genre': 'Teatro',
   'second_place': '',
   'top_genre': 'Ficcion',
   'top_place': u'Madrid',
   'type': u'Persona'}),
 (18,
  {'avg_date': 1608,
   'doc_type': {u'Impresion': 8, u'Privilegio/Licencia': 5},
   'fecha': {'1600-05-15': 1,
    '1604-01-15': 2,
    '1

In [46]:
[(n, a)  for n, a  in proj.nodes(data=True)]

[(3,
  {'avg_date': '',
   'genero': {'Teatro': 7},
   'label': u'Bernardo Grassa',
   'real': {'True': 1},
   'second_genre': '',
   'second_place': '',
   'top_genre': 'Teatro',
   'top_place': '',
   'type': u'Persona'}),
 (4,
  {'avg_date': 1603,
   'doc_type': {u'Privilegio/Licencia': 1},
   'fecha': {'1603-12-11': 1},
   'genero': {'Teatro': 1},
   'inst': {u'Archidiocesis de Zaragoza': 1},
   'label': u'Pedro de Moya, vicario general',
   'lugar': {u'Zaragoza': 1},
   'real': {'True': 1},
   'role': 'signatory',
   'second_genre': '',
   'second_place': '',
   'top_genre': 'Teatro',
   'top_place': u'Zaragoza',
   'type': u'Persona'}),
 (5,
  {'avg_date': '',
   'doc_type': {'dedicatory': 1},
   'fecha': {'': 1},
   'genero': {'Teatro': 1},
   'inst': {u'Condado de Sastago': 1},
   'label': u'Gabriel Blasco de Alagon, conde de Sastago',
   'lugar': {'': 1},
   'notas': {'Senor de las baronias de Espes y Escuer, camarlengo del Rey': 1},
   'real': {'True': 1},
   'role': 'patron'