In [None]:
# https://rdflib.readthedocs.io/en/stable/

#
# REQUIREMENTS
#

# python >= 3.6
# pip install rdflib

#
# LOADING RDF DATA INTO rdflib
#

import rdflib as rdflib

def print_triples(triples, g, n=10):
    k = 0
    for s,p,o in triples:
        print_triple(s,p,o,g_tags)
        if (n > 0):
            k += 1
            if (k > n):
                break
    print()
        
def print_triple(s, p, o, g):
    ns = g.namespace_manager
    print(s.n3(ns), p.n3(ns), o.n3(ns))
    
# g.namespace_manager.bind('tag', rdflib.URIRef('http://www.semanlink.net/tag/'))

#
# parse in some RDF data hosted on the Internet
#

# create a Graph
g = rdflib.Graph()

# some semanlink rdf data (as rdf/xml)
# result = g.parse('http:www.semanlink.net/tag/nlp.rdf')

# some semanlink rdf data (as turtle)
# semanlink uses text/rdf+n3 content-type 
# but rdflib does not accept it natively, hence this trick
# cf. https://github.com/RDFLib/rdflib/issues/340

# from rdflib.plugin import register, Serializer, Parser
# # register('text/rdf+n3', Parser, 'rdflib.plugins.parsers.notation3', 'N3Parser')
# result = g.parse('http://www.semanlink.net/tag/kd_mkb.n3')

#
# LOAD THE 2 SEMANLINK FILES
#

# one contains statements about tags, the other about docs
# Download them from the files folder on github

# we can load both of them into the same graph, or not
g_tags = rdflib.Graph()
result = g_tags.parse("file:///Users/fps/Semanlink/semanlink-fps/export/sltags.ttl", format='turtle')
g_docs = rdflib.Graph()
result = g_docs.parse("file:///Users/fps/Semanlink/semanlink-fps/export/sldocs.ttl", format='turtle')

# number of triples
print("g_tags has {} statements.".format(len(g_tags)))
print("g_docs has {} statements.".format(len(g_docs)))

# loop through each triple in the graph

print_triples(g_tags, g_tags)
print_triples(g_docs, g_docs)

# print a Graph in the RDF Turtle format
# print(g_tags.serialize(format="turtle").decode("utf-8"))


In [None]:
#
# Playing a bit with rdflib
# 

# Namespaces

tag_ns = rdflib.Namespace("http://www.semanlink.net/tag/")
doc_ns = rdflib.Namespace("http://www.semanlink.net/doc/")
sl_ns = rdflib.Namespace('http://www.semanlink.net/2001/00/semanlink-schema#')

# Basic Triple Matching

# tags:
t = g_tags.triples((None, rdflib.RDF.type, sl_ns.Tag))
print_triples(t,g_tags)

# one tag
t = g_tags.triples((rdflib.URIRef(tag_ns.nlp), None, None))
print_triples(t,g_tags)

# docs (docs have tags)
t = g_docs.triples((None, sl_ns.tag, None))
print_triples(t,g_tags)

# docs tagged with a given tag
t = g_docs.triples((None, sl_ns.tag, tag_ns.nlp))
print_triples(t,g_tags)

# arxiv docs
t = g_docs.triples((None, sl_ns.tag, tag_ns.arxiv_doc))
print_triples(t,g_tags)

# tags of a given doc
# t = g_docs.triples(('https://arxiv.org/abs/1607.07956', sl_ns.tag, None)) # NOT OK
doc = rdflib.URIRef('https://arxiv.org/abs/1607.07956')
t = g_docs.triples((doc, sl_ns.tag, None))
print_triples(t,g_tags)

# note about docs in semanlink:
# old docs are identified by their web URL
# newer have a URI under www.semanlink.net/doc/, 
# and point to the bookmark they relate to through the sl:bookmarkOf property

doc = rdflib.URIRef(doc_ns + '2020/07/2004_07202_entities_as_expert')
t = g_docs.triples((doc, None, None))
print_triples(t,g_docs)

'''
# creating a grpah from a list of triples
# (hum, it must be possible to add them in one call)
gr = rdflib.Graph()
for s,p,o in t:
    gr.add((s,p,o))
print(gr.serialize(format="turtle").decode("utf-8"))
'''
print()

In [None]:
# Hum, I'll need this later to check the export results,
# because support of namespaces in rdflib seems a bit weak
# Question asked her
# https://stackoverflow.com/questions/63088670/rdflib-python-how-to-get-a-uriref-from-a-string-such-as-nsxxx

from rdflib.namespace import NamespaceManager
import rdflib as rdflib

class MyNamespacesInfo:
    def __init__(self, namespace_manager: NamespaceManager):
        # as I don't know how to get the namespace from a prefix from the API
        # I construct a dict
        self.pref2ns = {}
        for pref, ns in namespace_manager.namespaces():
            self.pref2ns[pref] = ns
    
    def uriref(self, n3uri: str, laxist=True) -> rdflib.URIRef:
        # n3uri: either 'ns:xxx', '<http://..../xxx>' or 'http://..../xxx'
        if n3uri[0] == '<':
            if n3uri[len(n3uri)-1] == '>':
                return rdflib.URIRef(n3uri[1:-1])
            else:
                raise ValueError("Illegal uri: ", n3uri)
        else:
            return self.prefixed_2_uriref(n3uri, laxist=True)

    def prefixed_2_uriref(self, short_uri: str, laxist=True) -> rdflib.URIRef:
        # param short_uri eg. 'ns:xxx', where ns declared in namespace_manager
        # using laxist = True, you also can pass a long uri
        s = short_uri.split(':')
        if len(s) < 2:
            if laxist:
                return rdflib.URIRef(short_uri)
            else:
                raise ValueError('Not a prefix:localname string: ' + short_uri)
        prefix = s[0]
        ns = self.pref2ns.get(prefix)
        if ns == None:
            if laxist:
                return rdflib.URIRef(short_uri)
            else:
                raise ValueError('Unknown prefix: ' + prefix)
        else:
            x = ns + s[1]
            for i in range(2, len(s)):
                x = x + (':' + s[i])
            return x
        
g = rdflib.Graph()
g.parse('http://www.semanlink.net/tag/rdf_tools.rdf')
ns_info = MyNamespacesInfo(g.namespace_manager)
    
x = ns_info.prefixed_2_uriref('tag:rdf_tools')
print(type(x) , x)
x = ns_info.prefixed_2_uriref('http://www.semanlink.net/tag/rdf_tools')
print(type(x) , x)

x = ns_info.uriref('tag:rdflib')
print(type(x) , x)
x = ns_info.uriref('http://www.semanlink.net/tag/rdflib')
print(type(x) , x)
x = ns_info.uriref('<http://www.semanlink.net/tag/rdflib>')
print(type(x) , x)

In [None]:
def export2kdmkb(g):
    '''
    return entities, relations, triples expected by kd-mkb
    '''
    entities = {}
    relations = {}
    triples = []
    ns = g.namespace_manager
    k = 0
    # print(type(s.n3(ns)))
    for s, p, o in g:
        # we don't do anything with literals
        if not isinstance(o, rdflib.URIRef):
            continue

        s_key = s.n3(ns)
        s_val = entities.get(s_key)
        if s_val == None:
            s_val = len(entities)
            entities[s_key] = s_val       

        p_key = p.n3(ns)
        p_val = relations.get(p_key)
        if p_val == None:
            p_val = len(relations)
            relations[p_key] = p_val

        o_key = o.n3(ns)
        o_val = entities.get(o_key)
        if o_val == None:
            o_val = len(entities)
            entities[o_key] = o_val

        triples.append((s_val, p_val, o_val))
        
    return entities, relations, triples

def verify_export(entities, relations, triples, g):
    # test the export; verify the triples (as ints),
    # when converted back to original resources and relations,
    # correspond to statements in the originl graph g
    # (the params are the output of export2kdmkb, + the graph g)
    
    nsinfos = MyNamespacesInfo(g.namespace_manager)
    # we are using python >= 3.6,
    # dict preserves the insertion order
    # I guess that's what Raphaël uses to go from int to entitie or relation
    ent_list = list(entities)
    rel_list = list(relations)
    
    # check triples are in g
    for t in triples:
        s = ent_list[t[0]]
        p = rel_list[t[1]]
        o = ent_list[t[2]]
        # print(s,p,o)
        if (nsinfos.uriref(s), nsinfos.uriref(p), nsinfos.uriref(o)) not in g:
           raise Exception("It better be!")

def print_dict(dict, n=10):
    # (pass 0 into n to get all items)

    k = 0
    for key, value in dict.items():
        print(key,':',value)
        if n > 0:
            k += 1
            if k >= n:
                break
    print()

def print_triples(entities, relations, triples, g, n=10):
    # (pass 0 into n to get all of them)
    
    # we are using python >= 3.6,
    # so dict preserves the insertion order.
    ent_list = list(entities)
    rel_list = list(relations)
    
    # triples as ints to rdf triples
    k = 0
    for t in triples:
        s = ent_list[t[0]]
        p = rel_list[t[1]]
        o = ent_list[t[2]]
        print(s,p,o)
        if n > 0:
            k += 1
            if k >= n:
                break
    print()

#
# CHOOSE THE GRAPH YOU'RE INTERESTED WITH
#

# g_tags = rdflib.Graph()
# result = g_tags.parse("file:///Users/fps/Semanlink/semanlink-fps/export/sltags.ttl", format='turtle')
# g_docs = rdflib.Graph()
# result = g_docs.parse("file:///Users/fps/Semanlink/semanlink-fps/export/sldocs.ttl", format='turtle')

# g = g_tags 
# g = g_docs
# g = g_tags + g_docs
g = g_tags + g_docs

#
# EXPORT semanlink to kd-mkb
#

# (well prepare the input data for now)

# do the export
entities, relations, triples = export2kdmkb(g)

# optionnaly, test things are correct:
# verify_export(entities, relations, triples, g)

# print some stuff
print('#entities:', len(entities))
print_dict(entities)
print('#relations:', len(relations))
print_dict(relations, n=0)
print('#triples:', len(triples))
print_triples(entities, relations, triples, g)

In [None]:
# Hum: garder les uri à la mode n3 (<http://..../xxx>) 
# ou bien ne pas mettre les '<>' autour?

# FAUDRAIT se préoccuper de stabilité: 
# à chaque fois qu'on relance, les int des entités changent.

# On pourrait garder pour test les triplets récents
# (on n'a pas de date associée aux triplets, mais il y a 
# une date de création pour les docs, et les tags)
# Faudrait pour cela ne pas se contenter de boucler sur les triplets
# du graphe, comme fait plus haut, mais sur les entités (tags ou docs), voir
# leur date de création, et les ajouter à train ou test en fct d'elle
# (Hum, je pourrais aussi faire ça à la source, lors de la génération
# des fichiers d'export dans semanlink)

# idea: use each of the 2 graphs (tags and docs) in kd-mkb setting for
# mutual learning

In [None]:
# Ben voilà, sinon, ya plus qu'à calculer les embeddings, maintenant