In [None]:
# https://rdflib.readthedocs.io/en/stable/

#
# REQUIREMENTS
#

# python >= 3.6
# pip install rdflib

#
# LOADING RDF DATA INTO rdflib
#

import rdflib as rdflib

def _load_slfile_(f, g:rdflib.Graph=None) -> rdflib.Graph:
    # load one of the semanlink files into a rdflib graph
    # if g is None, a new graph is created. Otherwise, statements are added to g
    if g == None:
        g = rdflib.Graph()
    result = g.parse(f, format='turtle')
    # bind some namespaces that are not declared. # TODO change source
    # (sl and tag are defined within the file) 
    g.namespace_manager.bind('foaf', rdflib.URIRef('http://xmlns.com/foaf/0.1/'))
    return g


def load_tag_graph(g:rdflib.Graph=None) -> rdflib.Graph:
    return _load_slfile_('file:///Users/fps/Semanlink/semanlink-fps/export/sltags.ttl', g)


def load_doc_graph(g:rdflib.Graph=None) -> rdflib.Graph:
    return _load_slfile_('file:///Users/fps/Semanlink/semanlink-fps/export/sldocs.ttl', g)

#
    
def print_triples(triples, g, n=10):
    k = 0
    for s,p,o in triples:
        print_triple(s,p,o,g)
        if (n > 0):
            k += 1
            if (k > n):
                break
    print()
        
def print_triple(s, p, o, g):
    ns = g.namespace_manager
    print(s.n3(ns), p.n3(ns), o.n3(ns))
    
# g.namespace_manager.bind('tag', rdflib.URIRef('http://www.semanlink.net/tag/'))

#
# parse in some RDF data hosted on the Internet
#

# create a Graph
g = rdflib.Graph()

# some semanlink rdf data (as rdf/xml)
# result = g.parse('http:www.semanlink.net/tag/nlp.rdf')

# some semanlink rdf data (as turtle)
# semanlink uses text/rdf+n3 content-type 
# but rdflib does not accept it natively, hence this trick
# cf. https://github.com/RDFLib/rdflib/issues/340

# from rdflib.plugin import register, Serializer, Parser
# # register('text/rdf+n3', Parser, 'rdflib.plugins.parsers.notation3', 'N3Parser')
# result = g.parse('http://www.semanlink.net/tag/kd_mkb.n3')

#
# LOAD THE 2 SEMANLINK FILES
#

# one contains statements about tags, the other about docs
# Download them from the files folder on github

# we can load both of them into the same graph, or not
g_tags = load_tag_graph()
g_docs = load_doc_graph()
# to load both of them into the same graph:
# either g = g_tags+g_docs
# or:
'''
g = rdflib.Graph()
load_tag_graph(g)
load_doc_graph(g)
'''

# number of triples
print("g_tags has {} statements.".format(len(g_tags)))
print("g_docs has {} statements.".format(len(g_docs)))
print()

# loop through each triple in the graph

print_triples(g_tags, g_tags)
print_triples(g_docs, g_docs)

# print a Graph in the RDF Turtle format
# print(g_tags.serialize(format="turtle").decode("utf-8"))


In [None]:
#
# Playing a bit with rdflib
# 

# Namespaces

tag_ns = rdflib.Namespace("http://www.semanlink.net/tag/")
doc_ns = rdflib.Namespace("http://www.semanlink.net/doc/")
sl_ns = rdflib.Namespace('http://www.semanlink.net/2001/00/semanlink-schema#')

# Basic Triple Matching

# tags:
t = g_tags.triples((None, rdflib.RDF.type, sl_ns.Tag))
print("Tags:")
print_triples(t,g_tags)

# one tag
print("One tag:")
t = g_tags.triples((rdflib.URIRef(tag_ns.nlp), None, None))
print_triples(t,g_tags)

# docs (docs have tags)
print("Triples about docs:")
t = g_docs.triples((None, sl_ns.tag, None))
print_triples(t,g_tags)

# docs tagged with a given tag
print("Docs tagged with a given tag:")
t = g_docs.triples((None, sl_ns.tag, tag_ns.nlp))
print_triples(t,g_tags)

# arxiv docs
print("Arxiv docs:")
t = g_docs.triples((None, sl_ns.tag, tag_ns.arxiv_doc))
print_triples(t,g_tags)

# tags of a given doc
print("Tags of a given doc:")
# t = g_docs.triples(('https://arxiv.org/abs/1607.07956', sl_ns.tag, None)) # NOT OK
doc = rdflib.URIRef('https://arxiv.org/abs/1607.07956')
t = g_docs.triples((doc, sl_ns.tag, None))
print_triples(t,g_tags)

# note about docs in semanlink:
# old docs are identified by their web URL
# newer have a URI under www.semanlink.net/doc/, 
# and point to the bookmark they relate to through the sl:bookmarkOf property

print("Triples about a given doc:")
doc = rdflib.URIRef(doc_ns + '2020/07/2004_07202_entities_as_expert')
t = g_docs.triples((doc, None, None))
print_triples(t,g_docs)

'''
# creating a graph from a list of triples
# (hum, it must be possible to add them in one call)
gr = rdflib.Graph()
for s,p,o in t:
    gr.add((s,p,o))
print(gr.serialize(format="turtle").decode("utf-8"))
'''
print()

In [None]:
# Hum, I'll need this later to check the export results,
# because support of namespaces in rdflib seems a bit weak
# Question asked her
# https://stackoverflow.com/questions/63088670/rdflib-python-how-to-get-a-uriref-from-a-string-such-as-nsxxx

from rdflib.namespace import NamespaceManager
import rdflib as rdflib

class MyNamespacesInfo:
    def __init__(self, namespace_manager: NamespaceManager):
        # as I don't know how to get the namespace from a prefix from the API
        # I construct a dict
        self.pref2ns = {}
        for pref, ns in namespace_manager.namespaces():
            self.pref2ns[pref] = ns
    
    def uriref(self, n3uri: str, laxist=True) -> rdflib.URIRef:
        # n3uri: either 'ns:xxx', '<http://..../xxx>' or 'http://..../xxx'
        if n3uri[0] == '<':
            if n3uri[len(n3uri)-1] == '>':
                return rdflib.URIRef(n3uri[1:-1])
            else:
                raise ValueError("Illegal uri: ", n3uri)
        else:
            return self.prefixed_2_uriref(n3uri, laxist=True)

    def prefixed_2_uriref(self, short_uri: str, laxist=True) -> rdflib.URIRef:
        # param short_uri eg. 'ns:xxx', where ns declared in namespace_manager
        # using laxist = True, you also can pass a long uri
        s = short_uri.split(':')
        if len(s) < 2:
            if laxist:
                return rdflib.URIRef(short_uri)
            else:
                raise ValueError('Not a prefix:localname string: ' + short_uri)
        prefix = s[0]
        ns = self.pref2ns.get(prefix)
        if ns == None:
            if laxist:
                return rdflib.URIRef(short_uri)
            else:
                raise ValueError('Unknown prefix: ' + prefix)
        else:
            x = ns + s[1]
            for i in range(2, len(s)):
                x = x + (':' + s[i])
            return x

'''
g = rdflib.Graph()
g.parse('http://www.semanlink.net/tag/rdf_tools.rdf')
ns_info = MyNamespacesInfo(g.namespace_manager)
    
x = ns_info.prefixed_2_uriref('tag:rdf_tools')
print(type(x) , x)
x = ns_info.prefixed_2_uriref('http://www.semanlink.net/tag/rdf_tools')
print(type(x) , x)

x = ns_info.uriref('tag:rdflib')
print(type(x) , x)
x = ns_info.uriref('http://www.semanlink.net/tag/rdflib')
print(type(x) , x)
x = ns_info.uriref('<http://www.semanlink.net/tag/rdflib>')
print(type(x) , x)
'''
print()

In [None]:
# some methods will need

def uriref_2_key(uriref:rdflib.URIRef, g) -> str:
    # from a URIRef in rdf to the id passed to kd-mkb
    # (we could just take the URI, but it's nicer to have shorter forms)
    return uriref.n3(g.namespace_manager)

def verify_export(entities:dict, relations:dict, triples:list, g:rdflib.Graph):
    # test the export: verify the triples (as ints),
    # when converted back to original resources and relations,
    # correspond to statements in the originl graph g
    # (the params are the output of export2kdmkb, + the graph g)
    
    nsinfos = MyNamespacesInfo(g.namespace_manager)
    # we are using python >= 3.6,
    # dict preserves the insertion order
    # I guess that's what Raphaël uses to go from int to entitie or relation
    ent_list = list(entities)
    rel_list = list(relations)
    
    # check triples are in g
    for t in triples:
        s = ent_list[t[0]]
        p = rel_list[t[1]]
        o = ent_list[t[2]]
        # print(s,p,o)
        if (nsinfos.uriref(s), nsinfos.uriref(p), nsinfos.uriref(o)) not in g:
           raise Exception("It better be!")

def print_dict(dict, n=10):
    # (pass 0 into n to get all items)

    k = 0
    for key, value in dict.items():
        print(key,':',value)
        if n > 0:
            k += 1
            if k >= n:
                break
    print()

def print_itriples(entities:dict, relations:dict, triples:list, g:rdflib.Graph, n=10):
    # (pass 0 into n to get all of them)
    
    # we are using python >= 3.6,
    # so dict preserves the insertion order.
    ent_list = list(entities)
    rel_list = list(relations)
    
    # triples as ints to rdf triples
    k = 0
    for t in triples:
        s = ent_list[t[0]]
        p = rel_list[t[1]]
        o = ent_list[t[2]]
        print(s,p,o)
        if n > 0:
            k += 1
            if k >= n:
                break
    print()

## First version of export

**@deprecated**: a more powerful one is given below

In [None]:
# First basic way to export a graph
# (g_tags, g_docs, or g_tags+g_docs)

def export2kdmkb_v1(g:rdflib.Graph):
    '''
    return entities, relations, triples expected by kd-mkb
    '''
    entities = {}
    relations = {}
    triples = []
    for s, p, o in g:
        # we don't do anything with literals
        if not isinstance(o, rdflib.URIRef):
            continue

        s_key = uriref_2_key(s, g)
        s_val = entities.get(s_key)
        if s_val == None:
            s_val = len(entities)
            entities[s_key] = s_val       

        p_key = uriref_2_key(p, g)
        p_val = relations.get(p_key)
        if p_val == None:
            p_val = len(relations)
            relations[p_key] = p_val

        o_key = uriref_2_key(o, g)
        o_val = entities.get(o_key)
        if o_val == None:
            o_val = len(entities)
            entities[o_key] = o_val
            
        triples.append((s_val, p_val, o_val))


#
# CHOOSE THE GRAPH YOU'RE INTERESTED WITH
#

# g_tags = load_tag_graph()
# g_docs = load_doc_graph()
g = g_tags
# g = g_docs
# g = g_tags + g_docs

#
# EXPORT semanlink to kd-mkb
#

# (well prepare the input data for now)

# do the export
entities, relations, triples = export2kdmkb_v1(g)

# optionnaly, test things are correct:
# verify_export(entities, relations, triples, g)

# print some stuff
print('#entities:', len(entities))
print_dict(entities)
print('#relations:', len(relations))
print_dict(relations, n=0)
print('#triples:', len(triples))
print_itriples(entities, relations, triples, g)

In [None]:
'''
# listing des subjects:
k = 0
subjects = {}
for s in g.subjects(): # a loop over the subjects of triples (as many iter as triples)
    s_key = s.n3(ns)
    s_val = subjects.get(s_key)
    if s_val == None:
        s_val = len(subjects)
        subjects[s_key] = s_val       
    k+=1
print(len(subjects), '/', k) # docs: 6729 / 39404, tags: 4255 / 22420, all: 10984 / 61824
# print_dict(subjects,n=0)
'''

'''
k = 0
subjects = {}
# creation date:
creation_date_prop = rdflib.URIRef(sl_ns + 'creationDate')
for s in g.subjects(predicate=creation_date_prop): # looping over the things which have a creation date
    s_key = s.n3(ns)
    s_val = subjects.get(s_key)
    if s_val == None:
        s_val = len(subjects)
        subjects[s_key] = s_val       
    k+=1
print(len(subjects), '/', k) # docs: 6729 / 6729, tags: 2992 / 2992, all: 9721 / 9721
# ouais, tous les docs exportés ont une date de creation (cf plus haut)
# (par construction de l'export, ce qui explique que le plus vieux doit dater de 2004, date d'introduction de la date de creation
# pour les docs)
# par contre pour les tags, elle dû être ajoutée plus tardivement : par ex tag:rdf
# n'a pas de creationDate.

# Moralité : 
# pour la subdivision train/test sur la date de creation 
# - pour les tags, mettre dans test ceux qui ont une creationDate, et qui est postérieure à une date limite fixée
# - pour les docs (qui sont tous sujets, et qui ont tous une date de création), idem
# - reste la question des entités qui ne sont qu'objet
# bah, comme on ne doit avoir que de telles entités à une distance de 1 des tags
# ou des docs, il suffit de partir des subjects, et prendre les objects qui leur sont liés
'''
print()

## Thinking

In [None]:
# Hum: garder les uri à la mode n3 (<http://..../xxx>) 
# ou bien ne pas mettre les '<>' autour?
# -> ça se passe dans uriref_2_key

# FAUDRAIT se préoccuper de stabilité: 
# à chaque fois qu'on relance, les int des entités changent.

# idea: use each of the 2 graphs (tags and docs) in kd-mkb setting for
# mutual learning

## Exporting semanlink data to kd-mkb input

creating a test set with the newest data, older one being used for training
(giving a date to separate beteen the two)

In [None]:
class SemanlinkKdmkbExport:
    sl_ns= rdflib.Namespace('http://www.semanlink.net/2001/00/semanlink-schema#')
    creation_date_prop = rdflib.URIRef(sl_ns + 'creationDate')
    tag_type = rdflib.URIRef(sl_ns + 'Tag')
    
    def __init__(self,
                 g_tags: rdflib.Graph,
                 g_docs: rdflib.Graph,
                 limit_date): # AAAA-MM-JJ
        self.limit_date = limit_date
        
        # compute the ids for entites and relations
        g_all = g_tags + g_docs
        self.entities, self.relations = self._entities_and_relations(g_all)
        
        # compute the triples -- that would be v1
        # self.triples = self._rdf_to_int_triples_(g_all.triples(), [], g_all)
        
        # the tags
        self.old_tag_triples = []
        self.new_tag_triples = []
        self.old_tag_nb = 0
        self.old_tag_nb = 0
        
        self.old_tag_triples, self.new_tag_triples, self.old_tag_nb, self.old_tag_nb = self._export_tags(g_tags)
 
        # the docs
        self.old_doc_triples = []
        self.new_doc_triples = []
        self.old_doc_nb = 0
        self.new_doc_nb = 0

        self.old_doc_triples, self.new_doc_triples, self.old_doc_nb, self.new_doc_nb = self._export_docs(g_docs)
    
    #
    # getters over the computed data necessary for KD-MKB
    #
    
    def get_entities(self) -> dict:
        return self.entities
    
    def get_relations(self) -> dict:
        return self.relations
    
    def get_train(self, includeTags=True, includeDocs=True) -> list:
        if includeTags:
            if includeDocs:
                return self.old_tag_triples + self.old_doc_triples
            else:
                return self.old_tag_triples
        elif includeDocs:
            return self.old_doc_triples
        else:
            raise ValueError("What do you expect if you don't ask for anything?")
    
    def get_test(self, includeTags=True, includeDocs=True) -> list:
        if includeTags:
            if includeDocs:
                return self.new_tag_triples + self.new_doc_triples  
            else:
                return self.new_tag_triples
        elif includeDocs:
            self.new_doc_triples
        else:
            return None             
        
    #
    # Computing the stuff
    #
    
    def _entities_and_relations(self, g):
        '''
        return entities and relations, as expected by kd-mkb
        '''
        entities = {}
        relations = {}
        for s, p, o in g:
            # we don't do anything with literals
            if not isinstance(o, rdflib.URIRef):
                continue

            s_key = uriref_2_key(s, g)
            s_val = entities.get(s_key)
            if s_val == None:
                s_val = len(entities)
                entities[s_key] = s_val       

            p_key = uriref_2_key(p, g)
            p_val = relations.get(p_key)
            if p_val == None:
                p_val = len(relations)
                relations[p_key] = p_val

            o_key = uriref_2_key(o, g)
            o_val = entities.get(o_key)
            if o_val == None:
                o_val = len(entities)
                entities[o_key] = o_val
        return entities, relations
       
    def _export_tags(self, g_tags):
        g = g_tags
        # list tags: they all are of rdf:type sl:Tag      
        tags = g.subjects(predicate=rdflib.RDF.type, object=SemanlinkKdmkbExport.tag_type)
        return self._export_rdf_res(tags, g)

    def _export_docs(self, g_docs):
        g = g_docs
        # docs do not have a type - but they all have a sl:creationDate
        docs = g.subjects(predicate=SemanlinkKdmkbExport.creation_date_prop)
        return self._export_rdf_res(docs, g)

    def _export_rdf_res(self, subjects, g):
        '''
        return 2 lists: old_triples and new_triples, and,
        for information purpose the nb of res involved in old and new triples
        '''
        old_res_nb = 0
        new_res_nb = 0
        old_triples = []
        new_triples = []

        for res in subjects:    
            # check their creation date
            creation_date = None
            for s, p, creation_date in g.triples((res,  self.creation_date_prop, None)): # maybe there's a better way
                break
            rdf_triples = g.triples((res, None, None)) # the triples starting from res
            # depending on creation data, update old_triples or new_triples
            if (creation_date == None) or (str(creation_date) < self.limit_date):
                # old res
                int_triples = old_triples
                old_res_nb += 1
            else:
                # new res
                int_triples = new_triples
                new_res_nb += 1
            self._rdf_to_int_triples_(rdf_triples, int_triples, g)
        return old_triples, new_triples, old_res_nb, new_res_nb


    def _rdf_to_int_triples_(self, rdf_triples, int_triples, g):
        '''
        from rdf triples out of semanlink to triples of int as used by kg-mkb.
        Adds to int_triples the triples in rdf_triples
        BEWARE modifies (and returns) int_triples
        '''
        for s, p, o in rdf_triples:
            # we don't do anything with literals
            if not isinstance(o, rdflib.URIRef):
                continue

            s_key = uriref_2_key(s, g)
            s_val = self.entities.get(s_key)
            if s_val == None:
                raise ValueError('Unknown entitie', s)

            p_key = uriref_2_key(p, g)
            p_val = self.relations.get(p_key)
            if p_val == None:
                raise ValueError('Unknown relation', p)

            o_key = uriref_2_key(o, g)
            o_val = self.entities.get(o_key)
            if o_val == None:
                raise ValueError('Unknown entitie', o)

            int_triples.append((s_val, p_val, o_val))
        return int_triples

    #
    # just to print info
    #
    
    def print_info(self):
        # print numbers
        old_t, new_t, old_n, new_n = self._print_tag_info_util()
        print("#Tags:", old_n+new_n, "#Triples: "
              , len(old_t)+len(new_t))
        print('#New Tags:', new_n, '#Triples: ', len(new_t))
        print('#Old Tags:', old_n, '#Triples: ', len(old_t))
        print()
        
        old_t, new_t, old_n, new_n = self._print_doc_info_util()
        print("#Docs:", old_n+new_n, "#Triples: "
              , len(old_t)+len(new_t))
        print('#New Docs:', new_n, '#Triples: ', len(new_t))
        print('#Old Docs:', old_n, '#Triples: ', len(old_t))
        print()

        # print triples
        old_t, new_t, old_n, new_n = self._print_tag_info_util()
        
        print('#New Tags:', new_n, '#Triples: ', len(new_t))
        print_itriples(self.entities, self.relations, new_t, g)
        
        print('#Old Tags:', old_n, '#Triples: ', len(old_t))
        print_itriples(self.entities, self.relations, old_t, g)

        old_t, new_t, old_n, new_n = self._print_doc_info_util()

        print('#New Docs:', new_n, '#Triples: ', len(new_t))
        print_itriples(self.entities, self.relations, new_t, g)
        
        print('#Old Docs:', old_n, '#Triples: ', len(old_t))
        print_itriples(self.entities, self.relations, old_t, g)

    def _print_tag_info_util(self):
        return self.old_tag_triples, self.new_tag_triples, self.old_tag_nb, self.old_tag_nb
        
    def _print_doc_info_util(self):
        return self.old_doc_triples, self.new_doc_triples, self.old_doc_nb, self.new_doc_nb
        




export = SemanlinkKdmkbExport(g_tags,g_docs,"2020-04-01")
export.print_info()


In [None]:
verify_export(export.get_entities(), export.get_relations(), export.get_train(), g_tags+g_docs)

## Compute embeddings with kd-mkb
### Training

In [None]:
# assuming we have done the things above

from kdmkb import datasets
from kdmkb import losses 
from kdmkb import models
from kdmkb import sampling
from kdmkb import utils

from creme import stats

import torch

_ = torch.manual_seed(42)

device = 'cpu' # 'cuda' if you own a gpu.

# Note: you may use only tags related triples, or only docs related triples
# (see parameters to export.get_train and get_test)

dataset = datasets.Fetch(
    train      = export.get_train(), 
    test       = export.get_test(), 
    entities   = export.get_entities(), 
    relations  = export.get_relations(),
    batch_size = 3, 
    shuffle    = True,
    seed       = 42
)

negative_sampling = sampling.NegativeSampling(
    size          = 1024,
    train_triples = dataset.train,
    entities      = dataset.entities,
    relations     = dataset.relations,
    seed          = 42,
)

model = models.RotatE(
    n_entity   = dataset.n_entity, 
    n_relation = dataset.n_relation, 
    gamma      = 3, 
    hidden_dim = 500
)

model = model.to(device)

optimizer = torch.optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr = 0.00005,
)

loss = losses.Adversarial()

bar = utils.Bar(step = 80000, update_every = 30)

for _ in bar():
     positive_sample, weight, mode=next(dataset)
     positive_score = model(positive_sample)
     negative_sample = negative_sampling.generate(
         positive_sample = positive_sample,
         mode            = mode
     )
     negative_score = model(
         (positive_sample, negative_sample), 
         mode=mode
     )
     error = loss(positive_score, negative_score, weight, alpha=0.5)
     error.backward()
     _ = optimizer.step()
     bar.set_description(f'loss: {error.item():4f}')