In [16]:
from rdflib import ConjunctiveGraph, Namespace
import difflib
from rdflib.compare import to_isomorphic, graph_diff

import rdflib
print(rdflib.__version__)

7.0.0


In [None]:
v1 = """
@prefix : <http://edamontology.org/> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix doap: <http://usefulinc.com/ns/doap#> .
@prefix edam: <http://purl.obolibrary.org/obo/edam#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#> .
@prefix oboOther: <http://purl.obolibrary.org/obo/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

:data_0852 a owl:Class ;
    rdfs:label "Sequence mask type" ;
    :created_in "beta12orEarlier" ;
    :obsolete_since "1.5" ;
    oboInOwl:consider :data_0842 ;
    oboInOwl:hasDefinition "A label (text token) describing the type of sequence masking to perform." ;
    oboInOwl:inSubset edam:obsolete ;
    rdfs:comment "Sequence masking is where specific characters or positions in a molecular sequence are masked (replaced) with an another (mask character). The mask type indicates what is masked, for example regions that are not of interest or which are information-poor including acidic protein regions, basic protein regions, proline-rich regions, low compositional complexity regions, short-periodicity internal repeats, simple repeats and low complexity regions. Masked sequences are used in database search to eliminate statistically significant but biologically uninteresting hits." ;
    rdfs:subClassOf owl:DeprecatedClass ;
    owl:deprecated "true" .

:data_0853 a owl:Class ;
    rdfs:label "DNA sense specification" ;
    :created_in "beta12orEarlier" ;
    :obsolete_since "1.20" ;
    :oldParent :data_2534 ;
    oboInOwl:consider :data_2534 ;
    oboInOwl:hasDefinition "The strand of a DNA sequence (forward or reverse)." ;
    oboInOwl:inSubset edam:obsolete ;
    rdfs:comment "The forward or 'top' strand might specify a sequence is to be used as given, the reverse or 'bottom' strand specifying the reverse complement of the sequence is to be used." ;
    rdfs:subClassOf owl:DeprecatedClass ;
    owl:deprecated "true" .
"""

In [None]:
v2 = """
@prefix : <http://edamontology.org/> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix doap: <http://usefulinc.com/ns/doap#> .
@prefix edam: <http://purl.obolibrary.org/obo/edam#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#> .
@prefix oboOther: <http://purl.obolibrary.org/obo/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
    
:data_0852 a owl:Class ;
    rdfs:label "Sequence mask type" ;
    oboInOwl:hasDefinition "A label (text token) describing the type of sequence masking to perform." ;
    :created_in "beta12orEarlier" ;
    rdfs:comment "Sequence masking is where specific characters or positions in a molecular sequence are masked (replaced) with an another (mask character). The mask type indicates what is masked, for example regions that are not of interest or which are information-poor including acidic protein regions, basic protein regions, proline-rich regions, low compositional complexity regions, short-periodicity internal repeats, simple repeats and low complexity regions. Masked sequences are used in database search to eliminate statistically significant but biologically uninteresting hits." ;
    :obsolete_since "1.5" ;
    oboInOwl:inSubset edam:obsolete ;
    oboInOwl:consider :data_0842 ;    
    rdfs:subClassOf owl:DeprecatedClass ;
    owl:deprecated "true" .
    


:data_0853 a owl:Class ;
    rdfs:label "DNA sense specification" ;
    :created_in "beta12orEarlier" ;
    :obsolete_since "1.20" ;
    :oldParent :data_2534 ;
    oboInOwl:consider :data_2534 ;
    oboInOwl:hasDefinition "The strand of a DNA sequence (forward or reverse)." ;
    oboInOwl:inSubset edam:obsolete ;
    rdfs:comment "The forward or 'top' strand might specify a sequence is to be used as given, the reverse or 'bottom' strand specifying the reverse complement of the sequence is to be used." ;
    rdfs:subClassOf owl:DeprecatedClass ;
    owl:deprecated "true" .
    
"""

v3 = """
@prefix : <http://edamontology.org/> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix doap: <http://usefulinc.com/ns/doap#> .
@prefix edam: <http://purl.obolibrary.org/obo/edam#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#> .
@prefix oboOther: <http://purl.obolibrary.org/obo/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

:data_0853 a owl:Class ;
    rdfs:label "DNA sense specification" ;
    :created_in "beta12orEarlier" ;
    :obsolete_since "1.20" ;
    :oldParent :data_2534 ;
    oboInOwl:consider :data_2534 ;
    oboInOwl:hasDefinition "The strand of a DNA sequence (forward or reverse)." ;
    oboInOwl:inSubset edam:obsolete ;
    rdfs:comment "The forward or 'top' strand might specify a sequence is to be used as given, the reverse or 'bottom' strand specifying the reverse complement of the sequence is to be used." ;
    rdfs:subClassOf owl:DeprecatedClass ;
    owl:deprecated "true" .
    
:data_0852 a owl:Class ;
    rdfs:label "Sequence mask type" ;
    oboInOwl:hasDefinition "A label (text token) describing the type of sequence masking to perform." ;
    :created_in "beta12orEarlier" ;
    rdfs:comment "Sequence masking is where specific characters or positions in a molecular sequence are masked (replaced) with an another (mask character). The mask type indicates what is masked, for example regions that are not of interest or which are information-poor including acidic protein regions, basic protein regions, proline-rich regions, low compositional complexity regions, short-periodicity internal repeats, simple repeats and low complexity regions. Masked sequences are used in database search to eliminate statistically significant but biologically uninteresting hits." ;
    :obsolete_since "1.5" ;
    oboInOwl:inSubset edam:obsolete ;
    oboInOwl:consider :data_0842 ;    
    rdfs:subClassOf owl:DeprecatedClass ;
    owl:deprecated "true" .
    
"""

In [None]:
# v1 and v2 are the same but in serialized in different order
# assert v1 == v2, "The two serializations of the RDF graph should be the same"
from pprint import pprint
import sys

sys.stdout.writelines(difflib.unified_diff(v1.splitlines(keepends=True), v2.splitlines(keepends=True), n=3))

In [None]:
import jellyfish

In [None]:
from rdflib.compare import to_isomorphic, graph_diff

kg1 = ConjunctiveGraph()
kg1.parse(data=v1, format="turtle")

kg2 = ConjunctiveGraph()
kg2.parse(data=v1, format="turtle")


iso1 = to_isomorphic(kg1)
iso2 = to_isomorphic(kg2)

assert iso1 == iso2, "The two RDFlib graph should be the same"

in_both, in_first, in_second = graph_diff(iso1, iso2)

#print(in_both.serialize(format="turtle"))
assert len(in_first) == 0 
assert len(in_second) == 0 

In [None]:
iso1_ttl = iso1.serialize(format="turtle")
iso2_ttl = iso2.serialize(format="turtle")
sys.stdout.writelines(difflib.unified_diff(iso1_ttl.splitlines(keepends=True), iso2_ttl.splitlines(keepends=True), n=3))

In [None]:
kg1 = ConjunctiveGraph()
kg1.parse(data=v1, format="turtle")
v12 = kg1.serialize(format="turtle")
print(v12)

In [None]:
kg2 = ConjunctiveGraph()
kg2.parse(data=v2, format="turtle")
v22 = kg2.serialize(format="turtle")
print(v22)

In [None]:
assert(v12==v22)

In [None]:
#!pip install jellyfish

In [None]:
import jellyfish

print(jellyfish.jaro_similarity(v1, v2))
print(jellyfish.jaro_similarity(v12, v22))

In [None]:
print(jellyfish.levenshtein_distance(v1, v2))
print(jellyfish.levenshtein_distance(v1, v3))
print(jellyfish.levenshtein_distance(v12, v22))

In [None]:
v23_xml = to_isomorphic(kg2).serialize(format="xml")
v13_xml = to_isomorphic(kg1).serialize(format="xml")
print(jellyfish.levenshtein_distance(v23_xml, v13_xml))


v23_ttl = to_isomorphic(kg2).serialize(format="turtle")
v13_ttl = to_isomorphic(kg1).serialize(format="turtle")
print(jellyfish.levenshtein_distance(v23_ttl, v13_ttl))


#print(v13)
#print(v23)

# EDAMdiff prototype

In [None]:
from rich.jupyter import print
from rich.progress import Progress

def diffEdam(rdf_1, rdf_2):
    
    with Progress() as progress:
        task1 = progress.add_task("[red]Computing diff ...", total=5)
        while not progress.finished:
            kg_1 = ConjunctiveGraph().parse(rdf_1)
            progress.update(task1, advance=1)
            progress.refresh()
            
            kg_2 = ConjunctiveGraph().parse(rdf_2)
            progress.update(task1, advance=2)
            progress.refresh()
            
            kg_1_ttl = to_isomorphic(kg_1).serialize(format="turtle")
            progress.update(task1, advance=3)
            progress.refresh()
            
            kg_2_ttl = to_isomorphic(kg_2).serialize(format="turtle")
            progress.update(task1, advance=4)
            progress.refresh()
        
            diff_output = difflib.unified_diff(kg_1_ttl.splitlines(keepends=True), kg_2_ttl.splitlines(keepends=True), n=2)
            progress.update(task1, advance=5)
            progress.refresh()
            
            for line in diff_output:
                if line.startswith("+"):
                    print("[green]"+line.strip())
                elif line.startswith("-"):
                    print("[red]"+line.strip())
                else: 
                    print(line.strip())
            
            progress.finnish()
    


In [None]:
unstable_edam = "https://edamontology.org/EDAM_unstable.owl"
stable_edam = "https://edamontology.org/EDAM.owl"

diffEdam(unstable_edam, stable_edam)

In [None]:
unstable_edam = "https://edamontology.org/EDAM_unstable.owl"
stable_edam = "https://edamontology.org/EDAM.owl"

local_edam = "edam_split.ttl"

remote_edam_kg = ConjunctiveGraph().parse(stable_edam)
print(len(remote_edam_kg))

local_edam_kg = ConjunctiveGraph().parse(local_edam)
print(len(local_edam_kg))

In [None]:
ref_ttl = to_isomorphic(remote_edam_kg).serialize(format="turtle")
dev_ttl = to_isomorphic(local_edam_kg).serialize(format="turtle")
sys.stdout.writelines(difflib.unified_diff(ref_ttl.splitlines(keepends=True), dev_ttl.splitlines(keepends=True), n=3))

In [None]:
local_edam_kg = ConjunctiveGraph().parse("edam_mod.ttl")
print(len(local_edam_kg))

dev_ttl = to_isomorphic(local_edam_kg).serialize(format="turtle")
for line in difflib.unified_diff(ref_ttl.splitlines(keepends=True), dev_ttl.splitlines(keepends=True), n=2):
    #sys.stdout.writelines(line)
    if line.startswith("+"):
        print("[green]"+line.strip())
    elif line.startswith("-"):
        print("[red]"+line.strip())
    else: 
        print(line.strip())

# Test set
 - diff should output {} when class definitions are not in the same order 
 - diff should output {} when IN A class definitions PROPERTIES ARE NOT IN THE SAME ORDER
 - diff should output {} when there are ≠ whitespaces or indentation or blank lines 
 - diff should output {} when different namespaces are used 
 
Test scenario 
 +  edam_v1 -> cannonical serializion -> edam_tmp_v1 
 +  edam_v2 -> cannonical serializion -> edam_tmp_v2 
 +  DIFF( edam_tmp_v1, edam_tmp_v2) 

In [3]:
edam_v0 = "edam_v0.ttl"
#edam_v1 = "edam_v1_class_order.ttl"

kg = ConjunctiveGraph().parse(edam_v0)
print(len(kg))

reformatted_v0 = to_isomorphic(kg).serialize(format="turtle")
print(reformatted_v0)

21
@prefix ns1: <http://www.geneontology.org/formats/oboInOwl#> .
@prefix ns2: <http://edamontology.org/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

ns2:data_0852 a owl:Class ;
    rdfs:label "Sequence mask type" ;
    ns2:created_in "beta12orEarlier" ;
    ns2:obsolete_since "1.5" ;
    ns1:consider ns2:data_0842 ;
    ns1:hasDefinition "A label (text token) describing the type of sequence masking to perform." ;
    ns1:inSubset <http://purl.obolibrary.org/obo/edam#obsolete> ;
    rdfs:comment "Sequence masking is where specific characters or positions in a molecular sequence are masked (replaced) with an another (mask character). The mask type indicates what is masked, for example regions that are not of interest or which are information-poor including acidic protein regions, basic protein regions, proline-rich regions, low compositional complexity regions, short-periodicity internal repeats, simple repeats and low compl

In [6]:
edam_ref = """
@prefix ns1: <http://www.geneontology.org/formats/oboInOwl#> .
@prefix ns2: <http://edamontology.org/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

ns2:data_0852 a owl:Class ;
    rdfs:label "Sequence mask type" ;
    ns2:created_in "beta12orEarlier" ;
    ns2:obsolete_since "1.5" ;
    ns1:consider ns2:data_0842 ;
    ns1:hasDefinition "A label (text token) describing the type of sequence masking to perform." ;
    ns1:inSubset <http://purl.obolibrary.org/obo/edam#obsolete> ;
    rdfs:comment "Sequence masking is where specific characters or positions in a molecular sequence are masked (replaced) with an another (mask character). The mask type indicates what is masked, for example regions that are not of interest or which are information-poor including acidic protein regions, basic protein regions, proline-rich regions, low compositional complexity regions, short-periodicity internal repeats, simple repeats and low complexity regions. Masked sequences are used in database search to eliminate statistically significant but biologically uninteresting hits." ;
    rdfs:subClassOf owl:DeprecatedClass ;
    owl:deprecated "true" .

ns2:data_0853 a owl:Class ;
    rdfs:label "DNA sense specification" ;
    ns2:created_in "beta12orEarlier" ;
    ns2:obsolete_since "1.20" ;
    ns2:oldParent ns2:data_2534 ;
    ns1:consider ns2:data_2534 ;
    ns1:hasDefinition "The strand of a DNA sequence (forward or reverse)." ;
    ns1:inSubset <http://purl.obolibrary.org/obo/edam#obsolete> ;
    rdfs:comment "The forward or 'top' strand might specify a sequence is to be used as given, the reverse or 'bottom' strand specifying the reverse complement of the sequence is to be used." ;
    rdfs:subClassOf owl:DeprecatedClass ;
    owl:deprecated "true" .
"""

### 0. Test compare with the re-seriaization of the reference file

In [10]:
edam_v0 = "edam_ref.ttl"
kg = ConjunctiveGraph().parse(edam_v0)
print(len(kg))

reformatted_v0 = to_isomorphic(kg).serialize(format="turtle")
print(reformatted_v0)

for line in difflib.unified_diff(edam_ref.splitlines(keepends=True), reformatted_v0.splitlines(keepends=True), n=2):
    print(line.strip())

21
@prefix ns1: <http://www.geneontology.org/formats/oboInOwl#> .
@prefix ns2: <http://edamontology.org/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

ns2:data_0852 a owl:Class ;
    rdfs:label "Sequence mask type" ;
    ns2:created_in "beta12orEarlier" ;
    ns2:obsolete_since "1.5" ;
    ns1:consider ns2:data_0842 ;
    ns1:hasDefinition "A label (text token) describing the type of sequence masking to perform." ;
    ns1:inSubset <http://purl.obolibrary.org/obo/edam#obsolete> ;
    rdfs:comment "Sequence masking is where specific characters or positions in a molecular sequence are masked (replaced) with an another (mask character). The mask type indicates what is masked, for example regions that are not of interest or which are information-poor including acidic protein regions, basic protein regions, proline-rich regions, low compositional complexity regions, short-periodicity internal repeats, simple repeats and low compl

### 1. Test compare with the re-seriaization of the class-order modification

In [11]:
edam_v1 = "edam_v1_class_order.ttl"
kg = ConjunctiveGraph().parse(edam_v1)
print(len(kg))

reformatted_v1 = to_isomorphic(kg).serialize(format="turtle")
print(reformatted_v1)

for line in difflib.unified_diff(edam_ref.splitlines(keepends=True), reformatted_v1.splitlines(keepends=True), n=2):
    print(line.strip())

21
@prefix ns1: <http://www.geneontology.org/formats/oboInOwl#> .
@prefix ns2: <http://edamontology.org/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

ns2:data_0852 a owl:Class ;
    rdfs:label "Sequence mask type" ;
    ns2:created_in "beta12orEarlier" ;
    ns2:obsolete_since "1.5" ;
    ns1:consider ns2:data_0842 ;
    ns1:hasDefinition "A label (text token) describing the type of sequence masking to perform." ;
    ns1:inSubset <http://purl.obolibrary.org/obo/edam#obsolete> ;
    rdfs:comment "Sequence masking is where specific characters or positions in a molecular sequence are masked (replaced) with an another (mask character). The mask type indicates what is masked, for example regions that are not of interest or which are information-poor including acidic protein regions, basic protein regions, proline-rich regions, low compositional complexity regions, short-periodicity internal repeats, simple repeats and low compl

### 2. Test compare with the re-seriaization of the prop-order modification plus spaces

In [22]:
edam_v2 = "edam_space_prop_order_mod.ttl"
kg = ConjunctiveGraph().parse(edam_v2)
print(len(kg))

#reformatted_v2 = to_isomorphic(kg).serialize(format="turtle")
reformatted_v2 = kg.serialize(format="turtle")
print(reformatted_v2)

for line in difflib.unified_diff(edam_ref.splitlines(keepends=True), reformatted_v2.splitlines(keepends=True), n=2):
    print(line.strip())

21
@prefix ns1: <http://www.geneontology.org/formats/oboInOwl#> .
@prefix ns2: <http://edamontology.org/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

ns2:data_0852 a owl:Class ;
    rdfs:label "Sequence mask type" ;
    ns2:created_in "beta12orEarlier" ;
    ns2:obsolete_since "1.5" ;
    ns1:consider ns2:data_0842 ;
    ns1:hasDefinition "A label (text token) describing the type of sequence masking to perform." ;
    ns1:inSubset <http://purl.obolibrary.org/obo/edam#obsolete> ;
    rdfs:comment "Sequence masking is where specific characters or positions in a molecular sequence are masked (replaced) with an another (mask character). The mask type indicates what is masked, for example regions that are not of interest or which are information-poor including acidic protein regions, basic protein regions, proline-rich regions, low compositional complexity regions, short-periodicity internal repeats, simple repeats and low compl

### 3. Test compare with the re-seriaization of the namespace modification (name + order)

In [23]:
edam_v3 = "edam_namespace_mod.ttl"
kg = ConjunctiveGraph().parse(edam_v3)
print(len(kg))

edam_ns = Namespace("http://edamontology.org/")
obo_ns = Namespace("http://www.geneontology.org/formats/oboInOwl#")
#kg.bind("edam", edam_ns)
kg.bind("", edam_ns)
kg.bind("obo", obo_ns)

#reformatted_v3 = to_isomorphic(kg).serialize(format="turtle")
reformatted_v3 = kg.serialize(format="turtle")
print(reformatted_v3)

for line in difflib.unified_diff(edam_ref.splitlines(keepends=True), reformatted_v3.splitlines(keepends=True), n=2):
    print(line.strip())

21
@prefix : <http://edamontology.org/> .
@prefix obo: <http://www.geneontology.org/formats/oboInOwl#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

:data_0852 a owl:Class ;
    rdfs:label "Sequence mask type" ;
    :created_in "beta12orEarlier" ;
    :obsolete_since "1.5" ;
    obo:consider :data_0842 ;
    obo:hasDefinition "A label (text token) describing the type of sequence masking to perform." ;
    obo:inSubset <http://purl.obolibrary.org/obo/edam#obsolete> ;
    rdfs:comment "Sequence masking is where specific characters or positions in a molecular sequence are masked (replaced) with an another (mask character). The mask type indicates what is masked, for example regions that are not of interest or which are information-poor including acidic protein regions, basic protein regions, proline-rich regions, low compositional complexity regions, short-periodicity internal repeats, simple repeats and low complexity regions. 