In [2]:
from rdflib import ConjunctiveGraph
import difflib

In [3]:
import rdflib
print(rdflib.__version__)

7.0.0


In [4]:
v1 = """
@prefix : <http://edamontology.org/> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix doap: <http://usefulinc.com/ns/doap#> .
@prefix edam: <http://purl.obolibrary.org/obo/edam#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#> .
@prefix oboOther: <http://purl.obolibrary.org/obo/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

:data_0852 a owl:Class ;
    rdfs:label "Sequence mask type" ;
    :created_in "beta12orEarlier" ;
    :obsolete_since "1.5" ;
    oboInOwl:consider :data_0842 ;
    oboInOwl:hasDefinition "A label (text token) describing the type of sequence masking to perform." ;
    oboInOwl:inSubset edam:obsolete ;
    rdfs:comment "Sequence masking is where specific characters or positions in a molecular sequence are masked (replaced) with an another (mask character). The mask type indicates what is masked, for example regions that are not of interest or which are information-poor including acidic protein regions, basic protein regions, proline-rich regions, low compositional complexity regions, short-periodicity internal repeats, simple repeats and low complexity regions. Masked sequences are used in database search to eliminate statistically significant but biologically uninteresting hits." ;
    rdfs:subClassOf owl:DeprecatedClass ;
    owl:deprecated "true" .

:data_0853 a owl:Class ;
    rdfs:label "DNA sense specification" ;
    :created_in "beta12orEarlier" ;
    :obsolete_since "1.20" ;
    :oldParent :data_2534 ;
    oboInOwl:consider :data_2534 ;
    oboInOwl:hasDefinition "The strand of a DNA sequence (forward or reverse)." ;
    oboInOwl:inSubset edam:obsolete ;
    rdfs:comment "The forward or 'top' strand might specify a sequence is to be used as given, the reverse or 'bottom' strand specifying the reverse complement of the sequence is to be used." ;
    rdfs:subClassOf owl:DeprecatedClass ;
    owl:deprecated "true" .
"""

In [5]:
v2 = """
@prefix : <http://edamontology.org/> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix doap: <http://usefulinc.com/ns/doap#> .
@prefix edam: <http://purl.obolibrary.org/obo/edam#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#> .
@prefix oboOther: <http://purl.obolibrary.org/obo/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
    
:data_0852 a owl:Class ;
    rdfs:label "Sequence mask type" ;
    oboInOwl:hasDefinition "A label (text token) describing the type of sequence masking to perform." ;
    :created_in "beta12orEarlier" ;
    rdfs:comment "Sequence masking is where specific characters or positions in a molecular sequence are masked (replaced) with an another (mask character). The mask type indicates what is masked, for example regions that are not of interest or which are information-poor including acidic protein regions, basic protein regions, proline-rich regions, low compositional complexity regions, short-periodicity internal repeats, simple repeats and low complexity regions. Masked sequences are used in database search to eliminate statistically significant but biologically uninteresting hits." ;
    :obsolete_since "1.5" ;
    oboInOwl:inSubset edam:obsolete ;
    oboInOwl:consider :data_0842 ;    
    rdfs:subClassOf owl:DeprecatedClass ;
    owl:deprecated "true" .
    


:data_0853 a owl:Class ;
    rdfs:label "DNA sense specification" ;
    :created_in "beta12orEarlier" ;
    :obsolete_since "1.20" ;
    :oldParent :data_2534 ;
    oboInOwl:consider :data_2534 ;
    oboInOwl:hasDefinition "The strand of a DNA sequence (forward or reverse)." ;
    oboInOwl:inSubset edam:obsolete ;
    rdfs:comment "The forward or 'top' strand might specify a sequence is to be used as given, the reverse or 'bottom' strand specifying the reverse complement of the sequence is to be used." ;
    rdfs:subClassOf owl:DeprecatedClass ;
    owl:deprecated "true" .
    
"""

v3 = """
@prefix : <http://edamontology.org/> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix doap: <http://usefulinc.com/ns/doap#> .
@prefix edam: <http://purl.obolibrary.org/obo/edam#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#> .
@prefix oboOther: <http://purl.obolibrary.org/obo/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

:data_0853 a owl:Class ;
    rdfs:label "DNA sense specification" ;
    :created_in "beta12orEarlier" ;
    :obsolete_since "1.20" ;
    :oldParent :data_2534 ;
    oboInOwl:consider :data_2534 ;
    oboInOwl:hasDefinition "The strand of a DNA sequence (forward or reverse)." ;
    oboInOwl:inSubset edam:obsolete ;
    rdfs:comment "The forward or 'top' strand might specify a sequence is to be used as given, the reverse or 'bottom' strand specifying the reverse complement of the sequence is to be used." ;
    rdfs:subClassOf owl:DeprecatedClass ;
    owl:deprecated "true" .
    
:data_0852 a owl:Class ;
    rdfs:label "Sequence mask type" ;
    oboInOwl:hasDefinition "A label (text token) describing the type of sequence masking to perform." ;
    :created_in "beta12orEarlier" ;
    rdfs:comment "Sequence masking is where specific characters or positions in a molecular sequence are masked (replaced) with an another (mask character). The mask type indicates what is masked, for example regions that are not of interest or which are information-poor including acidic protein regions, basic protein regions, proline-rich regions, low compositional complexity regions, short-periodicity internal repeats, simple repeats and low complexity regions. Masked sequences are used in database search to eliminate statistically significant but biologically uninteresting hits." ;
    :obsolete_since "1.5" ;
    oboInOwl:inSubset edam:obsolete ;
    oboInOwl:consider :data_0842 ;    
    rdfs:subClassOf owl:DeprecatedClass ;
    owl:deprecated "true" .
    
"""

In [6]:
assert v1 == v2, "The two serializations of the RDF graph should be the same"

AssertionError: The two serializations of the RDF graph should be the same

In [7]:
import jellyfish

In [8]:
from rdflib.compare import to_isomorphic, graph_diff

kg1 = ConjunctiveGraph()
kg1.parse(data=v1, format="turtle")

kg2 = ConjunctiveGraph()
kg2.parse(data=v1, format="turtle")


iso1 = to_isomorphic(kg1)
iso2 = to_isomorphic(kg2)

assert iso1 == iso2, "The two RDFlib graph should be the same"

in_both, in_first, in_second = graph_diff(iso1, iso2)

#print(in_both.serialize(format="turtle"))
assert len(in_first) == 0 
assert len(in_second) == 0 

In [10]:
kg1 = ConjunctiveGraph()
kg1.parse(data=v1, format="turtle")
v12 = kg1.serialize(format="turtle")
print(v12)

@prefix : <http://edamontology.org/> .
@prefix edam: <http://purl.obolibrary.org/obo/edam#> .
@prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

:data_0852 a owl:Class ;
    rdfs:label "Sequence mask type" ;
    :created_in "beta12orEarlier" ;
    :obsolete_since "1.5" ;
    oboInOwl:consider :data_0842 ;
    oboInOwl:hasDefinition "A label (text token) describing the type of sequence masking to perform." ;
    oboInOwl:inSubset edam:obsolete ;
    rdfs:comment "Sequence masking is where specific characters or positions in a molecular sequence are masked (replaced) with an another (mask character). The mask type indicates what is masked, for example regions that are not of interest or which are information-poor including acidic protein regions, basic protein regions, proline-rich regions, low compositional complexity regions, short-periodicity internal repeats, simp

In [11]:
kg2 = ConjunctiveGraph()
kg2.parse(data=v2, format="turtle")
v22 = kg2.serialize(format="turtle")
print(v22)

@prefix : <http://edamontology.org/> .
@prefix edam: <http://purl.obolibrary.org/obo/edam#> .
@prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

:data_0852 a owl:Class ;
    rdfs:label "Sequence mask type" ;
    :created_in "beta12orEarlier" ;
    :obsolete_since "1.5" ;
    oboInOwl:consider :data_0842 ;
    oboInOwl:hasDefinition "A label (text token) describing the type of sequence masking to perform." ;
    oboInOwl:inSubset edam:obsolete ;
    rdfs:comment "Sequence masking is where specific characters or positions in a molecular sequence are masked (replaced) with an another (mask character). The mask type indicates what is masked, for example regions that are not of interest or which are information-poor including acidic protein regions, basic protein regions, proline-rich regions, low compositional complexity regions, short-periodicity internal repeats, simp

In [12]:
assert(v12==v22)

In [13]:
#!pip install jellyfish

In [14]:
import jellyfish

print(jellyfish.jaro_similarity(v1, v2))
print(jellyfish.jaro_similarity(v12, v22))

0.8825825928843302
1.0


In [15]:
print(jellyfish.levenshtein_distance(v1, v2))
print(jellyfish.levenshtein_distance(v1, v3))
print(jellyfish.levenshtein_distance(v12, v22))

243
1022
0


In [16]:
v23_xml = to_isomorphic(kg2).serialize(format="xml")
v13_xml = to_isomorphic(kg1).serialize(format="xml")
print(jellyfish.levenshtein_distance(v23_xml, v13_xml))


v23_ttl = to_isomorphic(kg2).serialize(format="turtle")
v13_ttl = to_isomorphic(kg1).serialize(format="turtle")
print(jellyfish.levenshtein_distance(v23_ttl, v13_ttl))


#print(v13)
#print(v23)

0
0
