## Load EDAM from github

In [14]:
from rdflib import ConjunctiveGraph, Namespace
import pandas as pd
import numpy as np

print("Loading graph ...", end="")
g = ConjunctiveGraph()
g.parse('https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl', format='xml')
#g.parse('../edamontology/EDAM_dev.owl', format='xml')
g.bind('edam', Namespace('http://edamontology.org#'))
print("done!")
print(str(len(g)) + ' triples in the EDAM triple store')


Loading graph ...done!
38256 triples in the EDAM triple store


## query to get format without the *is_format_of* property

In [38]:
q = """
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
PREFIX edam:<http://edamontology.org/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?entity ?label ?property WHERE
{
  
    ?entity rdfs:subClassOf+ edam:format_1915 .
    ?entity rdfs:label ?label .

     VALUES ?property { edam:is_format_of               
                        }
    FILTER NOT EXISTS {    
        ?entity rdfs:subClassOf ?restriction . 
        ?restriction rdf:type owl:Restriction ; 
                owl:onProperty ?property  ; 
                owl:someValuesFrom ?data.}

}ORDER BY ?entity
    
    
"""

In [39]:
results = g.query(q)
print(f"nb formats missing is_format_of property: {len(results)} ")
df = pd.DataFrame(columns=["entity","label","property"])
for r in results:
        pd.set_option("display.max_rows",None,"display.max_colwidth",5000,"display.width",5000,)
        concept = pd.DataFrame([[r["entity"], r["label"], r["property"]]],columns=["entity","label","property"])
        df=pd.concat([df, concept], ignore_index=True)
df.index = np.arange(1, len(df) + 1)
display(df)

nb formats missing is_format_of property: 85 


Unnamed: 0,entity,label,property
1,http://edamontology.org/format_1295,quicktandem,http://edamontology.org/is_format_of
2,http://edamontology.org/format_1296,Sanger inverted repeats,http://edamontology.org/is_format_of
3,http://edamontology.org/format_1297,EMBOSS repeat,http://edamontology.org/is_format_of
4,http://edamontology.org/format_1318,restrict format,http://edamontology.org/is_format_of
5,http://edamontology.org/format_1319,restover format,http://edamontology.org/is_format_of
6,http://edamontology.org/format_1320,REBASE restriction sites,http://edamontology.org/is_format_of
7,http://edamontology.org/format_1454,dssp,http://edamontology.org/is_format_of
8,http://edamontology.org/format_1455,hssp,http://edamontology.org/is_format_of
9,http://edamontology.org/format_1627,Primer3 primer,http://edamontology.org/is_format_of
10,http://edamontology.org/format_1665,Taverna workflow format,http://edamontology.org/is_format_of


## query to see format with is_format_of property

In [20]:

q2= """
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
PREFIX edam:<http://edamontology.org/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT ?entity ?label ?data
WHERE {
    ?entity rdfs:subClassOf ?restriction . 
    ?restriction rdf:type owl:Restriction ; 
            owl:onProperty edam:is_format_of  ; 
            owl:someValuesFrom ?data.
    ?entity rdfs:label ?label .
    }
"""

In [22]:
results = g.query(q2)
print(f"nb formats with is_format_of property: {len(results)} ")
df = pd.DataFrame(columns=["entity","label","data"])
for r in results:
        pd.set_option("display.max_rows",None,"display.max_colwidth",5000,"display.width",5000,)
        concept = pd.DataFrame([[r["entity"], r["label"], r["data"]]],columns=["entity","label","data"])
        df=pd.concat([df, concept], ignore_index=True)
df.index = np.arange(1, len(df) + 1)
display(df)

nb formats with is_format_of property: 111 


Unnamed: 0,entity,label,data
1,http://edamontology.org/format_1475,PDB database entry format,http://edamontology.org/data_0883
2,http://edamontology.org/format_1475,PDB database entry format,http://edamontology.org/data_3870
3,http://edamontology.org/format_1637,dat,http://edamontology.org/data_1714
4,http://edamontology.org/format_1638,cel,http://edamontology.org/data_3110
5,http://edamontology.org/format_1644,CHP,http://edamontology.org/data_3111
6,http://edamontology.org/format_1919,Sequence record format,http://edamontology.org/data_0849
7,http://edamontology.org/format_1920,Sequence feature annotation format,http://edamontology.org/data_1255
8,http://edamontology.org/format_1921,Alignment format,http://edamontology.org/data_0863
9,http://edamontology.org/format_2006,Phylogenetic tree format,http://edamontology.org/data_0872
10,http://edamontology.org/format_2013,Biological pathway or network format,http://edamontology.org/data_2600


## Create new triples to create all is_format_of relation possible

In [35]:
q3 = """
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
PREFIX edam:<http://edamontology.org/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

CONSTRUCT { 
    ?children_format rdfs:subClassOf ?restriction . 
    ?restriction rdf:type owl:Restriction ; 
            owl:onProperty edam:is_format_of  ; 
            owl:someValuesFrom ?data. 
}
WHERE {
    ?parent_format rdfs:subClassOf ?restriction . 
    ?restriction rdf:type owl:Restriction ; 
            owl:onProperty edam:is_format_of  ; 
            owl:someValuesFrom ?data.
    ?children_format rdfs:subClassOf+ ?parent_format .
    }
    
    
"""

In [36]:
results = g.query(q3)
print(f"nb constructed is_format_of property: {len(results)} ")
df = pd.DataFrame(columns=["entity","property","object"])
for r in results:
        pd.set_option("display.max_rows",None,"display.max_colwidth",5000,"display.width",5000,)
        concept = pd.DataFrame([[r[0],r[1],r[2]]],columns=["entity","property","object"])
        df=pd.concat([df, concept], ignore_index=True)
        g.add((r[0],r[1],r[2]))
df.index = np.arange(1, len(df) + 1)
display(df)

nb constructed is_format_of property: 844 


Unnamed: 0,entity,property,object
1,http://edamontology.org/format_1739,http://www.w3.org/2000/01/rdf-schema#subClassOf,Nccaafc7218ac4c5888c5c2a2062975ea
2,http://edamontology.org/format_1430,http://www.w3.org/2000/01/rdf-schema#subClassOf,N72002b38e4bd432aa779b309338f63f8
3,http://edamontology.org/format_3592,http://www.w3.org/2000/01/rdf-schema#subClassOf,N6a94bccb14ab4e4a90f2257e822abf77
4,http://edamontology.org/format_1434,http://www.w3.org/2000/01/rdf-schema#subClassOf,N6758ed77fffc42aaa23e8811007f9a30
5,Nccaafc7218ac4c5888c5c2a2062975ea,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/2002/07/owl#Restriction
6,http://edamontology.org/format_1961,http://www.w3.org/2000/01/rdf-schema#subClassOf,Nae710a1ac1b1447ab0325c139674e1c1
7,N4b9191880a7a475db2dce2b9120a1a7b,http://www.w3.org/2002/07/owl#onProperty,http://edamontology.org/is_format_of
8,http://edamontology.org/format_3826,http://www.w3.org/2000/01/rdf-schema#subClassOf,Nd1f6d47a6c284630b1f2490ddf0b6970
9,N75a240cf6b9943ea86a081ad9edc3901,http://www.w3.org/2002/07/owl#onProperty,http://edamontology.org/is_format_of
10,http://edamontology.org/format_3820,http://www.w3.org/2000/01/rdf-schema#subClassOf,Nae710a1ac1b1447ab0325c139674e1c1


In [14]:
print(str(len(g)) + ' triples in the EDAM triple store')

38602 triples in the EDAM triple store


## Run query 2 after adding the "infered" triplets

In [37]:
results = g.query(q2)
print(f"nb formats with is_format_of property: {len(results)} ")
df = pd.DataFrame(columns=["entity","label","data"])
for r in results:
        pd.set_option("display.max_rows",None,"display.max_colwidth",5000,"display.width",5000,)
        concept = pd.DataFrame([[r["entity"], r["label"], r["data"]]],columns=["entity","label","data"])
        df=pd.concat([df, concept], ignore_index=True)
df.index = np.arange(1, len(df) + 1)
display(df)

nb formats with is_format_of property: 732 


Unnamed: 0,entity,label,data
1,http://edamontology.org/format_1475,PDB database entry format,http://edamontology.org/data_0883
2,http://edamontology.org/format_1951,pdbatomnuc,http://edamontology.org/data_0883
3,http://edamontology.org/format_4035,PQR,http://edamontology.org/data_0883
4,http://edamontology.org/format_1950,pdbatom,http://edamontology.org/data_0883
5,http://edamontology.org/format_1953,pdbseqres,http://edamontology.org/data_0883
6,http://edamontology.org/format_1477,mmCIF,http://edamontology.org/data_0883
7,http://edamontology.org/format_4036,PDBQT,http://edamontology.org/data_0883
8,http://edamontology.org/format_1478,PDBML,http://edamontology.org/data_0883
9,http://edamontology.org/format_1476,PDB,http://edamontology.org/data_0883
10,http://edamontology.org/format_1952,pdbseqresnuc,http://edamontology.org/data_0883


## Test with a example sample

In [57]:
sample_test= """<?xml version="1.0"?>
<rdf:RDF xmlns="http://edamontology.org/"
     xml:base="http://edamontology.org/"
     xmlns:dc="http://purl.org/dc/elements/1.1/"
     xmlns:dcterms="http://purl.org/dc/terms/"
     xmlns:owl="http://www.w3.org/2002/07/owl#"
     xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
     xmlns:xml="http://www.w3.org/XML/1998/namespace"
     xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
     xmlns:doap="http://usefulinc.com/ns/doap#"
     xmlns:edam="http://purl.obolibrary.org/obo/edam#"
     xmlns:foaf="http://xmlns.com/foaf/0.1/"
     xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
     xmlns:oboInOwl="http://www.geneontology.org/formats/oboInOwl#"
     xmlns:oboOther="http://purl.obolibrary.org/obo/">

<owl:Class rdf:about="c">
    <rdfs:subClassOf rdf:resource="b"/>
</owl:Class>
<owl:Class rdf:about="d">
    <rdfs:subClassOf rdf:resource="b"/>
</owl:Class>   
<owl:Class rdf:about="b">
    <rdfs:subClassOf rdf:resource="a"/>
</owl:Class>

<owl:Class rdf:about="a">
    <rdfs:subClassOf>
    <owl:Restriction>
        <owl:onProperty rdf:resource="http://edamontology.org/is_format_of"/>
        <owl:someValuesFrom rdf:resource="data"/>
    </owl:Restriction>
    </rdfs:subClassOf>
</owl:Class>
</rdf:RDF>
"""

In [67]:
q = """
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
PREFIX edam:<http://edamontology.org/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

CONSTRUCT { 
    ?children_format rdfs:subClassOf ?restriction . 
    ?restriction rdf:type owl:Restriction ; 
            owl:onProperty edam:is_format_of  ; 
            owl:someValuesFrom ?data. 
}
WHERE {
    ?parent_format rdfs:subClassOf ?restriction . 
    ?restriction rdf:type owl:Restriction ; 
            owl:onProperty edam:is_format_of  ; 
            owl:someValuesFrom ?data.
    ?children_format rdfs:subClassOf* ?parent_format .
    }
    
    
"""

In [68]:
g2 = ConjunctiveGraph()
g2.parse(data=sample_test, format="xml")
print(len(g2))



11


In [69]:
results = g2.query(q)
print(f"{len(results)} ")
for r in results:
    print(r[0],r[1],r[2])

7 
Ne9b0bdaaafb34c71990f41cc75d45281 http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://www.w3.org/2002/07/owl#Restriction
http://edamontology.org/c http://www.w3.org/2000/01/rdf-schema#subClassOf Ne9b0bdaaafb34c71990f41cc75d45281
Ne9b0bdaaafb34c71990f41cc75d45281 http://www.w3.org/2002/07/owl#onProperty http://edamontology.org/is_format_of
http://edamontology.org/a http://www.w3.org/2000/01/rdf-schema#subClassOf Ne9b0bdaaafb34c71990f41cc75d45281
Ne9b0bdaaafb34c71990f41cc75d45281 http://www.w3.org/2002/07/owl#someValuesFrom http://edamontology.org/data
http://edamontology.org/d http://www.w3.org/2000/01/rdf-schema#subClassOf Ne9b0bdaaafb34c71990f41cc75d45281
http://edamontology.org/b http://www.w3.org/2000/01/rdf-schema#subClassOf Ne9b0bdaaafb34c71990f41cc75d45281
