In [26]:
import os

import rdflib
import oxrdflib

import pandas as pd
from datetime import datetime

from myst_nb import glue

rdflib.plugin.register('Oxigraph', rdflib.store.Store, 'oxrdflib', 'Oxigraph')



# Loading RDF Datasets and Ontologies

In [27]:
def get_file_ext(filename: str):
    """  """
    return filename.split('.')[-1]

def get_rdf_filenames(dirname: str):
    """  """
    rdf_filenames = []
    for filename in os.listdir(dirname):
        file_path = os.path.join(dirname, filename)
        if os.path.isfile(file_path): 
            if get_file_ext(filename) == 'ttl':
                rdf_filenames.append(filename)
            elif get_file_ext(filename) == 'xrdf':
                rdf_filenames.append(filename)
    return rdf_filenames

def load_rdf_files(dirnames: list):
    """  """

    g = rdflib.Graph(store=oxrdflib.OxigraphStore())

    for dirname in dirnames:
        print(f'Load RDF data from directory {dirname} into RDF graph.')
    
        rdf_filenames = get_rdf_filenames(dirname)
        for rdf_filename in rdf_filenames:
            if get_file_ext(rdf_filename) == 'xrdf':
                g.parse(f'{dirname}/{rdf_filename}', format='xml')
            elif get_file_ext(rdf_filename) == 'ttl':
                g.parse(f'{dirname}/{rdf_filename}', format='ttl')

    return g

In [37]:
rdf_dirnames = ['MONARCH_PHENOPACKET_STORE_DATA', 'HPO']
g = load_rdf_files(rdf_dirnames)

Load RDF data from directory MONARCH_PHENOPACKET_STORE_DATA into RDF graph.
Load RDF data from directory HPO into RDF graph.


# SPARQL Queries

In [38]:
SCENARIO_NR = 5
gene_id = 'HGNC:7989'

## Question 1

Are there any other individuals with a mutation in the same gene?

In [39]:
with open(f'SPARQL/scenario_{SCENARIO_NR}/question1.rq', 'r') as file:
    query = file.read()

query = query.replace('VAR_GENE_ID', gene_id)
glue('scenario5_query_q1', query)

"PREFIX ex: <https://example.org/>\nPREFIX obo: <http://purl.obolibrary.org/obo/>\nPREFIX sio: <http://semanticscience.org/resource/>\nPREFIX dcterms: <http://purl.org/dc/terms/> \nPREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n\nSELECT ?phenopacketidval ?creatorname ?createddate\nWHERE {\n    # Find the phenopackets that contain at least one mutation in given gene\n    ?genedescr a obo:NCIT_C16612 ;\n               dcterms:identifier 'HGNC:7989' .\n    ?vardescr sio:SIO_001403 ?genedescr .\n    ?varinterpr sio:SIO_001403 ?vardescr .\n    ?genomicinterp sio:SIO_001403 ?varinterpr .\n    ?diagnosis sio:SIO_001403 ?genomicinterp .\n    ?interpr sio:SIO_001403 ?diagnosis .\n    ?phenopacket sio:SIO_001403 ?interpr ;\n                 a obo:NCIT_C79269 ;\n                 sio:SIO_000228 ?role .\n    \n    # Find phenopacket ID value\n    ?phenopacketid sio:SIO_000020 ?role ;\n\t               sio:SIO_000300 ?phenopacketidval .\n    \n    # Find metadata\n    ?phenopacket sio:SIO_001

In [40]:
result_list = []

results = g.query(query)
result_nr = 0
for result_row in results:
    datetime_str = result_row[2]
    datetime_obj = datetime.fromisoformat(datetime_str)
    date_val = datetime_obj.strftime('%Y-%m-%d %H:%M:%S')

    id_val = result_row[0]

    result_list.append({'phenopacket id': result_row[0],
                        'creator': result_row[1],
                        'creation date': date_val})
    
    result_nr += 1

In [41]:
phenopackets_df = pd.DataFrame(result_list)
glue('scenario5_phenopackets', phenopackets_df)

Unnamed: 0,phenopacket id,creator,creation date
0,PMID_26467218_individual_6_Cirstea_et_al__14,ORCID:0000-0002-0736-9199,2024-05-20 20:56:12
1,PMID_26467218_individual_11_Kraoua_et_al__23,ORCID:0000-0002-0736-9199,2024-05-20 20:56:12
2,PMID_26467218_individual_3_Denayer_et_al__22,ORCID:0000-0002-0736-9199,2024-05-20 20:56:12
3,PMID_26467218_individual_9_Cirstea_et_al__14,ORCID:0000-0002-0736-9199,2024-05-20 20:56:12
4,PMID_26467218_individual_4_Denayer_et_al__22,ORCID:0000-0002-0736-9199,2024-05-20 20:56:12
5,PMID_26467218_individual_12_Present_study,ORCID:0000-0002-0736-9199,2024-05-20 20:56:12
6,PMID_26467218_individual_1_De_Filippi_et_al__20,ORCID:0000-0002-0736-9199,2024-05-20 20:56:12
7,PMID_26467218_individual_2_Runtuwene_et_al__21,ORCID:0000-0002-0736-9199,2024-05-20 20:56:12
8,PMID_26467218_individual_9_M_Cirstea_et_al__14,ORCID:0000-0002-0736-9199,2024-05-20 20:56:12
9,PMID_26467218_individual_5_Denayer_et_al__22,ORCID:0000-0002-0736-9199,2024-05-20 20:56:12


## Question 2

What phenotypes and which diseases have been observed in these individuals and how often do they occur in this group of individuals?

### Diseases

In [42]:
phenopacket_ids = phenopackets_df['phenopacket id'].tolist()
quoted_phenopacket_ids = [f'"{id_value}"' for id_value in phenopacket_ids]
query_ids_list_str = ' '.join(quoted_phenopacket_ids)

with open(f'SPARQL/scenario_{SCENARIO_NR}/question2_a.rq', 'r') as file:
    query2 = file.read()

query2 = query2.replace('VAR_ID_LIST', query_ids_list_str)
glue('scenario5_query_q2a', query2)

'PREFIX ex: <https://example.org/>\nPREFIX obo: <http://purl.obolibrary.org/obo/>\nPREFIX sio: <http://semanticscience.org/resource/>\nPREFIX dcterms: <http://purl.org/dc/terms/> \nPREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n\nSELECT ?idvalue ?diseaseid ?diseaselabel\nWHERE {\n    # Find relevant phenopackets\n    ?id a obo:IAO_0020000 ;\n        sio:SIO_000300 ?idvalue .\n    VALUES ?idvalue { "PMID_26467218_individual_6_Cirstea_et_al__14" "PMID_26467218_individual_11_Kraoua_et_al__23" "PMID_26467218_individual_3_Denayer_et_al__22" "PMID_26467218_individual_9_Cirstea_et_al__14" "PMID_26467218_individual_4_Denayer_et_al__22" "PMID_26467218_individual_12_Present_study" "PMID_26467218_individual_1_De_Filippi_et_al__20" "PMID_26467218_individual_2_Runtuwene_et_al__21" "PMID_26467218_individual_9_M_Cirstea_et_al__14" "PMID_26467218_individual_5_Denayer_et_al__22" "PMID_26467218_individual_12_F_Present_study" "PMID_26467218_individual_8_Cirstea_et_al__14" "PMID_26467218_individual

In [43]:
results = g.query(query2)

diseases_list = []

for result_row in results:
    diseases_list.append({'phenopacket id': result_row[0], 
                          'disease ID': result_row[1],
                          'disease label': result_row[2]})

In [44]:
diseases_df = pd.DataFrame(diseases_list)
glue('scenario5_diseases', diseases_df)

Unnamed: 0,phenopacket id,disease ID,disease label
0,PMID_26467218_individual_6_Cirstea_et_al__14,OMIM:613224,Noonan syndrome 6
1,PMID_26467218_individual_11_Kraoua_et_al__23,OMIM:613224,Noonan syndrome 6
2,PMID_26467218_individual_3_Denayer_et_al__22,OMIM:613224,Noonan syndrome 6
3,PMID_26467218_individual_9_Cirstea_et_al__14,OMIM:613224,Noonan syndrome 6
4,PMID_26467218_individual_4_Denayer_et_al__22,OMIM:613224,Noonan syndrome 6
5,PMID_26467218_individual_12_Present_study,OMIM:613224,Noonan syndrome 6
6,PMID_26467218_individual_1_De_Filippi_et_al__20,OMIM:613224,Noonan syndrome 6
7,PMID_26467218_individual_2_Runtuwene_et_al__21,OMIM:613224,Noonan syndrome 6
8,PMID_26467218_individual_9_M_Cirstea_et_al__14,OMIM:613224,Noonan syndrome 6
9,PMID_26467218_individual_5_Denayer_et_al__22,OMIM:613224,Noonan syndrome 6


### Phenotypes

# Notes

In [14]:
query = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX obo: <http://purl.obolibrary.org/obo/>

    SELECT ?child ?label
    WHERE {{
        ?child rdfs:subClassOf obo:HP_0025668 .
        ?child rdfs:label ?label .
    }}
"""

# Execute the query
results = g.query(query)

# Print results
for row in results:
    print(row)

(rdflib.term.URIRef('http://purl.obolibrary.org/obo/HP_0000465'), rdflib.term.Literal('Webbed neck', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))
(rdflib.term.URIRef('http://purl.obolibrary.org/obo/HP_0000468'), rdflib.term.Literal('Increased adipose tissue around the neck', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))
(rdflib.term.URIRef('http://purl.obolibrary.org/obo/HP_0000470'), rdflib.term.Literal('Short neck', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))
(rdflib.term.URIRef('http://purl.obolibrary.org/obo/HP_0000472'), rdflib.term.Literal('Long neck', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))
(rdflib.term.URIRef('http://purl.obolibrary.org/obo/HP_0000475'), rdflib.term.Literal('Broad neck', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))
(rdflib.term.URIRef('http://purl.obolibrary.org/obo/HP_0000476'), rdflib.term.Literal('Cystic hygroma', d