## Example to query SparQL endpoint

In [1]:
import pandas as pd
import requests
import json
import re
from rdflib import Graph, Namespace, URIRef, RDF
from urllib.parse import quote_plus, urlparse


pd.options.display.max_columns = None
pd.set_option("display.max_colwidth", None)

## Query existing fuseki
- assumed that it is populated with EMO-BON RO-Crates
- `fuseki` runs locally on the port `localhost:3030`

In [2]:
def sparql_json_to_df(sparql_json):
    """
    Convert a SPARQL SELECT query JSON result to a pandas DataFrame.
    
    Parameters
    ----------
    sparql_json : dict
        JSON returned by Fuseki / SPARQL endpoint with Accept: application/sparql-results+json
    
    Returns
    -------
    pd.DataFrame
    """
    vars_ = sparql_json.get("head", {}).get("vars", [])
    rows = []

    for binding in sparql_json.get("results", {}).get("bindings", []):
        row = {}
        for var in vars_:
            # Some results might not bind all variables
            if var in binding:
                row[var] = binding[var]["value"]
            else:
                row[var] = None
        rows.append(row)

    df = pd.DataFrame(rows, columns=vars_)
    return df

#### How many triples do we have?

In [18]:
q = """
SELECT (COUNT(*) AS ?c)
WHERE { 
    ?s ?p ?o
}
"""
r = requests.get("http://localhost:3030/emobon", params={"query": q}, headers={"Accept": "application/sparql-results+json"})
print(r.json())

{'head': {'vars': ['c']}, 'results': {'bindings': [{'c': {'type': 'literal', 'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'value': '2637'}}]}}


In [19]:
df = sparql_json_to_df(r.json())
print(df)

      c
0  2637


#### Filter our all the `text/html` files

In [6]:
q = """
PREFIX sdo: <http://schema.org/>

SELECT ?x ?dtype
WHERE {
  ?x sdo:encodingFormat ?dtype .
  FILTER regex(str(?dtype), "^text/html", "i")
}
"""
r = requests.get("http://localhost:3030/emobon", params={"query": q}, headers={"Accept": "application/sparql-results+json"})
df = sparql_json_to_df(r.json())
df

Unnamed: 0,x,dtype
0,https://www.ebi.ac.uk/ena/browser/view/ERS20568987,text/html
1,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_OSD74_Wa_21-ro-crate/taxonomy-summary/SSU/krona.html,text/html
2,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_67-ro-crate/taxonomy-summary/SSU/krona.html,text/html
3,https://www.ebi.ac.uk/ena/browser/view/ERS20569004,text/html
4,https://www.ebi.ac.uk/ena/browser/view/ERS20569003,text/html
5,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_67-ro-crate/fastp.html,text/html
6,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/fastp.html,text/html
7,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary/SSU/krona.html,text/html
8,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_OSD74_Wa_21-ro-crate/fastp.html,text/html
9,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_66-ro-crate/fastp.html,text/html


#### Return also the `sdo:downloadUrl` of those files.

In [7]:
q = """
PREFIX sdo: <http://schema.org/>

SELECT ?x ?dtype ?durl
WHERE {
  ?x sdo:encodingFormat ?dtype ;
     sdo:downloadUrl ?durl .
  FILTER regex(str(?dtype), "^text/html", "i")
}
"""
r = requests.get("http://localhost:3030/emobon", params={"query": q}, headers={"Accept": "application/sparql-results+json"})
df = sparql_json_to_df(r.json())
df

Unnamed: 0,x,dtype,durl
0,https://www.ebi.ac.uk/ena/browser/view/ERS20568987,text/html,https://www.ebi.ac.uk/ena/browser/view/ERS20568987
1,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_OSD74_Wa_21-ro-crate/taxonomy-summary/SSU/krona.html,text/html,https://s3.mesocentre.uca.fr/mgf-data-products/files/md5/1f/d7f1e97dc438433527d667ad7694da
2,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_67-ro-crate/taxonomy-summary/SSU/krona.html,text/html,https://s3.mesocentre.uca.fr/mgf-data-products/files/md5/65/40f5e154fe4b60253aa2bb546aa163
3,https://www.ebi.ac.uk/ena/browser/view/ERS20569004,text/html,https://www.ebi.ac.uk/ena/browser/view/ERS20569004
4,https://www.ebi.ac.uk/ena/browser/view/ERS20569003,text/html,https://www.ebi.ac.uk/ena/browser/view/ERS20569003
5,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_67-ro-crate/fastp.html,text/html,https://s3.mesocentre.uca.fr/mgf-data-products/files/md5/a3/01657724b6a1dd94b2d5c773ab3389
6,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/fastp.html,text/html,https://s3.mesocentre.uca.fr/mgf-data-products/files/md5/12/e78d197511d96354e0996e9d674d39
7,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary/SSU/krona.html,text/html,https://s3.mesocentre.uca.fr/mgf-data-products/files/md5/94/cf7ab813e63199fdfa4a134ee87884
8,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_OSD74_Wa_21-ro-crate/fastp.html,text/html,https://s3.mesocentre.uca.fr/mgf-data-products/files/md5/0f/28859f70d366611ab6c31110f865bb
9,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_66-ro-crate/fastp.html,text/html,https://s3.mesocentre.uca.fr/mgf-data-products/files/md5/14/151c6e5052ad5fe322b710fcd83759


#### Return SSU taxonomy download links

In [9]:
q = """
PREFIX sdo: <http://schema.org/>

SELECT ?subject ?predicate ?object ?durl
WHERE {
  ?subject ?predicate ?object .
  FILTER regex(str(?object), "SSU-taxonomy-summary", "i")
  OPTIONAL { ?object sdo:downloadUrl ?durl }
}
LIMIT 50
"""

r = requests.get(
    "http://localhost:3030/emobon",
    params={"query": q},
    headers={"Accept": "application/sparql-results+json"},
)

df = sparql_json_to_df(r.json())
df

Unnamed: 0,subject,predicate,object,durl
0,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_66-ro-crate/taxonomy-summary/SSU/,http://schema.org/hasPart,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_66-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl,https://s3.mesocentre.uca.fr/mgf-data-products/files/md5/f1/6ea0374e1b699258f80cb72d183c9d
1,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary/SSU/,http://schema.org/hasPart,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl,https://s3.mesocentre.uca.fr/mgf-data-products/files/md5/89/eb7c8afaadc7a3b7e61181ad289ff3
2,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_67-ro-crate/taxonomy-summary/SSU/,http://schema.org/hasPart,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_67-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl,https://s3.mesocentre.uca.fr/mgf-data-products/files/md5/06/c40e558f4e38f74d6010db8a68d9ca
3,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_OSD74_Wa_21-ro-crate/taxonomy-summary/SSU/,http://schema.org/hasPart,https://github.com/emo-bon/analysis-results-cluster-01-crate/EMOBON_OSD74_Wa_21-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl,https://s3.mesocentre.uca.fr/mgf-data-products/files/md5/10/daf150ddf1ffdcd0f8e06bca453987


## SSU taxonomy display
- from the object values we see the taxonomy is in ttl format, which means it has been triplicated during so called `semantic uplift`


In [10]:
url = df["durl"].dropna().unique()[0]
r = requests.get(url)
# save to a file
with open("ssu_example.ttl", "wb") as f:
    f.write(r.content)

In [11]:
g = Graph()
g.parse("ssu_example.ttl", format="turtle")

# Convert all triples into a list of tuples
triples = [(str(s), str(p), str(o)) for s, p, o in g]

# Make a pandas DataFrame
df = pd.DataFrame(triples, columns=["subject", "predicate", "object"])
df.head(10)

Unnamed: 0,subject,predicate,object
0,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_66-ro-crate/taxonomy-summary-SSU#89373,http://purl.org/dc/terms/identifier,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=89373
1,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_66-ro-crate/taxonomy-summary-SSU#1284657,http://purl.org/dc/terms/isPartOf,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_66-ro-crate/taxonomy-summary-SSU
2,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_66-ro-crate/taxonomy-summary-SSU#1706375,https://data.emobon.embrc.eu/ns/product#ofSample,http://data.emobon.embrc.eu/observatory-aaot-crate/water/sample/EMOBON_AAOT_Wa_66
3,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_66-ro-crate/taxonomy-summary-SSU#1182780,http://purl.org/dc/terms/identifier,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=1182780
4,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=469,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,https://schema.org/Taxon
5,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_66-ro-crate/taxonomy-summary-SSU#60004,https://data.emobon.embrc.eu/ns/product#ssuRNA,1.0
6,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=444,http://purl.org/dc/terms/taxonRank,family
7,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=186801,http://purl.org/dc/terms/scientificName,Clostridia
8,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_66-ro-crate/taxonomy-summary-SSU#1752723,http://purl.org/dc/terms/identifier,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=1752723
9,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=85033,http://purl.org/dc/terms/title,Sporichthyaceae


## Translate SSU triples into a taxonomy DF

In [12]:
COLUMNS = [
    'ncbi_tax_id', 'abundance', 'superkingdom',
    'kingdom', 'phylum', 'class', 'order',
    'family', 'genus', 'species'
]
RANKS = ['superkingdom','kingdom','phylum','class','order','family','genus','species']

# ----- helper functions -----------------------------------------------------
def as_str(node):
    return None if node is None else str(node)

def extract_ncbi_id(uri_str):
    """Extract numeric taxid from NCBI URI or fragment like #41873."""
    if not uri_str:
        return None
    m = re.search(r'[?&]id=(\d+)', uri_str)
    if m:
        return m.group(1)
    m = re.search(r'#(\d+)$', uri_str)
    if m:
        return m.group(1)
    m = re.search(r'/(\d+)(?:$|[/?#])', uri_str)
    if m:
        return m.group(1)
    return None

def last_path_segment(uri_str):
    if not uri_str:
        return None
    p = urlparse(uri_str)
    if p.fragment:
        return p.fragment
    seg = p.path.rstrip('/').split('/')[-1]
    return seg if seg != '' else None


In [13]:
# ----- load graph -----------------------------------------------------------
g = Graph()
g.parse("ssu_example.ttl", format="turtle")   # <-- change path if needed

# ----- namespaces used in your sample --------------------------------------
PROD = Namespace("https://data.emobon.embrc.eu/ns/product#")
DCT  = Namespace("http://purl.org/dc/terms/")
SCHEMA = Namespace("http://schema.org/")  # not required here but safe

# ----- accumulate rows keyed by (source_material_ID, ncbi_tax_id) -----------
acc = {}  # (source_id, taxid) -> dict of columns

def ensure_row(source_id, taxid):
    key = (source_id or "unknown_sample", taxid or "unknown_taxid")
    if key not in acc:
        acc[key] = {col: None for col in COLUMNS}
        acc[key]['ncbi_tax_id'] = taxid
    return acc[key]

# ----- iterate TaxonomicAnnotation nodes -----------------------------------
for ta_node in g.subjects(RDF.type, PROD.TaxonomicAnnotation):
    # read annotation-level properties
    sample_uri = g.value(ta_node, PROD.ofSample) or g.value(ta_node, DCT.ofSample)
    source_id = last_path_segment(as_str(sample_uri))
    identifier = g.value(ta_node, DCT.identifier)  # expected to be NCBI URI
    identifier_s = as_str(identifier)
    ncbi_id = extract_ncbi_id(identifier_s)

    # abundance (prod:ssuRNA) and otuID
    abundance_term = g.value(ta_node, PROD.ssuRNA)
    # sometimes abundance literal typed - convert to int if possible
    try:
        abundance = int(str(abundance_term))
    except Exception:
        abundance = str(abundance_term)

    # create/ensure row
    row = ensure_row(source_id, ncbi_id)
    row['abundance'] = abundance

    # now look up the taxon node (identifier) and extract rank / scientificName
    if identifier is not None:
        tax_subject = URIRef(identifier_s)
        sci_name_term = g.value(tax_subject, DCT.scientificName) or g.value(tax_subject, DCT.title)
        rank_term = g.value(tax_subject, DCT.taxonRank)
        sci_name = as_str(sci_name_term)
        rank = as_str(rank_term).lower() if rank_term is not None else None

        if rank and sci_name:
            # if rank is one of our expected ranks, store at that column
            if rank in RANKS:
                row[rank] = sci_name
        else:
            raise ValueError(f"Missing rank or scientific name for taxon {identifier_s}")

# ----- build final DataFrame ------------------------------------------------
out_rows = []
for (source_id, taxid), vals in acc.items():
    # enforce ncbi_tax_id numeric when possible
    try:
        vals['ncbi_tax_id'] = int(taxid) if (taxid and taxid != "unknown_taxid") else None
    except Exception:
        vals['ncbi_tax_id'] = taxid
    vals['source_material_ID'] = source_id
    out_rows.append(vals)

df = pd.DataFrame(out_rows)

# set index like your example and reorder columns
df['source_material_ID'] = df['source_material_ID'].fillna('unknown_sample')
df['ncbi_tax_id'] = df['ncbi_tax_id'].fillna('unknown_taxid')

df = df.set_index(['source_material_ID', 'ncbi_tax_id'])
final_cols = ['abundance'] + RANKS
# ensure columns exist in the DataFrame before slicing
final_cols = [c for c in final_cols if c in df.columns]
df = df[final_cols].sort_index()

# convert abundance to numeric (if any)
if 'abundance' in df.columns:
    df['abundance'] = pd.to_numeric(df['abundance'], errors='coerce')

print(df.shape)
df.head(20)

(388, 9)


Unnamed: 0_level_0,Unnamed: 1_level_0,abundance,superkingdom,kingdom,phylum,class,order,family,genus,species
source_material_ID,ncbi_tax_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
EMOBON_AAOT_Wa_66,2,783.0,,,,,,,,
EMOBON_AAOT_Wa_66,29,5.0,,,,,Myxococcales,,,
EMOBON_AAOT_Wa_66,81,2.0,,,,,,,Hyphomicrobium,
EMOBON_AAOT_Wa_66,112,11.0,,,,,Planctomycetales,,,
EMOBON_AAOT_Wa_66,136,1.0,,,,,Spirochaetales,,,
EMOBON_AAOT_Wa_66,137,1.0,,,,,,Spirochaetaceae,,
EMOBON_AAOT_Wa_66,226,15.0,,,,,,,Alteromonas,
EMOBON_AAOT_Wa_66,237,12.0,,,,,,,Flavobacterium,
EMOBON_AAOT_Wa_66,265,10.0,,,,,,,Paracoccus,
EMOBON_AAOT_Wa_66,279,1.0,,,,,,,Xanthobacter,
