## Example of RO-Crates ingestion and fuseki SparQL
- set up local SparQL DB
- get several RO-Crates published
- create a graph and add it to the SparQL

This NB should mirror the [tutorial](https://py-ualg.github.io/biohap/biodata_pt/training_2511.html) working directly on `fuseki`

### Fuseki setup
Fuseki is a java SparQL endpoint server.

Get java
```
sudo apt update
sudo apt install -y openjdk-17-jre-headless
java -version
```

- download the zip file from https://jena.apache.org/download/index.cgi
- run the fuseki server script
```
cd apache-jena-fuseki-5.6.0/
./fuseki-server
```
the server is at
http://localhost:3030/#/

### Preparation of a dataset
- I want to keep this in pure python
- use `rdflib rdflib-jsonld requests`

In [1]:
import pandas as pd
import requests
import os
import json
import re
from rdflib import Graph
from urllib.parse import quote_plus, urljoin

### Methods to get RO-Crates from GH
- easier is to clone the `https://github.com/emo-bon/analysis-results-cluster-01-crate` repo

In [2]:
def jsonld_to_rdflib(jsonld_text, base=None):
    """
    Parse JSON-LD text into an rdflib.Graph and return the Graph.
    """
    g = Graph()
    # rdflib accepts a JSON-LD string as input; base is optional
    g.parse(data=jsonld_text, format="json-ld", publicID=base)
    return g


In [3]:
rocrate_folder = "/home/david-palecek/coding/ro-crates/analysis-results-cluster-01-crate/"

with open(os.path.join(rocrate_folder, "EMOBON_AAOT_Wa_66-ro-crate.ttl"), "r") as f:
    ttl_content = f.read()

In [4]:
# Use the Graph Store Protocol endpoint for Fuseki
fuseki_url = "http://localhost:3030/rocrate_python/data"
headers = {"Content-Type": "text/turtle"}

resp = requests.put(fuseki_url, data=ttl_content.encode("utf-8"), headers=headers, timeout=60)

# raise for HTTP error
try:
    resp.raise_for_status()
except requests.HTTPError as e:
    raise RuntimeError(f"Upload failed: {resp.status_code} {resp.text}") from e

### Loop over `ttl`s in a folder

In [5]:
fuseki_host = "http://localhost:3030"
dataset = "rocrate_python"
gsp_endpoint = f"{fuseki_host}/{dataset}/data"
headers = {"Content-Type": "text/turtle"}     # sending Turtle

for filename in os.listdir(rocrate_folder):
    if not filename.endswith("-ro-crate.ttl"):
        continue

    path = os.path.join(rocrate_folder, filename)
    print("Uploading (append):", filename)
    with open(path, "rb") as f:
        ttl_bytes = f.read()

    # POST appends triples to the named graph (PUT would replace)
    resp = requests.post(
        gsp_endpoint,
        data=ttl_bytes,
        headers=headers,
        timeout=60,
    )

    try:
        resp.raise_for_status()
    except requests.HTTPError as e:
        print("Upload failed for", filename, resp.status_code)
        print("Server response:", resp.text[:1000])
        raise

print("All files uploaded (appended).")

Uploading (append): EMOBON_AAOT_Wa_67-ro-crate.ttl
Uploading (append): EMOBON_HCMR-1_Wa_6-ro-crate.ttl
Uploading (append): EMOBON_AAOT_Wa_66-ro-crate.ttl
Uploading (append): EMOBON_OSD74_Wa_21-ro-crate.ttl
All files uploaded (appended).


### sidetrack to dvc

In [None]:
os.environ['AWS_NO_SIGN_REQUEST'] = '1'
os.environ.pop('AWS_PROFILE', None)   # avoid using a missing profile

import dvc.api
from dvc.api import DVCFileSystem
import boto3
from botocore import UNSIGNED
from botocore.client import Config

dvc_fs = DVCFileSystem(rocrate_folder)
files = list(dvc_fs.find("/", detail=False))
print(files)

['/.github/workflows/rocrate_to_pages.yml', '/EMOBON_AAOT_Wa_66-ro-crate.ttl', '/EMOBON_AAOT_Wa_66-ro-crate/.gitignore', '/EMOBON_AAOT_Wa_66-ro-crate/DBB.merged.cmsearch.all.tblout.deoverlapped.bz2', '/EMOBON_AAOT_Wa_66-ro-crate/DBB.merged.fasta.bz2', '/EMOBON_AAOT_Wa_66-ro-crate/DBB.merged.motus.tsv.bz2', '/EMOBON_AAOT_Wa_66-ro-crate/DBB.merged.qc_summary', '/EMOBON_AAOT_Wa_66-ro-crate/DBB.merged.unfiltered_fasta.bz2', '/EMOBON_AAOT_Wa_66-ro-crate/DBB.merged_CDS.faa.bz2', '/EMOBON_AAOT_Wa_66-ro-crate/DBB.merged_CDS.ffn.bz2', '/EMOBON_AAOT_Wa_66-ro-crate/DBB_AAEPOSDA_1_1_HVWGWDSX5.UDI100_clean.fastq.trimmed.fasta.bz2', '/EMOBON_AAOT_Wa_66-ro-crate/DBB_AAEPOSDA_1_1_HVWGWDSX5.UDI100_clean.fastq.trimmed.qc_summary', '/EMOBON_AAOT_Wa_66-ro-crate/DBB_AAEPOSDA_1_2_HVWGWDSX5.UDI100_clean.fastq.trimmed.fasta.bz2', '/EMOBON_AAOT_Wa_66-ro-crate/DBB_AAEPOSDA_1_2_HVWGWDSX5.UDI100_clean.fastq.trimmed.qc_summary', '/EMOBON_AAOT_Wa_66-ro-crate/config.yml', '/EMOBON_AAOT_Wa_66-ro-crate/fastp.html', '/

In [47]:
filt_files = [f for f in files if f.endswith("SSU-taxonomy-summary.ttl")]
print(filt_files)

['/EMOBON_AAOT_Wa_66-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl', '/EMOBON_AAOT_Wa_67-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl', '/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl', '/EMOBON_OSD74_Wa_21-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl']



In [48]:
def get_single_file_s3(repo_folder, path: str):
    url = dvc.api.get_url(
        path=path,
        repo=repo_folder,
    )
    # Custom S3 endpoint (non-AWS)
    endpoint_url = "https://s3.mesocentre.uca.fr"
    bucket = "mgf-data-products"
    key = url.split(f"{bucket}/")[-1]  # extract key from URL

    # Create S3 client that does NOT require credentials
    s3 = boto3.client(
        "s3",
        endpoint_url=endpoint_url,
        config=Config(signature_version=UNSIGNED),
    )

    # Fetch the object
    obj = s3.get_object(Bucket=bucket, Key=key)

    # Read contents into memory
    data = obj["Body"].read()

    filename = path.split("/")[1] + "_" + path.split("/")[-1]
    # Save to a local file if needed
    with open(filename, "wb") as f:
        f.write(data)

    print("Downloaded", len(data), "bytes from", endpoint_url)
    return path.split("/")[1], data
def create_upload_ds(name, contents):
    fuseki_admin_url = "http://localhost:3030/$/datasets"
    # Form data
    form_data = {
        "dbName": name,   # dataset name
        "dbType": "mem"       # in-memory
    }

    # Headers to enforce form encoding
    headers = {
        "Content-Type": "application/x-www-form-urlencoded"
    }

    # POST request
    resp = requests.post(fuseki_admin_url, data=form_data, headers=headers)
    print(resp.status_code, resp.text)

    # Now upload the data to the named graph
    gsp_endpoint = f"http://localhost:3030/{name}/data"
    headers = {"Content-Type": "text/turtle"}     # sending Turtle
    resp = requests.post(
        gsp_endpoint,
        data=contents,
        headers=headers,
        timeout=60,
    )


### Get all the tables and upload them to separate graphs

In [49]:
for file in filt_files:
    name, contents = get_single_file_s3(rocrate_folder, file)
    print("Uploading (append):", file)

    create_upload_ds(name, contents)

Downloaded 233217 bytes from https://s3.mesocentre.uca.fr
Uploading (append): /EMOBON_AAOT_Wa_66-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl
 233217 bytes from https://s3.mesocentre.uca.fr
Uploading (append): /EMOBON_AAOT_Wa_66-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl
200 
Downloaded 240253 bytes from https://s3.mesocentre.uca.fr
Uploading (append): /EMOBON_AAOT_Wa_67-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl
200 
Downloaded 215932 bytes from https://s3.mesocentre.uca.fr
Uploading (append): /EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl
200 
Downloaded 628259 bytes from https://s3.mesocentre.uca.fr
Uploading (append): /EMOBON_OSD74_Wa_21-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl
200 


### Serialization to .ttl

In [4]:
def serialize_rocrate_to_ttl(repo_url, output_file_path):
    """
    Fetch RO-Crate metadata from GitHub, parse it, and serialize to a TTL file.
    
    Parameters
    ----------
    repo_url : str
        GitHub repo URL or raw file URL to ro-crate-metadata.json
    output_file_path : str
        Path where the TTL file should be saved
    
    Returns
    -------
    str
        Path to the created TTL file
    """
    # Fetch and parse the RO-Crate metadata
    jsonld_text = fetch_rocrate_json_from_github(repo_url)
    g = jsonld_to_rdflib(jsonld_text)
    
    # Serialize to TTL format and save to file
    ttl_content = g.serialize(format='turtle')
    
    with open(output_file_path, 'w', encoding='utf-8') as f:
        f.write(ttl_content)
    
    print(f"RO-Crate serialized to TTL file: {output_file_path}")
    print(f"Graph contains {len(g)} triples")
    
    return output_file_path


def serialize_multiple_rocrates_to_ttl(repo_urls, output_file_path):
    """
    Fetch multiple RO-Crate metadata files from GitHub, parse them, 
    and serialize all to a single TTL file.
    
    Parameters
    ----------
    repo_urls : list
        List of GitHub repo URLs or raw file URLs to ro-crate-metadata.json
    output_file_path : str
        Path where the combined TTL file should be saved
    
    Returns
    -------
    str
        Path to the created TTL file
    """
    # Create a combined graph
    combined_graph = Graph()
    
    for i, repo_url in enumerate(repo_urls):
        try:
            print(f"Processing {i+1}/{len(repo_urls)}: {repo_url}")
            jsonld_text = fetch_rocrate_json_from_github(repo_url)
            g = jsonld_to_rdflib(jsonld_text)
            
            # Add all triples from this graph to the combined graph
            for triple in g:
                combined_graph.add(triple)
                
            print(f"  Added {len(g)} triples")
            
        except Exception as e:
            print(f"  FAILED to process {repo_url}: {e}")
    
    # Serialize combined graph to TTL format and save to file
    ttl_content = combined_graph.serialize(format='turtle')
    
    with open(output_file_path, 'w', encoding='utf-8') as f:
        f.write(ttl_content)
    
    print(f"\nCombined RO-Crates serialized to TTL file: {output_file_path}")
    print(f"Combined graph contains {len(combined_graph)} triples")
    
    return output_file_path

In [5]:
# Example usage of TTL serialization functions

# Serialize a single RO-Crate to TTL
single_repo = 'https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_BPNS_So_34-ro-crate/ro-crate-metadata.json'

try:
    ttl_file = serialize_rocrate_to_ttl(single_repo, "emobon_bpns_so_34.ttl")
    print(f"Single RO-Crate saved to: {ttl_file}")
except Exception as e:
    print(f"Error serializing single RO-Crate: {e}")

print("\n" + "="*50 + "\n")

# Serialize multiple RO-Crates to a combined TTL file
repos = [
    "https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_BPNS_So_34-ro-crate/ro-crate-metadata.json",
    "https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_NRMCB_So_1-ro-crate/ro-crate-metadata.json",
    # "https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_NRMCB_So_11-ro-crate/ro-crate-metadata.json",
    # 'https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_OOB_So_25-ro-crate/ro-crate-metadata.json',
    # "https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_RFormosa_So_11-ro-crate/ro-crate-metadata.json"
]

try:
    combined_ttl = serialize_multiple_rocrates_to_ttl(repos, "combined_emobon_rocrates.ttl")
    print(f"Combined RO-Crates saved to: {combined_ttl}")
except Exception as e:
    print(f"Error serializing multiple RO-Crates: {e}")

RO-Crate serialized to TTL file: emobon_bpns_so_34.ttl
Graph contains 481 triples
Single RO-Crate saved to: emobon_bpns_so_34.ttl


Processing 1/2: https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_BPNS_So_34-ro-crate/ro-crate-metadata.json
  Added 481 triples
Processing 2/2: https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_NRMCB_So_1-ro-crate/ro-crate-metadata.json
  Added 486 triples

Combined RO-Crates serialized to TTL file: combined_emobon_rocrates.ttl
Combined graph contains 655 triples
Combined RO-Crates saved to: combined_emobon_rocrates.ttl


### Upload of the combined graph

In [10]:
with open(combined_ttl, 'r', encoding='utf-8') as f:
    ttl_content = f.read()

uri = "http://example.org/graphs/emobon_combined"
# Use the Graph Store Protocol endpoint for Fuseki
gsp_url = "http://localhost:3030/rocrate/data?graph=" + uri

headers = {"Content-Type": "text/turtle"}

resp = requests.put(gsp_url, data=ttl_content.encode("utf-8"), headers=headers, timeout=60)

# raise for HTTP error
try:
    resp.raise_for_status()
except requests.HTTPError as e:
    raise RuntimeError(f"Upload failed: {resp.status_code} {resp.text}") from e

In [11]:
uri

'http://example.org/graphs/emobon_combined'

### Original single graph upload

In [None]:
repo = 'https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_BPNS_So_34-ro-crate/ro-crate-metadata.json'
fuseki_base = "http://localhost:3030/rocrate"
try:
    r = process_github_rocrate(
        repo,
        fuseki_base,
        graph_uri="http://example.org/rocrate/EMOBON_BPNS_So_34",
        fmt="turtle")
    print("Upload successful:", r.status_code)
except Exception as e:
    print("Error:", e)

Upload successful: 200


In [14]:
q = "SELECT (COUNT(*) AS ?c) WHERE { ?s ?p ?o }"
r = requests.get("http://localhost:3030/rocrate/query", params={"query": q}, headers={"Accept": "application/sparql-results+json"})
print(r.json())

{'head': {'vars': ['c']}, 'results': {'bindings': [{'c': {'type': 'literal', 'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'value': '481'}}]}}


In [16]:
def sparql_json_to_df(sparql_json):
    """
    Convert a SPARQL SELECT query JSON result to a pandas DataFrame.
    
    Parameters
    ----------
    sparql_json : dict
        JSON returned by Fuseki / SPARQL endpoint with Accept: application/sparql-results+json
    
    Returns
    -------
    pd.DataFrame
    """
    vars_ = sparql_json.get("head", {}).get("vars", [])
    rows = []

    for binding in sparql_json.get("results", {}).get("bindings", []):
        row = {}
        for var in vars_:
            # Some results might not bind all variables
            if var in binding:
                row[var] = binding[var]["value"]
            else:
                row[var] = None
        rows.append(row)

    df = pd.DataFrame(rows, columns=vars_)
    return df

In [17]:
df = sparql_json_to_df(r.json())
print(df)

     c
0  481


### Add more than one `repository`
- ie. several emo-bon ro-crates

In [22]:
repos = [
    "https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_BPNS_So_34-ro-crate/ro-crate-metadata.json",
    "https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_NRMCB_So_1-ro-crate/ro-crate-metadata.json",
    "https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_NRMCB_So_11-ro-crate/ro-crate-metadata.json",
    'https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_OOB_So_25-ro-crate/ro-crate-metadata.json',
    "https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_RFormosa_So_11-ro-crate/ro-crate-metadata.json"
]
for r in repos:
    try:
        resp = process_github_rocrate(
            r, "http://localhost:3030/rocrate",
            graph_uri=f"http://example.org/rocrate/{r.split('/')[-2]}")
        print(r, "->", resp.status_code)
    except Exception as e:
        print("FAILED", r, e)

https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_BPNS_So_34-ro-crate/ro-crate-metadata.json -> 201
https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_NRMCB_So_1-ro-crate/ro-crate-metadata.json -> 201
https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_NRMCB_So_11-ro-crate/ro-crate-metadata.json -> 201
https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_OOB_So_25-ro-crate/ro-crate-metadata.json -> 201
https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_RFormosa_So_11-ro-crate/ro-crate-metadata.json -> 201


In [25]:
q = "SELECT (COUNT(*) AS ?c) WHERE { ?s ?p ?o }"
r = requests.get("http://localhost:3030/rocrate/query", params={"query": q}, headers={"Accept": "application/sparql-results+json"})
df = sparql_json_to_df(r.json())
print(df)

     c
0  481
