## Example of future for the RO-Crates and SparQL
- set up local SparQL DB
- based on few RO-Crates so far published
- test methods to query the SparQL
  - directly
  - or using UDAL


In [1]:
import sys
import os
import logging
from IPython import get_ipython
logger = logging.getLogger(name="Diversity analysis app")

if 'google.colab' in str(get_ipython()):
    print('Setting Google colab, you will need a ngrok account to make the dashboard display over the tunnel. \
    https://ngrok.com/')
    # clone the momics-demos repository to use it to load data
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        logger.info(f"Repository cloned")
    except OSError as e:
        logger.info(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')

    # this step takes time beacause of many dependencies
    os.system('pip install marine-omics')

from momics.utils import (
    memory_load, reconfig_logger,
    init_setup, get_notebook_environment,
)

# Set up logging
reconfig_logger()

# Determine the notebook environment
env = get_notebook_environment()

init_setup()
logger.info(f"Environment: {env}")

INFO | root | Logging.basicConfig completed successfully
INFO | Diversity analysis app | Environment: vscode
INFO | Diversity analysis app | Environment: vscode


### Fuseki setup
Fuseki is a java SparQL endpoint server.

Get java
```
sudo apt update
sudo apt install -y openjdk-17-jre-headless
java -version
```

- download the zip file from https://jena.apache.org/download/index.cgi
- run the fuseki server script
```
cd apache-jena-fuseki-5.5.0/
./fuseki-server
```
the server is at
http://localhost:3030/#/

### Preparation of a dataset
- I want to keep this in pure python
- use `rdflib rdflib-jsonld requests`

In [2]:
import pandas as pd
import requests
import json
import re
from rdflib import Graph
from urllib.parse import quote_plus, urljoin

### Methods to get RO-Crates from GH

In [3]:
def fetch_rocrate_json_from_github(url):
    """
    Try to obtain the ro-crate-metadata.json (or .jsonld) from a GitHub repo URL
    or a raw.githubusercontent.com URL.
    Returns text of JSON-LD.
    """
    # quick case: raw URL or direct file link
    if "raw.githubusercontent.com" in url or url.endswith(".json") or url.endswith(".jsonld"):
        r = requests.get(url, timeout=20)
        r.raise_for_status()
        return r.text

    # try to parse owner/repo from a normal GitHub URL
    m = re.search(r"github.com/([^/]+)/([^/]+)", url)
    if not m:
        raise ValueError("Provide a GitHub repo url or direct raw URL to ro-crate-metadata.json")

    owner, repo = m.group(1), m.group(2).replace(".git", "")
    # try main and master branches, common filenames
    branches = ["main", "master"]
    filenames = ["ro-crate-metadata.json", "ro-crate-metadata.jsonld", "ro-crate-metadata.jsonld.json"]
    for br in branches:
        for fn in filenames:
            raw = f"https://raw.githubusercontent.com/{owner}/{repo}/{br}/{fn}"
            r = requests.get(raw, timeout=15)
            if r.status_code == 200:
                return r.text
    raise FileNotFoundError(f"Could not find ro-crate-metadata.json in {owner}/{repo} (tried main/master). "
                            "If your file is elsewhere, pass its raw URL directly.")


def jsonld_to_rdflib(jsonld_text, base=None):
    """
    Parse JSON-LD text into an rdflib.Graph and return the Graph.
    """
    g = Graph()
    # rdflib accepts a JSON-LD string as input; base is optional
    g.parse(data=jsonld_text, format="json-ld", publicID=base)
    return g


def upload_graph_to_fuseki(graph, fuseki_dataset_base, graph_uri=None, method="POST", fmt="turtle"):
    """
    Upload an rdflib.Graph to Fuseki using the Graph Store HTTP endpoint.
    - fuseki_dataset_base: e.g. "http://localhost:3030/rocrate" (no trailing slash)
    - graph_uri: if provided, data will be loaded into that named graph (as a graph IRI)
    - method: "POST" (append) or "PUT" (replace)
    - fmt: serialization format for upload ('turtle' recommended)
    Returns requests.Response
    """
    if not fuseki_dataset_base.endswith("/"):
        fuseki_dataset_base = fuseki_dataset_base
    gsp_url = fuseki_dataset_base.rstrip("/") + "/data"
    params = {}
    headers = {}

    # serialize
    payload = graph.serialize(format=fmt)
    # content type mapping
    ct = {
        "turtle": "text/turtle",
        "nt": "application/n-triples",
        "json-ld": "application/ld+json",
        "trig": "application/trig",
        "nquads": "application/n-quads"
    }.get(fmt, "text/turtle")

    headers["Content-Type"] = ct

    # add graph query parameter if named graph
    if graph_uri:
        gsp_url = gsp_url + "?graph=" + quote_plus(graph_uri)

    # choose requests method
    if method.upper() == "POST":
        resp = requests.post(gsp_url, data=payload.encode("utf-8"), headers=headers, timeout=60)
    elif method.upper() == "PUT":
        resp = requests.put(gsp_url, data=payload.encode("utf-8"), headers=headers, timeout=60)
    else:
        raise ValueError("method must be 'POST' or 'PUT'")

    # raise for HTTP error
    try:
        resp.raise_for_status()
    except requests.HTTPError as e:
        raise RuntimeError(f"Upload failed: {resp.status_code} {resp.text}") from e

    return resp


def process_github_rocrate(repo_url, fuseki_dataset_base, graph_uri=None, branch=None, fmt="turtle"):
    """
    High level: fetch ro-crate metadata from GitHub, parse, upload.
    - repo_url: github repo url or raw file url
    - fuseki_dataset_base: e.g. 'http://localhost:3030/rocrate'
    - graph_uri: optional named graph IRI; if None uses default graph
    - fmt: how to serialize before upload (turtle is robust)
    """
    jsonld_text = fetch_rocrate_json_from_github(repo_url)
    g = jsonld_to_rdflib(jsonld_text)
    # optional: add provenance triple identifying source
    if graph_uri:
        # keep graph_uri separate (we upload into that graph)
        pass
    resp = upload_graph_to_fuseki(g, fuseki_dataset_base, graph_uri=graph_uri, method="POST", fmt=fmt)
    return resp



### Serialization to .ttl

In [4]:
def serialize_rocrate_to_ttl(repo_url, output_file_path):
    """
    Fetch RO-Crate metadata from GitHub, parse it, and serialize to a TTL file.
    
    Parameters
    ----------
    repo_url : str
        GitHub repo URL or raw file URL to ro-crate-metadata.json
    output_file_path : str
        Path where the TTL file should be saved
    
    Returns
    -------
    str
        Path to the created TTL file
    """
    # Fetch and parse the RO-Crate metadata
    jsonld_text = fetch_rocrate_json_from_github(repo_url)
    g = jsonld_to_rdflib(jsonld_text)
    
    # Serialize to TTL format and save to file
    ttl_content = g.serialize(format='turtle')
    
    with open(output_file_path, 'w', encoding='utf-8') as f:
        f.write(ttl_content)
    
    print(f"RO-Crate serialized to TTL file: {output_file_path}")
    print(f"Graph contains {len(g)} triples")
    
    return output_file_path


def serialize_multiple_rocrates_to_ttl(repo_urls, output_file_path):
    """
    Fetch multiple RO-Crate metadata files from GitHub, parse them, 
    and serialize all to a single TTL file.
    
    Parameters
    ----------
    repo_urls : list
        List of GitHub repo URLs or raw file URLs to ro-crate-metadata.json
    output_file_path : str
        Path where the combined TTL file should be saved
    
    Returns
    -------
    str
        Path to the created TTL file
    """
    # Create a combined graph
    combined_graph = Graph()
    
    for i, repo_url in enumerate(repo_urls):
        try:
            print(f"Processing {i+1}/{len(repo_urls)}: {repo_url}")
            jsonld_text = fetch_rocrate_json_from_github(repo_url)
            g = jsonld_to_rdflib(jsonld_text)
            
            # Add all triples from this graph to the combined graph
            for triple in g:
                combined_graph.add(triple)
                
            print(f"  Added {len(g)} triples")
            
        except Exception as e:
            print(f"  FAILED to process {repo_url}: {e}")
    
    # Serialize combined graph to TTL format and save to file
    ttl_content = combined_graph.serialize(format='turtle')
    
    with open(output_file_path, 'w', encoding='utf-8') as f:
        f.write(ttl_content)
    
    print(f"\nCombined RO-Crates serialized to TTL file: {output_file_path}")
    print(f"Combined graph contains {len(combined_graph)} triples")
    
    return output_file_path

In [5]:
# Example usage of TTL serialization functions

# Serialize a single RO-Crate to TTL
single_repo = 'https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_BPNS_So_34-ro-crate/ro-crate-metadata.json'

try:
    ttl_file = serialize_rocrate_to_ttl(single_repo, "emobon_bpns_so_34.ttl")
    print(f"Single RO-Crate saved to: {ttl_file}")
except Exception as e:
    print(f"Error serializing single RO-Crate: {e}")

print("\n" + "="*50 + "\n")

# Serialize multiple RO-Crates to a combined TTL file
repos = [
    "https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_BPNS_So_34-ro-crate/ro-crate-metadata.json",
    "https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_NRMCB_So_1-ro-crate/ro-crate-metadata.json",
    # "https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_NRMCB_So_11-ro-crate/ro-crate-metadata.json",
    # 'https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_OOB_So_25-ro-crate/ro-crate-metadata.json',
    # "https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_RFormosa_So_11-ro-crate/ro-crate-metadata.json"
]

try:
    combined_ttl = serialize_multiple_rocrates_to_ttl(repos, "combined_emobon_rocrates.ttl")
    print(f"Combined RO-Crates saved to: {combined_ttl}")
except Exception as e:
    print(f"Error serializing multiple RO-Crates: {e}")

RO-Crate serialized to TTL file: emobon_bpns_so_34.ttl
Graph contains 481 triples
Single RO-Crate saved to: emobon_bpns_so_34.ttl


Processing 1/2: https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_BPNS_So_34-ro-crate/ro-crate-metadata.json
  Added 481 triples
Processing 2/2: https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_NRMCB_So_1-ro-crate/ro-crate-metadata.json
  Added 486 triples

Combined RO-Crates serialized to TTL file: combined_emobon_rocrates.ttl
Combined graph contains 655 triples
Combined RO-Crates saved to: combined_emobon_rocrates.ttl


### Upload of the combined graph

In [10]:
with open(combined_ttl, 'r', encoding='utf-8') as f:
    ttl_content = f.read()

uri = "http://example.org/graphs/emobon_combined"
# Use the Graph Store Protocol endpoint for Fuseki
gsp_url = "http://localhost:3030/rocrate/data?graph=" + uri

headers = {"Content-Type": "text/turtle"}

resp = requests.put(gsp_url, data=ttl_content.encode("utf-8"), headers=headers, timeout=60)

# raise for HTTP error
try:
    resp.raise_for_status()
except requests.HTTPError as e:
    raise RuntimeError(f"Upload failed: {resp.status_code} {resp.text}") from e

In [11]:
uri

'http://example.org/graphs/emobon_combined'

### Original single graph upload

In [None]:
repo = 'https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_BPNS_So_34-ro-crate/ro-crate-metadata.json'
fuseki_base = "http://localhost:3030/rocrate"
try:
    r = process_github_rocrate(
        repo,
        fuseki_base,
        graph_uri="http://example.org/rocrate/EMOBON_BPNS_So_34",
        fmt="turtle")
    print("Upload successful:", r.status_code)
except Exception as e:
    print("Error:", e)

Upload successful: 200


In [14]:
q = "SELECT (COUNT(*) AS ?c) WHERE { ?s ?p ?o }"
r = requests.get("http://localhost:3030/rocrate/query", params={"query": q}, headers={"Accept": "application/sparql-results+json"})
print(r.json())

{'head': {'vars': ['c']}, 'results': {'bindings': [{'c': {'type': 'literal', 'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'value': '481'}}]}}


In [16]:
def sparql_json_to_df(sparql_json):
    """
    Convert a SPARQL SELECT query JSON result to a pandas DataFrame.
    
    Parameters
    ----------
    sparql_json : dict
        JSON returned by Fuseki / SPARQL endpoint with Accept: application/sparql-results+json
    
    Returns
    -------
    pd.DataFrame
    """
    vars_ = sparql_json.get("head", {}).get("vars", [])
    rows = []

    for binding in sparql_json.get("results", {}).get("bindings", []):
        row = {}
        for var in vars_:
            # Some results might not bind all variables
            if var in binding:
                row[var] = binding[var]["value"]
            else:
                row[var] = None
        rows.append(row)

    df = pd.DataFrame(rows, columns=vars_)
    return df

In [17]:
df = sparql_json_to_df(r.json())
print(df)

     c
0  481


### Add more than one `repository`
- ie. several emo-bon ro-crates

In [22]:
repos = [
    "https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_BPNS_So_34-ro-crate/ro-crate-metadata.json",
    "https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_NRMCB_So_1-ro-crate/ro-crate-metadata.json",
    "https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_NRMCB_So_11-ro-crate/ro-crate-metadata.json",
    'https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_OOB_So_25-ro-crate/ro-crate-metadata.json',
    "https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_RFormosa_So_11-ro-crate/ro-crate-metadata.json"
]
for r in repos:
    try:
        resp = process_github_rocrate(
            r, "http://localhost:3030/rocrate",
            graph_uri=f"http://example.org/rocrate/{r.split('/')[-2]}")
        print(r, "->", resp.status_code)
    except Exception as e:
        print("FAILED", r, e)

https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_BPNS_So_34-ro-crate/ro-crate-metadata.json -> 201
https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_NRMCB_So_1-ro-crate/ro-crate-metadata.json -> 201
https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_NRMCB_So_11-ro-crate/ro-crate-metadata.json -> 201
https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_OOB_So_25-ro-crate/ro-crate-metadata.json -> 201
https://raw.githubusercontent.com/emo-bon/analysis-results-cluster-01-crate/refs/heads/main/EMOBON_RFormosa_So_11-ro-crate/ro-crate-metadata.json -> 201


In [25]:
q = "SELECT (COUNT(*) AS ?c) WHERE { ?s ?p ?o }"
r = requests.get("http://localhost:3030/rocrate/query", params={"query": q}, headers={"Accept": "application/sparql-results+json"})
df = sparql_json_to_df(r.json())
print(df)

     c
0  481
