# Example of RO-Crates ingestion and fuseki SparQL
- set up local SparQL DB
- get several RO-Crates published
- create a graph and add it to the SparQL

This NB should mirror the [tutorial](https://py-ualg.github.io/biohap/biodata_pt/training_2511.html) working directly on `fuseki`

**Fuseki setup**

Fuseki is a java SparQL endpoint server.

Get java
```
sudo apt update
sudo apt install -y openjdk-17-jre-headless
java -version
```

- download the zip file from https://jena.apache.org/download/index.cgi
- run the fuseki server script
```
cd apache-jena-fuseki-5.6.0/
./fuseki-server
```
the server is at
http://localhost:3030/#/

**Preparation of a dataset**

- I want to keep this in pure python
- use `rdflib rdflib-jsonld requests`

In [1]:
import pandas as pd
import requests
import os
import json
import re
from rdflib import Graph
from urllib.parse import quote_plus, urljoin

pd.options.display.max_columns = None
pd.set_option("display.max_colwidth", None)

## 1. Get RO-Crates from GH
- easiest is to clone the `https://github.com/emo-bon/analysis-results-cluster-01-crate` repo
- other half hardcoded option can be found in `fuseki_create_sparql_endpoint_GH.ipynb`

In [2]:
def jsonld_to_rdflib(jsonld_text, base=None):
    """
    Parse JSON-LD text into an rdflib.Graph and return the Graph.
    """
    g = Graph()
    # rdflib accepts a JSON-LD string as input; base is optional
    g.parse(data=jsonld_text, format="json-ld", publicID=base)
    return g


def fuseki_create_dataset(name: str) -> None:
    """Create a new in-memory dataset in a local Apache Jena Fuseki server.
    Args:
        name (str): Name of the dataset to create.
    
    Returns:
        None
    """
    fuseki_admin_url = "http://localhost:3030/$/datasets"
    # Form data
    form_data = {
        "dbName": name,   # dataset name
        "dbType": "mem"       # in-memory
    }

    # Headers to enforce form encoding
    headers = {
        "Content-Type": "application/x-www-form-urlencoded"
    }

    # POST request
    resp = requests.post(fuseki_admin_url, data=form_data, headers=headers)
    if resp.status_code != 200:
        print("Dataset creation failed for", name, resp.status_code)
        print("Server response:", resp.text[:1000])
    else:
        print("Dataset created:", name)


def sparql_json_to_df(sparql_json):
    """
    Convert a SPARQL SELECT query JSON result to a pandas DataFrame.
    
    Parameters
    ----------
    sparql_json : dict
        JSON returned by Fuseki / SPARQL endpoint with Accept: application/sparql-results+json
    
    Returns
    -------
    pd.DataFrame
    """
    vars_ = sparql_json.get("head", {}).get("vars", [])
    rows = []

    for binding in sparql_json.get("results", {}).get("bindings", []):
        row = {}
        for var in vars_:
            # Some results might not bind all variables
            if var in binding:
                row[var] = binding[var]["value"]
            else:
                row[var] = None
        rows.append(row)

    df = pd.DataFrame(rows, columns=vars_)
    return df

In [3]:
rocrate_folder = "/home/david-palecek/coding/ro-crates/analysis-results-cluster-01-crate/"
dataset = "emobon_python"

with open(os.path.join(rocrate_folder, "EMOBON_AAOT_Wa_66-ro-crate.ttl"), "r") as f:
    ttl_content = f.read()

In [4]:
# check if dataset exists, if not create it
fuseki_datasets_url = "http://localhost:3030/$/datasets"
resp = requests.get(fuseki_datasets_url)
resp.raise_for_status()
datasets_info = resp.json()
# print(json.dumps(datasets_info, indent=2))

existing_datasets = [ds["ds.name"] for ds in datasets_info["datasets"]]

print("Existing datasets:", existing_datasets)
if dataset not in existing_datasets:
    fuseki_create_dataset(dataset)


# upload TTL content to the dataset
fuseki_url = f"http://localhost:3030/{dataset}/data"
headers = {"Content-Type": "text/turtle"}

resp = requests.put(fuseki_url, data=ttl_content.encode("utf-8"), headers=headers, timeout=60)

# raise for HTTP error
try:
    resp.raise_for_status()
except requests.HTTPError as e:
    raise RuntimeError(f"Upload failed: {resp.status_code} {resp.text}") from e

Existing datasets: ['/emobon']
Dataset created: emobon_python


## 2. Ingestion to fuseki

### Loop over `ttl`s in a folder

In [5]:
fuseki_host = "http://localhost:3030"

gsp_endpoint = f"{fuseki_host}/{dataset}/data"
headers = {"Content-Type": "text/turtle"}     # sending Turtle

for filename in os.listdir(rocrate_folder):
    if not filename.endswith("-ro-crate.ttl"):
        continue

    path = os.path.join(rocrate_folder, filename)
    print("Uploading (append):", filename)
    with open(path, "rb") as f:
        ttl_bytes = f.read()

    # POST appends triples to the named graph (PUT would replace)
    resp = requests.post(
        gsp_endpoint,
        data=ttl_bytes,
        headers=headers,
        timeout=60,
    )

    try:
        resp.raise_for_status()
    except requests.HTTPError as e:
        print("Upload failed for", filename, resp.status_code)
        print("Server response:", resp.text[:1000])
        raise

print("All files uploaded (appended).")

Uploading (append): EMOBON_AAOT_Wa_67-ro-crate.ttl
Uploading (append): EMOBON_HCMR-1_Wa_6-ro-crate.ttl
Uploading (append): EMOBON_AAOT_Wa_66-ro-crate.ttl
Uploading (append): EMOBON_OSD74_Wa_21-ro-crate.ttl
All files uploaded (appended).


### sidetrack to `DVC`
- I want to get the ttl file from the NB, not putting the `sdo:downloadUrl` into the browser

In [6]:
os.environ['AWS_NO_SIGN_REQUEST'] = '1'
os.environ.pop('AWS_PROFILE', None)   # avoid using a missing profile

import dvc.api
from dvc.api import DVCFileSystem
import boto3
from botocore import UNSIGNED
from botocore.client import Config


In [7]:
def get_single_file_s3(repo_folder, path: str):
    url = dvc.api.get_url(
        path=path,
        repo=repo_folder,
    )
    # Custom S3 endpoint (non-AWS)
    endpoint_url = "https://s3.mesocentre.uca.fr"
    bucket = "mgf-data-products"
    key = url.split(f"{bucket}/")[-1]  # extract key from URL

    # Create S3 client that does NOT require credentials
    s3 = boto3.client(
        "s3",
        endpoint_url=endpoint_url,
        config=Config(signature_version=UNSIGNED),
    )

    # Fetch the object
    obj = s3.get_object(Bucket=bucket, Key=key)

    # Read contents into memory
    data = obj["Body"].read()

    filename = path.split("/")[1] + "_" + path.split("/")[-1]
    # Save to a local file if needed
    with open(filename, "wb") as f:
        f.write(data)

    print("Downloaded", len(data), "bytes from", endpoint_url)
    return path.split("/")[1], data


def create_upload_ds(name: str, contents: bytes) -> None:
    """
    Create a new dataset in Fuseki and upload contents to it.

    Args:
        name (str): Name of the dataset.
        contents (bytes): RDF data in Turtle format.
    
    Returns:
        None
    """
    fuseki_create_dataset(name)

    # Now upload the data to the named graph
    gsp_endpoint = f"http://localhost:3030/{name}/data"
    headers = {"Content-Type": "text/turtle"}     # sending Turtle
    resp = requests.post(
        gsp_endpoint,
        data=contents,
        headers=headers,
        timeout=60,
    )
    if resp.status_code != 200:
        print("Upload failed for dataset", name, resp.status_code)
        print("Server response:", resp.text[:1000])
    else:
        print("Upload succeeded for dataset", name)

In [8]:
dvc_fs = DVCFileSystem(rocrate_folder)
files = list(dvc_fs.find("/", detail=False))
print(files)

['/.github/workflows/rocrate_to_pages.yml', '/EMOBON_AAOT_Wa_66-ro-crate.ttl', '/EMOBON_AAOT_Wa_66-ro-crate/.gitignore', '/EMOBON_AAOT_Wa_66-ro-crate/DBB.merged.cmsearch.all.tblout.deoverlapped.bz2', '/EMOBON_AAOT_Wa_66-ro-crate/DBB.merged.fasta.bz2', '/EMOBON_AAOT_Wa_66-ro-crate/DBB.merged.motus.tsv.bz2', '/EMOBON_AAOT_Wa_66-ro-crate/DBB.merged.qc_summary', '/EMOBON_AAOT_Wa_66-ro-crate/DBB.merged.unfiltered_fasta.bz2', '/EMOBON_AAOT_Wa_66-ro-crate/DBB.merged_CDS.faa.bz2', '/EMOBON_AAOT_Wa_66-ro-crate/DBB.merged_CDS.ffn.bz2', '/EMOBON_AAOT_Wa_66-ro-crate/DBB_AAEPOSDA_1_1_HVWGWDSX5.UDI100_clean.fastq.trimmed.fasta.bz2', '/EMOBON_AAOT_Wa_66-ro-crate/DBB_AAEPOSDA_1_1_HVWGWDSX5.UDI100_clean.fastq.trimmed.qc_summary', '/EMOBON_AAOT_Wa_66-ro-crate/DBB_AAEPOSDA_1_2_HVWGWDSX5.UDI100_clean.fastq.trimmed.fasta.bz2', '/EMOBON_AAOT_Wa_66-ro-crate/DBB_AAEPOSDA_1_2_HVWGWDSX5.UDI100_clean.fastq.trimmed.qc_summary', '/EMOBON_AAOT_Wa_66-ro-crate/config.yml', '/EMOBON_AAOT_Wa_66-ro-crate/fastp.html', '/

Filter SSU taxonomy tables

In [9]:
filt_files = [f for f in files if f.endswith("SSU-taxonomy-summary.ttl")]
[k for k in filt_files]

['/EMOBON_AAOT_Wa_66-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl',
 '/EMOBON_AAOT_Wa_67-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl',
 '/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl',
 '/EMOBON_OSD74_Wa_21-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl']

### Get SSU tables and upload them to a single graph

In [10]:
dataset = "ssu"
for file in filt_files:
    _, contents = get_single_file_s3(rocrate_folder, file)
    print("Uploading (append):", file)

    create_upload_ds(dataset, contents)

Downloaded 233217 bytes from https://s3.mesocentre.uca.fr
Uploading (append): /EMOBON_AAOT_Wa_66-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl
 233217 bytes from https://s3.mesocentre.uca.fr
Uploading (append): /EMOBON_AAOT_Wa_66-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl
Dataset created: ssu
Upload succeeded for dataset ssu
Downloaded 240253 bytes from https://s3.mesocentre.uca.fr
Uploading (append): /EMOBON_AAOT_Wa_67-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl
Dataset creation failed for ssu 409
Server response: Name already registered '/ssu'

Upload succeeded for dataset ssu
Downloaded 215932 bytes from https://s3.mesocentre.uca.fr
Uploading (append): /EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary/SSU/SSU-taxonomy-summary.ttl
Dataset creation failed for ssu 409
Server response: Name already registered '/ssu'

Upload succeeded for dataset ssu
Downloaded 628259 bytes from https://s3.mesocentre.uca.fr
Uploading (append): /EMOBON_OSD74_Wa_21-ro-crate/taxo

## 3. SPARQL filtering
- Now we can demonstrate queries across several graphs
- This is the future added value to organize data in graphs
- Once somebody hosts MGnify data in SPARQL endpoint, you can query all MGnify/metaGOflow data at once

In [None]:
q = """
PREFIX prod:  <https://data.emobon.embrc.eu/ns/product#>
PREFIX dct:   <http://purl.org/dc/terms/>
PREFIX schema:<https://schema.org/>
PREFIX xsd:   <http://www.w3.org/2001/XMLSchema#>

SELECT ?annotation ?otuID ?abundance ?sample ?taxonIRI ?taxonName ?taxonRank
WHERE {
  # annotation node carrying the abundance
  ?annotation a prod:TaxonomicAnnotation ;
              prod:ssuRNA ?abundance ;
              prod:otuID ?otuID ;
              prod:ofSample ?sample ;
              dct:identifier ?taxonIRI .

  # optional taxon metadata reachable from the taxon IRI
  OPTIONAL { ?taxonIRI dct:scientificName ?taxonName }
  OPTIONAL { ?taxonIRI dct:taxonRank ?taxonRank }

  FILTER ( xsd:double(?abundance) > 20 )    # numeric filter
  FILTER ( regex(str(?taxonRank), "^family", "i"))
}
ORDER BY DESC(xsd:double(?abundance))
"""


r = requests.get("http://localhost:3030/ssu", params={"query": q}, headers={"Accept": "application/sparql-results+json"})
df = sparql_json_to_df(r.json())
df

Unnamed: 0,annotation,otuID,abundance,sample,taxonIRI,taxonName,taxonRank
0,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary-SSU#49546,23045,2121.0,http://data.emobon.embrc.eu/observatory-hcmr-1-crate/water/sample/EMOBON_HCMR-1_Wa_6,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=49546,Flavobacteriaceae,family
1,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_67-ro-crate/taxonomy-summary-SSU#49546,23045,1411.0,http://data.emobon.embrc.eu/observatory-aaot-crate/water/sample/EMOBON_AAOT_Wa_67,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=49546,Flavobacteriaceae,family
2,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_66-ro-crate/taxonomy-summary-SSU#49546,23045,1151.0,http://data.emobon.embrc.eu/observatory-aaot-crate/water/sample/EMOBON_AAOT_Wa_66,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=49546,Flavobacteriaceae,family
3,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary-SSU#31989,223669,975.0,http://data.emobon.embrc.eu/observatory-hcmr-1-crate/water/sample/EMOBON_HCMR-1_Wa_6,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=31989,Rhodobacteraceae,family
4,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_67-ro-crate/taxonomy-summary-SSU#338190,104349,827.0,http://data.emobon.embrc.eu/observatory-aaot-crate/water/sample/EMOBON_AAOT_Wa_67,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=338190,Nitrosopumilaceae,family
...,...,...,...,...,...,...,...
76,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_66-ro-crate/taxonomy-summary-SSU#1706374,133665,21.0,http://data.emobon.embrc.eu/observatory-aaot-crate/water/sample/EMOBON_AAOT_Wa_66,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=1706374,Porticoccaceae,family
77,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_67-ro-crate/taxonomy-summary-SSU#203557,104471,21.0,http://data.emobon.embrc.eu/observatory-aaot-crate/water/sample/EMOBON_AAOT_Wa_67,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=203557,Verrucomicrobiaceae,family
78,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_67-ro-crate/taxonomy-summary-SSU#415002,119872,21.0,http://data.emobon.embrc.eu/observatory-aaot-crate/water/sample/EMOBON_AAOT_Wa_67,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=415002,Puniceicoccaceae,family
79,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_AAOT_Wa_67-ro-crate/taxonomy-summary-SSU#568386,50017,21.0,http://data.emobon.embrc.eu/observatory-aaot-crate/water/sample/EMOBON_AAOT_Wa_67,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=568386,Sinobacteraceae,family


### Wikipedia query

In [None]:
q = """
PREFIX wdt:  <http://www.wikidata.org/prop/direct/>
PREFIX rdfs:  <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <https://schema.org/>

SELECT ?item ?label ?wikipediaPage ?ncbi ?mesh 
WHERE {
  SERVICE <https://query.wikidata.org/sparql> {
    ?item wdt:P225 "Flavobacteriaceae" .
    ?item rdfs:label ?label .
    FILTER(LANG(?label) = "en")
    OPTIONAL { ?item wdt:P685 ?ncbi }   # NCBI Taxon ID
    # Find the Wikidata item whose MeSH descriptor ID is the same string
    OPTIONAL { ?item wdt:P672 ?mesh }
  }
}
LIMIT 10
"""

r = requests.get("http://localhost:3030/ssu", params={"query": q}, headers={"Accept": "application/sparql-results+json"})
df = sparql_json_to_df(r.json())
df

Unnamed: 0,item,label,wikipediaPage,ncbi,mesh
0,http://www.wikidata.org/entity/Q5458145,Flavobacteriaceae,,49546,B03.440.080.190
1,http://www.wikidata.org/entity/Q5458145,Flavobacteriaceae,,49546,B03.440.400.425.310


### UniProt
- example queries, https://sparql.uniprot.org/.well-known/sparql-examples/

In [16]:
q = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>

SELECT ?protein ?organism ?sequence
WHERE {
    ?protein a up:Protein ;
             up:organism ?organism ;
             up:sequence ?seqNode .
    ?seqNode rdf:value ?sequence .
    
    # Only proteins under taxon 49546
    ?organism rdfs:subClassOf taxon:49546 .
}
LIMIT 100
""" 

r = requests.get("https://sparql.uniprot.org/sparql", params={"query": q}, headers={"Accept": "application/sparql-results+json"})
df = sparql_json_to_df(r.json())
df

Unnamed: 0,protein,organism,sequence
0,http://purl.uniprot.org/uniprot/A0A023BMI4,http://purl.uniprot.org/taxonomy/1317122,MYSRTHFSELLSLIPRYKFNQFVLKYSADKHNKGFNSWTHLVTMVFSQLSKANSLREIETSFNSVVNAHFHMGARSIKRSTLSEANQKRDFRVFADLANELMKNFRPSKQKELKEFLFLLDSSPIILQGRHFDWTNKTRNYNNGLKLHMLYDTHTTTPTYIDITASNINDINIGRELPIQPNATYVFDKGYTDYNWWFSIHKKQSFFVTRFKKNAATHIIEELPINKSDTQLVLADQKVIFKNKTPRGGKINQYTVPLRKITIRRDNKNTPLVIATNDFNKSAGEIASLYKKRWDIELFFKWIKQNLKIKRFIGTSLNAVKTQIYTAIITYLLSLKLQKLKENTLPFYLFLEKLSALLFVPVTLIKNDGHSQKKKDQLLIKQQLNFSW
1,http://purl.uniprot.org/uniprot/A0A023BMI7,http://purl.uniprot.org/taxonomy/1317122,MDFTIRKGSHTVSRLTCHIVWSTKYRYKVLRGDIQIRCRELLIQICDAEGIEILKGVVSADHVHMHIEYAPKLSVSYVVKQLKGRTSRKLQQEFHSLQDRYWGKHFWANGYGVWSTGNITDKMVNEYLEHHRRDNNDNSNFILE
2,http://purl.uniprot.org/uniprot/A0A023BMJ1,http://purl.uniprot.org/taxonomy/1317122,MKLPKTYIKIPITELGELRFMQDTLLDNLSLLSQTEDNFSTNRSIKDNMYWISKILVAIAEIDQRREFDDLELEKK
3,http://purl.uniprot.org/uniprot/A0A023BMJ3,http://purl.uniprot.org/taxonomy/1317122,MKKIIFVVFFLMTYIGFSQDYGFLFKTEVQATAHANLFNASVVTDSPEVYPGIGYNSTPQNVGSIYYDFIELNNNFNTATVNLYSHWISLSDPVSCNSDDTYTYTRNEFINNLVGYNTKDCNFFTIVYPIHIIEPSANEFCPDQEIVLKYGYHWQFSFDGINWNSFPTSLNTKRVTSFTLKELFSLSGIPDSQWQSESNIKFQTGYRTEFTNIRNITIINCSPKLDGPIIDIQPLCSNSINHNDNDNGSFTVTFDRELDDTKQEKMNLQVYRQVGSSFDGYASKVVTKSDFTGTSYTWEPKNLPGGVYKLFWQTKSNNEGFDDINTVPDAYDESNPFTLTTPPALSVSGAPSPVQCFGGNDGSITVTPNGGTPGTPPTSPRYQYSIDNGTTWQQETLFDSLTKGDYTILIKDNNGCEATSAPITVNERFLTIPDVVGLSALITSPTLINGNNGRIAISVSSGSGNYTNYAWTKDGNPFTPPSGSTNTNIINLYEGVYTIVVTDSNGCSSNLETFTLTDPEPIDISINMTPNTVNCSDTKVNLIASATGGFLNSGGDYTYLWDDGTTEASLTNVGIGNYQVTVSDQGGNSQSKSFQVQGPEPITAIPTVSNVGCKNGSDGTIQLTINGGTGQYTVNWTKLFDNT
4,http://purl.uniprot.org/uniprot/A0A023BMJ6,http://purl.uniprot.org/taxonomy/1317122,MIDQGLNFNIFNIIILIGIFQGPIFALIVFFNKNYRFLANYFLVSTALALSFNNFQYWLLDTGMVNELYFQIPFEFLIMSMFYPFVDEYLQIKSPKKIILAIIVPFFTSFIFRLIMKFGLITLSNDLIHILLTLEEYLSLVFSVSMITIILIKIHKYEKAKTDFNLSEIKAKTKWLKQALVFGIIICVFWVFVIQDNIARFEDDLSKYYPLWIIISILVYWIVYKGIIETQIFNQRIEIRNDTIEFTYNGQKTAYINDDFFLEIKSFIINEKLYLNPNLNLDLVAEKFNVSIGHLSKTVNKNANQSFTDFVNQLRVNESKKMLLNPNYKNYTIEAIGYESGFYSKSNFYAAFKKETNQTPSAFRLRK
...,...,...,...
95,http://purl.uniprot.org/uniprot/A0A023BMZ8,http://purl.uniprot.org/taxonomy/1317122,MKFENLIICVLTGFVVVSGYTQEKTIDTTLVNELQEVVLTATRTERQLSSLPLPVTIVSQETIKQSGTIRLNEILNEQTGIITVADESGFEGVQIQGIASDYILILIDGVPLVGRKAGNFDVNRLTVGNIKQVEVVKGPSSSLYGSEALGGVINIITEKPKSDVLSGNASYRIGSYTQQDINVDIKQRIKKLGYGVFANRFSSEGYDLTPDTAGQTVNPFENYTFNGRLYYDFSDQFSLFLSGRLYTQYQDAGFTTNTTSFEGDSEEKEWNSHLRLDHKWSDHLTTQYEFYYTNYNAKEQLADSSSGDIVSDSDFDQRLLRPEIRTTYAFKDSSKLTFGVGFQYDELDRTFFDKQVDFNSQYVYAQYDTHLIERLNVITGARFDNHSEYSNQFSPKLALRYKITEALAAKASVGYGFKAPDFRQLYFDFTNSTVGYTVLGYNVALEKLNELQAQGQILDVVVPESSLQDPLEAENSIGYNAGLTYKENRWNAELNFFRNDFKNLIDTRVIARKTNGQNVFSYFNFDKIYTTGLEFNTNYRITDNVRLSAGYQLLYAFDKEKERLVKNGEVFARDPETNQTVAVSRSEYFGLVNRSRHNANFKVFYDIVSAKANINLRLLYRSKYALFDTNGNDLIDDYDTSFVDGFAIANIAASKTFYENFTLQIGANNLFDYTKDNIPTLPGIQLYAKLNYQF
96,http://purl.uniprot.org/uniprot/A0A023BMZ9,http://purl.uniprot.org/taxonomy/1317122,MKKKHFLLISVSLLMSQGLLAQDHSTHSSPGSLGAEQIFGLLEMPFLAIALIFSFLTATKLKGGKFGSGMTLLAWGFVVMALGHLHMQIAHIFDYNIFKNIFGDTFGNYIWFIALILTWGLSALGFYKIYKASKI
97,http://purl.uniprot.org/uniprot/A0A023BN00,http://purl.uniprot.org/taxonomy/1317122,MKVNVHQNIKILVLVLSILFSLQKSYSQNDNFWSNVSFGGNLGIGFGNDTFSGVIEPSALYNFNEQFAAGMGVSFGYIESNNFTATNYGGSLLAFYSPIREIRLSLEFQEMGVSRTLEIENAQDLKENYWYPSLFVGGGYRMGNVSVGIRYDLLYDSDKSIYGSAYTPFVSVFF
98,http://purl.uniprot.org/uniprot/A0A023BN01,http://purl.uniprot.org/taxonomy/1317122,MTPVVSGFSKLSKADKIKWLAKHHFNDDQNAVDTLVTYWNSDDGLQQLHDEFTENTISNYYLPFSVAPNFLINNKRYTLPMAIEESSVVAAASKAAKFWQTRGGFKAEVLSTIKVGQVHFTYNGKPEKLQQFFSIIKPKLLASVSHMTKNMEKRGGGVIDIELRDKTSEIDDYYQLHCTFETVDAMGANFINSCLEQFAKTMTTEAKEHHDFSATEKDIEIVMSILSNYVPQCLVKASVSCNIKDLPSSPSLSPLQYANKFVRAVRIAEVEPYRAVTHNKGIMNGIDAVVLATGNDFRAIEAGAHAYASRDGKYTSLTHAEIQNEMLTFSIKLPLALGTVGGLTSLHPLVKFALQLLEKPNAKKLMEITAVAGLAQNFAAINSLITTGIQQGHMKMHLMNILNQFKATENEKKQLIKYFETNAVTHSEVVTQIEKLRA


In [17]:
q = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>

SELECT (COUNT(?protein) AS ?proteinCount)
WHERE {
    ?protein a up:Protein ;
             up:organism ?organism ;
             up:sequence ?seqNode .
    ?seqNode rdf:value ?sequence .

    ?organism rdfs:subClassOf taxon:49546 .
}

"""
r = requests.get("https://sparql.uniprot.org/sparql", params={"query": q}, headers={"Accept": "application/sparql-results+json"})
df = sparql_json_to_df(r.json())
df

Unnamed: 0,proteinCount
0,2829119


## Total query

In [18]:
q = """
PREFIX prod:  <https://data.emobon.embrc.eu/ns/product#>
PREFIX dct:   <http://purl.org/dc/terms/>
PREFIX schema:<https://schema.org/>
PREFIX xsd:   <http://www.w3.org/2001/XMLSchema#>
PREFIX wdt:  <http://www.wikidata.org/prop/direct/>
PREFIX rdfs:  <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>

SELECT ?annotation ?otuID ?abundance ?sample ?taxonIRI ?taxonTitle ?taxonName ?taxonRank ?item ?label ?wikipediaPage ?ncbi ?mesh
WHERE {
  {
    SELECT ?annotation ?otuID ?abundance ?sample ?taxonIRI ?taxonTitle ?taxonName ?taxonRank
    WHERE {
      ?annotation a prod:TaxonomicAnnotation ;
                  prod:ssuRNA ?abundance ;
                  prod:otuID ?otuID ;
                  prod:ofSample ?sample ;
                  dct:identifier ?taxonIRI .
      OPTIONAL { ?taxonIRI dct:title ?taxonTitle }
      OPTIONAL { ?taxonIRI dct:scientificName ?taxonName }
      OPTIONAL { ?taxonIRI dct:taxonRank ?taxonRank }

      FILTER(xsd:double(?abundance) > 20)
      FILTER(regex(str(?taxonRank), "^family", "i"))
    }
    ORDER BY DESC(xsd:double(?abundance))
    LIMIT 1
  }

  SERVICE <https://query.wikidata.org/sparql> {
    ?item wdt:P225 ?taxonName .
    ?item rdfs:label ?label .
    FILTER(LANG(?label) = "en")
    OPTIONAL { ?item wdt:P685 ?ncbi }   # NCBI Taxon ID
    # Find the Wikidata item whose MeSH descriptor ID is the same string
    OPTIONAL { ?item wdt:P672 ?mesh }
  }
}
"""


r = requests.get("http://localhost:3030/ssu", params={"query": q}, headers={"Accept": "application/sparql-results+json"})
df = sparql_json_to_df(r.json())
df

Unnamed: 0,annotation,otuID,abundance,sample,taxonIRI,taxonTitle,taxonName,taxonRank,item,label,wikipediaPage,ncbi,mesh
0,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary-SSU#49546,23045,2121.0,http://data.emobon.embrc.eu/observatory-hcmr-1-crate/water/sample/EMOBON_HCMR-1_Wa_6,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=49546,Flavobacteriaceae,Flavobacteriaceae,family,http://www.wikidata.org/entity/Q5458145,Flavobacteriaceae,,49546,B03.440.080.190
1,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary-SSU#49546,23045,2121.0,http://data.emobon.embrc.eu/observatory-hcmr-1-crate/water/sample/EMOBON_HCMR-1_Wa_6,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=49546,Flavobacteriaceae,Flavobacteriaceae,family,http://www.wikidata.org/entity/Q5458145,Flavobacteriaceae,,49546,B03.440.400.425.310


In [19]:
tax_id = df.loc[0, "ncbi"]
q = f"""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>

SELECT ?taxon ?protein ?organism ?sequence
WHERE {{
    ?protein a up:Protein ;
             up:organism ?organism ;
             up:sequence ?seqNode .
    ?seqNode rdf:value ?sequence .
    
    # Only proteins under taxon
    ?organism rdfs:subClassOf taxon:{tax_id} .
    BIND({tax_id} AS ?taxon)
}}
LIMIT 100
""" 

r = requests.get("https://sparql.uniprot.org/sparql", params={"query": q}, headers={"Accept": "application/sparql-results+json"})
df_prot = sparql_json_to_df(r.json())
df_prot

Unnamed: 0,taxon,protein,organism,sequence
0,49546,http://purl.uniprot.org/uniprot/A0A023BMI4,http://purl.uniprot.org/taxonomy/1317122,MYSRTHFSELLSLIPRYKFNQFVLKYSADKHNKGFNSWTHLVTMVFSQLSKANSLREIETSFNSVVNAHFHMGARSIKRSTLSEANQKRDFRVFADLANELMKNFRPSKQKELKEFLFLLDSSPIILQGRHFDWTNKTRNYNNGLKLHMLYDTHTTTPTYIDITASNINDINIGRELPIQPNATYVFDKGYTDYNWWFSIHKKQSFFVTRFKKNAATHIIEELPINKSDTQLVLADQKVIFKNKTPRGGKINQYTVPLRKITIRRDNKNTPLVIATNDFNKSAGEIASLYKKRWDIELFFKWIKQNLKIKRFIGTSLNAVKTQIYTAIITYLLSLKLQKLKENTLPFYLFLEKLSALLFVPVTLIKNDGHSQKKKDQLLIKQQLNFSW
1,49546,http://purl.uniprot.org/uniprot/A0A023BMI7,http://purl.uniprot.org/taxonomy/1317122,MDFTIRKGSHTVSRLTCHIVWSTKYRYKVLRGDIQIRCRELLIQICDAEGIEILKGVVSADHVHMHIEYAPKLSVSYVVKQLKGRTSRKLQQEFHSLQDRYWGKHFWANGYGVWSTGNITDKMVNEYLEHHRRDNNDNSNFILE
2,49546,http://purl.uniprot.org/uniprot/A0A023BMJ1,http://purl.uniprot.org/taxonomy/1317122,MKLPKTYIKIPITELGELRFMQDTLLDNLSLLSQTEDNFSTNRSIKDNMYWISKILVAIAEIDQRREFDDLELEKK
3,49546,http://purl.uniprot.org/uniprot/A0A023BMJ3,http://purl.uniprot.org/taxonomy/1317122,MKKIIFVVFFLMTYIGFSQDYGFLFKTEVQATAHANLFNASVVTDSPEVYPGIGYNSTPQNVGSIYYDFIELNNNFNTATVNLYSHWISLSDPVSCNSDDTYTYTRNEFINNLVGYNTKDCNFFTIVYPIHIIEPSANEFCPDQEIVLKYGYHWQFSFDGINWNSFPTSLNTKRVTSFTLKELFSLSGIPDSQWQSESNIKFQTGYRTEFTNIRNITIINCSPKLDGPIIDIQPLCSNSINHNDNDNGSFTVTFDRELDDTKQEKMNLQVYRQVGSSFDGYASKVVTKSDFTGTSYTWEPKNLPGGVYKLFWQTKSNNEGFDDINTVPDAYDESNPFTLTTPPALSVSGAPSPVQCFGGNDGSITVTPNGGTPGTPPTSPRYQYSIDNGTTWQQETLFDSLTKGDYTILIKDNNGCEATSAPITVNERFLTIPDVVGLSALITSPTLINGNNGRIAISVSSGSGNYTNYAWTKDGNPFTPPSGSTNTNIINLYEGVYTIVVTDSNGCSSNLETFTLTDPEPIDISINMTPNTVNCSDTKVNLIASATGGFLNSGGDYTYLWDDGTTEASLTNVGIGNYQVTVSDQGGNSQSKSFQVQGPEPITAIPTVSNVGCKNGSDGTIQLTINGGTGQYTVNWTKLFDNT
4,49546,http://purl.uniprot.org/uniprot/A0A023BMJ6,http://purl.uniprot.org/taxonomy/1317122,MIDQGLNFNIFNIIILIGIFQGPIFALIVFFNKNYRFLANYFLVSTALALSFNNFQYWLLDTGMVNELYFQIPFEFLIMSMFYPFVDEYLQIKSPKKIILAIIVPFFTSFIFRLIMKFGLITLSNDLIHILLTLEEYLSLVFSVSMITIILIKIHKYEKAKTDFNLSEIKAKTKWLKQALVFGIIICVFWVFVIQDNIARFEDDLSKYYPLWIIISILVYWIVYKGIIETQIFNQRIEIRNDTIEFTYNGQKTAYINDDFFLEIKSFIINEKLYLNPNLNLDLVAEKFNVSIGHLSKTVNKNANQSFTDFVNQLRVNESKKMLLNPNYKNYTIEAIGYESGFYSKSNFYAAFKKETNQTPSAFRLRK
...,...,...,...,...
95,49546,http://purl.uniprot.org/uniprot/A0A023BMZ8,http://purl.uniprot.org/taxonomy/1317122,MKFENLIICVLTGFVVVSGYTQEKTIDTTLVNELQEVVLTATRTERQLSSLPLPVTIVSQETIKQSGTIRLNEILNEQTGIITVADESGFEGVQIQGIASDYILILIDGVPLVGRKAGNFDVNRLTVGNIKQVEVVKGPSSSLYGSEALGGVINIITEKPKSDVLSGNASYRIGSYTQQDINVDIKQRIKKLGYGVFANRFSSEGYDLTPDTAGQTVNPFENYTFNGRLYYDFSDQFSLFLSGRLYTQYQDAGFTTNTTSFEGDSEEKEWNSHLRLDHKWSDHLTTQYEFYYTNYNAKEQLADSSSGDIVSDSDFDQRLLRPEIRTTYAFKDSSKLTFGVGFQYDELDRTFFDKQVDFNSQYVYAQYDTHLIERLNVITGARFDNHSEYSNQFSPKLALRYKITEALAAKASVGYGFKAPDFRQLYFDFTNSTVGYTVLGYNVALEKLNELQAQGQILDVVVPESSLQDPLEAENSIGYNAGLTYKENRWNAELNFFRNDFKNLIDTRVIARKTNGQNVFSYFNFDKIYTTGLEFNTNYRITDNVRLSAGYQLLYAFDKEKERLVKNGEVFARDPETNQTVAVSRSEYFGLVNRSRHNANFKVFYDIVSAKANINLRLLYRSKYALFDTNGNDLIDDYDTSFVDGFAIANIAASKTFYENFTLQIGANNLFDYTKDNIPTLPGIQLYAKLNYQF
96,49546,http://purl.uniprot.org/uniprot/A0A023BMZ9,http://purl.uniprot.org/taxonomy/1317122,MKKKHFLLISVSLLMSQGLLAQDHSTHSSPGSLGAEQIFGLLEMPFLAIALIFSFLTATKLKGGKFGSGMTLLAWGFVVMALGHLHMQIAHIFDYNIFKNIFGDTFGNYIWFIALILTWGLSALGFYKIYKASKI
97,49546,http://purl.uniprot.org/uniprot/A0A023BN00,http://purl.uniprot.org/taxonomy/1317122,MKVNVHQNIKILVLVLSILFSLQKSYSQNDNFWSNVSFGGNLGIGFGNDTFSGVIEPSALYNFNEQFAAGMGVSFGYIESNNFTATNYGGSLLAFYSPIREIRLSLEFQEMGVSRTLEIENAQDLKENYWYPSLFVGGGYRMGNVSVGIRYDLLYDSDKSIYGSAYTPFVSVFF
98,49546,http://purl.uniprot.org/uniprot/A0A023BN01,http://purl.uniprot.org/taxonomy/1317122,MTPVVSGFSKLSKADKIKWLAKHHFNDDQNAVDTLVTYWNSDDGLQQLHDEFTENTISNYYLPFSVAPNFLINNKRYTLPMAIEESSVVAAASKAAKFWQTRGGFKAEVLSTIKVGQVHFTYNGKPEKLQQFFSIIKPKLLASVSHMTKNMEKRGGGVIDIELRDKTSEIDDYYQLHCTFETVDAMGANFINSCLEQFAKTMTTEAKEHHDFSATEKDIEIVMSILSNYVPQCLVKASVSCNIKDLPSSPSLSPLQYANKFVRAVRIAEVEPYRAVTHNKGIMNGIDAVVLATGNDFRAIEAGAHAYASRDGKYTSLTHAEIQNEMLTFSIKLPLALGTVGGLTSLHPLVKFALQLLEKPNAKKLMEITAVAGLAQNFAAINSLITTGIQQGHMKMHLMNILNQFKATENEKKQLIKYFETNAVTHSEVVTQIEKLRA


## Final DF merge, data from local EMO-BON + wikipedia + UniProt!!!

In [20]:
# merge dataframes
df_merged = df.merge(df_prot, how="left", left_on="ncbi", right_on="taxon")
df_merged

Unnamed: 0,annotation,otuID,abundance,sample,taxonIRI,taxonTitle,taxonName,taxonRank,item,label,wikipediaPage,ncbi,mesh,taxon,protein,organism,sequence
0,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary-SSU#49546,23045,2121.0,http://data.emobon.embrc.eu/observatory-hcmr-1-crate/water/sample/EMOBON_HCMR-1_Wa_6,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=49546,Flavobacteriaceae,Flavobacteriaceae,family,http://www.wikidata.org/entity/Q5458145,Flavobacteriaceae,,49546,B03.440.080.190,49546,http://purl.uniprot.org/uniprot/A0A023BMI4,http://purl.uniprot.org/taxonomy/1317122,MYSRTHFSELLSLIPRYKFNQFVLKYSADKHNKGFNSWTHLVTMVFSQLSKANSLREIETSFNSVVNAHFHMGARSIKRSTLSEANQKRDFRVFADLANELMKNFRPSKQKELKEFLFLLDSSPIILQGRHFDWTNKTRNYNNGLKLHMLYDTHTTTPTYIDITASNINDINIGRELPIQPNATYVFDKGYTDYNWWFSIHKKQSFFVTRFKKNAATHIIEELPINKSDTQLVLADQKVIFKNKTPRGGKINQYTVPLRKITIRRDNKNTPLVIATNDFNKSAGEIASLYKKRWDIELFFKWIKQNLKIKRFIGTSLNAVKTQIYTAIITYLLSLKLQKLKENTLPFYLFLEKLSALLFVPVTLIKNDGHSQKKKDQLLIKQQLNFSW
1,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary-SSU#49546,23045,2121.0,http://data.emobon.embrc.eu/observatory-hcmr-1-crate/water/sample/EMOBON_HCMR-1_Wa_6,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=49546,Flavobacteriaceae,Flavobacteriaceae,family,http://www.wikidata.org/entity/Q5458145,Flavobacteriaceae,,49546,B03.440.080.190,49546,http://purl.uniprot.org/uniprot/A0A023BMI7,http://purl.uniprot.org/taxonomy/1317122,MDFTIRKGSHTVSRLTCHIVWSTKYRYKVLRGDIQIRCRELLIQICDAEGIEILKGVVSADHVHMHIEYAPKLSVSYVVKQLKGRTSRKLQQEFHSLQDRYWGKHFWANGYGVWSTGNITDKMVNEYLEHHRRDNNDNSNFILE
2,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary-SSU#49546,23045,2121.0,http://data.emobon.embrc.eu/observatory-hcmr-1-crate/water/sample/EMOBON_HCMR-1_Wa_6,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=49546,Flavobacteriaceae,Flavobacteriaceae,family,http://www.wikidata.org/entity/Q5458145,Flavobacteriaceae,,49546,B03.440.080.190,49546,http://purl.uniprot.org/uniprot/A0A023BMJ1,http://purl.uniprot.org/taxonomy/1317122,MKLPKTYIKIPITELGELRFMQDTLLDNLSLLSQTEDNFSTNRSIKDNMYWISKILVAIAEIDQRREFDDLELEKK
3,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary-SSU#49546,23045,2121.0,http://data.emobon.embrc.eu/observatory-hcmr-1-crate/water/sample/EMOBON_HCMR-1_Wa_6,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=49546,Flavobacteriaceae,Flavobacteriaceae,family,http://www.wikidata.org/entity/Q5458145,Flavobacteriaceae,,49546,B03.440.080.190,49546,http://purl.uniprot.org/uniprot/A0A023BMJ3,http://purl.uniprot.org/taxonomy/1317122,MKKIIFVVFFLMTYIGFSQDYGFLFKTEVQATAHANLFNASVVTDSPEVYPGIGYNSTPQNVGSIYYDFIELNNNFNTATVNLYSHWISLSDPVSCNSDDTYTYTRNEFINNLVGYNTKDCNFFTIVYPIHIIEPSANEFCPDQEIVLKYGYHWQFSFDGINWNSFPTSLNTKRVTSFTLKELFSLSGIPDSQWQSESNIKFQTGYRTEFTNIRNITIINCSPKLDGPIIDIQPLCSNSINHNDNDNGSFTVTFDRELDDTKQEKMNLQVYRQVGSSFDGYASKVVTKSDFTGTSYTWEPKNLPGGVYKLFWQTKSNNEGFDDINTVPDAYDESNPFTLTTPPALSVSGAPSPVQCFGGNDGSITVTPNGGTPGTPPTSPRYQYSIDNGTTWQQETLFDSLTKGDYTILIKDNNGCEATSAPITVNERFLTIPDVVGLSALITSPTLINGNNGRIAISVSSGSGNYTNYAWTKDGNPFTPPSGSTNTNIINLYEGVYTIVVTDSNGCSSNLETFTLTDPEPIDISINMTPNTVNCSDTKVNLIASATGGFLNSGGDYTYLWDDGTTEASLTNVGIGNYQVTVSDQGGNSQSKSFQVQGPEPITAIPTVSNVGCKNGSDGTIQLTINGGTGQYTVNWTKLFDNT
4,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary-SSU#49546,23045,2121.0,http://data.emobon.embrc.eu/observatory-hcmr-1-crate/water/sample/EMOBON_HCMR-1_Wa_6,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=49546,Flavobacteriaceae,Flavobacteriaceae,family,http://www.wikidata.org/entity/Q5458145,Flavobacteriaceae,,49546,B03.440.080.190,49546,http://purl.uniprot.org/uniprot/A0A023BMJ6,http://purl.uniprot.org/taxonomy/1317122,MIDQGLNFNIFNIIILIGIFQGPIFALIVFFNKNYRFLANYFLVSTALALSFNNFQYWLLDTGMVNELYFQIPFEFLIMSMFYPFVDEYLQIKSPKKIILAIIVPFFTSFIFRLIMKFGLITLSNDLIHILLTLEEYLSLVFSVSMITIILIKIHKYEKAKTDFNLSEIKAKTKWLKQALVFGIIICVFWVFVIQDNIARFEDDLSKYYPLWIIISILVYWIVYKGIIETQIFNQRIEIRNDTIEFTYNGQKTAYINDDFFLEIKSFIINEKLYLNPNLNLDLVAEKFNVSIGHLSKTVNKNANQSFTDFVNQLRVNESKKMLLNPNYKNYTIEAIGYESGFYSKSNFYAAFKKETNQTPSAFRLRK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary-SSU#49546,23045,2121.0,http://data.emobon.embrc.eu/observatory-hcmr-1-crate/water/sample/EMOBON_HCMR-1_Wa_6,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=49546,Flavobacteriaceae,Flavobacteriaceae,family,http://www.wikidata.org/entity/Q5458145,Flavobacteriaceae,,49546,B03.440.400.425.310,49546,http://purl.uniprot.org/uniprot/A0A023BMZ8,http://purl.uniprot.org/taxonomy/1317122,MKFENLIICVLTGFVVVSGYTQEKTIDTTLVNELQEVVLTATRTERQLSSLPLPVTIVSQETIKQSGTIRLNEILNEQTGIITVADESGFEGVQIQGIASDYILILIDGVPLVGRKAGNFDVNRLTVGNIKQVEVVKGPSSSLYGSEALGGVINIITEKPKSDVLSGNASYRIGSYTQQDINVDIKQRIKKLGYGVFANRFSSEGYDLTPDTAGQTVNPFENYTFNGRLYYDFSDQFSLFLSGRLYTQYQDAGFTTNTTSFEGDSEEKEWNSHLRLDHKWSDHLTTQYEFYYTNYNAKEQLADSSSGDIVSDSDFDQRLLRPEIRTTYAFKDSSKLTFGVGFQYDELDRTFFDKQVDFNSQYVYAQYDTHLIERLNVITGARFDNHSEYSNQFSPKLALRYKITEALAAKASVGYGFKAPDFRQLYFDFTNSTVGYTVLGYNVALEKLNELQAQGQILDVVVPESSLQDPLEAENSIGYNAGLTYKENRWNAELNFFRNDFKNLIDTRVIARKTNGQNVFSYFNFDKIYTTGLEFNTNYRITDNVRLSAGYQLLYAFDKEKERLVKNGEVFARDPETNQTVAVSRSEYFGLVNRSRHNANFKVFYDIVSAKANINLRLLYRSKYALFDTNGNDLIDDYDTSFVDGFAIANIAASKTFYENFTLQIGANNLFDYTKDNIPTLPGIQLYAKLNYQF
196,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary-SSU#49546,23045,2121.0,http://data.emobon.embrc.eu/observatory-hcmr-1-crate/water/sample/EMOBON_HCMR-1_Wa_6,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=49546,Flavobacteriaceae,Flavobacteriaceae,family,http://www.wikidata.org/entity/Q5458145,Flavobacteriaceae,,49546,B03.440.400.425.310,49546,http://purl.uniprot.org/uniprot/A0A023BMZ9,http://purl.uniprot.org/taxonomy/1317122,MKKKHFLLISVSLLMSQGLLAQDHSTHSSPGSLGAEQIFGLLEMPFLAIALIFSFLTATKLKGGKFGSGMTLLAWGFVVMALGHLHMQIAHIFDYNIFKNIFGDTFGNYIWFIALILTWGLSALGFYKIYKASKI
197,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary-SSU#49546,23045,2121.0,http://data.emobon.embrc.eu/observatory-hcmr-1-crate/water/sample/EMOBON_HCMR-1_Wa_6,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=49546,Flavobacteriaceae,Flavobacteriaceae,family,http://www.wikidata.org/entity/Q5458145,Flavobacteriaceae,,49546,B03.440.400.425.310,49546,http://purl.uniprot.org/uniprot/A0A023BN00,http://purl.uniprot.org/taxonomy/1317122,MKVNVHQNIKILVLVLSILFSLQKSYSQNDNFWSNVSFGGNLGIGFGNDTFSGVIEPSALYNFNEQFAAGMGVSFGYIESNNFTATNYGGSLLAFYSPIREIRLSLEFQEMGVSRTLEIENAQDLKENYWYPSLFVGGGYRMGNVSVGIRYDLLYDSDKSIYGSAYTPFVSVFF
198,https://data.emobon.embrc.eu/analysis-results-cluster-01-crate/EMOBON_HCMR-1_Wa_6-ro-crate/taxonomy-summary-SSU#49546,23045,2121.0,http://data.emobon.embrc.eu/observatory-hcmr-1-crate/water/sample/EMOBON_HCMR-1_Wa_6,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=49546,Flavobacteriaceae,Flavobacteriaceae,family,http://www.wikidata.org/entity/Q5458145,Flavobacteriaceae,,49546,B03.440.400.425.310,49546,http://purl.uniprot.org/uniprot/A0A023BN01,http://purl.uniprot.org/taxonomy/1317122,MTPVVSGFSKLSKADKIKWLAKHHFNDDQNAVDTLVTYWNSDDGLQQLHDEFTENTISNYYLPFSVAPNFLINNKRYTLPMAIEESSVVAAASKAAKFWQTRGGFKAEVLSTIKVGQVHFTYNGKPEKLQQFFSIIKPKLLASVSHMTKNMEKRGGGVIDIELRDKTSEIDDYYQLHCTFETVDAMGANFINSCLEQFAKTMTTEAKEHHDFSATEKDIEIVMSILSNYVPQCLVKASVSCNIKDLPSSPSLSPLQYANKFVRAVRIAEVEPYRAVTHNKGIMNGIDAVVLATGNDFRAIEAGAHAYASRDGKYTSLTHAEIQNEMLTFSIKLPLALGTVGGLTSLHPLVKFALQLLEKPNAKKLMEITAVAGLAQNFAAINSLITTGIQQGHMKMHLMNILNQFKATENEKKQLIKYFETNAVTHSEVVTQIEKLRA
