# Querry data based on ro-Crates

1. This specifically looks at the intermediate steps of the metaGOflow pipeline.
2. `fastap` outputs
3. ...

**Steps:** (for each metaGOflow step)
1. Acess ro-crate metadata file and extract needed data sources
2. Get the data
3. Visualize

In [None]:
import sys
import os
import io
import gc
import logging
import psutil

from IPython import get_ipython
logger = logging.getLogger(name="Diversity analysis app")
NUMBER_PERMUTATIONS = 999

if 'google.colab' in str(get_ipython()):
    print('Setting Google colab, you will need a ngrok account to make the dashboard display over the tunnel. \
    https://ngrok.com/')
    # clone the momics-demos repository to use it to load data
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        logger.info(f"Repository cloned")
    except OSError as e:
        logger.info(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')

    # this step takes time beacause of many dependencies
    os.system('pip install momics@git+https://github.com/emo-bon/marine-omics-methods.git@main')

elif psutil.users() == []:
    logger.info("Binder")
    NUMBER_PERMUTATIONS = 29  # permanova extremely slow on binder, therefore a change here
else:
    logger.info("Local")


from momics.utils import (
    memory_load, reconfig_logger,
    init_setup, get_notebook_environment,
)

# Set up logging
reconfig_logger()

# Determine the notebook environment
env = get_notebook_environment()

init_setup()
logger.info(f"Environment: {env}")

## Imports

In [5]:
# This needs to be repeated here for the Pannel dashboard to work, WEIRD
# TODO: report as possible bug
import sys
import os
import io
import requests

# import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
import panel as pn
from dotenv import load_dotenv
load_dotenv()

# from json2txttree import json2txttree

# All low level functions are imported from the momics package
from momics.loader import get_ro_crate_metadata_gh

### Trying new public access

In [None]:
def get_rocrate_metadata(sample_id):
    """
    Get the metadata from the ro-crate
    """
    # Get the metadata from the ro-crate
    url = f"https://api.github.com/repos/emo-bon/analysis-results-cluster-01-crate/contents/{sample_id}-ro-crate/ro-crate-metadata.json"
    req = requests.get(
        url,
        headers={
            "accept": "application/vnd.github.v3.raw",
        },
    )
    print("ro-crate-metadata.json request status", req.status_code)
    return req.json()


In [15]:
# TODO: is this nomenclature the final one?
sample_id = "EMOBON_PiEGetxo_Wa_4"
met_json = get_rocrate_metadata(sample_id)

ro-crate-metadata.json request status 200


In [19]:
## pretty nice tool
# print(json2txttree(met_json))

## Methods

- I do not really want to code parsing of this html
  - TODO: check for some written analyser of fastap

In [20]:

def extract_data_by_name(metadata, name='FASTP analysis of raw sequence data'):
    for d in metadata['@graph']:
        if 'name' in d.keys() and d['name'] == name:
            data = d
            break
    return data


def extract_all_datafiles(metadata):
    datafiles = []
    for d in metadata['@graph']:
        if 'name' in d.keys() and d['@type'] == 'File':
            data_unit = {}
            data_unit['name'] = d['name']
            try:
                # in MB
                data_unit['sizeMB'] = int(int(d['contentSize'])/1e6)
            except KeyError:
                data_unit['sizeMB'] = 'unknown'

            try:
                data_unit['downloadUrl'] = d['downloadUrl']
            except KeyError:
                data_unit['downloadUrl'] = 'unknown'
            datafiles.append(data_unit)
            
    return datafiles

In [21]:
from IPython.display import display, HTML

In [22]:
# extract all related data files
datafiles = extract_all_datafiles(met_json)
assert len(datafiles) == len([k for k in datafiles if k['sizeMB'] != 'unknown'])
assert len(datafiles) == len([k for k in datafiles if k['downloadUrl'] != 'unknown'])

len(datafiles), len([k for k in datafiles if k['sizeMB'] < 100])

(60, 53)

* this means I have 50 outputs which I can do something about with size < 50 MB

In [23]:
# print all data files which contain word fastq
[k for k in datafiles if 'fast' in k['name'].lower()]

[{'name': 'FASTP analysis of raw sequence data',
  'sizeMB': 0,
  'downloadUrl': 'https://s3.mesocentre.uca.fr/mgf-data-products/files/md5/99/d78d57cf8c0ebc38e15162b0afe37f'},
 {'name': 'FASTA formatted contig sequences',
  'sizeMB': 92,
  'downloadUrl': 'https://s3.mesocentre.uca.fr/mgf-data-products/files/md5/b4/1de79ad2ea5e565064d74362b76563'}]

In [24]:
[k['name' ] for k in datafiles]

['MetaGOflow YAML configuration file',
 'FASTP analysis of raw sequence data',
 'FASTA formatted contig sequences',
 'MetaGOflow configuration in YAML',
 'Overlapped coding sequences',
 'Trimmed forward reads',
 'Unfiltered merged reads',
 'Trimmed reverse reads',
 'Trimmed reverse reads QC summary',
 'Protein coding nucleotide sequences',
 'QC summary of merged reads',
 'Trimmed forward reads QC summary',
 'MOTUs',
 'Protein coding amino acid sequences',
 'Merged reads',
 'Geno Ontology summary statistics',
 'InterProScan summary statistics',
 'Kegg Ontology summary statistics',
 'ORF summary statistics',
 'Pfam summary statistcs',
 'Merged contigs CDS I5 summary',
 'Merged contigs HMM summary',
 'Merged contigs GO summary',
 'Merged contigs InterProScan slim',
 'Merged contigs InterProScan',
 'Merged contigs KO summary',
 'Merged contigs PFAM summary',
 'RDF 1.2 Turtle triples of the functional analyses results',
 'Eggnog emapper summary',
 'RNA prediction for 5_8S',
 'RNA prediction

In [25]:
[k for k in met_json['@graph'] if k['@type'] == 'File' and k['name'] == 'FASTA formatted contig sequences']

[{'@id': './final.contigs.fa.bz2',
  '@type': 'File',
  'name': 'FASTA formatted contig sequences',
  'description': 'These are the assembled contig sequences from the merged reads in FASTA format',
  'downloadUrl': 'https://s3.mesocentre.uca.fr/mgf-data-products/files/md5/b4/1de79ad2ea5e565064d74362b76563',
  'encodingFormat': 'application/x-bzip2',
  'contentSize': '92742645'}]

In [None]:
data = extract_data_by_name(met_json)
r = requests.get(data['downloadUrl'])
print(r.status_code)

# both look the same, but plots are generated by scripts, will not display here
# display(HTML(r.content.decode('utf-8')))
HTML(r.content.decode('utf-8'))

In [None]:
# questionable for Gcolab, but locally works
import webbrowser
webbrowser.open_new_tab(data['downloadUrl'])