## Example from MGnify summary data

1. Study MGYS00006608, 16S rRNA amplicon sequencing from the Ocean Sampling Day (OSD) campaign June 2018 (ERP124424_taxonomy_abundances_SSU_v5.0)
2. Study MGYS00006607, 16S rRNA amplicon sequencing from the Ocean Sampling Day (OSD) campaign June 2019 (ERP124431_taxonomy_abundances_SSU_v5.0)
3. Study MGYS00000492, Amplicon sequencing of Tara Oceans DNA samples corresponding to size fractions for prokaryotes or protist. (ERP003634_taxonomy_abundances_SSU_v5.0)
4. Study MGYS00006680, SOLA sampling point Raw sequence reads (SRP237882_taxonomy_abundances_SSU_v5.0)
5. Study MGYS00006682, Vertical stratification of environmental DNA in the open ocean captures ecological patterns and behavior of deep-sea fishes (SRP334933_taxonomy_abundances_SSU_v5.0)
6. Study MGYS00006678, Dataset on spatiotemporal variation of microbial plankton communities in the Baltic Sea (ERP140185_taxonomy_abundances_SSU_v5.0)
7. Study MGYS00006675, 16S rRNA gene amplicon time-series in Blanes Bay Microbial Observatory (BBMO) (ERP122219_taxonomy_abundances_SSU_v5.0)
8. Study MGYS00003725, Arctic microbiome along Svalbard Cross Shelf transects (ERP106348_taxonomy_abundances_SSU_v5.0)
9. Study MGYS00006686, Environmental DNA and zooplankton samples taken at Helgoland Roads in June 2019 (ERP144826_taxonomy_abundances_SSU_v5.0)
10. Study MGYS00006714, Regional and vertical patterns in microbial communities across Fram Strait (2015-2019) (ERP151329_taxonomy_abundances_SSU_v5.0)


In [1]:
# This needs to be repeated here for the Pannel dashboard to work, WEIRD
# TODO: report as possible bug
import sys
import os
import io
import warnings
import holoviews as hv

from functools import partial
warnings.filterwarnings('ignore')

import pandas as pd
import panel as pn
from dotenv import load_dotenv
load_dotenv()

from mgo.udal import UDAL

# All low level functions are imported from the momics package
from momics.loader import load_parquets_udal
from momics.metadata import get_metadata_udal, enhance_metadata
import momics.plotting as pl
from momics.panel_utils import (
    diversity_select_widgets, create_indicators_diversity,
    serve_app, close_server,
)
from momics.diversity import (
    beta_diversity_parametrized,
)

from momics.taxonomy import (
    fill_taxonomy_placeholders,
    pivot_taxonomic_data,
    remove_high_taxa,
    prevalence_cutoff_taxonomy,
    prevalence_cutoff,
)

## Loading EMO-BON (meta)data

In [2]:
# parquet files
if 'google.colab' in str(get_ipython()):
    root_folder = os.path.abspath(os.path.join('/content/momics-demos'))
else:
    root_folder = os.path.abspath(os.path.join('../'))

assets_folder = os.path.join(root_folder, 'assets')
data_folder = os.path.join(root_folder, 'data')

In [3]:
@pn.cache()
def get_data():
    return load_parquets_udal()

# Load and merge metadata
@pn.cache()
def get_full_metadata():
    return get_metadata_udal()

@pn.cache()
def get_valid_samples():
    df_valid = pd.read_csv(
        os.path.join(root_folder, 'data/shipment_b1b2_181.csv')
    )
    return df_valid

In [4]:
# Load metadata
full_metadata = get_full_metadata()

# filter the metadata only for valid 181 samples
valid_samples = get_valid_samples()
full_metadata = enhance_metadata(full_metadata, valid_samples)

# LOADing data
mgf_parquet_dfs = get_data()

In [5]:
# keep only ssu
ssu = mgf_parquet_dfs['ssu'].copy()

del mgf_parquet_dfs

### MGnify: Get study metadata
- study contains many samples, metadata need to be per sample.

In [67]:
from jsonapi_client import Session as APISession
import requests
from tqdm import tqdm
import time

In [None]:
# function to get metadata for MGnify studies
def get_mgnify_metadata(study_id):
    with APISession("https://www.ebi.ac.uk/metagenomics/api/v1") as session:

        samples = map(lambda r: r.json, session.iterate(f'studies/{study_id}/samples?page_size=1000'))

        sample_list = []
        for sample_json in tqdm(samples):
            # Flatten sample-metadata list into a dictionary
            # 1. Extract sample-metadata (allowing None)
            metadata_fields = {
                item.get("key"): item.get("value", None)
                for item in sample_json["attributes"].get("sample-metadata", [])
            }

            # 2. Extract all other attributes (including None)
            attributes_fields = {
                k: v for k, v in sample_json["attributes"].items()
                if k != "sample-metadata"  # already unpacked separately
            }

            # 3. Merge everything including top-level id
            flat_data = {
                "id": sample_json.get("id"),
                **attributes_fields,
                **metadata_fields
            }

            # 4. Create DataFrame
            df = pd.DataFrame([flat_data])
            sample_list.append(df)

        # Concatenate all DataFrames into one
        df = pd.concat(sample_list, ignore_index=True)
        df['study'] = study_id
    return df


In [65]:
all_studies = [
    'MGYS00006608',
    'MGYS00006607',
    'MGYS00000492',
    'MGYS00006680',
    'MGYS00006682',
    'MGYS00006678',
    'MGYS00006675',
    'MGYS00003725',
    'MGYS00006686',
    'MGYS00006714',
]

In [71]:
metadata_list = []
for study in all_studies:
    time.sleep(1)  # To avoid hitting the API too fast
    try:
        metadata_list.append(get_mgnify_metadata(study))
    except Exception as e:
        print(f"Error fetching metadata for {study}: {e}")
# Concatenate all DataFrames into one
metadata = pd.concat(metadata_list, ignore_index=True)

INFO | jsonapi_client.session | Entering session
0it [00:00, ?it/s]INFO | jsonapi_client.session | Fetching document from url ParseResult(scheme='https', netloc='www.ebi.ac.uk', path='/metagenomics/api/v1/studies/MGYS00006608/samples', params='', query='page_size=1000', fragment='')
0it [00:00, ?it/s]INFO | jsonapi_client.session | Fetching document from url ParseResult(scheme='https', netloc='www.ebi.ac.uk', path='/metagenomics/api/v1/studies/MGYS00006608/samples', params='', query='page_size=1000', fragment='')
62it [00:00, 81.21it/s]
INFO | jsonapi_client.session | Exiting session
INFO | jsonapi_client.session | Committing dirty resources
INFO | jsonapi_client.session | Entering session
0it [00:00, ?it/s]INFO | jsonapi_client.session | Fetching document from url ParseResult(scheme='https', netloc='www.ebi.ac.uk', path='/metagenomics/api/v1/studies/MGYS00006607/samples', params='', query='page_size=1000', fragment='')
48it [00:01, 41.32it/s]
INFO | jsonapi_client.session | Exiting se

In [75]:
metadata.shape

(2009, 56)

In [76]:
non_null_counts = metadata.notnull().sum()

# Step 2: Sort column names by that count (descending)
ordered_columns = non_null_counts.sort_values(ascending=False).index.tolist()

# Step 3: Reorder DataFrame
df_ordered = metadata[ordered_columns]

# Step 4 (optional): Show info
df_ordered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2009 entries, 0 to 2008
Data columns (total 56 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   id                                               2009 non-null   object 
 1   longitude                                        2009 non-null   float64
 2   biosample                                        2009 non-null   object 
 3   latitude                                         2009 non-null   float64
 4   accession                                        2009 non-null   object 
 5   sample-desc                                      2009 non-null   object 
 6   sample-alias                                     2009 non-null   object 
 7   sample-name                                      2009 non-null   object 
 8   geographic location (longitude)                  2009 non-null   object 
 9   study                         

In [78]:
metadata.study.value_counts()

study
MGYS00006678    665
MGYS00006686    357
MGYS00006680    283
MGYS00006675    249
MGYS00006714    205
MGYS00003725     81
MGYS00006608     62
MGYS00006682     52
MGYS00006607     48
MGYS00000492      7
Name: count, dtype: int64

In [136]:
metadata.head()

Unnamed: 0,id,longitude,biosample,latitude,accession,sample-desc,environment-biome,environment-feature,environment-material,sample-name,...,sample collection device,sample status,sampling station,size fraction lower threshold,size fraction upper threshold,salinity,"geographic location (country and/or sea,region)",target gene,geo-loc-name,investigation type
0,ERS5150811,14.34,SAMEA7392422,40.8,ERS5150811,surface water sample from the Tyrrhenian Sea,marine biome [ENVO:00000447],microbial community [PCO:1000004],sea water [ENVO:00002149],32513:f016dd88-3b2f-48a1-b740-b5a51edf1232,...,,,,,,,,,,
1,ERS5150815,-3.89,SAMEA7392426,48.67,ERS5150815,surface water sample from the English Channel,estuarine biome [ENVO:01000020],microbial community [PCO:1000004],sea water [ENVO:00002149],32509:f016dd88-3b2f-48a1-b740-b5a51edf1232,...,,,,,,,,,,
2,ERS5150818,-9.31,SAMEA7392429,53.24,ERS5150818,surface water sample from the North Atlantic O...,marine biome [ENVO:00000447],microbial community [PCO:1000004],sea water [ENVO:00002149],32506:f016dd88-3b2f-48a1-b740-b5a51edf1232,...,,,,,,,,,,
3,ERS5150819,-60.0,SAMEA7392430,43.94,ERS5150819,surface water sample from the Northwest Atlant...,neritic epipelagic zone biome [ENVO:01000042],microbial community [PCO:1000004],sea water [ENVO:00002149],32505:f016dd88-3b2f-48a1-b740-b5a51edf1232,...,,,,,,,,,,
4,ERS5150823,12.58,SAMEA7392434,55.73,ERS5150823,surface water sample from the Baltic Sea,marine biome [ENVO:00000447],microbial community [PCO:1000004],sea water [ENVO:00002149],32501:f016dd88-3b2f-48a1-b740-b5a51edf1232,...,,,,,,,,,,


In [81]:
# save metadata to parquet
metadata.to_parquet(
    os.path.join(data_folder, 'mgnify_metadata.parquet'),
    index=False
)

### ENA: EMO-BON project name is PRJEB51688
- this needs to be searched at ENA, because it is not present on MGnify


In [128]:
# Step 1: Get all available fields for 'read_run'
fields_url = "https://www.ebi.ac.uk/ena/portal/api/returnFields"
fields_params = {
    "result": "read_run",
    "format": "json"
}
fields_response = requests.get(fields_url, params=fields_params)
fields = [f["columnId"] for f in fields_response.json()]

##############################
# Define base URL
base_url = "https://www.ebi.ac.uk/ena/portal/api/search"

# Define query parameters
params = {
    "result": "read_run",  # result type
    "query": '"study_accession=PRJEB51688"',  # search condition
    "fields": ",".join(fields),
    "limit": 1000,  # limit results
    "format": "json"  # return as JSON
}

# Send GET request
response = requests.get(base_url, params=params)

# Check response
if response.ok:
    data = response.json()
    df = pd.DataFrame(data)
    # print(df)
else:
    print("Error:", response.status_code, response.text)

In [130]:
import numpy as np
df.replace("", np.nan, inplace=True)
df_cleaned = df.dropna(axis=1, how='all')

In [131]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188 entries, 0 to 187
Data columns (total 78 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   run_accession                      188 non-null    object
 1   bam_aspera                         188 non-null    object
 2   bam_bytes                          188 non-null    object
 3   bam_ftp                            188 non-null    object
 4   bam_galaxy                         188 non-null    object
 5   bam_md5                            188 non-null    object
 6   base_count                         188 non-null    object
 7   broad_scale_environmental_context  188 non-null    object
 8   center_name                        188 non-null    object
 9   checklist                          188 non-null    object
 10  collection_date                    188 non-null    object
 11  collection_date_end                188 non-null    object
 12  collecti

In [134]:
df_cleaned['tax_lineage']

0       1;2787823;12908;408169;410657;412755
1      1;2787823;12908;408169;410657;1874687
2      1;2787823;12908;408169;410657;1874687
3      1;2787823;12908;408169;410657;1874687
4      1;2787823;12908;408169;410657;1874687
                       ...                  
183    1;2787823;12908;408169;410657;1874687
184    1;2787823;12908;408169;410657;1874687
185     1;2787823;12908;408169;410657;412755
186     1;2787823;12908;408169;410657;412755
187     1;2787823;12908;408169;410657;412755
Name: tax_lineage, Length: 188, dtype: object

### Merge the two
- check how many columns share the same name

In [135]:
set(metadata.columns).intersection(set(df_cleaned.columns))

{'depth', 'temperature'}