In [49]:
from mgnify_oop import AbundanceTable, TaxonomyTable

from pathlib import Path
import pandas as pd

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [60]:
mgnify = AbundanceTable.from_csv(
    Path.home() / "coding/emo-bon/momics-demos/data/mgnify_data/ERP003634_taxonomy_abundances_SSU_v5.0.tsv",
    sep="\t",
)

INFO:mgnify_oop:Detected source as 'mgnify' based on data format.
INFO:mgnify_oop:raw MGNify data -> standardizing.
INFO:mgnify_oop:raw MGNify data -> standardizing.


In [61]:
mgnify.df

source material ID,ERR2010979,ERR315850,ERR315851,ERR315852,ERR315853,ERR315854,ERR315855
taxonomic_concat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
sk__Archaea,4,0,0,0,0,0,0
sk__Archaea;k__;p__Euryarchaeota;c__Candidatus_Poseidoniia,295,0,0,0,0,0,0
sk__Archaea;k__;p__Euryarchaeota;c__Thermoplasmata,261,0,0,0,0,0,0
sk__Archaea;k__;p__Thaumarchaeota;c__;o__Nitrosopumilales;f__Nitrosopumilaceae,3,0,0,0,0,0,0
sk__Bacteria,939,202,962,741,669,327,424
...,...,...,...,...,...,...,...
sk__Eukaryota;k__;p__;c__Polycystinea;o__Spumellaria,3,0,0,0,0,0,0
sk__Eukaryota;k__;p__Bacillariophyta,1,0,0,0,0,0,0
sk__Eukaryota;k__Metazoa;p__Arthropoda;c__Malacostraca,0,1,0,0,0,0,0
sk__Eukaryota;k__Metazoa;p__Chordata;c__Appendicularia;o__Copelata,1,0,0,0,0,0,0


In [66]:
len(mgnify.df), mgnify.df.columns

(497,
 Index(['ERR2010979', 'ERR315850', 'ERR315851', 'ERR315852', 'ERR315853',
        'ERR315854', 'ERR315855'],
       dtype='object', name='source material ID'))

In [63]:
df = pd.read_csv(
    Path.home() / "coding/emo-bon/momics-demos/data/mgnify_data/ERP003634_taxonomy_abundances_SSU_v5.0.tsv",
    sep="\t",
)

In [64]:
df.index

RangeIndex(start=0, stop=497, step=1)

In [65]:
df.columns

Index(['#SampleID', 'ERR2010979', 'ERR315850', 'ERR315851', 'ERR315852',
       'ERR315853', 'ERR315854', 'ERR315855'],
      dtype='object')

In [8]:
df['#SampleID']

0                                            sk__Archaea
1      sk__Archaea;k__;p__Euryarchaeota;c__Candidatus...
2      sk__Archaea;k__;p__Euryarchaeota;c__Thermoplas...
3      sk__Archaea;k__;p__Thaumarchaeota;c__;o__Nitro...
4                                           sk__Bacteria
                             ...                        
492    sk__Eukaryota;k__;p__;c__Polycystinea;o__Spume...
493                 sk__Eukaryota;k__;p__Bacillariophyta
494    sk__Eukaryota;k__Metazoa;p__Arthropoda;c__Mala...
495    sk__Eukaryota;k__Metazoa;p__Chordata;c__Append...
496                       sk__Eukaryota;k__Viridiplantae
Name: #SampleID, Length: 497, dtype: object

In [9]:
def is_mgnify_raw(df: pd.DataFrame) -> bool:
    required_cols = {"#SampleID"}
    missing = required_cols - set(df.columns)
    starts_with_sk = bool(df['#SampleID'].str.startswith("sk__").all())
    unique_sample_ids = df['#SampleID'].is_unique
    return len(missing) == 0 and isinstance(df.index, pd.RangeIndex) and unique_sample_ids and starts_with_sk

In [10]:
is_mgnify_raw(df)

True

In [11]:
df['#SampleID'].is_unique

True

In [12]:
bool(df['#SampleID'].str.startswith("sk__").all())

True

## Loading emobon data

In [13]:
import os

In [14]:
import warnings
import holoviews as hv
from skbio.stats.ordination import pcoa

warnings.filterwarnings('ignore')

import pandas as pd
import panel as pn

# from mgo.udal import UDAL

# All low level functions are imported from the momics package
import momics.plotting as pl
from momics.panel_utils import tax_finder_selector

from momics.diversity import (
    beta_diversity_parametrized,
    find_taxa_in_table,
)
from momics.utils import load_and_clean

In [15]:
root_folder = os.path.abspath(os.path.join('../../../'))
assets_folder = os.path.join(root_folder, 'assets')

def get_valid_samples():
    df_valid = pd.read_csv(
        os.path.join(root_folder, 'data/shipment_b1b2_181.csv')
    )
    return df_valid

valid_samples = get_valid_samples()

In [16]:
# High level function from the momics.utils module
full_metadata, mgf_parquet_dfs = load_and_clean(valid_samples=valid_samples)

# select categorical columns from metadata
categorical_columns = sorted(full_metadata.select_dtypes(include=['object', "boolean"]).columns)

# select numerical columns from metadata
numerical_columns = sorted(full_metadata.select_dtypes(include=['int64', 'float64']).columns)

In [17]:
ssu= mgf_parquet_dfs['ssu']

In [18]:
ssu.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,abundance,superkingdom,kingdom,phylum,class,order,family,genus,species
source material ID,ncbi_tax_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
EMOBON_BPNS_So_5,2157,7.0,Archaea,,,,,,,
EMOBON_BPNS_So_5,1801616,1.0,Archaea,,Candidatus_Woesearchaeota,,,,,
EMOBON_BPNS_So_5,28890,1.0,Archaea,,Euryarchaeota,,,,,
EMOBON_BPNS_So_5,183968,1.0,Archaea,,Euryarchaeota,Thermococci,,,,
EMOBON_BPNS_So_5,192989,3.0,Archaea,,Nanoarchaeota,,,,,


In [19]:
isinstance(ssu.index, pd.MultiIndex)

True

In [42]:
s = TaxonomyTable(ssu)
s.source

INFO:mgnify_oop:Detected source as 'emobon_processed' based on data format.


'emobon_processed'

In [43]:
reset = s.df.reset_index()
reset.columns

Index(['source material ID', 'ncbi_tax_id', 'abundance', 'superkingdom',
       'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'],
      dtype='object')

In [44]:
abundance_table = s.to_abundance_table()

ERROR:mgnify_oop:Error checking MGNify raw format: '#SampleID'


In [45]:
abundance_table.df.head()

Unnamed: 0_level_0,source material ID,EMOBON_AAOT_Wa_1,EMOBON_AAOT_Wa_2,EMOBON_AAOT_Wa_22,EMOBON_AAOT_Wa_26,EMOBON_AAOT_Wa_27,EMOBON_AAOT_Wa_41,EMOBON_AAOT_Wa_42,EMOBON_AAOT_Wa_46,EMOBON_AAOT_Wa_47,EMOBON_AAOT_Wa_6,...,EMOBON_VB_Wa_4,EMOBON_VB_Wa_41,EMOBON_VB_Wa_42,EMOBON_VB_Wa_43,EMOBON_VB_Wa_44,EMOBON_VB_Wa_5,EMOBON_VB_Wa_93,EMOBON_VB_Wa_94,EMOBON_VB_Wa_96,EMOBON_VB_Wa_97
ncbi_tax_id,taxonomic_concat,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2,2;sk__Bacteria;k__;p__;c__;o__;f__;g__;s__;,1256,1783,1920,895,536,848,461,534,304,1172,...,765,769,1106,1449,996,978,456,552,1908,1120
6,6;sk__Bacteria;k__;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Xanthobacteraceae;g__Azorhizobium;s__;,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,10;sk__Bacteria;k__;p__Proteobacteria;c__Gammaproteobacteria;o__Cellvibrionales;f__Cellvibrionaceae;g__Cellvibrio;s__;,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,16;sk__Bacteria;k__;p__Proteobacteria;c__Betaproteobacteria;o__Nitrosomonadales;f__Methylophilaceae;g__Methylophilus;s__;,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,18;sk__Bacteria;k__;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfuromonadales;f__Desulfuromonadaceae;g__Pelobacter;s__;,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
abundance_table.has_tax_ids

True

In [54]:
mgnify.df.columns

Index(['ERR2010979', 'ERR315850', 'ERR315851', 'ERR315852', 'ERR315853',
       'ERR315854', 'ERR315855'],
      dtype='object')

In [56]:
# name the column index name
mgnify.df.columns.name = "source material ID"

In [57]:
mgnify.df.columns

Index(['ERR2010979', 'ERR315850', 'ERR315851', 'ERR315852', 'ERR315853',
       'ERR315854', 'ERR315855'],
      dtype='object', name='source material ID')

In [58]:
len(mgnify.df.columns)

7

In [59]:
abundance_table.df.columns[:5]

Index(['EMOBON_AAOT_Wa_1', 'EMOBON_AAOT_Wa_2', 'EMOBON_AAOT_Wa_22',
       'EMOBON_AAOT_Wa_26', 'EMOBON_AAOT_Wa_27'],
      dtype='object', name='source material ID')