# Taxonomy tables 

Make a table that has rows as taxa, columns as samples, and values as abundance. \
Because the taxonomic classification is hierarcical we need to sum abundance from the rank and all lower rank. \
Let's say we had this: 


| Order     | Family      | Genus     | Species     | Abundance| 
| ----------| ------------|-----------| ------------|----------|   
| Bryales   | Bryaceae    | Bryum     | capillare   | 10       |
| Bryales   | Bryaceae    | Bryum     |             | 23       |
| Bryales   | Bryaceae    |           |             | 45       |
| Bryales   |             |           |             | 123      |  

So the abundances we want to record for each rank are: \
Order   Bryales   123+45+23+10 \
Family  Bryaceae  45+23+10 \
Genus   Bryum     23+10 \
Species Bryum     10

Do this for all named taxa at all ranks

Also normalise data within each Superkingdom 

## Load the tables

In [1]:
import os
import io
import math
import duckdb
import numpy
from duckdb import CatalogException, BinderException
import pandas as pd
from pandasql import sqldf
from minio import Minio, S3Error
from timeit_decorator import timeit
    
client = Minio("10.4.1.4:9000",
    secure=False,
    access_key="PapyfVxlHhHD63nJnB0W",
    secret_key="7Z2q9uD44CS2HEBGhbvJi0nhLcvjgffZwG9wqJ1j")

def get_object(bucket_name, file_format, file_name, verbose=True):
    if verbose:
        print(f"{bucket_name=} - {file_format=} - {file_name=}")
    try:
        response = client.get_object(bucket_name, file_name)
        buffer = io.BytesIO(response.read())
    except S3Error:
        raise
    finally:
        if file_format == "parquet":
            df = pd.read_parquet(buffer, engine='pyarrow')
        elif file_format == "csv":
            df = pd.read_csv(buffer)
        else:
            raise ValueError(f"Unknown {file_format=}")
        response.close()
        response.release_conn()
        if verbose:
            print(f"Downloaded {file_name} into dataframe")
    return df

In [3]:
# MGF parquet tables to dataframes
bucket_name = "emo-bon-tables"
objects = client.list_objects(bucket_name, recursive=True)
mgf_parquet_dfs = {}
for obj in objects:
    name = obj.object_name.split(".")[-2]
    df = get_object(bucket_name, "parquet", obj.object_name, verbose=False)
    mgf_parquet_dfs[name] = df

# Sample metadata
# Get the latest Batch combined logsheets file
# Remember we are downloading from MinIO
batch_file = "Batch1and2_combined_logsheets_2024-09-06.csv"
sample_metadata = ("emo-bon-metadata-tables", "csv", batch_file)
sample_metadata = get_object(*sample_metadata, verbose=False)

#Observatory metadata - from the GoogleSheets
observatory_metadata = ("emo-bon-metadata-tables", "csv", "Observatory_combined_logsheets_validated.csv")
observatory_metadata = get_object(*observatory_metadata, verbose=False)

# Into duckdb
try:
    duckdb.sql("DROP TABLE SAMPLE_METADATA")
    duckdb.sql("DROP TABLE OBS_METADATA")
    for table_name in mgf_parquet_dfs:
        cmd = f"DROP TABLE {table_name}"
        duckdb.sql(cmd)
except CatalogException:
    pass
duckdb.sql("CREATE TABLE SAMPLE_METADATA AS SELECT * FROM sample_metadata")
duckdb.sql("SELECT COUNT(*) FROM SAMPLE_METADATA")
duckdb.sql("CREATE TABLE OBS_METADATA AS SELECT * FROM observatory_metadata")
duckdb.sql("SELECT COUNT(*) FROM OBS_METADATA")
for table_name in mgf_parquet_dfs:
    df = mgf_parquet_dfs[table_name]
    cmd = f"CREATE TABLE {table_name} AS SELECT * FROM df"
    duckdb.sql(cmd)

duckdb.sql("SHOW TABLES")


┌─────────────────┐
│      name       │
│     varchar     │
├─────────────────┤
│ LSU             │
│ OBS_METADATA    │
│ SAMPLE_METADATA │
│ SSU             │
│ go              │
│ go_slim         │
│ ips             │
│ ko              │
│ pfam            │
└─────────────────┘

## Sum abundances per taxon in LSU, pivot, and take sqrt()

In [4]:
# Need to load imports and LSU table above

# TODO: split into superkingdoms

# The rank name "order" interfers with ORDER being reserved word in SQL, so we need to change it 
RANKS = ["superkingdom", "kingdom", "phylum", "class", "order_rank", "family", "genus", "species"]

# Check if column 'order' exists prior to changing, ie on first read
try:
   duckdb.sql("ALTER TABLE LSU RENAME COLUMN 'order' TO order_rank")
except BinderException as e:
    # Exception occurs when alread
    assert str(e) == 'Binder Error: Table "LSU" does not have a column with name "order"'
    pass
        
dfs = {}
for rank in RANKS:
    QUERY = f"""
    PIVOT (
    SELECT
    ref_code,
    {rank},
    sum(abundance) as sum_abundance,
    FROM LSU 
    WHERE {rank} <> '' AND {rank} IS NOT NULL
    GROUP BY {rank}, ref_code
    ORDER BY {rank}, sum(abundance) DESC
    )
    ON ref_code
    USING sum(sum_abundance)
    ORDER BY {rank}
    """
    df = duckdb.sql(QUERY).to_df()
    df.set_index(f"{rank}")
    dfs[rank] = df

df_sk = dfs['superkingdom']
print("\nData frame superkingdom with summed values:")
print(sqldf("SELECT * FROM df_sk"))

#Normalisation before sqrtabs


# Method 1 - vectorization which is supposed to be quicker
# Take the sqrt of each value using apply() which IS NOT in-place
@timeit() # doesnt seem to work in Jupyter NB
def Method1(dfs):
    sqrt_dfs = {}
    for rank, df in dfs.items():
        print(f"Method 1: Converting data frame to sqrt(): {rank}")
        # Concatenate rows: axis=1
        sqrt_dfs[rank] = pd.concat([df[rank], df.iloc[:, 1:].apply(numpy.sqrt, axis=1)], axis=1)
    return sqrt_dfs
sqrt_dfs = Method1(dfs)

# Method 2 - iterating over values, supposed to be slow
# Take the sqrt of each value in each rank dataframe in-place
@timeit()
def Method2(dfs):
    for rank, df in dfs.items():
        print(f"Method 2: Converting data frame to sqrt(): {rank}")
        for col in df:
            if col == rank:
                continue
            for i, row_value in df[col].items():
                df.loc[i, col] = numpy.sqrt(row_value)
    return dfs
dfs = Method2(dfs)

print("\nMethod 1: Data superkingdom frame with sqrt values:")
df_sk = dfs['superkingdom']
print(sqldf("SELECT * FROM df_sk"))

print("\nMethod 2: Data superkingdom frame with sqrt values:")
sqrt_sk = sqrt_dfs['superkingdom']
print(sqldf("SELECT * FROM sqrt_sk"))


Data frame superkingdom with summed values:
  superkingdom  EMOBON00001  EMOBON00084  EMOBON00085  EMOBON00088  \
0      Archaea        620.0        366.0        267.0        627.0   
1     Bacteria      16850.0      10329.0      30259.0      13515.0   
2    Eukaryota        814.0        368.0        125.0       1068.0   

   EMOBON00089  EMOBON00092  EMOBON00093  EMOBON00094  EMOBON00095  ...  \
0        730.0        616.0         55.0       1432.0        710.0  ...   
1      13608.0      19502.0      20991.0      23520.0      12633.0  ...   
2        778.0        730.0       1257.0        662.0        322.0  ...   

   EMOBON00224  EMOBON00225  EMOBON00226  EMOBON00227  EMOBON00236  \
0        335.0        552.0        645.0        598.0       1403.0   
1      15351.0      22638.0      26152.0      26993.0      26645.0   
2      12376.0       1900.0        834.0       1202.0        976.0   

   EMOBON00237  EMOBON00238  EMOBON00239  EMOBON00242  EMOBON00243  
0       1203.0       13