In [1]:
import pandas as pd
from os import listdir
import numpy as np

In [2]:
def get_taxa_ranks(classification, taxa_rank="all"):
    mapping = {
        "domain": 'd',
        "phylum": 'p',
        "class": 'c',
        "order": 'o',
        "family": 'f',
        "genus": 'g',
        "species": 's',
    }
    
    tokens = classification.split(";")
    d = {}
    for tkn in tokens:
        rank = tkn[0]
        info = tkn[3:]
        
        d[rank] = info
    
    if taxa_rank == "all":
        return d
    elif taxa_rank == "domain":
        return d[mapping[taxa_rank]]
    elif taxa_rank == "phylum":
        return d[mapping[taxa_rank]]
    elif taxa_rank == "class":
        return d[mapping[taxa_rank]]
    elif taxa_rank == "order":
        return d[mapping[taxa_rank]]
    elif taxa_rank == "family":
        return d[mapping[taxa_rank]]
    elif taxa_rank == "genus":
        return d[mapping[taxa_rank]]
    else:
        assert taxa_rank == "species"
        return d[mapping[taxa_rank]]
        

In [3]:
samples = [
    "CC4",
    "CD4",
    "MC4",
    "MD4",
    "FC4",
    "FD4",
]

In [26]:
# example = pd.read_csv(f"../results_coassembly_megahit/gtdbtk_results/CC4/CC4_bacteria.csv")
# for i in example.Classification:
#     print(i)

In [27]:
base_folder="../from_kbase/coassembly_megahit/gtdbtk_results/"

In [5]:
taxa_ranks = ["domain", "phylum", "class", "order", "family", "genus", "species"]

In [6]:
DF_TAXA_RANKS = {
    "domain": {},
    "phylum": {},
    "class": {},
    "order": {},
    "family": {},
    "genus": {},
    "species": {},
}


for s in samples:
    flag = False
    if f"{s}_archaea.csv" in listdir(f"{base_folder}/{s}"):
        flag = True
    
    bac_df = pd.read_csv(f"{base_folder}/{s}/{s}_bacteria.csv")
    bac_df["bins"]   = bac_df.apply(lambda x: f'Bin{x["User Genome"].split(".")[1]}', axis=1)
    
    bac_df["domain"]  = bac_df.apply(lambda x: get_taxa_ranks(x["Classification"], "domain"), axis=1)
    bac_df["phylum"]  = bac_df.apply(lambda x: get_taxa_ranks(x["Classification"], "phylum"), axis=1)
    bac_df["class"]   = bac_df.apply(lambda x: get_taxa_ranks(x["Classification"], "class"), axis=1)
    bac_df["order"]   = bac_df.apply(lambda x: get_taxa_ranks(x["Classification"], "order"), axis=1)
    bac_df["family"]  = bac_df.apply(lambda x: get_taxa_ranks(x["Classification"], "family"), axis=1)
    bac_df["genus"]   = bac_df.apply(lambda x: get_taxa_ranks(x["Classification"], "genus"), axis=1)
    bac_df["species"] = bac_df.apply(lambda x: get_taxa_ranks(x["Classification"], "species"), axis=1)
    
    if flag:
        arc_df = pd.read_csv(f"{base_folder}/{s}/{s}_archaea.csv")
        arc_df["bins"]   = arc_df.apply(lambda x: f'Bin{x["User Genome"].split(".")[1]}', axis=1)
        
        arc_df["domain"]  = arc_df.apply(lambda x: get_taxa_ranks(x["Classification"], "domain"), axis=1)
        arc_df["phylum"]  = arc_df.apply(lambda x: get_taxa_ranks(x["Classification"], "phylum"), axis=1)
        arc_df["class"]   = arc_df.apply(lambda x: get_taxa_ranks(x["Classification"], "class"), axis=1)
        arc_df["order"]   = arc_df.apply(lambda x: get_taxa_ranks(x["Classification"], "order"), axis=1)
        arc_df["family"]  = arc_df.apply(lambda x: get_taxa_ranks(x["Classification"], "family"), axis=1)
        arc_df["genus"]   = arc_df.apply(lambda x: get_taxa_ranks(x["Classification"], "genus"), axis=1)
        arc_df["species"] = arc_df.apply(lambda x: get_taxa_ranks(x["Classification"], "species"), axis=1)
    
        final = pd.concat([bac_df, arc_df], ignore_index=True, )
    
    else:
        final = bac_df
    
    for tr in taxa_ranks:
        DF_TAXA_RANKS[tr][s] = final.loc[:,[tr, "bins"]].groupby(by=tr).count().reset_index()



In [7]:
DF_TAXA_RANKS["phylum"]["CC4"]

Unnamed: 0,phylum,bins
0,Acidobacteriota,8
1,Actinomycetota,7
2,Armatimonadota,3
3,Bacteroidota,29
4,Bdellovibrionota,2
5,Chloroflexota,4
6,Deinococcota,2
7,Gemmatimonadota,5
8,Myxococcota,4
9,Myxococcota_A,1


In [8]:
DF_TAXA_RANKS["phylum"]["CC4"]

Unnamed: 0,phylum,bins
0,Acidobacteriota,8
1,Actinomycetota,7
2,Armatimonadota,3
3,Bacteroidota,29
4,Bdellovibrionota,2
5,Chloroflexota,4
6,Deinococcota,2
7,Gemmatimonadota,5
8,Myxococcota,4
9,Myxococcota_A,1


In [9]:
def adapt_df_for_PCA(df, pivot):
    raw_newdf = df.T.reset_index()
    newdf = raw_newdf.replace(np.nan, 0, regex=True)

    new_header = newdf.iloc[0] #grab the first row for the header
    newdf = newdf[1:] #take the data less the header row
    newdf.columns = new_header #set the header row as the df header

    sample_id = {
        "CC4": 0,
        "CD4": 1,
        "MC4": 2,
        "MD4": 3,
        "FC4": 4,
        "FD4": 5
    }

    newdf["sample"] = newdf.apply(lambda x: x[pivot].split("_")[0] , axis=1)
    newdf["sample_ids"] = newdf.apply(lambda x: sample_id[x["sample"]], axis=1)
    
    macro_sample_id = {
        "C": 0,
        "M": 1,
        "F": 2,
    }
    
    macro_sample_location = {
        "C": "Suna Canottieri",
        "M": "Teatro Maggiore",
        "F": "Fondo Toce",
    }
    
    newdf["macro_sample"] = newdf.apply(lambda x: x[pivot][0] , axis=1)
    newdf["macro_sample_ids"] = newdf.apply(lambda x: macro_sample_id[x["macro_sample"]], axis=1)
    newdf["macro_sample_location"] = newdf.apply(lambda x: macro_sample_location[x["macro_sample"]], axis=1)
    newdf["coast_deep_shapes"] = newdf.apply(lambda x: "coast" if x["sample"][1] == "C" else "deep", axis=1)
    
    return newdf

In [10]:
def merge_results_by_taxa(DF_TAXA_RANKS, taxa_level):
    assert taxa_level in ["domain", "phylum", "class", "order", "family", "genus", "species"]
    
    # remove empty classification
    for _, x in DF_TAXA_RANKS[taxa_level].items():
        x.replace(r'^\s*$', "unclassified_", regex=True, inplace=True)
    
    db_list = []
    for _, x in DF_TAXA_RANKS[taxa_level].items():
        db_list += list(x[taxa_level])
    
    unique_db_list = sorted(set(db_list))
    
    m = pd.DataFrame(unique_db_list, columns=[taxa_level])
    
    for s, x in DF_TAXA_RANKS[taxa_level].items():
        temp_sample_df = x.copy()
        temp_sample_df.rename(columns={"bins": s}, inplace=True)
        temp = pd.merge(m, temp_sample_df, how="left", on=taxa_level)
        m = temp.copy()
    
    newm = m.fillna(0)
    newm_adapted = adapt_df_for_PCA(newm, pivot=taxa_level)
    return newm_adapted
    

# Domain

In [11]:
domain_merged = merge_results_by_taxa(DF_TAXA_RANKS, taxa_level="domain")
domain_merged

Unnamed: 0,domain,Archaea,Bacteria,sample,sample_ids,macro_sample,macro_sample_ids,macro_sample_location,coast_deep_shapes
1,CC4,2.0,95.0,CC4,0,C,0,Suna Canottieri,coast
2,CD4,0.0,21.0,CD4,1,C,0,Suna Canottieri,deep
3,MC4,0.0,16.0,MC4,2,M,1,Teatro Maggiore,coast
4,MD4,3.0,28.0,MD4,3,M,1,Teatro Maggiore,deep
5,FC4,4.0,71.0,FC4,4,F,2,Fondo Toce,coast
6,FD4,0.0,17.0,FD4,5,F,2,Fondo Toce,deep


# Phylum

In [18]:
phylum_merged = merge_results_by_taxa(DF_TAXA_RANKS, taxa_level="phylum")
phylum_merged.to_csv("./COASSEMBLY_MEGAHIT_binning_based_gtdbtk/phylum.tsv", sep="\t", header=True, index=False)
phylum_merged

Unnamed: 0,phylum,Acidobacteriota,Actinomycetota,Armatimonadota,Bacteroidota,Bdellovibrionota,Chloroflexota,Cyanobacteriota,Deinococcota,Desulfobacterota,...,Thermoplasmatota,Thermoproteota,Verrucomicrobiota,Zixibacteria,sample,sample_ids,macro_sample,macro_sample_ids,macro_sample_location,coast_deep_shapes
1,CC4,8.0,7.0,3.0,29.0,2.0,4.0,0.0,2.0,0.0,...,0.0,2.0,7.0,0.0,CC4,0,C,0,Suna Canottieri,coast
2,CD4,3.0,3.0,0.0,9.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,CD4,1,C,0,Suna Canottieri,deep
3,MC4,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,MC4,2,M,1,Teatro Maggiore,coast
4,MD4,3.0,1.0,0.0,4.0,0.0,2.0,0.0,0.0,5.0,...,2.0,0.0,0.0,1.0,MD4,3,M,1,Teatro Maggiore,deep
5,FC4,5.0,3.0,2.0,26.0,3.0,2.0,1.0,1.0,0.0,...,2.0,2.0,6.0,0.0,FC4,4,F,2,Fondo Toce,coast
6,FD4,1.0,1.0,0.0,8.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,1.0,0.0,FD4,5,F,2,Fondo Toce,deep


# Class

In [19]:
class_merged = merge_results_by_taxa(DF_TAXA_RANKS, taxa_level="class")
class_merged.to_csv("./COASSEMBLY_MEGAHIT_binning_based_gtdbtk/class.tsv", sep="\t", header=True, index=False)
class_merged

Unnamed: 0,class,Abditibacteria,Acidimicrobiia,Actinomycetia,Alphaproteobacteria,Aminicenantia,Anaerolineae,Armatimonadia,Bacteroidia,Bdellovibrionia,...,UBA9160,Verrucomicrobiae,Vicinamibacteria,WYAZ01,sample,sample_ids,macro_sample,macro_sample_ids,macro_sample_location,coast_deep_shapes
1,CC4,1.0,6.0,0.0,10.0,0.0,2.0,1.0,28.0,0.0,...,1.0,7.0,1.0,1.0,CC4,0,C,0,Suna Canottieri,coast
2,CD4,0.0,1.0,1.0,3.0,0.0,1.0,0.0,9.0,0.0,...,0.0,0.0,0.0,0.0,CD4,1,C,0,Suna Canottieri,deep
3,MC4,0.0,0.0,2.0,2.0,0.0,0.0,0.0,3.0,0.0,...,1.0,0.0,0.0,0.0,MC4,2,M,1,Teatro Maggiore,coast
4,MD4,0.0,1.0,0.0,3.0,1.0,2.0,0.0,2.0,0.0,...,0.0,0.0,2.0,0.0,MD4,3,M,1,Teatro Maggiore,deep
5,FC4,0.0,2.0,0.0,5.0,0.0,1.0,1.0,24.0,3.0,...,0.0,6.0,0.0,0.0,FC4,4,F,2,Fondo Toce,coast
6,FD4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,1.0,0.0,0.0,FD4,5,F,2,Fondo Toce,deep


# Order

In [20]:
order_merged = merge_results_by_taxa(DF_TAXA_RANKS, taxa_level="order")
order_merged.to_csv("./COASSEMBLY_MEGAHIT_binning_based_gtdbtk/order.tsv", sep="\t", header=True, index=False)
order_merged

Unnamed: 0,order,AKYH767-A,Abditibacteriales,Absconditabacterales,Acidimicrobiales,Actinomycetales,Anaerolineales,Armatimonadales,BD1-5,Bacteroidales,...,Verrucomicrobiales,Vicinamibacterales,WYAZ01,unclassified_,sample,sample_ids,macro_sample,macro_sample_ids,macro_sample_location,coast_deep_shapes
1,CC4,1.0,1.0,0.0,6.0,0.0,1.0,1.0,1.0,0.0,...,5.0,1.0,1.0,0.0,CC4,0,C,0,Suna Canottieri,coast
2,CD4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,CD4,1,C,0,Suna Canottieri,deep
3,MC4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,MC4,2,M,1,Teatro Maggiore,coast
4,MD4,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,MD4,3,M,1,Teatro Maggiore,deep
5,FC4,3.0,0.0,1.0,2.0,0.0,0.0,1.0,1.0,0.0,...,3.0,0.0,0.0,0.0,FC4,4,F,2,Fondo Toce,coast
6,FD4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,1.0,FD4,5,F,2,Fondo Toce,deep


# Family

In [21]:
family_merged = merge_results_by_taxa(DF_TAXA_RANKS, taxa_level="family")
family_merged.to_csv("./COASSEMBLY_MEGAHIT_binning_based_gtdbtk/family.tsv", sep="\t", header=True, index=False)
family_merged

Unnamed: 0,family,0-14-0-80-60-11,2013-40CM-41-45,Abditibacteriaceae,Absconditicoccaceae,Aestuariivirgaceae,Akkermansiaceae,Anaeromyxobacteraceae,Armatimonadaceae,BA12,...,Verrucomicrobiaceae,WLNW01,WYAZ01,unclassified_,sample,sample_ids,macro_sample,macro_sample_ids,macro_sample_location,coast_deep_shapes
1,CC4,0.0,1.0,1.0,0.0,3.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,CC4,0,C,0,Suna Canottieri,coast
2,CD4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,CD4,1,C,0,Suna Canottieri,deep
3,MC4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,MC4,2,M,1,Teatro Maggiore,coast
4,MD4,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,MD4,3,M,1,Teatro Maggiore,deep
5,FC4,0.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,2.0,0.0,0.0,1.0,FC4,4,F,2,Fondo Toce,coast
6,FD4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,FD4,5,F,2,Fondo Toce,deep


# Genus

In [22]:
genus_merged = merge_results_by_taxa(DF_TAXA_RANKS, taxa_level="genus")
genus_merged.to_csv("./COASSEMBLY_MEGAHIT_binning_based_gtdbtk/genus.tsv", sep="\t", header=True, index=False)
genus_merged

Unnamed: 0,genus,Aestuariivirga,Albidovulum,Aquisediminimonas,Archangium_A,Armatimonas,BA12,BJ22,Bog-950,CADEED01,...,WLNW01,ZC4RG19,ZC4RG30,unclassified_,sample,sample_ids,macro_sample,macro_sample_ids,macro_sample_location,coast_deep_shapes
1,CC4,2.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,1.0,18.0,CC4,0,C,0,Suna Canottieri,coast
2,CD4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,2.0,CD4,1,C,0,Suna Canottieri,deep
3,MC4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,7.0,MC4,2,M,1,Teatro Maggiore,coast
4,MD4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,12.0,MD4,3,M,1,Teatro Maggiore,deep
5,FC4,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,10.0,FC4,4,F,2,Fondo Toce,coast
6,FD4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,FD4,5,F,2,Fondo Toce,deep


# Species

In [23]:
species_merged = merge_results_by_taxa(DF_TAXA_RANKS, taxa_level="species")
species_merged.to_csv("./COASSEMBLY_MEGAHIT_binning_based_gtdbtk/species.tsv", sep="\t", header=True, index=False)
species_merged

Unnamed: 0,species,CSP1-5 sp001443495,Nitrospira_F sp001464735,Nitrospira_F sp002083565,Sphingorhabdus_B lacus,UBA3362 sp013140975,unclassified_,sample,sample_ids,macro_sample,macro_sample_ids,macro_sample_location,coast_deep_shapes
1,CC4,0.0,0.0,1.0,1.0,1.0,94.0,CC4,0,C,0,Suna Canottieri,coast
2,CD4,0.0,1.0,0.0,0.0,1.0,19.0,CD4,1,C,0,Suna Canottieri,deep
3,MC4,0.0,0.0,0.0,0.0,0.0,16.0,MC4,2,M,1,Teatro Maggiore,coast
4,MD4,0.0,0.0,0.0,0.0,0.0,31.0,MD4,3,M,1,Teatro Maggiore,deep
5,FC4,1.0,0.0,0.0,0.0,0.0,74.0,FC4,4,F,2,Fondo Toce,coast
6,FD4,0.0,0.0,0.0,0.0,0.0,17.0,FD4,5,F,2,Fondo Toce,deep
