In [1]:
import pandas as pd
import os

DATA_DIR = "~/Desktop/code/data/"

In [2]:
f_bins = pd.read_csv(os.path.join(DATA_DIR, 'MAGs', 'FENIX21', 'FENIX21-all.csv'))
f_env = pd.read_csv(os.path.join(DATA_DIR, 'MAGs', 'FENIX21', 'envdata.csv'))
#f_metals = pd.read_csv(os.path.join(DATA_DIR, 'MAGs', 'FENIX21', 'fluid_metals.csv'))
#s_metals = pd.read_csv(os.path.join(DATA_DIR, 'MAGs', 'FENIX21', 'sediment_metals.csv'))

## KNearestNeighbors taxonomy

In [3]:
# Model output with clustering and classification
model_df = pd.read_csv(os.path.join(DATA_DIR, 'model', 'ml-model-output.csv'))
model_df = model_df[["taxon_id", "Test Cluster", "KNN Classify", "RF Classify", "neighbor_media_id", "Media Cluster", "neighbor_taxon_id"]]
model_df = model_df.rename(columns={"Media Cluster": "media_cluster", "taxon_id": "bin", "neighbor_taxon_id": "taxon_id"})

# BacDive taxonomy information
bacdive_df = pd.read_csv(os.path.join(DATA_DIR, "bacdive", "bacdive-all.csv"), low_memory=False)
bd_taxonomy = bacdive_df[["taxon_id", "domain", "phylum", "class", "order", "family", "genus", "species"]]

neighbors_list = model_df["taxon_id"].to_list()
neighbors = bd_taxonomy["taxon_id"].isin(neighbors_list)
bd_taxonomy = bd_taxonomy[neighbors]
bd_taxonomy = bd_taxonomy.drop_duplicates()

# Merge for neighbor_taxonomy information (multiple taxonomic classifications for some taxon_id's)
model_neighbors = pd.merge(left=model_df, right=bd_taxonomy, on="taxon_id", how="left")

model_neighbors.head()

Unnamed: 0,bin,Test Cluster,KNN Classify,RF Classify,neighbor_media_id,media_cluster,taxon_id,domain,phylum,class,order,family,genus,species
0,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,171,64,64,J475,9.0,427754.0,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Brevibacteriaceae,Brevibacterium,Brevibacterium sp.
1,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,171,64,64,J475,9.0,664640.0,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Brevibacteriaceae,Brevibacterium,Brevibacterium sp.
2,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,171,64,64,J26,0.0,427754.0,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Brevibacteriaceae,Brevibacterium,Brevibacterium sp.
3,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,171,64,64,J26,0.0,53358.0,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Brevibacteriaceae,Brevibacterium,Brevibacterium sp.
4,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,171,64,64,J26,0.0,664640.0,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Brevibacteriaceae,Brevibacterium,Brevibacterium sp.


## Bin metadata

In [4]:
model_df = pd.read_csv(os.path.join(DATA_DIR, 'model', 'ml-model-output.csv'))
model_df = model_df[["taxon_id", "Test Cluster", "KNN Classify", "RF Classify", "neighbor_media_id", "Media Cluster", "neighbor_taxon_id"]]
model_df = model_df.rename(columns={"Media Cluster": "media_cluster", "taxon_id": "bin", "neighbor_taxon_id": "taxon_id"})

f_env = pd.read_csv(os.path.join(DATA_DIR, 'MAGs', 'FENIX21', 'envdata.csv'))
f_env = f_env.astype(str)
env_subset = f_env[["SiteID", "site_name", "waterType", "temp", "ph", "spc","sal", "alk_tot"]]
env_subset

Unnamed: 0,SiteID,site_name,waterType,temp,ph,spc,sal,alk_tot
0,AS,Acqua sauna lido scoglio,,65.0,5.96,,36.0,
1,BA,Bagnone,,64.0,7.0,,,
2,CA,Acqua Cantani,Ca-HCO3,17.48,6.34,2.757,0.14,1200.0
3,CF,Terme Caracciolo Forte,Ca-HCO3,53.0,5.36,3.067,0.156,1400.0
4,CG,Capasso geyser,Na-Cl,47.2,6.51,9.78,0.528,1400.0
5,CP,Capasso parcheggio,,46.9,,9.932,,
6,FE,Sorgente Ferrata,Ca-HCO3,15.22,6.49,1.946,0.1,800.0
7,GA,Grotta dell'acqua,Na-Cl,32.6,6.41,,10.0,800.0
8,LS,Lido lo scoglio,Na-Cl,47.8,6.12,,21.0,600.0
9,ML,Madonna dei Lattani,Ca-HCO3,15.0,8.6,0.22,0.011,200.0


In [5]:
# Merging our metadata with the model output
df1 = model_df
df2 = env_subset

# Using regex to capture the two-letter identifier
df1['SiteID'] = df1['bin'].str.extract(r'fasta([A-Z]{2})_(?:F|S)_extracted_bins')

# Clean up identifiers to ensure they match
df1.loc[:, 'SiteID'] = df1['SiteID'].str.strip().str.upper()
df2.loc[:, 'SiteID'] = df2['SiteID'].str.strip().str.upper()

# Merge our dataframes
benv = pd.merge(left=df1, right=df2, on="SiteID", how="left")

benv.to_csv(os.path.join(DATA_DIR, "model", "output-metadata.csv"), index=False)
benv


Unnamed: 0,bin,Test Cluster,KNN Classify,RF Classify,neighbor_media_id,media_cluster,taxon_id,SiteID,site_name,waterType,temp,ph,spc,sal,alk_tot
0,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,171,64,64,J475,9.0,427754.0,BA,Bagnone,,64.0,7.0,,,
1,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,171,64,64,J475,9.0,664640.0,BA,Bagnone,,64.0,7.0,,,
2,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,171,64,64,J26,0.0,427754.0,BA,Bagnone,,64.0,7.0,,,
3,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,171,64,64,J26,0.0,53358.0,BA,Bagnone,,64.0,7.0,,,
4,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,171,64,64,J26,0.0,664640.0,BA,Bagnone,,64.0,7.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2695,KBase_derived_Bin.039.fastaLS_S_extracted_bins...,130,130,130,1203a,9.0,646.0,LS,Lido lo scoglio,Na-Cl,47.8,6.12,,21.0,600.0
2696,KBase_derived_Bin.039.fastaLS_S_extracted_bins...,130,130,130,J84,9.0,2981779.0,LS,Lido lo scoglio,Na-Cl,47.8,6.12,,21.0,600.0
2697,KBase_derived_Bin.039.fastaLS_S_extracted_bins...,130,130,130,1203a,9.0,29370.0,LS,Lido lo scoglio,Na-Cl,47.8,6.12,,21.0,600.0
2698,KBase_derived_Bin.039.fastaLS_S_extracted_bins...,130,130,130,J14,8.0,29363.0,LS,Lido lo scoglio,Na-Cl,47.8,6.12,,21.0,600.0
