In [1]:
import pandas as pd
import os

DATA_DIR = "~/Desktop/code/data/"

In [2]:
f_bins = pd.read_csv(os.path.join(DATA_DIR, 'MAGs', 'FENIX21', 'FENIX21-all.csv'))
f_env = pd.read_csv(os.path.join(DATA_DIR, 'MAGs', 'FENIX21', 'envdata.csv'))
#f_metals = pd.read_csv(os.path.join(DATA_DIR, 'MAGs', 'FENIX21', 'fluid_metals.csv'))
#s_metals = pd.read_csv(os.path.join(DATA_DIR, 'MAGs', 'FENIX21', 'sediment_metals.csv'))

## Taxonomy of training set

In [3]:
# Training output with clustering and classification
train_df = pd.read_csv(os.path.join(DATA_DIR, 'model', 'train-output.csv'))

# BacDive taxonomy information
bacdive_df = pd.read_csv(os.path.join(DATA_DIR, "bacdive", "bacdive-all.csv"), low_memory=False)
bd_taxonomy = bacdive_df[["taxon_id", "domain", "phylum", "class", "order", "family", "genus", "species"]]

neighbors_list = train_df["taxon_id"].to_list()
neighbors = bd_taxonomy["taxon_id"].isin(neighbors_list)
bd_taxonomy = bd_taxonomy[neighbors]
bd_taxonomy = bd_taxonomy.drop_duplicates()

# Merge for neighbor_taxonomy information (multiple taxonomic classifications for some taxon_id's)
train_taxa = pd.merge(left=train_df, right=bd_taxonomy, on="taxon_id", how="left")

train_taxa.to_csv(os.path.join(DATA_DIR, "model", "train-taxa.csv"), index=False)
train_taxa.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,KMeans Cluster,taxon_id,Media Cluster,domain,phylum,class,order,family,genus,species
0,8.840182,5.432602,5.119657,5.64779,4.938649,4.791024,5.036232,6.41101,6.226582,6.500916,...,156,100.0,9.0,Bacteria,Proteobacteria,Alphaproteobacteria,Caulobacterales,Caulobacteraceae,Phenylobacterium,Phenylobacterium panacis
1,8.840182,5.432602,5.119657,5.64779,4.938649,4.791024,5.036232,6.41101,6.226582,6.500916,...,156,100.0,9.0,Bacteria,Actinobacteria,Actinobacteria,Actinomycetales,Mycobacteriaceae,Mycobacterium,Mycobacterium triplex
2,8.840182,5.432602,5.119657,5.64779,4.938649,4.791024,5.036232,6.41101,6.226582,6.500916,...,156,100.0,9.0,Bacteria,Actinobacteria,Actinobacteria,Actinomycetales,Mycobacteriaceae,Mycobacterium,Mycobacterium branderi
3,8.687406,5.89516,5.106698,4.957048,4.595566,4.786066,4.946724,5.214825,4.453093,3.743618,...,45,100.0,9.0,Bacteria,Proteobacteria,Alphaproteobacteria,Caulobacterales,Caulobacteraceae,Phenylobacterium,Phenylobacterium panacis
4,8.687406,5.89516,5.106698,4.957048,4.595566,4.786066,4.946724,5.214825,4.453093,3.743618,...,45,100.0,9.0,Bacteria,Actinobacteria,Actinobacteria,Actinomycetales,Mycobacteriaceae,Mycobacterium,Mycobacterium triplex


## Taxonomy of KNearestNeighbors

In [4]:
# Model output with clustering and classification
model_df = pd.read_csv(os.path.join(DATA_DIR, 'model', 'ml-model-output.csv'))
#model_df = model_df[["taxon_id", "Test Cluster", "KNN Classify", "RF Classify", "neighbor_media_id", "Media Cluster", "neighbor_taxon_id"]]
model_df = model_df.rename(columns={"Media Cluster": "media_cluster", "taxon_id": "bin", "neighbor_taxon_id": "taxon_id"})

# BacDive taxonomy information
bacdive_df = pd.read_csv(os.path.join(DATA_DIR, "bacdive", "bacdive-all.csv"), low_memory=False)
bd_taxonomy = bacdive_df[["taxon_id", "domain", "phylum", "class", "order", "family", "genus", "species"]]

neighbors_list = model_df["taxon_id"].to_list()
neighbors = bd_taxonomy["taxon_id"].isin(neighbors_list)
bd_taxonomy = bd_taxonomy[neighbors]
bd_taxonomy = bd_taxonomy.drop_duplicates()

# Merge for neighbor_taxonomy information (multiple taxonomic classifications for some taxon_id's)
model_neighbors = pd.merge(left=model_df, right=bd_taxonomy, on="taxon_id", how="left")

model_neighbors.to_csv(os.path.join(DATA_DIR, "model", "test-taxa.csv"), index=False)
model_neighbors.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,neighbor_media_id,taxon_id,media_cluster,domain,phylum,class,order,family,genus,species
0,8.837785,5.424089,5.648731,5.000338,4.70537,4.575614,5.449598,5.880421,2.030343,5.689766,...,J475,427754.0,9.0,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Brevibacteriaceae,Brevibacterium,Brevibacterium sp.
1,8.837785,5.424089,5.648731,5.000338,4.70537,4.575614,5.449598,5.880421,2.030343,5.689766,...,J475,664640.0,9.0,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Brevibacteriaceae,Brevibacterium,Brevibacterium sp.
2,8.837785,5.424089,5.648731,5.000338,4.70537,4.575614,5.449598,5.880421,2.030343,5.689766,...,J26,427754.0,0.0,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Brevibacteriaceae,Brevibacterium,Brevibacterium sp.
3,8.837785,5.424089,5.648731,5.000338,4.70537,4.575614,5.449598,5.880421,2.030343,5.689766,...,J26,53358.0,0.0,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Brevibacteriaceae,Brevibacterium,Brevibacterium sp.
4,8.837785,5.424089,5.648731,5.000338,4.70537,4.575614,5.449598,5.880421,2.030343,5.689766,...,J26,664640.0,0.0,Bacteria,Actinobacteria,Actinobacteria,Micrococcales,Brevibacteriaceae,Brevibacterium,Brevibacterium sp.


## Bin metadata

In [5]:
model_df = pd.read_csv(os.path.join(DATA_DIR, 'model', 'ml-model-output.csv'))
model_df = model_df.rename(columns={"Media Cluster": "media_cluster", "taxon_id": "bin", "neighbor_taxon_id": "taxon_id"})

f_env = pd.read_csv(os.path.join(DATA_DIR, 'MAGs', 'FENIX21', 'envdata.csv'))
f_env = f_env.astype(str)
env_subset = f_env[["SiteID", "site_name", "waterType", "temp", "ph", "spc","sal", "alk_tot"]]
env_subset

Unnamed: 0,SiteID,site_name,waterType,temp,ph,spc,sal,alk_tot
0,AS,Acqua sauna lido scoglio,,65.0,5.96,,36.0,
1,BA,Bagnone,,64.0,7.0,,,
2,CA,Acqua Cantani,Ca-HCO3,17.48,6.34,2.757,0.14,1200.0
3,CF,Terme Caracciolo Forte,Ca-HCO3,53.0,5.36,3.067,0.156,1400.0
4,CG,Capasso geyser,Na-Cl,47.2,6.51,9.78,0.528,1400.0
5,CP,Capasso parcheggio,,46.9,,9.932,,
6,FE,Sorgente Ferrata,Ca-HCO3,15.22,6.49,1.946,0.1,800.0
7,GA,Grotta dell'acqua,Na-Cl,32.6,6.41,,10.0,800.0
8,LS,Lido lo scoglio,Na-Cl,47.8,6.12,,21.0,600.0
9,ML,Madonna dei Lattani,Ca-HCO3,15.0,8.6,0.22,0.011,200.0


In [6]:
# Merging our metadata with the model output
df1 = model_df
df2 = env_subset

# Using regex to capture the two-letter identifier
df1['SiteID'] = df1['bin'].str.extract(r'fasta([A-Z]{2})_(?:F|S)_extracted_bins')

# Clean up identifiers to ensure they match
df1.loc[:, 'SiteID'] = df1['SiteID'].str.strip().str.upper()
df2.loc[:, 'SiteID'] = df2['SiteID'].str.strip().str.upper()

# Merge our dataframes
benv = pd.merge(left=df1, right=df2, on="SiteID", how="left")

benv.to_csv(os.path.join(DATA_DIR, "model", "output-metadata.csv"), index=False)
benv.head()


Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,taxon_id,media_cluster,SiteID,site_name,waterType,temp,ph,spc,sal,alk_tot
0,8.837785,5.424089,5.648731,5.000338,4.70537,4.575614,5.449598,5.880421,2.030343,5.689766,...,427754.0,9.0,BA,Bagnone,,64.0,7.0,,,
1,8.837785,5.424089,5.648731,5.000338,4.70537,4.575614,5.449598,5.880421,2.030343,5.689766,...,664640.0,9.0,BA,Bagnone,,64.0,7.0,,,
2,8.837785,5.424089,5.648731,5.000338,4.70537,4.575614,5.449598,5.880421,2.030343,5.689766,...,427754.0,0.0,BA,Bagnone,,64.0,7.0,,,
3,8.837785,5.424089,5.648731,5.000338,4.70537,4.575614,5.449598,5.880421,2.030343,5.689766,...,53358.0,0.0,BA,Bagnone,,64.0,7.0,,,
4,8.837785,5.424089,5.648731,5.000338,4.70537,4.575614,5.449598,5.880421,2.030343,5.689766,...,664640.0,0.0,BA,Bagnone,,64.0,7.0,,,


## Visualization

In [7]:
train = pd.read_csv(os.path.join(DATA_DIR, "model", "train-taxa.csv"))
test = pd.read_csv(os.path.join(DATA_DIR, "model", "output-metadata.csv"))

In [8]:
test.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,taxon_id,media_cluster,SiteID,site_name,waterType,temp,ph,spc,sal,alk_tot
0,8.837785,5.424089,5.648731,5.000338,4.70537,4.575614,5.449598,5.880421,2.030343,5.689766,...,427754.0,9.0,BA,Bagnone,,64.0,7.0,,,
1,8.837785,5.424089,5.648731,5.000338,4.70537,4.575614,5.449598,5.880421,2.030343,5.689766,...,664640.0,9.0,BA,Bagnone,,64.0,7.0,,,
2,8.837785,5.424089,5.648731,5.000338,4.70537,4.575614,5.449598,5.880421,2.030343,5.689766,...,427754.0,0.0,BA,Bagnone,,64.0,7.0,,,
3,8.837785,5.424089,5.648731,5.000338,4.70537,4.575614,5.449598,5.880421,2.030343,5.689766,...,53358.0,0.0,BA,Bagnone,,64.0,7.0,,,
4,8.837785,5.424089,5.648731,5.000338,4.70537,4.575614,5.449598,5.880421,2.030343,5.689766,...,664640.0,0.0,BA,Bagnone,,64.0,7.0,,,


In [9]:
import plotly.express as px
import plotly.graph_objects as go

# Plot the training points
fig = px.scatter(
    data_frame = train,
    x = "Component 1",
    y = "Component 2", # Change components to visualize each of the 30 dimensions
    color = "Media Cluster", 
    hover_data = ["taxon_id", "Target", "KMeans Cluster"],
    opacity = 0.3
) 

# Add the test points as a scatter trace
fig.add_trace(
    go.Scatter(
        x=test["Component 1"],  # x coordinates
        y=test["Component 2"],  # y coordinates
        mode='markers',
        marker=dict(
            color="black",  
            size=5,  
            opacity=1.0 
        ),
        text=test["taxon_id"]  # marker hover text
    )
)

# Configure opacity slider
steps = []
for i in range(11):  # 0 to 1 in steps of 0.1
    step = dict(
        method="restyle",
        args=[{"marker.opacity": [i/20]}, [0]],  # Adjust opacity of the training points trace only (trace index [0])
        label=str(i/20)
    )
    steps.append(step)

sliders = [dict(
    active=1,  # Initial value corresponding to opacity=0.1 (i.e., 1st step)
    currentvalue={"prefix": "Opacity: "},
    pad={"t": 50},
    steps=steps
)]

fig.update_layout(
    sliders=sliders,
    title="KMeans Visualization, Training & Test Sets",
    template="plotly_white",
)

fig.show()

In [10]:
# TRAINING SET ONLY

fig = px.scatter(
    data_frame = train,
    x = "Component 1",
    y = "Component 2", # Change components to visualize each of the 30 dimensions
    color = "class",
    hover_data = ["taxon_id", "Target", "KMeans Cluster"],
    opacity = 0.1,
    title = "Training Set"
) 

steps = []
for i in range(11):  # 0 to 1 in steps of 0.1
    step = dict(
        method="restyle", args=[{"marker.opacity": [i/20]}, [0]], label=str(i/20))
    steps.append(step)
sliders = [dict(active=1, currentvalue={"prefix": "Opacity: "}, pad={"t": 50}, steps=steps)]

fig.update_layout(
    sliders=sliders,
    template="plotly_white",
)

fig.show()

In [11]:
# TEST SET ONLY

fig1 = px.scatter(
    data_frame = test,
    x = "Component 1",
    y = "Component 2", # Change components to visualize each of the 30 dimensions
    color = "SiteID",
    hover_data = ["taxon_id", "RF Classify"],
    opacity = 0.5,
    title = "SiteID",
    template="plotly_white"
) 

fig2 = px.scatter(
    data_frame = test,
    x = "Component 1",
    y = "Component 2", # Change components to visualize each of the 30 dimensions
    color = "temp",
    hover_data = ["taxon_id", "RF Classify"],
    opacity = 0.5,
    title = "Temperature",
    template="plotly_white"
) 

fig3 = px.scatter(
    data_frame = test,
    x = "Component 1",
    y = "Component 2", # Change components to visualize each of the 30 dimensions
    color = "ph",
    hover_data = ["taxon_id", "RF Classify"],
    opacity = 0.5,
    title = "pH",
    template="plotly_white"
) 

fig1.show()
fig2.show()
fig3.show()

#TODO: sum or map across different major ions and trace elements