# Model Inputs

### Packages

In [1]:
# General
import pandas as pd
import numpy as np
import pandas as pd
import os
DATA_DIR = "~/Desktop/code/data/"

# Pipeline
import pandas as pd
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA 
import umap
import hdbscan
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score

# Visualization
import plotly.express as px
import plotly.graph_objects as go

### Dataset

In [2]:
df = pd.read_csv(os.path.join(DATA_DIR, "ml_input.csv"), low_memory=False)

filter = df["ec"].str.contains("-")
df = df[~filter]

df = df[["taxon_id", "media_id", "ec"]].value_counts().reset_index()
df = df.pivot(index=["taxon_id", "media_id"], columns="ec", values="count")
df = df.fillna(0.0).reset_index()

df.head()

ec,taxon_id,media_id,1.1.1.1,1.1.1.100,1.1.1.102,1.1.1.103,1.1.1.107,1.1.1.108,1.1.1.11,1.1.1.110,...,7.6.2.12,7.6.2.13,7.6.2.14,7.6.2.15,7.6.2.16,7.6.2.2,7.6.2.5,7.6.2.7,7.6.2.8,7.6.2.9
0,1002526,J22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1004166,1a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1004261,J181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1004261,J455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1005925,J118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Filtering into training and test sets (database set vs. MAGs)
bins = df["media_id"].str.contains("unknown")
df_train = df[~bins]
df_test = df[bins]

In [4]:
# Split the datasets into training and test sets (can use train_test_split on the database set to test model validity as well)
X_train = df_train.drop(["taxon_id", "media_id"], axis=1)
y_train = df_train["taxon_id"] # target variable
X_test = df_test.drop(["taxon_id", "media_id"], axis=1)
y_test = df_test["taxon_id"] # target variable

# Pipeline

### PCA-UMAP-hdbscan

In [5]:
# sklearn pipeline helps prevent data leakage; incorporate individual steps here
pipeline = Pipeline([
    ('scaler', StandardScaler()),    # Scale data (optional)
    ('pca', PCA(n_components=50)),   # Reduce dimensionality with PCA
    ('umap', umap.UMAP(              # Further reduce with UMAP
        metric="euclidean",
        n_components=2,
        n_epochs=200, #200, 500, 5000
        random_state=42,
        n_jobs=1,
        n_neighbors=30, # as n_neighbors increases, large-scale structure is better revealed
        min_dist=0.35 # larger numbers reveal larger overarching structures, whereas lower shows clusters better
        )) 
])

# Fit and transform the pipeline on the training and test data
X_train_transformed = pipeline.fit_transform(X_train) # fit pipeline and transform training data
X_test_transformed = pipeline.transform(X_test) # transform test data

# Apply HDBSCAN clustering on the transformed training data
clusterer = hdbscan.HDBSCAN(
    min_samples=5, 
    min_cluster_size=10, 
    #gen_min_span_tree=True, 
    prediction_data=True # cluster_selection_method="leaf", for more fine-grained clustering
)
cluster_labels = clusterer.fit_predict(X_train_transformed)

In [6]:
# Initial dimensionality reduction performance (PCA)
pca = pipeline.named_steps['pca']
explained_variance = pca.explained_variance_ratio_
print(f"Total PCA explained variance: {explained_variance.sum()}")

# Clustering performance (UMAP and hdbscan)
silhouette_avg = silhouette_score(X_train_transformed, cluster_labels)
print(f"Silhouette Score on training data: {silhouette_avg}")
# Silhouette score <=0.25 implies poor clustering, 0.25<x<=0.50 is fair, <50 is good

# Total number of hdbscan clusters on training set
print(clusterer.labels_.max(), "training clusters")

Total PCA explained variance: 0.6866034693233879
Silhouette Score on training data: 0.5650906562805176
211 training clusters


### Visualization

In [7]:
# Create DataFrames for plotting
train_df = pd.DataFrame(X_train_transformed, columns=['Component 1', 'Component 2'])
train_df['Set'] = 'Training'
train_df['Target'] = y_train
train_df['Cluster'] = cluster_labels # only showing training clusters for now, mapping test over these
train_df['Media'] = df_train["media_id"]

test_df = pd.DataFrame(X_test_transformed, columns=['Component 1', 'Component 2'])
test_df['Set'] = 'Test'
target = list(y_test)
test_df['Target'] = target

test_df.head()

Unnamed: 0,Component 1,Component 2,Set,Target
0,3.894849,12.926424,Test,KBase_derived_Bin.001.fastaBA_F_extracted_bins...
1,2.804447,14.827258,Test,KBase_derived_Bin.002.fastaBA_F_extracted_bins...
2,3.019445,13.854838,Test,KBase_derived_Bin.004.fastaBA_F_extracted_bins...
3,4.0157,12.670037,Test,KBase_derived_Bin.005.fastaBA_F_extracted_bins...
4,4.920488,12.279667,Test,KBase_derived_Bin.006.fastaBA_F_extracted_bins...


In [8]:
# Convert to numpy array for hdbscan and predict clustering
test_points = test_df[["Component 1", "Component 2"]].to_numpy()
test_labels, strengths = hdbscan.approximate_predict(clusterer, test_points)

# Optional constraints
filtered = train_df[train_df['Cluster'] >= 0]
#test_labels = clusterer.fit_predict(X_test_transformed)

In [12]:
# Plot the training points
fig2 = px.scatter(
    data_frame = filtered,
    x = "Component 1",
    y = "Component 2",
    color = "Cluster",
    hover_data = ["Target", "Media"],
    opacity = 0.03,
    template="plotly_white",
    title="hdbscan approximate_predict() mapping"
) 

# Add the test points as a scatter trace
fig2.add_trace(
    go.Scatter(
        x=test_points[:, 0],  # x coordinates
        y=test_points[:, 1],  # y coordinates
        mode='markers',
        marker=dict(
            color='black',  
            size=5,  
            opacity=1.0 
        ),
        text=test_df["Target"]  # hover text
    )
)

# Opacity slider
steps = []
for i in range(11):  # 0 to 1 in steps of 0.1
    step = dict(
        method="restyle",
        args=[{"marker.opacity": [i/20]}, [0]],  # Adjust opacity of the first trace only
        label=str(i/20)
    )
    steps.append(step)
sliders = [dict(
    active=3,  # initial value corresponding to opacity=0.3
    currentvalue={"prefix": "Opacity: "},
    pad={"t": 50},
    steps=steps
)]
fig2.update_layout(
    sliders=sliders,
    title="hdbscan approximate_predict() mapping"
)

fig2.show()

# Notes

Test cluster prediction results appear similar at first glance between the two methods

Automate way of finding the nearest neighbors for test_points (and associated taxon/media info)