# Model Inputs

### Packages

In [1]:
# General
import pandas as pd
import numpy as np
import pandas as pd
import os
DATA_DIR = "~/Desktop/code/data/"

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA 
import umap
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

# Classfication
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Visualization
import plotly.express as px
import plotly.graph_objects as go

### Dataset

In [2]:
# Loading output dataframe from data.ipynb
df = pd.read_csv(os.path.join(DATA_DIR, "model", "ml_input.csv"), low_memory=False)

# Filter non-specific ec numbers (e.g., 1.1.1,-)
filter = df["ec"].str.contains("-")
df = df[~filter]

# Format for machine learning
df = df[["taxon_id", "media_id", "ec"]].value_counts().reset_index()
df = df.pivot(index=["taxon_id", "media_id"], columns="ec", values="count")
df = df.fillna(0.0).reset_index()

# Filtering into training and test sets (database set vs. MAGs)
bins = df["media_id"].str.contains("unknown")
df_train = df[~bins]
df_test = df[bins]

### Split dataset

In [3]:
# Split the datasets into training and test sets
TARGET = "media_id" # target label
RANDOM_SEED = 47 # seed for random_state

# Test case with MAGs
X_train = df_train.drop(["taxon_id", "media_id"], axis=1)
y_train = df_train[TARGET] 
X_test = df_test.drop(["taxon_id", "media_id"], axis=1)
y_test = df_test[TARGET]

# Model

### Pipeline

In [4]:
# sklearn pipeline helps prevent data leakage; incorporate individual steps here
pipeline = Pipeline([
    ('scaler', StandardScaler()),    # Scale data (optional)
    ('pca', PCA(n_components=120)),   # Reduce dimensionality with PCA
    ('umap', umap.UMAP(              # Further reduce with UMAP
        metric="euclidean",
        n_epochs=200, #200 recommended for large datasets, higher => stricter clustering
        random_state=RANDOM_SEED,
        n_jobs=1,
        n_components=30, 
        n_neighbors=40, 
        min_dist=0.4 
        )) 
]) 

### Fit and transform

In [5]:
# Fit and transform the pipeline on the training and test data
X_train_transformed = pipeline.fit_transform(X_train) # fit pipeline and transform training data
X_test_transformed = pipeline.transform(X_test) # transform test data

### Cluster

In [6]:
# Cluster
clusterer = KMeans(n_clusters=181, random_state=RANDOM_SEED) #177
cluster_labels = clusterer.fit_predict(X_train_transformed)
test_clusters = clusterer.predict(X_test_transformed)

### Metrics

In [7]:
# Initial dimensionality reduction performance (PCA)
pca = pipeline.named_steps['pca']
explained_variance = pca.explained_variance_ratio_
#print(f"PCA explained variance: {explained_variance}") # variance of each component
print(f"Total PCA explained variance: {explained_variance.sum()}")

# Further dimensionality reduction and clustering performance
silhouette_avg = silhouette_score(X_train_transformed, cluster_labels)
print(clusterer.labels_.max()+1, "KMeans training clusters")
print(f"Average Silhouette Score on KMeans clusters: {silhouette_avg}") # <0.25 implies poor clustering, 0.25<x<0.50 is fair, >50 is good
#84, 177, 68

Total PCA explained variance: 0.8422592524038068
181 KMeans training clusters
Average Silhouette Score on KMeans clusters: 0.6751219034194946


# Classifier

In [8]:
y2_train = cluster_labels
y2_test = test_clusters

### Random Forest

In [9]:
rfc = RandomForestClassifier(
    n_estimators=100,
    n_jobs=-1,
    random_state=RANDOM_SEED
)

rfc.fit(X_train_transformed, y2_train)
print(rfc.predict(X_test_transformed))

[113  71 152 121  92  71  92 172  71  92  71  71   4   4  92  71  92  71
  92  87  92 120  71]


In [10]:
report = classification_report(
    y_true=y2_test,
    y_pred=rfc.predict(X_test_transformed),
    zero_division=0.0 # Default: set to "warn", or float {0.0, 1.0}
)

roc_score = metrics.roc_auc_score(
    y_true=y2_test,
    y_score=rfc.predict_proba(X_test_transformed),
    labels=np.unique(y2_train),
    average="weighted",
    multi_class="ovo"
)

print(report)
print("RF ROC AUC score:", roc_score)

              precision    recall  f1-score   support

           4       1.00      1.00      1.00         2
          71       1.00      1.00      1.00         8
          87       1.00      1.00      1.00         1
          92       0.86      0.86      0.86         7
         113       0.00      0.00      0.00         0
         120       1.00      1.00      1.00         1
         121       1.00      1.00      1.00         1
         152       0.00      0.00      0.00         1
         172       1.00      0.50      0.67         2

    accuracy                           0.87        23
   macro avg       0.76      0.71      0.72        23
weighted avg       0.91      0.87      0.88        23

RF ROC AUC score: 0.988243123336291


### Nearest Neighbors

In [11]:
knc= KNeighborsClassifier(n_neighbors=10)
knc.fit(X_train_transformed, cluster_labels)
distances, indices = knc.kneighbors(X_test_transformed)

print(knc.predict(X_test_transformed))

[ 92  71 172 121  92  71  92 172  71  92  71  71   4 141  92  71  92  71
  92  87 152 120  71]


In [12]:
report = classification_report(
    y_true=y2_test,
    y_pred=knc.predict(X_test_transformed),
    zero_division=0.0 # Default: set to "warn", or float {0.0, 1.0}
)

roc_score = metrics.roc_auc_score(
    y_true=y2_test,
    y_score=knc.predict_proba(X_test_transformed),
    labels=np.unique(y2_train),
    average="weighted",
    multi_class="ovo"
)

print(report)
print("KNN ROC AUC score:", roc_score)

              precision    recall  f1-score   support

           4       1.00      0.50      0.67         2
          71       1.00      1.00      1.00         8
          87       1.00      1.00      1.00         1
          92       1.00      1.00      1.00         7
         120       1.00      1.00      1.00         1
         121       1.00      1.00      1.00         1
         141       0.00      0.00      0.00         0
         152       1.00      1.00      1.00         1
         172       1.00      1.00      1.00         2

    accuracy                           0.96        23
   macro avg       0.89      0.83      0.85        23
weighted avg       1.00      0.96      0.97        23

KNN ROC AUC score: 1.0


# Outputs

### Train, clustering labels

In [13]:
# Training data
train_df = pd.DataFrame(X_train_transformed, columns=[f"Component {i+1}" for i in range(X_train_transformed.shape[1])])
train_df['Set'] = 'Training'
train_df['Target'] = y_train
train_df['KMeans Cluster'] = cluster_labels
train_df['taxon_id'] = df_train["taxon_id"]

# Media cluster labels
media_clusters = pd.read_csv(os.path.join(DATA_DIR, "model", "media-clusters.csv"))
media_clusters = media_clusters[["media_id", "Cluster"]]
media_clusters = media_clusters.rename(columns={"media_id": "Target", "Cluster": "Media Cluster"})
target_label_mapping = dict(zip(media_clusters['Target'], media_clusters['Media Cluster']))
train_df['Media Cluster'] = train_df['Target'].map(target_label_mapping)

#train_df.to_csv(os.path.join(DATA_DIR, "model", "train-output.csv"), index=False)
train_df.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 26,Component 27,Component 28,Component 29,Component 30,Set,Target,KMeans Cluster,taxon_id,Media Cluster
0,9.027959,5.426506,5.157255,5.02194,4.524093,3.421949,6.394276,5.207511,4.979474,4.193174,...,6.732959,5.856911,4.497073,5.060361,4.723148,Training,J346,154,100.0,1.0
1,9.446127,5.664748,5.035409,5.375591,4.214466,2.810515,4.689091,4.274476,3.751784,5.147243,...,4.482231,5.401375,5.734932,5.247356,5.099955,Training,J386,140,100.0,3.0
2,8.974957,5.659627,5.003708,4.691166,4.667992,2.894567,3.300177,2.47141,6.203967,5.672856,...,2.277288,3.982201,5.541786,5.923797,7.999502,Training,J233,150,100225.0,1.0
3,9.211626,5.348445,5.902422,4.90643,6.35361,2.988715,4.788762,3.662444,5.830168,5.116533,...,6.152011,4.932049,4.74909,2.373029,3.804562,Training,J22,109,1002526.0,0.0
4,8.318399,5.891038,4.665109,5.149699,4.37878,2.585904,4.386128,4.057523,4.404146,5.120682,...,1.691148,3.693504,3.821874,4.992656,4.654844,Training,1a,51,1004166.0,1.0


### Test, classifier labels

In [14]:
# Test data
test_df = pd.DataFrame(X_test_transformed, columns=[f"Component {i+1}" for i in range(X_test_transformed.shape[1])])
test_df['Set'] = 'Test'
test_df['Target'] = list(df_test['media_id'])
test_df['KMeans Cluster'] = test_clusters
test_df['taxon_id'] = list(df_test['taxon_id'])

# Add classifier labels 
test_df["KNN Classify"] = knc.predict(X_test_transformed)
test_df["RF Classify"] = rfc.predict(X_test_transformed)

#test_df.to_csv(os.path.join(DATA_DIR, "model", "test-output.csv"), index=False)
test_df.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 27,Component 28,Component 29,Component 30,Set,Target,KMeans Cluster,taxon_id,KMN Classify,RF Classify
0,9.027029,5.836047,5.036773,4.963749,4.595442,2.52441,4.57854,3.563184,4.6314,5.481608,...,3.984802,4.859235,5.536463,6.288519,Test,unknown,92,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,92,113
1,8.844609,5.664591,5.048285,4.745102,4.740118,2.894334,3.385616,2.349714,6.179421,5.700723,...,4.172782,5.571566,5.900666,7.845344,Test,unknown,71,KBase_derived_Bin.002.fastaBA_F_extracted_bins...,71,71
2,8.926155,6.078753,4.759858,5.24028,4.557077,2.392918,4.057384,4.404721,4.703854,4.981048,...,3.322117,4.750574,5.280197,6.204131,Test,unknown,172,KBase_derived_Bin.004.fastaBA_F_extracted_bins...,172,152
3,9.36294,5.828105,5.038566,4.838264,4.636667,2.581568,4.511452,3.787064,4.917755,5.679792,...,3.594023,4.65411,5.469277,6.655752,Test,unknown,121,KBase_derived_Bin.005.fastaBA_F_extracted_bins...,121,121
4,9.013338,5.865384,4.973617,4.992002,4.728854,2.513944,4.223456,4.026782,4.908994,5.326186,...,3.722124,4.995512,5.413904,6.48635,Test,unknown,92,KBase_derived_Bin.006.fastaBA_F_extracted_bins...,92,92


### ID nearest neighbors

In [15]:
# Add nearest neighbor indices to the test dataframe
knn_df = test_df.copy()
knn_df = knn_df.rename(columns={"KMeans Cluster": "Test Cluster"})
neighbors = pd.DataFrame(indices) # Indices of nearest neighbors from the training set
knn_df['indices'] = neighbors[neighbors.columns[0:]].apply(lambda x: ';'.join(x.dropna().astype(str)), axis=1)
knn_df["indices"] = knn_df["indices"].str.split(';')
knn_df = knn_df.explode("indices")

# Add original indices as a column for the train dataframe
train_index = train_df.copy()
train_index = train_index.rename(columns={"KMeans Cluster": "Train Cluster"})
train_index["indices"] = df_train.index
train_index["indices"] = train_index["indices"].astype(str)

# Merge to label nearest neighbors
left = knn_df.copy()
right = train_index[["indices", "Target", "taxon_id"]]
right = right.rename(columns={"Target": "neighbor_media_id", "taxon_id": "neighbor_taxon_id", "Cluster": "neighbor_cluster"})
merged = pd.merge(left, right, on="indices", how="left")

#merged.to_csv(os.path.join(DATA_DIR, "ml-model-output.csv"), index=False)
merged

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 30,Set,Target,Test Cluster,taxon_id,KMN Classify,RF Classify,indices,neighbor_media_id,neighbor_taxon_id
0,9.027029,5.836047,5.036773,4.963749,4.595442,2.524410,4.57854,3.563184,4.631400,5.481608,...,6.288519,Test,unknown,92,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,92,113,6617,J739,515264.0
1,9.027029,5.836047,5.036773,4.963749,4.595442,2.524410,4.57854,3.563184,4.631400,5.481608,...,6.288519,Test,unknown,92,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,92,113,798,J142,1306.0
2,9.027029,5.836047,5.036773,4.963749,4.595442,2.524410,4.57854,3.563184,4.631400,5.481608,...,6.288519,Test,unknown,92,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,92,113,339,J22,113288.0
3,9.027029,5.836047,5.036773,4.963749,4.595442,2.524410,4.57854,3.563184,4.631400,5.481608,...,6.288519,Test,unknown,92,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,92,113,4165,J436,265470.0
4,9.027029,5.836047,5.036773,4.963749,4.595442,2.524410,4.57854,3.563184,4.631400,5.481608,...,6.288519,Test,unknown,92,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,92,113,6580,J475,506594.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,8.833779,5.612067,5.050017,4.781399,4.769324,2.857274,3.42761,2.391053,6.067067,5.665295,...,7.770132,Test,unknown,71,KBase_derived_Bin.028.fastaBA_F_extracted_bins...,71,71,6387,J242,47482.0
226,8.833779,5.612067,5.050017,4.781399,4.769324,2.857274,3.42761,2.391053,6.067067,5.665295,...,7.770132,Test,unknown,71,KBase_derived_Bin.028.fastaBA_F_extracted_bins...,71,71,8377,J151,943940.0
227,8.833779,5.612067,5.050017,4.781399,4.769324,2.857274,3.42761,2.391053,6.067067,5.665295,...,7.770132,Test,unknown,71,KBase_derived_Bin.028.fastaBA_F_extracted_bins...,71,71,5630,J231,381.0
228,8.833779,5.612067,5.050017,4.781399,4.769324,2.857274,3.42761,2.391053,6.067067,5.665295,...,7.770132,Test,unknown,71,KBase_derived_Bin.028.fastaBA_F_extracted_bins...,71,71,6159,J198,43776.0


In [16]:
#TODO: extract more information from nearest neighbors and use to predict components/concentrations

# Visualization

### n dimensions

In [17]:
# Plot the training points
fig = px.scatter(
    data_frame = train_df,
    x = "Component 1",
    y = "Component 2", # Change components to visualize each of the 30 dimensions
    color = "Media Cluster",
    hover_data = ["taxon_id", "Target", "KMeans Cluster"],
    opacity = 0.3
) 

# Add the test points as a scatter trace
fig.add_trace(
    go.Scatter(
        x=test_df["Component 1"],  # x coordinates
        y=test_df["Component 2"],  # y coordinates
        mode='markers',
        marker=dict(
            color='black',  
            size=5,  
            opacity=1.0 
        ),
        text=test_df["Target"]  # marker hover text
    )
)

# Opacity slider
steps = []
for i in range(11):  # 0 to 1 in steps of 0.1
    step = dict(
        method="restyle",
        args=[{"marker.opacity": [i/20]}, [0]],  # Adjust opacity of the first trace only
        label=str(i/20)
    )
    steps.append(step)
sliders = [dict(
    active=3,  # initial value corresponding to opacity=0.3
    currentvalue={"prefix": "Opacity: "},
    pad={"t": 50},
    steps=steps
)]

fig.update_layout(
    sliders=sliders,
    title="KMeans",
    template="plotly_white",
)

fig.show()

### 2 dimensions

#### 2D Pipeline

In [18]:
# copy of model pipeline reduced to two dimensions, ***LEADS TO DISCONNECT IN DIMENSIONALITY REDUCTION STEPS***
visualization = Pipeline([
    ('scaler', StandardScaler()),   
    ('pca', PCA(n_components=120)),   
    ('umap', umap.UMAP(              
        metric="euclidean",
        n_epochs=200, 
        random_state=RANDOM_SEED,
        n_jobs=1,
        n_components=2, # reduced for visualization
        n_neighbors=40, 
        min_dist=0.4 
        )) 
]) 

# Fit and transform the pipeline on the training and test data
X_train_embedding = visualization.fit_transform(X_train) # fit pipeline and transform training data
X_test_embedding = visualization.transform(X_test) # transform test data

# Cluster
vclusterer = KMeans(n_clusters=177, random_state=RANDOM_SEED)
vcluster_labels = vclusterer.fit_predict(X_train_embedding)
vtest_clusters = vclusterer.predict(X_test_embedding)

# Metrics to compare to 30 dimensions
vpca = visualization.named_steps['pca']
vexplained_variance = vpca.explained_variance_ratio_
vsilhouette_avg = silhouette_score(X_train_embedding, vcluster_labels)
print(f"Total PCA explained variance: {vexplained_variance.sum()}")
print(vclusterer.labels_.max()+1, "KMeans training clusters")
print(f"Average Silhouette Score on KMeans clusters: {vsilhouette_avg}") # <0.25 implies poor clustering, 0.25<x<0.50 is fair, >50 is good


Graph is not fully connected, spectral embedding may not work as expected.



Total PCA explained variance: 0.8421924085483412
177 KMeans training clusters
Average Silhouette Score on KMeans clusters: 0.6528218984603882


#### Visualization

In [19]:
# Training data
train_2d = pd.DataFrame(X_train_embedding, columns=[f"Component {i+1}" for i in range(X_train_embedding.shape[1])])
train_2d['Set'] = 'Training'
train_2d['Target'] = y_train
train_2d['KMeans Cluster'] = cluster_labels
train_2d['taxon_id'] = df_train["taxon_id"]

# Media cluster labels
media_clusters = pd.read_csv(os.path.join(DATA_DIR, "model", "media-clusters.csv"))
media_clusters = media_clusters[["media_id", "Cluster"]]
media_clusters = media_clusters.rename(columns={"media_id": "Target", "Cluster": "Media Cluster"})
target_label_mapping = dict(zip(media_clusters['Target'], media_clusters['Media Cluster']))
train_2d['Media Cluster'] = train_2d['Target'].map(target_label_mapping)

# Test data
test_2d = pd.DataFrame(X_test_embedding, columns=[f"Component {i+1}" for i in range(X_test_embedding.shape[1])])
test_2d['Set'] = 'Test'
test_2d['Target'] = list(df_test['media_id'])
test_2d['KMeans Cluster'] = test_clusters
test_2d['taxon_id'] = list(df_test['taxon_id'])

In [20]:
test_2d.head()

Unnamed: 0,Component 1,Component 2,Set,Target,KMeans Cluster,taxon_id
0,-4.072868,6.97809,Test,unknown,92,KBase_derived_Bin.001.fastaBA_F_extracted_bins...
1,1.213786,9.399409,Test,unknown,71,KBase_derived_Bin.002.fastaBA_F_extracted_bins...
2,-0.273757,7.07022,Test,unknown,172,KBase_derived_Bin.004.fastaBA_F_extracted_bins...
3,-4.677022,7.700779,Test,unknown,121,KBase_derived_Bin.005.fastaBA_F_extracted_bins...
4,-1.474609,6.674373,Test,unknown,92,KBase_derived_Bin.006.fastaBA_F_extracted_bins...


In [22]:
# Plot the training points
fig = px.scatter(
    data_frame = train_2d,
    x = "Component 1",
    y = "Component 2", # Change components to visualize each of the 30 dimensions
    color = "Media Cluster",
    hover_data = ["taxon_id", "Target", "KMeans Cluster"],
    opacity = 0.3
) 

# Add the test points as a scatter trace
fig.add_trace(
    go.Scatter(
        x=test_2d["Component 1"],  # x coordinates
        y=test_2d["Component 2"],  # y coordinates
        mode='markers',
        marker=dict(
            color='black',  
            size=5,  
            opacity=1.0 
        ),
        text=test_2d["taxon_id"]  # marker hover text
    )
)

# Opacity slider
steps = []
for i in range(11):  # 0 to 1 in steps of 0.1
    step = dict(
        method="restyle",
        args=[{"marker.opacity": [i/20]}, [0]],  # Adjust opacity of the first trace only
        label=str(i/20)
    )
    steps.append(step)
sliders = [dict(
    active=3,  # initial value corresponding to opacity=0.3
    currentvalue={"prefix": "Opacity: "},
    pad={"t": 50},
    steps=steps
)]

fig.update_layout(
    sliders=sliders,
    title="KMeans",
    template="plotly_white",
)

fig.show()