# Model Inputs

### Packages

In [1]:
# General
import pandas as pd
import numpy as np
import pandas as pd
import os
DATA_DIR = "~/Desktop/code/data/"

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA 
import umap
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

# Classfication
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Visualization
import plotly.express as px
import plotly.graph_objects as go

### Dataset

In [2]:
# Loading output dataframe from data.ipynb
df = pd.read_csv(os.path.join(DATA_DIR, "model", "ml_input.csv"), low_memory=False)

# Filter non-specific ec numbers (e.g., 1.1.1,-)
filter = df["ec"].str.contains("-")
df = df[~filter]

# Format for machine learning
df = df[["taxon_id", "media_id", "ec"]].value_counts().reset_index()
df = df.pivot(index=["taxon_id", "media_id"], columns="ec", values="count")
df = df.fillna(0.0).reset_index()

# Filtering into training and test sets (database set vs. MAGs)
bins = df["media_id"].str.contains("unknown")
df_train = df[~bins]
df_test = df[bins]

### Split dataset

In [15]:
# Split the datasets into training and test sets
TARGET = "media_id" # target label
RANDOM_SEED = 47 # seed for random_state

# Test case with MAGs
X_train = df_train.drop(["taxon_id", "media_id"], axis=1)
y_train = df_train[TARGET] 
X_test = df_test.drop(["taxon_id", "media_id"], axis=1)
y_test = df_test[TARGET]

# Model

### Pipeline

In [4]:
# sklearn pipeline helps prevent data leakage; incorporate individual steps here
pipeline = Pipeline([
    ('scaler', StandardScaler()),    # Scale data (optional)
    ('pca', PCA(n_components=120)),   # Reduce dimensionality with PCA
    ('umap', umap.UMAP(              # Further reduce with UMAP
        metric="euclidean",
        n_epochs=200, #200 recommended for large datasets, higher => stricter clustering
        random_state=RANDOM_SEED,
        n_jobs=1,
        n_components=30, 
        n_neighbors=40, 
        min_dist=0.4 
        )) 
]) 

### Fit and transform

In [5]:
# Fit and transform the pipeline on the training and test data
X_train_transformed = pipeline.fit_transform(X_train) # fit pipeline and transform training data
X_test_transformed = pipeline.transform(X_test) # transform test data

### Cluster

In [6]:
# Cluster
clusterer = KMeans(n_clusters=177, random_state=RANDOM_SEED)
cluster_labels = clusterer.fit_predict(X_train_transformed)
test_clusters = clusterer.predict(X_test_transformed)

### Metrics

In [7]:
# Initial dimensionality reduction performance (PCA)
pca = pipeline.named_steps['pca']
explained_variance = pca.explained_variance_ratio_
#print(f"PCA explained variance: {explained_variance}") # variance of each component
print(f"Total PCA explained variance: {explained_variance.sum()}")

# Further dimensionality reduction and clustering performance
silhouette_avg = silhouette_score(X_train_transformed, cluster_labels)
print(clusterer.labels_.max()+1, "KMeans training clusters")
print(f"Average Silhouette Score on KMeans clusters: {silhouette_avg}") # <0.25 implies poor clustering, 0.25<x<0.50 is fair, >50 is good

Total PCA explained variance: 0.842228889284389
177 KMeans training clusters
Average Silhouette Score on KMeans clusters: 0.6815912127494812


# Classifier

In [8]:
y2_train = cluster_labels
y2_test = test_clusters

### Random Forest

In [9]:
rfc = RandomForestClassifier(
    n_estimators=100,
    n_jobs=-1,
    random_state=RANDOM_SEED
)

rfc.fit(X_train_transformed, y2_train)
print(rfc.predict(X_test_transformed))

[ 97 149  81 129  81 149 109 172 149 109  87 149  87  87 109 149 109 149
  81  22  81 169 149]


In [10]:
report = classification_report(
    y_true=y2_test,
    y_pred=rfc.predict(X_test_transformed),
    zero_division=0.0 # Default: set to "warn", or float {0.0, 1.0}
)

roc_score = metrics.roc_auc_score(
    y_true=y2_test,
    y_score=rfc.predict_proba(X_test_transformed),
    labels=np.unique(y_train),
    average="weighted",
    multi_class="ovo"
)

print(report)
print("RF ROC AUC score:", roc_score)

              precision    recall  f1-score   support

          22       0.00      0.00      0.00         0
          81       0.75      0.60      0.67         5
          87       0.33      1.00      0.50         1
          97       1.00      1.00      1.00         1
         109       0.50      0.50      0.50         4
         118       0.00      0.00      0.00         1
         129       1.00      1.00      1.00         1
         149       1.00      1.00      1.00         7
         169       1.00      1.00      1.00         1
         172       1.00      0.50      0.67         2

    accuracy                           0.74        23
   macro avg       0.66      0.66      0.63        23
weighted avg       0.79      0.74      0.75        23

RF ROC AUC score: 0.9721467391304348


### Nearest Neighbors

In [11]:
knc= KNeighborsClassifier(n_neighbors=5)
knc.fit(X_train_transformed, cluster_labels)
distances, indices = knc.kneighbors(X_test_transformed)

print(knc.predict(X_test_transformed))

[ 97 149  81 129  81 149 109 172 149 109  87 149  87  87  81 149  81 149
  81  22  81 169 149]


In [12]:
report = classification_report(
    y_true=y_test,
    y_pred=knc.predict(X_test_transformed),
    zero_division=0.0 # Default: set to "warn", or float {0.0, 1.0}
)

roc_score = metrics.roc_auc_score(
    y_true=y_test,
    y_score=knc.predict_proba(X_test_transformed),
    labels=np.unique(y_train),
    average="weighted",
    multi_class="ovo"
)

print(report)
print("KNN ROC AUC score:", roc_score)

              precision    recall  f1-score   support

          22       0.00      0.00      0.00         0
          81       0.83      1.00      0.91         5
          87       0.33      1.00      0.50         1
          97       1.00      1.00      1.00         1
         109       1.00      0.50      0.67         4
         118       0.00      0.00      0.00         1
         129       1.00      1.00      1.00         1
         149       1.00      1.00      1.00         7
         169       1.00      1.00      1.00         1
         172       1.00      0.50      0.67         2

    accuracy                           0.83        23
   macro avg       0.72      0.70      0.67        23
weighted avg       0.89      0.83      0.83        23

KNN ROC AUC score: 0.9565217391304348


### Visualization

Model is 30 dimensions, can re-run model in 2-3 dimensions to visualize (thesis figure)

OR

We can also just visualize 2-3 of the components at a time (notebook)

In [20]:
# Training data
train_df = pd.DataFrame(X_train_transformed, columns=[f"Component {i+1}" for i in range(X_train_transformed.shape[1])])
train_df['Set'] = 'Training'
train_df['Target'] = y_train
train_df['KMeans Cluster'] = cluster_labels
train_df['taxon_id'] = df_train["taxon_id"]

# Media cluster labels
media_clusters = pd.read_csv(os.path.join(DATA_DIR, "model", "media-clusters.csv"))
media_clusters = media_clusters[["media_id", "Cluster"]]
media_clusters = media_clusters.rename(columns={"media_id": "Target", "Cluster": "Media Cluster"})
target_label_mapping = dict(zip(media_clusters['Target'], media_clusters['Media Cluster']))
train_df['Media Cluster'] = train_df['Target'].map(target_label_mapping)

#train_df.to_csv(os.path.join(DATA_DIR, "model", "train-output.csv"), index=False)
train_df.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 26,Component 27,Component 28,Component 29,Component 30,Set,Target,KMeans Cluster,taxon_id,Media Cluster
0,8.985844,5.396324,4.927162,4.984013,5.114626,4.062961,5.06572,4.435867,3.611043,2.966594,...,3.85932,5.74562,3.760801,3.956754,4.953303,Training,J346,154,100.0,1.0
1,9.259938,5.684223,5.453715,4.694323,5.187022,3.36623,4.988073,4.725137,5.696577,4.23043,...,6.05152,4.826473,4.961215,5.244405,4.352324,Training,J386,99,100.0,3.0
2,8.71656,5.605093,5.252487,4.286055,5.277818,3.134952,4.758636,4.57386,7.989222,7.56279,...,6.807353,4.886649,6.481108,7.615997,3.623943,Training,J233,47,100225.0,1.0
3,8.83963,5.195937,3.337063,5.668203,5.210046,2.811143,5.313169,4.431442,6.882724,7.866201,...,4.679963,5.116502,4.88445,4.035846,5.098993,Training,J22,165,1002526.0,0.0
4,9.050777,5.699296,6.232972,4.834792,5.275085,3.01541,4.935047,5.283919,6.356308,5.161341,...,7.114043,5.015615,6.865211,8.742841,3.690391,Training,1a,52,1004166.0,1.0


In [21]:
# Test data
test_df = pd.DataFrame(X_test_transformed, columns=[f"Component {i+1}" for i in range(X_test_transformed.shape[1])])
test_df['Set'] = 'Test'
test_df['Target'] = list(df_test['media_id'])
test_df['KMeans Cluster'] = test_clusters
test_df['taxon_id'] = list(df_test['taxon_id'])

# Add classifier labels 
test_df["KMN Classify"] = knc.predict(X_test_transformed)
test_df["RF Classify"] = rfc.predict(X_test_transformed)

#test_df.to_csv(os.path.join(DATA_DIR, "model", "test-output.csv"), index=False)
test_df.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 27,Component 28,Component 29,Component 30,Set,Target,KMeans Cluster,taxon_id,KMN Classify,RF Classify
0,7.700302,5.244044,5.549472,3.681372,4.385915,2.525389,4.484209,4.70797,6.882111,6.487458,...,5.859639,6.945245,7.635668,2.857181,Test,unknown,97,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,97,97
1,8.506029,5.557426,5.424253,4.205349,5.199097,3.010877,4.844694,4.670259,7.754748,7.483955,...,5.034447,6.582025,7.497613,3.524251,Test,unknown,149,KBase_derived_Bin.002.fastaBA_F_extracted_bins...,149,149
2,8.858207,5.362713,5.77571,4.733593,5.155171,2.959213,4.80786,5.590268,6.061358,5.76166,...,5.432663,6.550792,7.767291,3.503449,Test,unknown,172,KBase_derived_Bin.004.fastaBA_F_extracted_bins...,81,81
3,9.122627,5.346426,5.843764,4.398993,5.217549,2.972898,4.897097,5.433899,5.875397,5.552927,...,5.362934,6.836047,7.884975,2.873525,Test,unknown,129,KBase_derived_Bin.005.fastaBA_F_extracted_bins...,129,129
4,8.824898,5.370995,5.691679,4.623533,5.157261,2.952721,4.85017,5.524247,6.015277,5.569561,...,5.317761,6.428502,7.576149,3.429394,Test,unknown,81,KBase_derived_Bin.006.fastaBA_F_extracted_bins...,81,81


In [24]:
# Plot the training points
fig = px.scatter(
    data_frame = train_df,
    x = "Component 1",
    y = "Component 2",
    color = "KMeans Cluster",
    hover_data = ["taxon_id", "Target", "Media Cluster"],
    opacity = 0.3
) 

# Add the test points as a scatter trace
fig.add_trace(
    go.Scatter(
        x=test_df["Component 1"],  # x coordinates
        y=test_df["Component 2"],  # y coordinates
        mode='markers',
        marker=dict(
            color='black',  
            size=5,  
            opacity=1.0 
        ),
        text=test_df["Target"]  # marker hover text
    )
)

# Opacity slider
steps = []
for i in range(11):  # 0 to 1 in steps of 0.1
    step = dict(
        method="restyle",
        args=[{"marker.opacity": [i/20]}, [0]],  # Adjust opacity of the first trace only
        label=str(i/20)
    )
    steps.append(step)
sliders = [dict(
    active=3,  # initial value corresponding to opacity=0.3
    currentvalue={"prefix": "Opacity: "},
    pad={"t": 50},
    steps=steps
)]

fig.update_layout(
    sliders=sliders,
    title="KMeans",
    template="plotly_white",
)

fig.show()

### ID Nearest Neighbors

In [48]:
# Add nearest neighbor indices to test dataframe
knn_df = test_df.copy()
knn_df = knn_df.rename(columns={"KMeans Cluster": "Test Cluster"})
neighbors = pd.DataFrame(indices) # Indices of nearest neighbors from the training set
knn_df['indices'] = neighbors[neighbors.columns[0:]].apply(lambda x: ';'.join(x.dropna().astype(str)), axis=1)

knn_df["indices"] = knn_df["indices"].str.split(';')
knn_df = knn_df.explode("indices")

print(len(knn_df))
knn_df.head()

115


Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 28,Component 29,Component 30,Set,Target,Test Cluster,taxon_id,KMN Classify,RF Classify,indices
0,7.700302,5.244044,5.549472,3.681372,4.385915,2.525389,4.484209,4.70797,6.882111,6.487458,...,6.945245,7.635668,2.857181,Test,unknown,97,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,97,97,2967
0,7.700302,5.244044,5.549472,3.681372,4.385915,2.525389,4.484209,4.70797,6.882111,6.487458,...,6.945245,7.635668,2.857181,Test,unknown,97,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,97,97,4301
0,7.700302,5.244044,5.549472,3.681372,4.385915,2.525389,4.484209,4.70797,6.882111,6.487458,...,6.945245,7.635668,2.857181,Test,unknown,97,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,97,97,214
0,7.700302,5.244044,5.549472,3.681372,4.385915,2.525389,4.484209,4.70797,6.882111,6.487458,...,6.945245,7.635668,2.857181,Test,unknown,97,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,97,97,2397
0,7.700302,5.244044,5.549472,3.681372,4.385915,2.525389,4.484209,4.70797,6.882111,6.487458,...,6.945245,7.635668,2.857181,Test,unknown,97,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,97,97,1062


In [49]:
# Add indices as column for the training dataframe
train_index = train_df.copy()
train_index = train_index.rename(columns={"KMeans Cluster": "Train Cluster"})
train_index["indices"] = df_train.index
train_index["indices"] = train_index["indices"].astype(str)

train_index.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 27,Component 28,Component 29,Component 30,Set,Target,Train Cluster,taxon_id,Media Cluster,indices
0,8.985844,5.396324,4.927162,4.984013,5.114626,4.062961,5.06572,4.435867,3.611043,2.966594,...,5.74562,3.760801,3.956754,4.953303,Training,J346,154,100.0,1.0,0
1,9.259938,5.684223,5.453715,4.694323,5.187022,3.36623,4.988073,4.725137,5.696577,4.23043,...,4.826473,4.961215,5.244405,4.352324,Training,J386,99,100.0,3.0,1
2,8.71656,5.605093,5.252487,4.286055,5.277818,3.134952,4.758636,4.57386,7.989222,7.56279,...,4.886649,6.481108,7.615997,3.623943,Training,J233,47,100225.0,1.0,2
3,8.83963,5.195937,3.337063,5.668203,5.210046,2.811143,5.313169,4.431442,6.882724,7.866201,...,5.116502,4.88445,4.035846,5.098993,Training,J22,165,1002526.0,0.0,3
4,9.050777,5.699296,6.232972,4.834792,5.275085,3.01541,4.935047,5.283919,6.356308,5.161341,...,5.015615,6.865211,8.742841,3.690391,Training,1a,52,1004166.0,1.0,4


In [50]:
# Merge our dataframes to provide labels
left = knn_df.copy()
right = train_index[["indices", "Target", "taxon_id"]]
right = right.rename(columns={"Target": "neighbor_media_id", "taxon_id": "neighbor_taxon_id", "Cluster": "neighbor_cluster"})
merged = pd.merge(left, right, on="indices", how="left")

#merged.to_csv(os.path.join(DATA_DIR, "ml-model-output.csv"), index=False)
merged

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 30,Set,Target,Test Cluster,taxon_id,KMN Classify,RF Classify,indices,neighbor_media_id,neighbor_taxon_id
0,7.700302,5.244044,5.549472,3.681372,4.385915,2.525389,4.484209,4.707970,6.882111,6.487458,...,2.857181,Test,unknown,97,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,97,97,2967,J74,1922.0
1,7.700302,5.244044,5.549472,3.681372,4.385915,2.525389,4.484209,4.707970,6.882111,6.487458,...,2.857181,Test,unknown,97,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,97,97,4301,J699,2734.0
2,7.700302,5.244044,5.549472,3.681372,4.385915,2.525389,4.484209,4.707970,6.882111,6.487458,...,2.857181,Test,unknown,97,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,97,97,214,J346,1095776.0
3,7.700302,5.244044,5.549472,3.681372,4.385915,2.525389,4.484209,4.707970,6.882111,6.487458,...,2.857181,Test,unknown,97,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,97,97,2397,J346,180542.0
4,7.700302,5.244044,5.549472,3.681372,4.385915,2.525389,4.484209,4.707970,6.882111,6.487458,...,2.857181,Test,unknown,97,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,97,97,1062,J346,1393122.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,8.427994,5.461604,5.494020,4.238135,5.133826,2.868487,4.800919,4.647846,7.621198,7.351358,...,3.382763,Test,unknown,149,KBase_derived_Bin.028.fastaBA_F_extracted_bins...,149,149,537,J280,1226327.0
111,8.427994,5.461604,5.494020,4.238135,5.133826,2.868487,4.800919,4.647846,7.621198,7.351358,...,3.382763,Test,unknown,149,KBase_derived_Bin.028.fastaBA_F_extracted_bins...,149,149,3260,J350,2025510.0
112,8.427994,5.461604,5.494020,4.238135,5.133826,2.868487,4.800919,4.647846,7.621198,7.351358,...,3.382763,Test,unknown,149,KBase_derived_Bin.028.fastaBA_F_extracted_bins...,149,149,2940,J280,191610.0
113,8.427994,5.461604,5.494020,4.238135,5.133826,2.868487,4.800919,4.647846,7.621198,7.351358,...,3.382763,Test,unknown,149,KBase_derived_Bin.028.fastaBA_F_extracted_bins...,149,149,679,J206,127891.0


# Two-dimensions