# Model Inputs

### Packages

In [1]:
# General
import pandas as pd
import numpy as np
import pandas as pd
import os
DATA_DIR = "~/Desktop/code/data/"

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA 
import umap
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

# Classfication
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Visualization
import plotly.express as px
import plotly.graph_objects as go

### Dataset

In [2]:
# Loading output dataframe from data.ipynb
df = pd.read_csv(os.path.join(DATA_DIR, "model", "ml_input.csv"), low_memory=False)

# Filter non-specific ec numbers (e.g., 1.1.1,-)
filter = df["ec"].str.contains("-")
df = df[~filter]

# Format for machine learning
df = df[["taxon_id", "media_id", "ec"]].value_counts().reset_index()
df = df.pivot(index=["taxon_id", "media_id"], columns="ec", values="count")
df = df.fillna(0.0).reset_index()

# Filtering into training and test sets (database set vs. MAGs)
bins = df["media_id"].str.contains("unknown")
df_train = df[~bins]
df_test = df[bins]

### Split dataset

In [3]:
# Split the datasets into training and test sets
TARGET = "media_id" # target label
RANDOM_SEED = 47 # seed for random_state

# Test case with MAGs
X_train = df_train.drop(["taxon_id", "media_id"], axis=1)
y_train = df_train[TARGET] 
X_test = df_test.drop(["taxon_id", "media_id"], axis=1)
y_test = df_test[TARGET]

# Model

### Pipeline

In [4]:
# sklearn pipeline helps prevent data leakage; incorporate individual steps here
pipeline = Pipeline([
    ('scaler', StandardScaler()),    # Scale data (optional)
    ('pca', PCA(n_components=120)),   # Reduce dimensionality with PCA
    ('umap', umap.UMAP(              # Further reduce with UMAP
        metric="euclidean",
        n_epochs=200, #200 recommended for large datasets, higher => stricter clustering
        random_state=RANDOM_SEED,
        n_jobs=1,
        n_components=30, 
        n_neighbors=40, 
        min_dist=0.4 
        )) 
]) 

### Fit and transform

In [5]:
# Fit and transform the pipeline on the training and test data
X_train_transformed = pipeline.fit_transform(X_train) # fit pipeline and transform training data
X_test_transformed = pipeline.transform(X_test) # transform test data

### Cluster

In [6]:
# Cluster
clusterer = KMeans(n_clusters=181, random_state=RANDOM_SEED) #177
cluster_labels = clusterer.fit_predict(X_train_transformed)
test_clusters = clusterer.predict(X_test_transformed)

### Metrics

In [7]:
# Initial dimensionality reduction performance (PCA)
pca = pipeline.named_steps['pca']
explained_variance = pca.explained_variance_ratio_
#print(f"PCA explained variance: {explained_variance}") # variance of each component
print(f"Total PCA explained variance: {explained_variance.sum()}")

# Further dimensionality reduction and clustering performance
silhouette_avg = silhouette_score(X_train_transformed, cluster_labels)
print(clusterer.labels_.max()+1, "KMeans training clusters")
print(f"Average Silhouette Score on KMeans clusters: {silhouette_avg}") # <0.25 implies poor clustering, 0.25<x<0.50 is fair, >50 is good
#84, 177, 68

Total PCA explained variance: 0.842233442838392
181 KMeans training clusters
Average Silhouette Score on KMeans clusters: 0.6864229440689087


# Classifier

In [15]:
y2_train = cluster_labels
y2_test = test_clusters

### Random Forest

In [16]:
rfc = RandomForestClassifier(
    n_estimators=100,
    n_jobs=-1,
    random_state=RANDOM_SEED
)

rfc.fit(X_train_transformed, y2_train)
print(rfc.predict(X_test_transformed))

[166  60 166 166   2  60   2 166  60   2  60  60 160 166 166  60 166  60
 112 166 166 136  60]


In [17]:
report = classification_report(
    y_true=y2_test,
    y_pred=rfc.predict(X_test_transformed),
    zero_division=0.0 # Default: set to "warn", or float {0.0, 1.0}
)

roc_score = metrics.roc_auc_score(
    y_true=y2_test,
    y_score=rfc.predict_proba(X_test_transformed),
    labels=np.unique(y2_train),
    average="weighted",
    multi_class="ovo"
)

print(report)
print("RF ROC AUC score:", roc_score)

              precision    recall  f1-score   support

           2       0.00      0.00      0.00         0
          60       1.00      1.00      1.00         8
         112       1.00      0.14      0.25         7
         136       1.00      1.00      1.00         1
         160       1.00      1.00      1.00         1
         166       0.56      0.83      0.67         6

    accuracy                           0.70        23
   macro avg       0.76      0.66      0.65        23
weighted avg       0.88      0.70      0.68        23

RF ROC AUC score: 0.9932712215320911


### Nearest Neighbors

In [29]:
knc= KNeighborsClassifier(n_neighbors=10)
knc.fit(X_train_transformed, cluster_labels)
distances, indices = knc.kneighbors(X_test_transformed)

print(knc.predict(X_test_transformed))

[166  60 166 166   2  60   2 166  60   2  60  60 160 166   2  60 166  60
 112 166 166 136  60]


In [30]:
report = classification_report(
    y_true=y2_test,
    y_pred=knc.predict(X_test_transformed),
    zero_division=0.0 # Default: set to "warn", or float {0.0, 1.0}
)

roc_score = metrics.roc_auc_score(
    y_true=y2_test,
    y_score=knc.predict_proba(X_test_transformed),
    labels=np.unique(y2_train),
    average="weighted",
    multi_class="ovo"
)

print(report)
print("KNN ROC AUC score:", roc_score)

              precision    recall  f1-score   support

           2       0.00      0.00      0.00         0
          60       1.00      1.00      1.00         8
         112       1.00      0.14      0.25         7
         136       1.00      1.00      1.00         1
         160       1.00      1.00      1.00         1
         166       0.62      0.83      0.71         6

    accuracy                           0.70        23
   macro avg       0.77      0.66      0.66        23
weighted avg       0.90      0.70      0.70        23

KNN ROC AUC score: 0.9736671842650104


# Outputs

### Train, clustering labels

In [31]:
# Training data
train_df = pd.DataFrame(X_train_transformed, columns=[f"Component {i+1}" for i in range(X_train_transformed.shape[1])])
train_df['Set'] = 'Training'
train_df['Target'] = y_train
train_df['KMeans Cluster'] = cluster_labels
train_df['taxon_id'] = df_train["taxon_id"]

# Media cluster labels
media_clusters = pd.read_csv(os.path.join(DATA_DIR, "model", "media-clusters.csv"))
media_clusters = media_clusters[["media_id", "Cluster"]]
media_clusters = media_clusters.rename(columns={"media_id": "Target", "Cluster": "Media Cluster"})
target_label_mapping = dict(zip(media_clusters['Target'], media_clusters['Media Cluster']))
train_df['Media Cluster'] = train_df['Target'].map(target_label_mapping)

#train_df.to_csv(os.path.join(DATA_DIR, "model", "train-output.csv"), index=False)
train_df.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 26,Component 27,Component 28,Component 29,Component 30,Set,Target,KMeans Cluster,taxon_id,Media Cluster
0,8.902985,6.311824,5.316379,5.1378,4.203104,3.404017,3.439919,6.720155,5.574666,3.797663,...,4.519817,5.875326,5.67431,4.63647,5.21877,Training,J346,152,100.0,1.0
1,8.922006,6.046684,4.744275,5.378962,4.545406,2.476492,4.640085,5.626589,5.783877,6.293915,...,5.851049,5.255988,5.467108,4.63105,4.573239,Training,J386,154,100.0,3.0
2,8.907426,5.947838,5.515216,4.735327,4.963597,2.512275,5.892787,3.863709,3.047846,5.497529,...,5.406521,2.281028,3.29653,4.206986,1.211805,Training,J233,60,100225.0,1.0
3,8.528702,6.46521,5.520042,3.489518,4.083768,2.170034,5.162029,3.853132,2.351385,4.91607,...,5.100926,4.72,5.041985,4.712763,3.835999,Training,J22,158,1002526.0,0.0
4,9.348657,6.346339,5.889155,5.136734,4.124396,1.470808,6.186804,4.25995,4.489815,6.489859,...,5.260788,2.060838,3.147772,4.676081,4.058699,Training,1a,81,1004166.0,1.0


### Test, classifier labels

In [32]:
# Test data
test_df = pd.DataFrame(X_test_transformed, columns=[f"Component {i+1}" for i in range(X_test_transformed.shape[1])])
test_df['Set'] = 'Test'
test_df['Target'] = list(df_test['media_id'])
test_df['KMeans Cluster'] = test_clusters
test_df['taxon_id'] = list(df_test['taxon_id'])

# Add classifier labels 
test_df["KMN Classify"] = knc.predict(X_test_transformed)
test_df["RF Classify"] = rfc.predict(X_test_transformed)

#test_df.to_csv(os.path.join(DATA_DIR, "model", "test-output.csv"), index=False)
test_df.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 27,Component 28,Component 29,Component 30,Set,Target,KMeans Cluster,taxon_id,KMN Classify,RF Classify
0,8.799496,6.152494,5.832331,4.717033,4.694175,2.113044,5.447878,4.507904,4.947254,5.843478,...,2.389772,3.723034,4.849397,2.075917,Test,unknown,166,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,166,166
1,8.812132,6.015193,5.503372,4.71763,4.969475,2.45269,5.655346,3.828663,2.829012,5.63935,...,2.333013,3.299282,4.554641,1.146146,Test,unknown,60,KBase_derived_Bin.002.fastaBA_F_extracted_bins...,60,60
2,8.846539,6.085261,5.779021,4.714541,4.6665,2.184908,5.701271,4.297401,5.19326,5.660457,...,2.477576,3.586415,4.7695,2.233284,Test,unknown,112,KBase_derived_Bin.004.fastaBA_F_extracted_bins...,166,166
3,8.825863,6.086555,5.733594,4.675618,4.68762,2.191548,5.642962,4.377346,5.275274,5.68234,...,2.528645,3.641576,4.767701,2.30409,Test,unknown,112,KBase_derived_Bin.005.fastaBA_F_extracted_bins...,166,166
4,8.877181,5.975276,5.536195,4.701238,4.633231,2.264041,5.651309,4.370574,5.198583,5.628085,...,2.831897,3.66568,4.876712,2.488525,Test,unknown,112,KBase_derived_Bin.006.fastaBA_F_extracted_bins...,2,2


### ID nearest neighbors

In [33]:
# Add nearest neighbor indices to the test dataframe
knn_df = test_df.copy()
knn_df = knn_df.rename(columns={"KMeans Cluster": "Test Cluster"})
neighbors = pd.DataFrame(indices) # Indices of nearest neighbors from the training set
knn_df['indices'] = neighbors[neighbors.columns[0:]].apply(lambda x: ';'.join(x.dropna().astype(str)), axis=1)
knn_df["indices"] = knn_df["indices"].str.split(';')
knn_df = knn_df.explode("indices")

# Add original indices as a column for the train dataframe
train_index = train_df.copy()
train_index = train_index.rename(columns={"KMeans Cluster": "Train Cluster"})
train_index["indices"] = df_train.index
train_index["indices"] = train_index["indices"].astype(str)

# Merge to label nearest neighbors
left = knn_df.copy()
right = train_index[["indices", "Target", "taxon_id"]]
right = right.rename(columns={"Target": "neighbor_media_id", "taxon_id": "neighbor_taxon_id", "Cluster": "neighbor_cluster"})
merged = pd.merge(left, right, on="indices", how="left")

#merged.to_csv(os.path.join(DATA_DIR, "ml-model-output.csv"), index=False)
merged

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 30,Set,Target,Test Cluster,taxon_id,KMN Classify,RF Classify,indices,neighbor_media_id,neighbor_taxon_id
0,8.799496,6.152494,5.832331,4.717033,4.694175,2.113044,5.447878,4.507904,4.947254,5.843478,...,2.075917,Test,unknown,166,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,166,166,7921,J237,75385.0
1,8.799496,6.152494,5.832331,4.717033,4.694175,2.113044,5.447878,4.507904,4.947254,5.843478,...,2.075917,Test,unknown,166,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,166,166,6631,J70,51669.0
2,8.799496,6.152494,5.832331,4.717033,4.694175,2.113044,5.447878,4.507904,4.947254,5.843478,...,2.075917,Test,unknown,166,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,166,166,5666,J156,38402.0
3,8.799496,6.152494,5.832331,4.717033,4.694175,2.113044,5.447878,4.507904,4.947254,5.843478,...,2.075917,Test,unknown,166,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,166,166,2863,J256,1901.0
4,8.799496,6.152494,5.832331,4.717033,4.694175,2.113044,5.447878,4.507904,4.947254,5.843478,...,2.075917,Test,unknown,166,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,166,166,2864,J70,1901.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,8.803839,6.023494,5.490750,4.778646,4.939512,2.380363,5.568592,3.863344,2.701969,5.725461,...,1.106536,Test,unknown,60,KBase_derived_Bin.028.fastaBA_F_extracted_bins...,60,60,537,J280,1226327.0
226,8.803839,6.023494,5.490750,4.778646,4.939512,2.380363,5.568592,3.863344,2.701969,5.725461,...,1.106536,Test,unknown,60,KBase_derived_Bin.028.fastaBA_F_extracted_bins...,60,60,2605,J309,185294.0
227,8.803839,6.023494,5.490750,4.778646,4.939512,2.380363,5.568592,3.863344,2.701969,5.725461,...,1.106536,Test,unknown,60,KBase_derived_Bin.028.fastaBA_F_extracted_bins...,60,60,8247,J280,873191.0
228,8.803839,6.023494,5.490750,4.778646,4.939512,2.380363,5.568592,3.863344,2.701969,5.725461,...,1.106536,Test,unknown,60,KBase_derived_Bin.028.fastaBA_F_extracted_bins...,60,60,3351,J1039,2055893.0


In [34]:
#TODO: extract more information from nearest neighbors and use to predict components/concentrations

# Visualization

### n dimensions

In [35]:
# Plot the training points
fig = px.scatter(
    data_frame = train_df,
    x = "Component 1",
    y = "Component 2", # Change components to visualize each of the 30 dimensions
    color = "KMeans Cluster",
    hover_data = ["taxon_id", "Target", "Media Cluster"],
    opacity = 0.3
) 

# Add the test points as a scatter trace
fig.add_trace(
    go.Scatter(
        x=test_df["Component 1"],  # x coordinates
        y=test_df["Component 2"],  # y coordinates
        mode='markers',
        marker=dict(
            color='black',  
            size=5,  
            opacity=1.0 
        ),
        text=test_df["Target"]  # marker hover text
    )
)

# Opacity slider
steps = []
for i in range(11):  # 0 to 1 in steps of 0.1
    step = dict(
        method="restyle",
        args=[{"marker.opacity": [i/20]}, [0]],  # Adjust opacity of the first trace only
        label=str(i/20)
    )
    steps.append(step)
sliders = [dict(
    active=3,  # initial value corresponding to opacity=0.3
    currentvalue={"prefix": "Opacity: "},
    pad={"t": 50},
    steps=steps
)]

fig.update_layout(
    sliders=sliders,
    title="KMeans",
    template="plotly_white",
)

fig.show()

### 2 dimensions

#### 2D Pipeline

In [36]:
# copy of model pipeline reduced to two dimensions, ***LEADS TO DISCONNECT IN DIMENSIONALITY REDUCTION STEPS***
visualization = Pipeline([
    ('scaler', StandardScaler()),   
    ('pca', PCA(n_components=120)),   
    ('umap', umap.UMAP(              
        metric="euclidean",
        n_epochs=200, 
        random_state=RANDOM_SEED,
        n_jobs=1,
        n_components=2, # reduced for visualization
        n_neighbors=40, 
        min_dist=0.4 
        )) 
]) 

# Fit and transform the pipeline on the training and test data
X_train_embedding = visualization.fit_transform(X_train) # fit pipeline and transform training data
X_test_embedding = visualization.transform(X_test) # transform test data

# Cluster
vclusterer = KMeans(n_clusters=177, random_state=RANDOM_SEED)
vcluster_labels = vclusterer.fit_predict(X_train_embedding)
vtest_clusters = vclusterer.predict(X_test_embedding)

# Metrics to compare to 30 dimensions
vpca = visualization.named_steps['pca']
vexplained_variance = vpca.explained_variance_ratio_
vsilhouette_avg = silhouette_score(X_train_embedding, vcluster_labels)
print(f"Total PCA explained variance: {vexplained_variance.sum()}")
print(vclusterer.labels_.max()+1, "KMeans training clusters")
print(f"Average Silhouette Score on KMeans clusters: {vsilhouette_avg}") # <0.25 implies poor clustering, 0.25<x<0.50 is fair, >50 is good


Graph is not fully connected, spectral embedding may not work as expected.



Total PCA explained variance: 0.8421925689547426
177 KMeans training clusters
Average Silhouette Score on KMeans clusters: 0.6967038512229919


#### Visualization

In [37]:
# Training data
train_2d = pd.DataFrame(X_train_embedding, columns=[f"Component {i+1}" for i in range(X_train_embedding.shape[1])])
train_2d['Set'] = 'Training'
train_2d['Target'] = y_train
train_2d['KMeans Cluster'] = cluster_labels
train_2d['taxon_id'] = df_train["taxon_id"]

# Media cluster labels
media_clusters = pd.read_csv(os.path.join(DATA_DIR, "model", "media-clusters.csv"))
media_clusters = media_clusters[["media_id", "Cluster"]]
media_clusters = media_clusters.rename(columns={"media_id": "Target", "Cluster": "Media Cluster"})
target_label_mapping = dict(zip(media_clusters['Target'], media_clusters['Media Cluster']))
train_2d['Media Cluster'] = train_2d['Target'].map(target_label_mapping)

# Test data
test_2d = pd.DataFrame(X_test_embedding, columns=[f"Component {i+1}" for i in range(X_test_embedding.shape[1])])
test_2d['Set'] = 'Test'
test_2d['Target'] = list(df_test['media_id'])
test_2d['KMeans Cluster'] = test_clusters
test_2d['taxon_id'] = list(df_test['taxon_id'])

In [39]:
test_2d.head()

Unnamed: 0,Component 1,Component 2,Set,Target,KMeans Cluster,taxon_id
0,0.057472,-4.731206,Test,unknown,166,KBase_derived_Bin.001.fastaBA_F_extracted_bins...
1,8.9175,-0.723673,Test,unknown,60,KBase_derived_Bin.002.fastaBA_F_extracted_bins...
2,-0.385404,-4.385252,Test,unknown,112,KBase_derived_Bin.004.fastaBA_F_extracted_bins...
3,-0.565663,-4.463836,Test,unknown,112,KBase_derived_Bin.005.fastaBA_F_extracted_bins...
4,-0.508184,-4.250774,Test,unknown,112,KBase_derived_Bin.006.fastaBA_F_extracted_bins...


In [41]:
# Plot the training points
fig = px.scatter(
    data_frame = train_2d,
    x = "Component 1",
    y = "Component 2", # Change components to visualize each of the 30 dimensions
    color = "KMeans Cluster",
    hover_data = ["taxon_id", "Target", "Media Cluster"],
    opacity = 0.3
) 

# Add the test points as a scatter trace
fig.add_trace(
    go.Scatter(
        x=test_2d["Component 1"],  # x coordinates
        y=test_2d["Component 2"],  # y coordinates
        mode='markers',
        marker=dict(
            color='black',  
            size=5,  
            opacity=1.0 
        ),
        text=test_2d["taxon_id"]  # marker hover text
    )
)

# Opacity slider
steps = []
for i in range(11):  # 0 to 1 in steps of 0.1
    step = dict(
        method="restyle",
        args=[{"marker.opacity": [i/20]}, [0]],  # Adjust opacity of the first trace only
        label=str(i/20)
    )
    steps.append(step)
sliders = [dict(
    active=3,  # initial value corresponding to opacity=0.3
    currentvalue={"prefix": "Opacity: "},
    pad={"t": 50},
    steps=steps
)]

fig.update_layout(
    sliders=sliders,
    title="KMeans",
    template="plotly_white",
)

fig.show()