# Model Inputs

### Packages

In [1]:
# General
import pandas as pd
import numpy as np
import pandas as pd
import os
DATA_DIR = "~/Desktop/code/data/"

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA 
import umap
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

# Classfication
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Visualization
import plotly.express as px
import plotly.graph_objects as go

### Dataset

In [2]:
# Loading output dataframe from data.ipynb
df = pd.read_csv(os.path.join(DATA_DIR, "model", "ml_input.csv"), low_memory=False)

# Filter non-specific ec numbers (e.g., 1.1.1,-)
filter = df["ec"].str.contains("-")
df = df[~filter]

# Format for machine learning
df = df[["taxon_id", "media_id", "ec"]].value_counts().reset_index()
df = df.pivot(index=["taxon_id", "media_id"], columns="ec", values="count")
df = df.fillna(0.0).reset_index()

# Filtering into training and test sets (database set vs. MAGs)
bins = df["media_id"].str.contains("unknown")
df_train = df[~bins]
df_test = df[bins]

### Split dataset

In [3]:
# Split the datasets into training and test sets
TARGET = "media_id" # target label
RANDOM_SEED = 47 # seed for random_state

# Test case with MAGs
X_train = df_train.drop(["taxon_id", "media_id"], axis=1)
y_train = df_train[TARGET] 
X_test = df_test.drop(["taxon_id", "media_id"], axis=1)
y_test = df_test[TARGET]

# Model

### Pipeline

In [4]:
# sklearn pipeline helps prevent data leakage; incorporate individual steps here
pipeline = Pipeline([
    ('scaler', StandardScaler()),    # Scale data (optional)
    ('pca', PCA(n_components=120)),   # Reduce dimensionality with PCA
    ('umap', umap.UMAP(              # Further reduce with UMAP
        metric="euclidean",
        n_epochs=200, #200 recommended for large datasets, higher => stricter clustering
        random_state=RANDOM_SEED,
        n_jobs=1,
        n_components=30, 
        n_neighbors=40, 
        min_dist=0.4 
        )) 
]) 

### Fit and transform

In [5]:
# Fit and transform the pipeline on the training and test data
X_train_transformed = pipeline.fit_transform(X_train) # fit pipeline and transform training data
X_test_transformed = pipeline.transform(X_test) # transform test data

### Cluster

In [6]:
# Cluster
clusterer = KMeans(n_clusters=180, random_state=RANDOM_SEED) #173, 177, 181
cluster_labels = clusterer.fit_predict(X_train_transformed)
test_clusters = clusterer.predict(X_test_transformed)

### Metrics

In [7]:
# Initial dimensionality reduction performance (PCA)
pca = pipeline.named_steps['pca']
explained_variance = pca.explained_variance_ratio_
#print(f"PCA explained variance: {explained_variance}") # variance of each component
print(f"Total PCA explained variance: {explained_variance.sum()}")

# Further dimensionality reduction and clustering performance
silhouette_avg = silhouette_score(X_train_transformed, cluster_labels)
print(clusterer.labels_.max()+1, "KMeans training clusters")
print(f"Average Silhouette Score on KMeans clusters: {silhouette_avg}") # <0.25 implies poor clustering, 0.25<x<0.50 is fair, >50 is good
#84, 177, 68

Total PCA explained variance: 0.8422015229034369
180 KMeans training clusters
Average Silhouette Score on KMeans clusters: 0.6886022686958313


# Classifier

In [8]:
y2_train = cluster_labels
y2_test = test_clusters

### Random Forest

In [9]:
rfc = RandomForestClassifier(
    n_estimators=100,
    n_jobs=-1,
    random_state=RANDOM_SEED
)

rfc.fit(X_train_transformed, y2_train)
print(rfc.predict(X_test_transformed))

[ 64 179 151 179 179  24 104  24 179  24  24  24 116 116 116  47 151 151
 130  24 104  47  24 116 179 152 151 179 179  24  47 151 173 114 151  24
 116 152  24  87  24  61  87 151  24 179  24 116  24 179  24  47 116 152
 171 179 135  87 151 151 116  24  87 179 116  47 116 116 152 116  24 152
 179 151  24 104  24 104 116 179 179 179  24 116  47  24 116  24  47 163
 151  61 179 151 152  24 179 171 151 104  47  47 143  24 179 116 116  24
  24 127  87  24  24  24 104 116 116 151 151  47 151  47 104  74 179  24
 179 179 130 116  61 179 116 152  47  47 127  24 151 179 151 116 151 114
 152  87  47 179  47 151 116  24  24 179 179 116  47  24 151 104  24 116
 152 179 163  47 104  47 179  24  47 179 130 114 179  24 151  24 104 104
 151 151 116  24  24  24 116 179 179 179 116  24 151  24  47 151 116  47
 151 130 116  24 135  24  47  47 104 179 179 179 143  24  47 179 151 116
  61 130  24  47 151 179  47  47  47  47 151  47  24 116  90  47  24  47
 116 179  47 151  47 130 116  24 151  24  24 116 13

In [10]:
report = classification_report(
    y_true=y2_test,
    y_pred=rfc.predict(X_test_transformed),
    zero_division=0.0 # Default: set to "warn", or float {0.0, 1.0}
)

roc_score = metrics.roc_auc_score(
    y_true=y2_test,
    y_score=rfc.predict_proba(X_test_transformed),
    labels=np.unique(y2_train),
    average="weighted",
    multi_class="ovo"
)

print(report)
print("RF ROC AUC score:", roc_score)

              precision    recall  f1-score   support

          24       0.60      0.89      0.71        35
          47       1.00      1.00      1.00        34
          61       1.00      1.00      1.00         4
          64       0.00      0.00      0.00         0
          74       1.00      0.50      0.67         2
          87       1.00      1.00      1.00         6
          90       1.00      1.00      1.00         1
         104       1.00      0.80      0.89        15
         114       0.33      0.50      0.40         2
         116       0.91      0.62      0.74        52
         127       1.00      1.00      1.00         2
         130       1.00      1.00      1.00         7
         135       0.50      1.00      0.67         1
         143       1.00      1.00      1.00         2
         151       0.62      1.00      0.77        20
         152       0.89      0.80      0.84        10
         163       1.00      0.67      0.80         3
         171       0.50    

### Nearest Neighbors

In [11]:
knc= KNeighborsClassifier(n_neighbors=10)
knc.fit(X_train_transformed, cluster_labels)
distances, indices = knc.kneighbors(X_test_transformed)

print(knc.predict(X_test_transformed))

[ 64 179 151 179 179  24 104  24 179 127 116 152 116 116 104  47  24 151
 130 116 104  47  24 116 179 152 151 151 179 116  47 151 173  47 151 116
 116 152  24  87 116  61  87 179 116 179  24 116  24 179 116  47 116 152
 171 179  74  87 151 151 104 151  87 151 116  47 116 116 152 116  24 152
 179 151  24 104 152 104 116 179 179 179 116 116  47 116 116 116  47 163
 151  61 179 151 152 116 179 171 151 104  47  47  47  24 179 116 116  24
 116 127  87 116 179  24 104 116 116 151 151  47 151  47 104  74 179 151
 179 179 130 116  61 179 116 152  47  47 127  24 151 179 151 116 151 114
 152  87  47 179  47 151 116 116  24 179 179 116  47 152 151 104  24 116
 152 179 163  47 104  47 179 116  47 179 130 171 179 116 179 116 104 104
 151 151 116  24  24 116 116 179 179 179 116 116 151  24  47  61 116  47
 151 130 116 116 135 151  47  47 104 179 179 179 143  24  47 179 151 104
  61 130  24  47 151 179  47  47  47  47 151  47  24 179  90  47 116  47
 104 179  47 151  47 130 116  24 179  24  24 116 13

In [12]:
report = classification_report(
    y_true=y2_test,
    y_pred=knc.predict(X_test_transformed),
    zero_division=0.0 # Default: set to "warn", or float {0.0, 1.0}
)

roc_score = metrics.roc_auc_score(
    y_true=y2_test,
    y_score=knc.predict_proba(X_test_transformed),
    labels=np.unique(y2_train),
    average="weighted",
    multi_class="ovo"
)

print(report)
print("KNN ROC AUC score:", roc_score)

              precision    recall  f1-score   support

          24       1.00      0.69      0.81        35
          47       0.94      1.00      0.97        34
          61       0.80      1.00      0.89         4
          64       0.00      0.00      0.00         0
          74       1.00      1.00      1.00         2
          87       1.00      1.00      1.00         6
          90       1.00      1.00      1.00         1
         104       0.94      1.00      0.97        15
         114       0.00      0.00      0.00         2
         116       0.96      0.94      0.95        52
         127       0.67      1.00      0.80         2
         130       1.00      1.00      1.00         7
         135       1.00      1.00      1.00         1
         143       1.00      0.50      0.67         2
         151       0.59      0.95      0.73        20
         152       0.83      1.00      0.91        10
         163       1.00      0.67      0.80         3
         171       0.67    

# Outputs

### Train, clustering labels

In [13]:
# Training data
train_df = pd.DataFrame(X_train_transformed, columns=[f"Component {i+1}" for i in range(X_train_transformed.shape[1])])
train_df['Set'] = 'Training'
train_df['Target'] = y_train
train_df['KMeans Cluster'] = cluster_labels
train_df['taxon_id'] = df_train["taxon_id"]

# Media cluster labels
media_clusters = pd.read_csv(os.path.join(DATA_DIR, "model", "media-clusters.csv"))
media_clusters = media_clusters[["media_id", "Cluster"]]
media_clusters = media_clusters.rename(columns={"media_id": "Target", "Cluster": "Media Cluster"})
target_label_mapping = dict(zip(media_clusters['Target'], media_clusters['Media Cluster']))
train_df['Media Cluster'] = train_df['Target'].map(target_label_mapping)

train_df.to_csv(os.path.join(DATA_DIR, "model", "train-output.csv"), index=False)
train_df.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 26,Component 27,Component 28,Component 29,Component 30,Set,Target,KMeans Cluster,taxon_id,Media Cluster
0,8.840182,5.432602,5.119658,5.64779,4.938649,4.791024,5.036232,6.41101,6.226582,6.500916,...,5.448195,4.885312,5.197779,4.412052,4.51098,Training,J346,156,100.0,9.0
1,8.687406,5.89516,5.106698,4.957048,4.595566,4.786066,4.946724,5.214825,4.453093,3.743618,...,4.026071,4.854664,4.765376,5.323591,5.619465,Training,J386,45,100.0,9.0
2,8.413919,5.727504,5.539958,4.281321,5.613963,5.370376,5.431298,5.425304,1.472727,5.735713,...,3.808705,4.374386,4.460837,8.781687,5.271106,Training,J233,47,100225.0,2.0
3,7.994631,6.327445,6.075785,5.278591,3.837756,6.217298,5.445611,5.136603,2.375182,6.673851,...,4.713346,7.495386,5.238845,6.101686,4.671212,Training,J22,141,1002526.0,0.0
4,10.425395,4.425355,6.571729,4.039037,3.463284,5.701714,4.16772,3.809626,4.497019,2.354812,...,2.423817,4.897007,4.141469,5.040603,4.073425,Training,1a,157,1004166.0,9.0


### Test, classifier labels

In [14]:
# Test data
test_df = pd.DataFrame(X_test_transformed, columns=[f"Component {i+1}" for i in range(X_test_transformed.shape[1])])
test_df['Set'] = 'Test'
test_df['Target'] = list(df_test['media_id'])
test_df['KMeans Cluster'] = test_clusters
test_df['taxon_id'] = list(df_test['taxon_id'])

# Add classifier labels 
test_df["KNN Classify"] = knc.predict(X_test_transformed)
test_df["RF Classify"] = rfc.predict(X_test_transformed)

test_df.to_csv(os.path.join(DATA_DIR, "model", "test-output.csv"), index=False)
test_df.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 27,Component 28,Component 29,Component 30,Set,Target,KMeans Cluster,taxon_id,KNN Classify,RF Classify
0,8.837785,5.424089,5.648731,5.000338,4.70537,4.575614,5.449598,5.880421,2.030343,5.689766,...,4.138461,4.203472,8.111569,5.076855,Test,unknown,171,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,64,64
1,9.199794,5.177211,5.455368,5.079551,4.57778,5.171197,4.726129,4.172611,4.693117,4.187629,...,4.811035,5.101124,6.792866,5.117544,Test,unknown,179,KBase_derived_Bin.001.fastaBA_S_extracted_bins...,179,179
2,9.172503,5.208464,5.436316,5.054221,4.62185,5.140619,4.72953,4.166208,4.701252,4.266988,...,4.809591,5.071842,6.819775,5.131373,Test,unknown,179,KBase_derived_Bin.001.fastaCF_F_extracted_bins...,151,151
3,9.195164,5.167337,5.48694,5.014349,4.609909,5.130751,4.705239,4.183858,4.598471,4.236225,...,4.744719,5.044061,6.780718,5.073137,Test,unknown,179,KBase_derived_Bin.001.fastaCG_S_extracted_bins...,179,179
4,9.247421,5.098809,5.511006,5.025043,4.619584,5.15224,4.660439,4.098059,4.685573,4.118848,...,4.739097,5.058743,6.693886,5.015133,Test,unknown,179,KBase_derived_Bin.001.fastaFE_F_extracted_bins...,179,179


### ID nearest neighbors

In [22]:
# Add nearest neighbor indices to the test dataframe
knn_df = test_df.copy()
knn_df = knn_df.rename(columns={"KMeans Cluster": "Test Cluster"})
neighbors = pd.DataFrame(indices) # Indices of nearest neighbors from the training set
knn_df['indices'] = neighbors[neighbors.columns[0:]].apply(lambda x: ';'.join(x.dropna().astype(str)), axis=1)
knn_df["indices"] = knn_df["indices"].str.split(';')
knn_df = knn_df.explode("indices")

# Add original indices as a column for the train dataframe
train_index = train_df.copy()
train_index = train_index.rename(columns={"KMeans Cluster": "Train Cluster"})
train_index["indices"] = df_train.index
train_index["indices"] = train_index["indices"].astype(str)

# Merge to label nearest neighbors
left = knn_df.copy()
right = train_index[["indices", "Target", "taxon_id", "Media Cluster"]]
right = right.rename(columns={"Target": "neighbor_media_id", "taxon_id": "neighbor_taxon_id", "Cluster": "neighbor_cluster"})
merged = pd.merge(left, right, on="indices", how="left")

merged.to_csv(os.path.join(DATA_DIR, "model", "ml-model-output.csv"), index=False)
merged

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Set,Target,Test Cluster,taxon_id,KNN Classify,RF Classify,indices,neighbor_media_id,neighbor_taxon_id,Media Cluster
0,8.837785,5.424089,5.648731,5.000338,4.705370,4.575614,5.449598,5.880421,2.030343,5.689766,...,Test,unknown,171,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,64,64,6065,J475,427754.0,9.0
1,8.837785,5.424089,5.648731,5.000338,4.705370,4.575614,5.449598,5.880421,2.030343,5.689766,...,Test,unknown,171,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,64,64,7447,J475,664640.0,9.0
2,8.837785,5.424089,5.648731,5.000338,4.705370,4.575614,5.449598,5.880421,2.030343,5.689766,...,Test,unknown,171,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,64,64,6064,J26,427754.0,0.0
3,8.837785,5.424089,5.648731,5.000338,4.705370,4.575614,5.449598,5.880421,2.030343,5.689766,...,Test,unknown,171,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,64,64,6702,J26,53358.0,0.0
4,8.837785,5.424089,5.648731,5.000338,4.705370,4.575614,5.449598,5.880421,2.030343,5.689766,...,Test,unknown,171,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,64,64,7446,J26,664640.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2465,8.915476,5.313042,5.594480,5.176408,4.555811,4.962924,5.135685,5.423875,3.764184,4.753271,...,Test,unknown,130,KBase_derived_Bin.039.fastaLS_S_extracted_bins...,130,130,7311,1203a,646.0,9.0
2466,8.915476,5.313042,5.594480,5.176408,4.555811,4.962924,5.135685,5.423875,3.764184,4.753271,...,Test,unknown,130,KBase_derived_Bin.039.fastaLS_S_extracted_bins...,130,130,4847,J84,2981779.0,9.0
2467,8.915476,5.313042,5.594480,5.176408,4.555811,4.962924,5.135685,5.423875,3.764184,4.753271,...,Test,unknown,130,KBase_derived_Bin.039.fastaLS_S_extracted_bins...,130,130,4720,1203a,29370.0,9.0
2468,8.915476,5.313042,5.594480,5.176408,4.555811,4.962924,5.135685,5.423875,3.764184,4.753271,...,Test,unknown,130,KBase_derived_Bin.039.fastaLS_S_extracted_bins...,130,130,4711,J14,29363.0,8.0


In [16]:
#TODO: extract more information from nearest neighbors and use to predict components/concentrations

# Visualization

### n dimensions

In [17]:
# Plot the training points
fig = px.scatter(
    data_frame = train_df,
    x = "Component 1",
    y = "Component 2", # Change components to visualize each of the 30 dimensions
    color = "Media Cluster",
    hover_data = ["taxon_id", "Target", "KMeans Cluster"],
    opacity = 0.3
) 

# Add the test points as a scatter trace
fig.add_trace(
    go.Scatter(
        x=test_df["Component 1"],  # x coordinates
        y=test_df["Component 2"],  # y coordinates
        mode='markers',
        marker=dict(
            color='black',  
            size=5,  
            opacity=1.0 
        ),
        text=test_df["taxon_id"]  # marker hover text
    )
)

# Opacity slider
steps = []
for i in range(11):  # 0 to 1 in steps of 0.1
    step = dict(
        method="restyle",
        args=[{"marker.opacity": [i/20]}, [0]],  # Adjust opacity of the first trace only
        label=str(i/20)
    )
    steps.append(step)
sliders = [dict(
    active=3,  # initial value corresponding to opacity=0.3
    currentvalue={"prefix": "Opacity: "},
    pad={"t": 50},
    steps=steps
)]

fig.update_layout(
    sliders=sliders,
    title="KMeans",
    template="plotly_white",
)

fig.show()

### 2 dimensions

#### 2D Pipeline

In [18]:
# copy of model pipeline reduced to two dimensions, ***LEADS TO DISCONNECT IN DIMENSIONALITY REDUCTION STEPS***
visualization = Pipeline([
    ('scaler', StandardScaler()),   
    ('pca', PCA(n_components=120)),   
    ('umap', umap.UMAP(              
        metric="euclidean",
        n_epochs=200, 
        random_state=RANDOM_SEED,
        n_jobs=1,
        n_components=2, # reduced for visualization
        n_neighbors=40, 
        min_dist=0.4 
        )) 
]) 

# Fit and transform the pipeline on the training and test data
X_train_embedding = visualization.fit_transform(X_train) # fit pipeline and transform training data
X_test_embedding = visualization.transform(X_test) # transform test data

# Cluster
vclusterer = KMeans(n_clusters=180, random_state=RANDOM_SEED)
vcluster_labels = vclusterer.fit_predict(X_train_embedding)
vtest_clusters = vclusterer.predict(X_test_embedding)

# Metrics to compare to 30 dimensions
vpca = visualization.named_steps['pca']
vexplained_variance = vpca.explained_variance_ratio_
vsilhouette_avg = silhouette_score(X_train_embedding, vcluster_labels)
print(f"Total PCA explained variance: {vexplained_variance.sum()}")
print(vclusterer.labels_.max()+1, "KMeans training clusters")
print(f"Average Silhouette Score on KMeans clusters: {vsilhouette_avg}") # <0.25 implies poor clustering, 0.25<x<0.50 is fair, >50 is good


Graph is not fully connected, spectral embedding may not work as expected.



Total PCA explained variance: 0.8421879882037544
177 KMeans training clusters
Average Silhouette Score on KMeans clusters: 0.6629546880722046


#### Visualization

In [19]:
# Training data
train_2d = pd.DataFrame(X_train_embedding, columns=[f"Component {i+1}" for i in range(X_train_embedding.shape[1])])
train_2d['Set'] = 'Training'
train_2d['Target'] = y_train
train_2d['KMeans Cluster'] = cluster_labels
train_2d['taxon_id'] = df_train["taxon_id"]

# Media cluster labels
media_clusters = pd.read_csv(os.path.join(DATA_DIR, "model", "media-clusters.csv"))
media_clusters = media_clusters[["media_id", "Cluster"]]
media_clusters = media_clusters.rename(columns={"media_id": "Target", "Cluster": "Media Cluster"})
target_label_mapping = dict(zip(media_clusters['Target'], media_clusters['Media Cluster']))
train_2d['Media Cluster'] = train_2d['Target'].map(target_label_mapping)

# Test data
test_2d = pd.DataFrame(X_test_embedding, columns=[f"Component {i+1}" for i in range(X_test_embedding.shape[1])])
test_2d['Set'] = 'Test'
test_2d['Target'] = list(df_test['media_id'])
test_2d['KMeans Cluster'] = test_clusters
test_2d['taxon_id'] = list(df_test['taxon_id'])

In [20]:
test_2d.head()

Unnamed: 0,Component 1,Component 2,Set,Target,KMeans Cluster,taxon_id
0,-2.87673,-2.937334,Test,unknown,171,KBase_derived_Bin.001.fastaBA_F_extracted_bins...
1,-2.5954,-3.899901,Test,unknown,179,KBase_derived_Bin.001.fastaBA_S_extracted_bins...
2,-2.521282,-3.933876,Test,unknown,179,KBase_derived_Bin.001.fastaCF_F_extracted_bins...
3,-2.585663,-4.223344,Test,unknown,179,KBase_derived_Bin.001.fastaCG_S_extracted_bins...
4,-2.522401,-4.091096,Test,unknown,179,KBase_derived_Bin.001.fastaFE_F_extracted_bins...


In [21]:
# Plot the training points
fig = px.scatter(
    data_frame = train_2d,
    x = "Component 1",
    y = "Component 2", # Change components to visualize each of the 30 dimensions
    color = "Media Cluster",
    hover_data = ["taxon_id", "Target", "KMeans Cluster"],
    opacity = 0.3
) 

# Add the test points as a scatter trace
fig.add_trace(
    go.Scatter(
        x=test_2d["Component 1"],  # x coordinates
        y=test_2d["Component 2"],  # y coordinates
        mode='markers',
        marker=dict(
            color='black',  
            size=5,  
            opacity=1.0 
        ),
        text=test_2d["taxon_id"]  # marker hover text
    )
)

# Opacity slider
steps = []
for i in range(11):  # 0 to 1 in steps of 0.1
    step = dict(
        method="restyle",
        args=[{"marker.opacity": [i/20]}, [0]],  # Adjust opacity of the first trace only
        label=str(i/20)
    )
    steps.append(step)
sliders = [dict(
    active=3,  # initial value corresponding to opacity=0.3
    currentvalue={"prefix": "Opacity: "},
    pad={"t": 50},
    steps=steps
)]

fig.update_layout(
    sliders=sliders,
    title="KMeans",
    template="plotly_white",
)

fig.show()