# Model Inputs

### Packages

In [1]:
# General
import pandas as pd
import numpy as np
import pandas as pd
import os
DATA_DIR = "~/Desktop/code/data/"

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA 
import umap
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

# Classfication
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Visualization
import plotly.express as px
import plotly.graph_objects as go

### Dataset

In [2]:
# Loading output dataframe from data.ipynb
df = pd.read_csv(os.path.join(DATA_DIR, "model", "ml_input.csv"), low_memory=False)

# Filter non-specific ec numbers (e.g., 1.1.1,-)
filter = df["ec"].str.contains("-")
df = df[~filter]

# Format for machine learning
df = df[["taxon_id", "media_id", "ec"]].value_counts().reset_index()
df = df.pivot(index=["taxon_id", "media_id"], columns="ec", values="count")
df = df.fillna(0.0).reset_index()

# Filtering into training and test sets (database set vs. MAGs)
bins = df["media_id"].str.contains("unknown")
df_train = df[~bins]
df_test = df[bins]

### Split dataset

In [3]:
# Split the datasets into training and test sets
TARGET = "media_id" # target label
RANDOM_SEED = 47 # seed for random_state

# Test case with MAGs
X_train = df_train.drop(["taxon_id", "media_id"], axis=1)
y_train = df_train[TARGET] 
X_test = df_test.drop(["taxon_id", "media_id"], axis=1)
y_test = df_test[TARGET]

# Model

### Pipeline

In [4]:
# sklearn pipeline helps prevent data leakage; incorporate individual steps here
pipeline = Pipeline([
    ('scaler', StandardScaler()),    # Scale data (optional)
    ('pca', PCA(n_components=120)),   # Reduce dimensionality with PCA
    ('umap', umap.UMAP(              # Further reduce with UMAP
        metric="euclidean",
        n_epochs=200, #200 recommended for large datasets, higher => stricter clustering
        random_state=RANDOM_SEED,
        n_jobs=1,
        n_components=30, 
        n_neighbors=40, 
        min_dist=0.4 
        )) 
]) 

### Fit and transform

In [5]:
# Fit and transform the pipeline on the training and test data
X_train_transformed = pipeline.fit_transform(X_train) # fit pipeline and transform training data
X_test_transformed = pipeline.transform(X_test) # transform test data

### Cluster

In [6]:
# Cluster
clusterer = KMeans(n_clusters=175, random_state=RANDOM_SEED) #173, 177, 181
cluster_labels = clusterer.fit_predict(X_train_transformed)
test_clusters = clusterer.predict(X_test_transformed)

### Metrics

In [7]:
# Initial dimensionality reduction performance (PCA)
pca = pipeline.named_steps['pca']
explained_variance = pca.explained_variance_ratio_
#print(f"PCA explained variance: {explained_variance}") # variance of each component
print(f"Total PCA explained variance: {explained_variance.sum()}")

# Further dimensionality reduction and clustering performance
silhouette_avg = silhouette_score(X_train_transformed, cluster_labels)
print(clusterer.labels_.max()+1, "KMeans training clusters")
print(f"Average Silhouette Score on KMeans clusters: {silhouette_avg}") # <0.25 implies poor clustering, 0.25<x<0.50 is fair, >50 is good
#84, 177, 68

Total PCA explained variance: 0.8422104585700301
175 KMeans training clusters
Average Silhouette Score on KMeans clusters: 0.6916720271110535


# Classifier

In [8]:
y2_train = cluster_labels
y2_test = test_clusters

### Random Forest

In [9]:
rfc = RandomForestClassifier(
    n_estimators=100,
    n_jobs=-1,
    random_state=RANDOM_SEED
)

rfc.fit(X_train_transformed, y2_train)
print(rfc.predict(X_test_transformed))

[ 47  93 162  93  71  47 162 162 162  47  71  71 108  71  93 113 162  71
  47  71 108 113 162  71  71 144 162  71  71  71 113 162 109 109 162 162
 162 144 162 162  71 162  71 162  71 162  71  71 162  93  71 113 162  71
  47  47 109  93 162 162 108 162  71  47  71 113 108  71  47  71  93 144
  93 162  71 108 144 108 108  93  93 162 108  71 113  71  71  71 113 142
 162 162  71 162  71  71 162 109 162   2 113 113  62  71  93 108 162  71
 108 162 162  71  47 162  71  71 162 162 162 162 162 113  71  93  47 144
  93  93  47 174 162  93 108 144 113 113 162 162 162 162 162  71 162  52
 144 105 162  47 113 162 108 162 162  71 162  71 113 162 162  71  71  71
 162  47 142 113  71 113 162 162 113  93 112 162  71 108  93  71  71  71
 162 162 162  71  47  71  93  93  71  71  71 162 162 144 113 162 162 113
  71  47  71  71 112  93 113 162 174 162 162  71  62 162 113  93 162  71
 162  47 162 113 162  93 113 113 113 113 162 113  71  71 161 113  71 113
  71 162 113 162 113  47  71 162 162 144 162 162  4

In [10]:
report = classification_report(
    y_true=y2_test,
    y_pred=rfc.predict(X_test_transformed),
    zero_division=0.0 # Default: set to "warn", or float {0.0, 1.0}
)

roc_score = metrics.roc_auc_score(
    y_true=y2_test,
    y_score=rfc.predict_proba(X_test_transformed),
    labels=np.unique(y2_train),
    average="weighted",
    multi_class="ovo"
)

print(report)
print("RF ROC AUC score:", roc_score)

              precision    recall  f1-score   support

           2       0.00      0.00      0.00         0
          47       0.78      0.93      0.85        15
          52       1.00      1.00      1.00         1
          62       1.00      1.00      1.00         2
          71       0.79      0.96      0.87        51
          88       0.00      0.00      0.00         1
          93       1.00      0.57      0.72        37
         105       1.00      1.00      1.00         1
         108       0.85      1.00      0.92        11
         109       0.50      1.00      0.67         2
         112       0.50      1.00      0.67         1
         113       1.00      1.00      1.00        31
         142       1.00      0.50      0.67         4
         144       1.00      1.00      1.00         9
         161       1.00      0.50      0.67         2
         162       0.92      0.93      0.93        76
         174       1.00      0.67      0.80         3

    accuracy              

### Nearest Neighbors

In [11]:
knc= KNeighborsClassifier(n_neighbors=10)
knc.fit(X_train_transformed, cluster_labels)
distances, indices = knc.kneighbors(X_test_transformed)

print(knc.predict(X_test_transformed))

[ 47  93  47  93  93  47 162 162  71  47  71  93 108  71  93 113 162  47
  47  71 108 113 162  71  93 144  93  47 108  47 113 162 142 109 162 162
 162 144  47  93  71 162  71  93  71  93  71  93 162  93  71 113 162  71
  47  93 109  93  47 162 108  47  71  47  71 113  71  71  47  71  93 144
  93  47  71 108 144 108 108  93  93  93 108  71 113  71  71  93 113 142
 162 162  93 162  93  71  47 109 162 161 113 113  62 162  93 108  93  71
 108  93  93  71  47 162  93  71 162 162 162 162 162 113  93  93  47  71
  93  93  47 108 162  93  71 144 113 113 162  93  93  93  93  71 162  52
 144 105 162  47 113 162 108 162 162  71 162  93 113 162  47 108  71  71
  47  47 142 113  71 113 162  47  62  93 112 162  93  71  93  71  71  71
 162  93  47 162  47  71  93  93  71  93  71  47  93 144 113  88 162 113
  93  47  71  47  47  93 113 162 174  93  93  93  62 162 113  93  93  71
 162  47 162  62 162  93 113 113 113 113 162 113 162  71 161 113  71 113
  93  93 113 162 113  47  71 162 162  71 162  93  4

In [12]:
report = classification_report(
    y_true=y2_test,
    y_pred=knc.predict(X_test_transformed),
    zero_division=0.0 # Default: set to "warn", or float {0.0, 1.0}
)

roc_score = metrics.roc_auc_score(
    y_true=y2_test,
    y_score=knc.predict_proba(X_test_transformed),
    labels=np.unique(y2_train),
    average="weighted",
    multi_class="ovo"
)

print(report)
print("KNN ROC AUC score:", roc_score)

              precision    recall  f1-score   support

          47       0.45      1.00      0.62        15
          52       1.00      1.00      1.00         1
          62       0.50      1.00      0.67         2
          71       0.89      0.76      0.82        51
          88       1.00      1.00      1.00         1
          93       0.62      0.95      0.75        37
         105       1.00      1.00      1.00         1
         108       0.77      0.91      0.83        11
         109       0.67      1.00      0.80         2
         112       1.00      1.00      1.00         1
         113       1.00      0.94      0.97        31
         142       1.00      0.75      0.86         4
         144       1.00      0.78      0.88         9
         161       1.00      1.00      1.00         2
         162       0.98      0.62      0.76        76
         174       1.00      0.33      0.50         3

    accuracy                           0.79       247
   macro avg       0.87   

# Outputs

### Train, clustering labels

In [22]:
# Training data
train_df = pd.DataFrame(X_train_transformed, columns=[f"Component {i+1}" for i in range(X_train_transformed.shape[1])])
train_df['Set'] = 'Training'
train_df['Target'] = y_train
train_df['KMeans Cluster'] = cluster_labels
train_df['taxon_id'] = df_train["taxon_id"]

# Media cluster labels
media_clusters = pd.read_csv(os.path.join(DATA_DIR, "model", "media-clusters.csv"))
media_clusters = media_clusters[["media_id", "Cluster"]]
media_clusters = media_clusters.rename(columns={"media_id": "Target", "Cluster": "Media Cluster"})
target_label_mapping = dict(zip(media_clusters['Target'], media_clusters['Media Cluster']))
train_df['Media Cluster'] = train_df['Target'].map(target_label_mapping)

train_df.to_csv(os.path.join(DATA_DIR, "model", "train-output.csv"), index=False)
train_df.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 26,Component 27,Component 28,Component 29,Component 30,Set,Target,KMeans Cluster,taxon_id,Media Cluster
0,8.66586,5.577676,5.371725,5.590428,4.462435,3.454598,5.416646,4.121308,4.994847,4.627648,...,4.564337,4.558545,5.42695,5.449899,4.724612,Training,J346,152,100.0,9.0
1,8.68999,5.846525,4.868929,5.488967,4.65727,2.008369,5.241614,5.676119,5.701665,3.389722,...,4.270833,5.498604,5.228493,4.609472,4.734795,Training,J386,118,100.0,9.0
2,9.09754,5.762683,5.813603,4.016942,5.071388,2.454074,5.308674,5.561325,6.254646,6.693429,...,6.122455,6.201872,5.613096,1.017154,3.272479,Training,J233,113,100225.0,2.0
3,8.803185,6.111124,5.162669,3.758887,6.443524,3.235219,5.243846,5.465508,6.954449,7.301729,...,5.402921,4.436521,3.516246,3.657414,6.896559,Training,J22,73,1002526.0,0.0
4,9.02465,6.179308,5.997299,3.909567,5.068938,2.089074,5.581003,6.203505,5.237926,4.415408,...,6.618696,5.366315,4.820214,5.380837,5.503313,Training,1a,90,1004166.0,9.0


### Test, classifier labels

In [23]:
# Test data
test_df = pd.DataFrame(X_test_transformed, columns=[f"Component {i+1}" for i in range(X_test_transformed.shape[1])])
test_df['Set'] = 'Test'
test_df['Target'] = list(df_test['media_id'])
test_df['KMeans Cluster'] = test_clusters
test_df['taxon_id'] = list(df_test['taxon_id'])

# Add classifier labels 
test_df["KNN Classify"] = knc.predict(X_test_transformed)
test_df["RF Classify"] = rfc.predict(X_test_transformed)

test_df.to_csv(os.path.join(DATA_DIR, "model", "test-output.csv"), index=False)
test_df.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 27,Component 28,Component 29,Component 30,Set,Target,KMeans Cluster,taxon_id,KNN Classify,RF Classify
0,9.19728,5.825243,5.394021,4.466215,5.150658,2.001808,5.536055,6.155151,5.887779,5.133421,...,5.818435,5.084758,3.106649,5.032887,Test,unknown,47,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,47,47
1,9.078587,5.848104,5.210234,4.419587,5.071416,2.282931,5.310214,5.968658,5.375305,5.474864,...,5.425525,4.931853,3.149422,5.275607,Test,unknown,93,KBase_derived_Bin.001.fastaBA_S_extracted_bins...,93,93
2,9.129475,5.856022,5.268515,4.503699,5.10458,2.220196,5.376308,6.049508,5.323339,5.308051,...,5.473971,4.91183,3.231877,5.416367,Test,unknown,162,KBase_derived_Bin.001.fastaCF_F_extracted_bins...,47,162
3,9.040198,5.87412,5.188167,4.33495,5.084061,2.228595,5.293644,5.931281,5.510087,5.425838,...,5.378965,4.921168,3.32128,5.159742,Test,unknown,93,KBase_derived_Bin.001.fastaCG_S_extracted_bins...,93,93
4,9.113511,5.839798,5.236561,4.422174,5.096156,2.201019,5.335394,6.03117,5.389693,5.353369,...,5.441777,4.940167,3.262278,5.315568,Test,unknown,93,KBase_derived_Bin.001.fastaFE_F_extracted_bins...,93,71


### ID nearest neighbors

In [24]:
# Add nearest neighbor indices to the test dataframe
knn_df = test_df.copy()
knn_df = knn_df.rename(columns={"KMeans Cluster": "Test Cluster"})
neighbors = pd.DataFrame(indices) # Indices of nearest neighbors from the training set
knn_df['indices'] = neighbors[neighbors.columns[0:]].apply(lambda x: ';'.join(x.dropna().astype(str)), axis=1)
knn_df["indices"] = knn_df["indices"].str.split(';')
knn_df = knn_df.explode("indices")

# Add original indices as a column for the train dataframe
train_index = train_df.copy()
train_index = train_index.rename(columns={"KMeans Cluster": "Train Cluster"})
train_index["indices"] = df_train.index
train_index["indices"] = train_index["indices"].astype(str)

# Merge to label nearest neighbors
left = knn_df.copy()
right = train_index[["indices", "Target", "taxon_id"]]
right = right.rename(columns={"Target": "neighbor_media_id", "taxon_id": "neighbor_taxon_id", "Cluster": "neighbor_cluster"})
merged = pd.merge(left, right, on="indices", how="left")

merged.to_csv(os.path.join(DATA_DIR, "ml-model-output.csv"), index=False)
merged

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 30,Set,Target,Test Cluster,taxon_id,KNN Classify,RF Classify,indices,neighbor_media_id,neighbor_taxon_id
0,9.197280,5.825243,5.394021,4.466215,5.150658,2.001808,5.536055,6.155151,5.887779,5.133421,...,5.032887,Test,unknown,47,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,47,47,1516,J303,1520.0
1,9.197280,5.825243,5.394021,4.466215,5.150658,2.001808,5.536055,6.155151,5.887779,5.133421,...,5.032887,Test,unknown,47,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,47,47,339,J22,113288.0
2,9.197280,5.825243,5.394021,4.466215,5.150658,2.001808,5.536055,6.155151,5.887779,5.133421,...,5.032887,Test,unknown,47,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,47,47,4165,J436,265470.0
3,9.197280,5.825243,5.394021,4.466215,5.150658,2.001808,5.536055,6.155151,5.887779,5.133421,...,5.032887,Test,unknown,47,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,47,47,1383,J1258,1485545.0
4,9.197280,5.825243,5.394021,4.466215,5.150658,2.001808,5.536055,6.155151,5.887779,5.133421,...,5.032887,Test,unknown,47,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,47,47,193,141c,1080712.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2465,9.160114,5.838995,5.062997,4.203086,5.148105,2.185012,5.470611,5.987193,5.684276,4.849582,...,5.361901,Test,unknown,47,KBase_derived_Bin.039.fastaLS_S_extracted_bins...,47,47,4844,1203a,2981779.0
2466,9.160114,5.838995,5.062997,4.203086,5.148105,2.185012,5.470611,5.987193,5.684276,4.849582,...,5.361901,Test,unknown,47,KBase_derived_Bin.039.fastaLS_S_extracted_bins...,47,47,5450,J84,363265.0
2467,9.160114,5.838995,5.062997,4.203086,5.148105,2.185012,5.470611,5.987193,5.684276,4.849582,...,5.361901,Test,unknown,47,KBase_derived_Bin.039.fastaLS_S_extracted_bins...,47,47,5447,1203a,363265.0
2468,9.160114,5.838995,5.062997,4.203086,5.148105,2.185012,5.470611,5.987193,5.684276,4.849582,...,5.361901,Test,unknown,47,KBase_derived_Bin.039.fastaLS_S_extracted_bins...,47,47,5449,J14,363265.0


In [16]:
#TODO: extract more information from nearest neighbors and use to predict components/concentrations

# Visualization

### n dimensions

In [17]:
# Plot the training points
fig = px.scatter(
    data_frame = train_df,
    x = "Component 1",
    y = "Component 2", # Change components to visualize each of the 30 dimensions
    color = "Media Cluster",
    hover_data = ["taxon_id", "Target", "KMeans Cluster"],
    opacity = 0.3
) 

# Add the test points as a scatter trace
fig.add_trace(
    go.Scatter(
        x=test_df["Component 1"],  # x coordinates
        y=test_df["Component 2"],  # y coordinates
        mode='markers',
        marker=dict(
            color='black',  
            size=5,  
            opacity=1.0 
        ),
        text=test_df["taxon_id"]  # marker hover text
    )
)

# Opacity slider
steps = []
for i in range(11):  # 0 to 1 in steps of 0.1
    step = dict(
        method="restyle",
        args=[{"marker.opacity": [i/20]}, [0]],  # Adjust opacity of the first trace only
        label=str(i/20)
    )
    steps.append(step)
sliders = [dict(
    active=3,  # initial value corresponding to opacity=0.3
    currentvalue={"prefix": "Opacity: "},
    pad={"t": 50},
    steps=steps
)]

fig.update_layout(
    sliders=sliders,
    title="KMeans",
    template="plotly_white",
)

fig.show()

### 2 dimensions

#### 2D Pipeline

In [18]:
# copy of model pipeline reduced to two dimensions, ***LEADS TO DISCONNECT IN DIMENSIONALITY REDUCTION STEPS***
visualization = Pipeline([
    ('scaler', StandardScaler()),   
    ('pca', PCA(n_components=120)),   
    ('umap', umap.UMAP(              
        metric="euclidean",
        n_epochs=200, 
        random_state=RANDOM_SEED,
        n_jobs=1,
        n_components=2, # reduced for visualization
        n_neighbors=40, 
        min_dist=0.4 
        )) 
]) 

# Fit and transform the pipeline on the training and test data
X_train_embedding = visualization.fit_transform(X_train) # fit pipeline and transform training data
X_test_embedding = visualization.transform(X_test) # transform test data

# Cluster
vclusterer = KMeans(n_clusters=177, random_state=RANDOM_SEED)
vcluster_labels = vclusterer.fit_predict(X_train_embedding)
vtest_clusters = vclusterer.predict(X_test_embedding)

# Metrics to compare to 30 dimensions
vpca = visualization.named_steps['pca']
vexplained_variance = vpca.explained_variance_ratio_
vsilhouette_avg = silhouette_score(X_train_embedding, vcluster_labels)
print(f"Total PCA explained variance: {vexplained_variance.sum()}")
print(vclusterer.labels_.max()+1, "KMeans training clusters")
print(f"Average Silhouette Score on KMeans clusters: {vsilhouette_avg}") # <0.25 implies poor clustering, 0.25<x<0.50 is fair, >50 is good


Graph is not fully connected, spectral embedding may not work as expected.



Total PCA explained variance: 0.8422117131388719
177 KMeans training clusters
Average Silhouette Score on KMeans clusters: 0.6532658338546753


#### Visualization

In [19]:
# Training data
train_2d = pd.DataFrame(X_train_embedding, columns=[f"Component {i+1}" for i in range(X_train_embedding.shape[1])])
train_2d['Set'] = 'Training'
train_2d['Target'] = y_train
train_2d['KMeans Cluster'] = cluster_labels
train_2d['taxon_id'] = df_train["taxon_id"]

# Media cluster labels
media_clusters = pd.read_csv(os.path.join(DATA_DIR, "model", "media-clusters.csv"))
media_clusters = media_clusters[["media_id", "Cluster"]]
media_clusters = media_clusters.rename(columns={"media_id": "Target", "Cluster": "Media Cluster"})
target_label_mapping = dict(zip(media_clusters['Target'], media_clusters['Media Cluster']))
train_2d['Media Cluster'] = train_2d['Target'].map(target_label_mapping)

# Test data
test_2d = pd.DataFrame(X_test_embedding, columns=[f"Component {i+1}" for i in range(X_test_embedding.shape[1])])
test_2d['Set'] = 'Test'
test_2d['Target'] = list(df_test['media_id'])
test_2d['KMeans Cluster'] = test_clusters
test_2d['taxon_id'] = list(df_test['taxon_id'])

In [20]:
test_2d.head()

Unnamed: 0,Component 1,Component 2,Set,Target,KMeans Cluster,taxon_id
0,-3.137385,3.773154,Test,unknown,47,KBase_derived_Bin.001.fastaBA_F_extracted_bins...
1,-3.709665,2.131238,Test,unknown,93,KBase_derived_Bin.001.fastaBA_S_extracted_bins...
2,-4.043037,2.007607,Test,unknown,162,KBase_derived_Bin.001.fastaCF_F_extracted_bins...
3,-3.127633,3.892563,Test,unknown,93,KBase_derived_Bin.001.fastaCG_S_extracted_bins...
4,-4.610725,1.721261,Test,unknown,93,KBase_derived_Bin.001.fastaFE_F_extracted_bins...


In [21]:
# Plot the training points
fig = px.scatter(
    data_frame = train_2d,
    x = "Component 1",
    y = "Component 2", # Change components to visualize each of the 30 dimensions
    color = "Media Cluster",
    hover_data = ["taxon_id", "Target", "KMeans Cluster"],
    opacity = 0.3
) 

# Add the test points as a scatter trace
fig.add_trace(
    go.Scatter(
        x=test_2d["Component 1"],  # x coordinates
        y=test_2d["Component 2"],  # y coordinates
        mode='markers',
        marker=dict(
            color='black',  
            size=5,  
            opacity=1.0 
        ),
        text=test_2d["taxon_id"]  # marker hover text
    )
)

# Opacity slider
steps = []
for i in range(11):  # 0 to 1 in steps of 0.1
    step = dict(
        method="restyle",
        args=[{"marker.opacity": [i/20]}, [0]],  # Adjust opacity of the first trace only
        label=str(i/20)
    )
    steps.append(step)
sliders = [dict(
    active=3,  # initial value corresponding to opacity=0.3
    currentvalue={"prefix": "Opacity: "},
    pad={"t": 50},
    steps=steps
)]

fig.update_layout(
    sliders=sliders,
    title="KMeans",
    template="plotly_white",
)

fig.show()