# Model Inputs

### Packages

In [1]:
# General
import pandas as pd
import numpy as np
import pandas as pd
import os
DATA_DIR = "~/Desktop/code/data/"

# Pipeline
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 

from sklearn.decomposition import PCA 
import umap
import hdbscan

from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

# Visualization
import plotly.express as px
import plotly.graph_objects as go

### Dataset

In [2]:
# Loading output dataframe from data.ipynb
df = pd.read_csv(os.path.join(DATA_DIR, "ml_input.csv"), low_memory=False)

# Filter non-specific ec numbers (e.g., 1.1.1,-)
filter = df["ec"].str.contains("-")
df = df[~filter]

# Format for machine learning
df = df[["taxon_id", "media_id", "ec"]].value_counts().reset_index()
df = df.pivot(index=["taxon_id", "media_id"], columns="ec", values="count")
df = df.fillna(0.0).reset_index()

# Filtering into training and test sets (database set vs. MAGs)
bins = df["media_id"].str.contains("unknown")
df_train = df[~bins]
df_test = df[bins]

# Model

### Training and test sets

In [3]:
# Split the datasets into training and test sets
TARGET = "media_id" # target label
RDM = 47 # seed for random_state

# Test case with MAGs
X_train = df_train.drop(["taxon_id", "media_id"], axis=1)
y_train = df_train[TARGET]
X_test = df_test.drop(["taxon_id", "media_id"], axis=1)
y_test = df_test[TARGET]

### Pipeline

In [4]:
# sklearn pipeline helps prevent data leakage; incorporate individual steps here
pipeline = Pipeline([
    ('scaler', StandardScaler()),    # Scale data (optional)
    ('pca', PCA(n_components=120)),   # Reduce dimensionality with PCA
    ('umap', umap.UMAP(              # Further reduce with UMAP
        metric="euclidean",
        n_components=20, #20 for clustering, 2 for visualization
        n_epochs=200, #200 recommended for large datasets, higher => stricter clustering
        random_state=RDM,
        n_jobs=1,
        n_neighbors=50,
        min_dist=0.9
        )) 
])

### Fit and transform

In [5]:
# Fit and transform the pipeline on the training and test data
X_train_transformed = pipeline.fit_transform(X_train) # fit pipeline and transform training data
X_test_transformed = pipeline.transform(X_test) # transform test data

### Optimizing n_clusters

In [6]:
# Silhouette coefficient method
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

# Test a range of clusters for their silhouette coefficients
clusters = []
for n_cluster in range(20, 100):
    kmeans = KMeans(n_clusters=n_cluster).fit(X_train_transformed)
    label = kmeans.labels_
    sil_coeff = silhouette_score(X_train_transformed, label, metric='euclidean')
    clusters.append({'n_clusters': n_cluster, 'coefficient': sil_coeff})

# Select the maximum coefficient
clusters = pd.DataFrame(clusters)
c = clusters.iloc[clusters["coefficient"].argmax()]["n_clusters"]
s = clusters["coefficient"].max()
n = int(c)

print("{} clusters returns a maximum Silhouette Coefficient of {}".format(n-1, s))

96 clusters returns a maximum Silhouette Coefficient of 0.5658345818519592


In [7]:
# Cluster
clusterer = KMeans(n_clusters=n, random_state=RDM)
cluster_labels = clusterer.fit_predict(X_train_transformed)
test_clusters = clusterer.predict(X_test_transformed)

### Metrics

In [8]:
# Initial dimensionality reduction performance (PCA)
pca = pipeline.named_steps['pca']
explained_variance = pca.explained_variance_ratio_
print(f"Total PCA explained variance: {explained_variance.sum()}")

# Further dimensionality reduction and clustering performance
silhouette_avg = silhouette_score(X_train_transformed, cluster_labels)
print(clusterer.labels_.max(), "KMeans training clusters")
print(f"Average Silhouette Score on KMeans clusters: {silhouette_avg}")
    # Silhouette score <=0.25 implies poor clustering, 0.25<x<=0.50 is fair, <50 is good

Total PCA explained variance: 0.8272554799373779
96 KMeans training clusters
Average Silhouette Score on KMeans clusters: 0.5572218298912048


### Visualization

In [9]:
# Training data
train_df = pd.DataFrame(X_train_transformed, columns=[f"Component {i+1}" for i in range(X_train_transformed.shape[1])])
train_df['Set'] = 'Training'
train_df['Target'] = y_train
train_df['Cluster'] = cluster_labels # only showing training clusters for now, mapping test over these
train_df['taxon_id'] = df_train["taxon_id"]

# Media cluster labels
media_clusters = pd.read_csv(os.path.join(DATA_DIR, "media-clusters.csv"))
media_clusters = media_clusters[["media_id", "Cluster"]]
media_clusters = media_clusters.rename(columns={"media_id": "Target", "Cluster": "mm_cluster"})

# Map labels
target_label_mapping = dict(zip(media_clusters['Target'], media_clusters['mm_cluster']))
train_df['mm_cluster'] = train_df['Target'].map(target_label_mapping)

train_df.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 16,Component 17,Component 18,Component 19,Component 20,Set,Target,Cluster,taxon_id,mm_cluster
0,10.195477,3.345171,23.74659,5.355264,0.982935,8.091534,6.23688,8.222627,-1.873895,5.554212,...,2.77207,5.488577,5.470057,6.482717,4.284708,Training,J22,10,1002526,0.0
1,13.480032,5.653246,5.777202,4.238627,-1.206559,4.905827,2.986797,5.058038,8.932267,1.217893,...,0.523725,5.120173,6.542607,3.555301,3.634867,Training,1a,0,1004166,1.0
2,6.991783,4.242228,5.043715,2.475263,9.000949,7.156883,6.048267,6.165906,3.411207,-0.892551,...,3.278044,11.585267,2.723828,2.740336,7.729725,Training,J181,20,1004261,1.0
3,6.897001,4.382188,5.186857,2.512902,8.934326,7.260574,5.892082,6.313311,3.454671,-0.823334,...,3.218682,11.584191,2.864377,2.882525,7.828901,Training,J455,20,1004261,
4,10.177027,3.628377,23.859009,5.475954,1.260935,8.217946,6.429015,8.080694,-1.913876,5.264847,...,2.656863,5.32882,5.779419,6.330937,4.326303,Training,J118,10,1005925,1.0


In [12]:
# Test data
test_df = pd.DataFrame(X_test_transformed, columns=[f"Component {i+1}" for i in range(X_train_transformed.shape[1])])
test_df['Set'] = 'Test'
test_df['Target'] = list(df_test['media_id'])
test_df['Cluster'] = test_clusters
test_df['taxon_id'] = list(df_test['taxon_id'])
test_df.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 15,Component 16,Component 17,Component 18,Component 19,Component 20,Set,Target,Cluster,taxon_id
0,11.178419,6.563054,5.299446,3.450693,5.54717,5.255319,5.362567,4.639151,2.894568,2.523374,...,2.286815,2.048378,5.543658,5.306262,1.714843,5.908508,Test,unknown,78,KBase_derived_Bin.001.fastaBA_F_extracted_bins...
1,11.080207,6.65892,5.378292,3.23779,5.882771,4.952508,5.167218,4.685688,2.65424,2.490207,...,2.732099,2.241842,5.60913,5.180329,1.435082,6.140223,Test,unknown,58,KBase_derived_Bin.002.fastaBA_F_extracted_bins...
2,11.382848,6.692156,5.259451,3.50948,5.150709,5.003079,5.252345,4.549433,2.859113,2.499007,...,2.2911,2.314097,5.148008,5.209519,1.457379,5.571975,Test,unknown,78,KBase_derived_Bin.004.fastaBA_F_extracted_bins...
3,11.451366,6.842658,5.638997,3.396361,5.238123,5.405813,5.52753,4.295044,2.896203,2.673939,...,2.077215,2.332489,5.250798,5.095497,1.136878,5.967654,Test,unknown,78,KBase_derived_Bin.005.fastaBA_F_extracted_bins...
4,11.347152,6.550403,5.185513,3.543113,5.408257,5.187567,5.449027,4.699819,2.99785,2.398331,...,2.277669,1.910573,5.53055,5.344505,1.670141,6.105617,Test,unknown,78,KBase_derived_Bin.006.fastaBA_F_extracted_bins...


In [13]:
# Plot the training points
fig = px.scatter(
    data_frame = train_df,
    x = "Component 1",
    y = "Component 2",
    color = "Cluster",
    hover_data = ["taxon_id", "Target", "mm_cluster"],
    opacity = 0.3
) 

# Add the test points as a scatter trace
fig.add_trace(
    go.Scatter(
        x=test_df["Component 1"],  # x coordinates
        y=test_df["Component 2"],  # y coordinates
        mode='markers',
        marker=dict(
            color='black',  
            size=5,  
            opacity=1.0 
        ),
        text=test_df["Target"]  # marker hover text
    )
)

# Opacity slider
steps = []
for i in range(11):  # 0 to 1 in steps of 0.1
    step = dict(
        method="restyle",
        args=[{"marker.opacity": [i/20]}, [0]],  # Adjust opacity of the first trace only
        label=str(i/20)
    )
    steps.append(step)
sliders = [dict(
    active=3,  # initial value corresponding to opacity=0.3
    currentvalue={"prefix": "Opacity: "},
    pad={"t": 50},
    steps=steps
)]

fig.update_layout(
    sliders=sliders,
    title="KMeans",
    template="plotly_white",
)

fig.show()

# Classifier

### Nearest Neighbors

In [14]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train_transformed, cluster_labels)

print(neigh.predict(X_test_transformed))
distances, indices = neigh.kneighbors(X_test_transformed)

[58 58 15 54 58 58 78 15 58 15 58 58 12 78 78 58 15 58 78 58 78 78 58]


In [15]:
# Add nearest neighbor indices to test dataframe
output_df = test_df.copy()
nn_df = pd.DataFrame(indices)
output_df['indices'] = nn_df[nn_df.columns[0:]].apply(lambda x: ';'.join(x.dropna().astype(str)), axis=1)
#output_df["KNN_label"] = neigh.predict(X_test_transformed)

# Explode the indices
output_df["indices"] = output_df["indices"].str.split(';')
output_df = output_df.explode("indices")

# Add indices as column for the training dataframe
train_index = train_df.copy()
train_index["indices"] = train_index.index
train_index["indices"] = train_index["indices"].astype(str)

# Merge our dataframes to provide labels
left = output_df.copy()
right = train_index[["indices", "Target", "Cluster", "taxon_id"]]
right = right.rename(columns={"Target": "neighbor_media_id", "taxon_id": "neighbor_taxon_id", "Cluster": "neighbor_cluster"})
merged = pd.merge(left, right, on="indices", how="left")



#merged.to_csv(os.path.join(DATA_DIR, "ml-model-output.csv"), index=False)
merged.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,...,Component 19,Component 20,Set,Target,Cluster,taxon_id,indices,neighbor_media_id,neighbor_cluster,neighbor_taxon_id
0,11.178419,6.563054,5.299446,3.450693,5.54717,5.255319,5.362567,4.639151,2.894568,2.523374,...,1.714843,5.908508,Test,unknown,78,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,1899,J49,58,1848
1,11.178419,6.563054,5.299446,3.450693,5.54717,5.255319,5.362567,4.639151,2.894568,2.523374,...,1.714843,5.908508,Test,unknown,78,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,1898,J48,58,1848
2,11.178419,6.563054,5.299446,3.450693,5.54717,5.255319,5.362567,4.639151,2.894568,2.523374,...,1.714843,5.908508,Test,unknown,78,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,5179,J443,58,57497
3,11.178419,6.563054,5.299446,3.450693,5.54717,5.255319,5.362567,4.639151,2.894568,2.523374,...,1.714843,5.908508,Test,unknown,78,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,1470,J27,78,1708
4,11.178419,6.563054,5.299446,3.450693,5.54717,5.255319,5.362567,4.639151,2.894568,2.523374,...,1.714843,5.908508,Test,unknown,78,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,1081,339a,72,1535


### Random Forest

In [16]:
y_train = cluster_labels
y_test = test_clusters

In [17]:
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model = RandomForestClassifier(
    n_estimators=100,
    n_jobs=-1,
    random_state=RDM
)

model.fit(X_train_transformed, y_train)

In [18]:
report = classification_report(
    y_true=y_test,
    y_pred=model.predict(X_test_transformed),
    zero_division="warn"
)

print(report)

              precision    recall  f1-score   support

          12       1.00      1.00      1.00         1
          54       0.00      0.00      0.00         0
          58       0.78      1.00      0.88         7
          72       0.00      0.00      0.00         1
          78       1.00      0.79      0.88        14

    accuracy                           0.83        23
   macro avg       0.56      0.56      0.55        23
weighted avg       0.89      0.83      0.85        23




Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



In [19]:
from sklearn import metrics

roc_score = metrics.roc_auc_score(
    y_true=y_test,
    y_score=model.predict_proba(X_test_transformed),
    labels=np.unique(y_train),
    average="weighted",
    multi_class="ovo"
)

print("ROC AUC score:", roc_score)

ROC AUC score: 0.9645445134575569
