# Predict media from taxa

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.cluster import KMeans
import umap

from sklearn import metrics
from sklearn.model_selection import train_test_split

import os
DATA_DIR = "~/Desktop/code/data/"

In [2]:
data_df = pd.read_csv(os.path.join(DATA_DIR, "media2ec-explode.csv"), low_memory=False)

data_df = data_df.dropna(subset='component_id', how='any')
data_df['component_id'] = data_df['component_id'].astype(int)

data_df = data_df[["taxon_id", "media_id", "component_id"]]\
    .value_counts()\
    .reset_index()

data_df = data_df.pivot(
    index=["taxon_id", "media_id"],
    columns="component_id",
    values="count"
)
data_df = data_df.fillna(0.0).reset_index()
data_df.head()

component_id,taxon_id,media_id,4,18,33,37,46,47,60,68,...,240,337,430,497,506,616,688,754,1353,1359
0,24.0,J22,8.0,2.0,0.0,0.0,4.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,24.0,J26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,33.0,J167,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,33.0,J306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,51.0,J443,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Model

### Strategy for splitting

In [3]:
TARGET_VAR = "taxon_id"
RANDOM_SEED = 47

X_train, X_test, y_train, y_test = train_test_split(
    data_df.drop(["taxon_id", "media_id"], axis=1),
    data_df[TARGET_VAR],
    test_size=0.2,
    random_state=RANDOM_SEED
)

### Dimensionality reduction

In [4]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer, silhouette_score
import umap
import numpy as np

# Define the pipeline
pipeline = Pipeline([
    ('umap', umap.UMAP( 
        metric="euclidean",
        n_epochs=200,
        random_state=RANDOM_SEED,
        n_jobs=1
    ))
])

# Define parameter grid with correct step names
param_distributions = {
    'umap__n_components': [2, 4, 6, 8, 10, 20],
    'umap__n_neighbors': [5, 10, 20, 30, 40, 50], 
    'umap__min_dist': [0, 0.2, 0.4, 0.6, 0.8, 0.99]
}

# Custom scorer function
def umap_silhouette(X, y):
    embedding = X  # X here is already the transformed data by UMAP
    return silhouette_score(embedding, y)

# Wrap the custom scorer
scorer = make_scorer(umap_silhouette, greater_is_better=True)

# Initialize RandomizedSearchCV
search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=50,
    cv=5,
    random_state=RANDOM_SEED,
    n_jobs=-1,
    scoring=scorer
)

# Fit the search to your data
search.fit(X_train, y_train)

# Retrieve the best parameters
best_params = search.best_params_
best_model = search.best_estimator_

print("Best Parameters:", best_params)

 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


Best Parameters: {'umap__n_neighbors': 30, 'umap__n_components': 10, 'umap__min_dist': 0.4}


In [5]:
# Define the pipeline
reducer = umap.UMAP(
        metric="euclidean",
        n_epochs=500,
        random_state=RANDOM_SEED,
        n_jobs=1,
        n_neighbors=30,
        n_components=10,
        min_dist=0.4
    )

reduced = reducer.fit_transform(X_train)
reduced_df = pd.DataFrame(reduced, columns=[f"Component {i+1}" for i in range(reduced.shape[1])])

reduced_df.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10
0,-1.773724,13.579668,3.121752,-0.064388,10.343656,8.856904,6.277681,9.24208,2.20315,3.557396
1,-2.63001,-3.7035,8.822997,2.021324,5.997718,-0.941044,5.681618,9.451844,5.813793,1.553398
2,-1.843176,13.483267,3.120849,-0.03789,10.300418,8.939775,6.300568,9.165531,2.090676,3.52162
3,3.028028,-0.327541,4.283557,3.880607,4.224166,3.252759,5.283535,10.281306,7.387136,3.712514
4,-1.789177,13.412326,3.061722,-0.025458,10.267249,8.859686,6.407624,9.19383,2.177684,3.67598


### Clustering

In [6]:
# Silhouette coefficient method
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

# Test a range of clusters for their silhouette coefficients
clusters = []
for n_cluster in range(10, 200):
    kmeans = KMeans(n_clusters=n_cluster).fit(reduced)
    label = kmeans.labels_
    sil_coeff = silhouette_score(reduced, label, metric='euclidean')
    clusters.append({'n_clusters': n_cluster, 'coefficient': sil_coeff})

# Select the maximum coefficient
clusters = pd.DataFrame(clusters)
c = clusters.iloc[clusters["coefficient"].argmax()]["n_clusters"]
s = clusters["coefficient"].max()
n = int(c)

print("{} clusters returns a maximum Silhouette Coefficient of {}".format(n, s))



12 clusters returns a maximum Silhouette Coefficient of 0.5956470370292664




In [7]:
cluster = KMeans(
    n_clusters=n,
    random_state=RANDOM_SEED
)

reduced_df["Cluster"] = cluster.fit_predict(reduced_df.values)
reduced_df["Cluster"] = reduced_df["Cluster"].astype(str)
reduced_df[["taxon_id", "media_id"]] = data_df.loc[X_train.index, ["taxon_id", "media_id"]].values

reduced_df.to_csv(os.path.join(DATA_DIR, "model", "media-clusters.csv"), index=False)
reduced_df.head()



Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Component 5,Component 6,Component 7,Component 8,Component 9,Component 10,Cluster,taxon_id,media_id
0,-1.773724,13.579668,3.121752,-0.064388,10.343656,8.856904,6.277681,9.24208,2.20315,3.557396,10,134375.0,J27
1,-2.63001,-3.7035,8.822997,2.021324,5.997718,-0.941044,5.681618,9.451844,5.813793,1.553398,5,1448267.0,J26
2,-1.843176,13.483267,3.120849,-0.03789,10.300418,8.939775,6.300568,9.165531,2.090676,3.52162,10,344163.0,J26
3,3.028028,-0.327541,4.283557,3.880607,4.224166,3.252759,5.283535,10.281306,7.387136,3.712514,11,29429.0,J22
4,-1.789177,13.412326,3.061722,-0.025458,10.267249,8.859686,6.407624,9.19383,2.177684,3.67598,10,43057.0,J198


#### Visualize the results

In [8]:
# Plot the training points
fig = px.scatter(
    data_frame = reduced_df,
    x = "Component 1",
    y = "Component 2",
    color = "Cluster",
    hover_data = ["taxon_id", "media_id"],
    opacity = 0.3
) 

fig.update_layout(
    title="Media Clusters",
    template="plotly_white"
)

fig.show()