In [1]:
# Add higher directory to python modules path

import sys

sys.path.append("..")

In [2]:
import os

import pandas as pd

import plotly.express as px

from sklearn.cluster import HDBSCAN, AgglomerativeClustering

import umap

In [15]:
DATA_DIR = "../data/ml/"
CLUSTER_FILE = "ml_input-taxa2component-donato.csv"

RANDOM_SEED = 666

# UMAP parameters
UMAP_METRIC = "euclidean"
UMAP_SEED = 42

In [16]:
data_df = pd.read_csv(
    os.path.join(
        DATA_DIR,
        CLUSTER_FILE
    )
)

# WARNING: This preprocessing should be done in previous steps!
data_df = data_df.rename(columns={"Media ID": "media_id"})
data_df["taxon_id"] = data_df["taxon_id"].astype(str).str.replace(".0", "0")

data_df.head()

Unnamed: 0,Components,taxon_id,media_id,(NH 4 ) 2 SO 4,(NH 4 ) acetate or Na-pyruvate,0.1 N H 2 SO 4,0.1M Tris/HCl at pH 7.5,Allen’s trace element solution (see medium 88),Artificial sea water,Ca(NO 3 ) 2,...,Trace element SL-11,Trace element SL-6,Trace element Wolfe's,Trypticase peptone,Tryptone,Vitamin solution 141,Vitamin solution 1412,Vitamin solution 603,VOSO 4 x 5 H 2 O solution (0.01% w/v),Yeast Extract
0,0,536350,709,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,5240,269,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2,330590,150a,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,3,9200,882,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,6497460,104a,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [5]:
X_train = data_df.drop(["Components", "taxon_id", "media_id"], axis=1)
X_train.head()

Unnamed: 0,(NH 4 ) 2 SO 4,(NH 4 ) acetate or Na-pyruvate,0.1 N H 2 SO 4,0.1M Tris/HCl at pH 7.5,Allen’s trace element solution (see medium 88),Artificial sea water,Ca(NO 3 ) 2,CaCl 2 x 2 H 2 O,CaCl2,Casamino acids (BD BBL),...,Trace element SL-11,Trace element SL-6,Trace element Wolfe's,Trypticase peptone,Tryptone,Vitamin solution 141,Vitamin solution 1412,Vitamin solution 603,VOSO 4 x 5 H 2 O solution (0.01% w/v),Yeast Extract
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


## 1. Predict taxon from media components

### Using binary data

In [6]:
# Add cluster label
cluster = AgglomerativeClustering(
    n_clusters=3,
    linkage="ward"
)
reduced_df_binary = X_train.copy()
reduced_df_binary["Cluster"] = cluster.fit_predict(reduced_df_binary)
reduced_df_binary["Cluster"] = reduced_df_binary["Cluster"].astype(str)

In [7]:
# Add ID columns
reduced_df_binary[["taxon_id", "media_id"]] = \
    data_df[["taxon_id", "media_id"]].values

reduced_df_binary.head()

Unnamed: 0,(NH 4 ) 2 SO 4,(NH 4 ) acetate or Na-pyruvate,0.1 N H 2 SO 4,0.1M Tris/HCl at pH 7.5,Allen’s trace element solution (see medium 88),Artificial sea water,Ca(NO 3 ) 2,CaCl 2 x 2 H 2 O,CaCl2,Casamino acids (BD BBL),...,Trypticase peptone,Tryptone,Vitamin solution 141,Vitamin solution 1412,Vitamin solution 603,VOSO 4 x 5 H 2 O solution (0.01% w/v),Yeast Extract,Cluster,taxon_id,media_id
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,2,536350,709
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,2,5240,269
2,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,2,330590,150a
3,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,2,9200,882
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,2,6497460,104a


### Using continuous data

#### Map binary features to a continuous embedding space using UMAP (or another algorithm)

In [8]:
reducer = umap.UMAP(
    metric=UMAP_METRIC,
    n_components=4,
    n_epochs=5000,
    random_state=UMAP_SEED,
    n_jobs=1
)

reduced_df = reducer.fit_transform(X_train)
reduced_df = pd.DataFrame(
    reduced_df,
    columns=[f"Component {i+1}" for i in range(reduced_df.shape[1])]
)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


#### Perform the clustering

In [9]:
# Add cluster label
cluster = HDBSCAN(
    min_cluster_size=10,
    min_samples=None,
    metric=UMAP_METRIC,
    n_jobs=-1
)
reduced_df["Cluster"] = cluster.fit_predict(reduced_df)
reduced_df["Cluster"] = reduced_df["Cluster"].astype(str)

In [10]:
# Add ID columns
reduced_df[["taxon_id", "media_id"]] = \
    data_df[["taxon_id", "media_id"]].values

reduced_df.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Cluster,taxon_id,media_id
0,-0.351901,2.414721,-15.072645,49.442924,0,536350,709
1,-0.144096,2.163689,-14.829208,49.19672,0,5240,269
2,-0.147368,2.694492,-14.889048,48.937775,0,330590,150a
3,0.42677,10.763753,-10.55787,45.669064,1,9200,882
4,1.114709,3.485331,-14.437037,49.309772,0,6497460,104a


#### Visualise the results

In [11]:
fig = px.scatter_3d(
    data_frame=reduced_df,
    x="Component 1",
    y="Component 2",
    z="Component 3",
    color="Component 4",
    hover_data=[
        "taxon_id",
        "media_id"
    ],
    template="plotly_white"
)
fig.show()

In [12]:
fig = px.scatter_3d(
    data_frame=reduced_df,
    x="Component 1",
    y="Component 2",
    z="Component 3",
    color="Cluster",
    color_discrete_sequence=px.colors.qualitative.Pastel,
    hover_data=[
        "taxon_id",
        "media_id"
    ],
    template="plotly_white"
)
fig.show()

### Compare both approaches

In [22]:
reduced_df_merged = pd.merge(
    left=reduced_df_binary[["taxon_id", "media_id", "Cluster"]]\
        .rename(columns={"Cluster": "cluster_bin"}),
    right=reduced_df[["taxon_id", "media_id", "Cluster"]]\
        .rename(columns={"Cluster": "cluster_con"}),
    on=["taxon_id", "media_id"],
    how="left"
)

reduced_df_merged

Unnamed: 0,taxon_id,media_id,cluster_bin,cluster_con
0,536350,709,2,0
1,5240,269,2,0
2,330590,150a,2,0
3,9200,882,2,1
4,6497460,104a,2,0
5,841560,519,0,0
6,14540,259,2,0
7,6930750,1211,0,0
8,9790,172,0,0
9,11080,87,2,0


In [23]:
# Choose approach and save clusterd categories for further ML
reduced_df_merged = reduced_df_merged\
    .rename(columns={"cluster_con": "media_id_cluster"})\
    .drop("cluster_bin", axis=1)

reduced_df_merged.to_csv(
    os.path.join(
        DATA_DIR,
        f"{os.path.splitext(CLUSTER_FILE)[0]}_media_cluster.csv"
    )
)

## 2. Predict media from taxon's EC numbers

In [14]:
# TODO