In [27]:
# Add higher directory to python modules path

import sys

sys.path.append("..")

In [28]:
import os

import pandas as pd

import plotly.express as px

from sklearn.cluster import HDBSCAN, AgglomerativeClustering

import umap

In [29]:
DATA_DIR = "../data/"
CLUSTER_FILE = "ml_input-taxa2component-donato.csv"

RANDOM_SEED = 666

# UMAP parameters
UMAP_METRIC = "euclidean"
UMAP_SEED = 42

In [30]:
data_df = pd.read_csv(
    os.path.join(
        DATA_DIR,
        "ml",
        CLUSTER_FILE
    )
)

# WARNING: This preprocessing should be done in previous steps!
data_df = data_df.rename(columns={"Media ID": "media_id"})
data_df["taxon_id"] = data_df["taxon_id"].astype(str).str.replace(".0", "0")

data_df.head()

Unnamed: 0,Components,taxon_id,media_id,(NH 4 ) 2 SO 4,(NH 4 ) acetate or Na-pyruvate,0.1 N H 2 SO 4,0.1M Tris/HCl at pH 7.5,Allen’s trace element solution (see medium 88),Artificial sea water,Ca(NO 3 ) 2,...,Trace element SL-11,Trace element SL-6,Trace element Wolfe's,Trypticase peptone,Tryptone,Vitamin solution 141,Vitamin solution 1412,Vitamin solution 603,VOSO 4 x 5 H 2 O solution (0.01% w/v),Yeast Extract
0,0,536350,709,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,5240,269,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2,330590,150a,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,3,9200,882,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,6497460,104a,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [31]:
X_train = data_df.drop(["Components", "taxon_id", "media_id"], axis=1)
X_train.head()

Unnamed: 0,(NH 4 ) 2 SO 4,(NH 4 ) acetate or Na-pyruvate,0.1 N H 2 SO 4,0.1M Tris/HCl at pH 7.5,Allen’s trace element solution (see medium 88),Artificial sea water,Ca(NO 3 ) 2,CaCl 2 x 2 H 2 O,CaCl2,Casamino acids (BD BBL),...,Trace element SL-11,Trace element SL-6,Trace element Wolfe's,Trypticase peptone,Tryptone,Vitamin solution 141,Vitamin solution 1412,Vitamin solution 603,VOSO 4 x 5 H 2 O solution (0.01% w/v),Yeast Extract
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


## 1. Predict taxon from media components

### Using binary data

In [32]:
# Add cluster label
cluster = AgglomerativeClustering(
    n_clusters=3,
    linkage="ward"
)
reduced_df_binary = X_train.copy()
reduced_df_binary["Cluster"] = cluster.fit_predict(reduced_df_binary)
reduced_df_binary["Cluster"] = reduced_df_binary["Cluster"].astype(str)

In [33]:
# Add ID columns
reduced_df_binary[["taxon_id", "media_id"]] = \
    data_df[["taxon_id", "media_id"]].values

reduced_df_binary.head()

Unnamed: 0,(NH 4 ) 2 SO 4,(NH 4 ) acetate or Na-pyruvate,0.1 N H 2 SO 4,0.1M Tris/HCl at pH 7.5,Allen’s trace element solution (see medium 88),Artificial sea water,Ca(NO 3 ) 2,CaCl 2 x 2 H 2 O,CaCl2,Casamino acids (BD BBL),...,Trypticase peptone,Tryptone,Vitamin solution 141,Vitamin solution 1412,Vitamin solution 603,VOSO 4 x 5 H 2 O solution (0.01% w/v),Yeast Extract,Cluster,taxon_id,media_id
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,2,536350,709
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,2,5240,269
2,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,2,330590,150a
3,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,2,9200,882
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,2,6497460,104a


### Using continuous data

#### Map binary features to a continuous embedding space using UMAP (or another algorithm)

In [34]:
reducer = umap.UMAP(
    metric=UMAP_METRIC,
    n_components=4,
    n_epochs=5000,
    random_state=UMAP_SEED,
    n_jobs=1
)

reduced_df = reducer.fit_transform(X_train)
reduced_df = pd.DataFrame(
    reduced_df,
    columns=[f"Component {i+1}" for i in range(reduced_df.shape[1])]
)

#### Perform the clustering

In [35]:
# Add cluster label
cluster = HDBSCAN(
    min_cluster_size=10,
    min_samples=None,
    metric=UMAP_METRIC,
    n_jobs=-1
)
reduced_df["Cluster"] = cluster.fit_predict(reduced_df)
reduced_df["Cluster"] = reduced_df["Cluster"].astype(str)

In [36]:
# Add ID columns
reduced_df[["taxon_id", "media_id"]] = \
    data_df[["taxon_id", "media_id"]].values

reduced_df.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Cluster,taxon_id,media_id
0,-0.351901,2.414721,-15.072645,49.442924,0,536350,709
1,-0.144096,2.163689,-14.829208,49.19672,0,5240,269
2,-0.147368,2.694492,-14.889048,48.937775,0,330590,150a
3,0.42677,10.763753,-10.55787,45.669064,1,9200,882
4,1.114709,3.485331,-14.437037,49.309772,0,6497460,104a


#### Visualise the results

In [37]:
fig = px.scatter_3d(
    data_frame=reduced_df,
    x="Component 1",
    y="Component 2",
    z="Component 3",
    color="Component 4",
    hover_data=[
        "taxon_id",
        "media_id"
    ],
    template="plotly_white"
)
fig.show()

In [38]:
fig = px.scatter_3d(
    data_frame=reduced_df,
    x="Component 1",
    y="Component 2",
    z="Component 3",
    color="Cluster",
    color_discrete_sequence=px.colors.qualitative.Pastel,
    hover_data=[
        "taxon_id",
        "media_id"
    ],
    template="plotly_white"
)
fig.show()

### Compare both approaches

In [39]:
reduced_df_merged = pd.merge(
    left=reduced_df_binary[["taxon_id", "media_id", "Cluster"]]\
        .rename(columns={"Cluster": "cluster_bin"}),
    right=reduced_df[["taxon_id", "media_id", "Cluster"]]\
        .rename(columns={"Cluster": "cluster_con"}),
    on=["taxon_id", "media_id"],
    how="left"
)

In [40]:
# Choose approach and save clusterd categories for further ML
reduced_df_merged = reduced_df_merged\
    .rename(columns={"cluster_con": "media_id_cluster"})\
    .drop("cluster_bin", axis=1)

reduced_df_merged.to_csv(
    os.path.join(
        DATA_DIR,
        "ml",
        f"{os.path.splitext(CLUSTER_FILE)[0]}_media_cluster.csv"
    )
)

## 2. Predict media from taxon's EC numbers

### Data preprocessing (should already be stored in a table)

In [41]:
komodo_df = pd.read_csv(
    os.path.join(
        DATA_DIR,
        "komodo",
        "komodo_taxa.txt"
    ),
    sep="\t"
)

# Transform taxon_id column to string for plotting
komodo_df["taxon_id"] = komodo_df["taxon_id"].astype(str).str.replace(".0", "")

# Add genus column
komodo_df["genus"] = komodo_df["organism_name"].str.split(" ").str[0]

# Get media column as a category
komodo_df["dsmz_id"] = komodo_df["dsmz_id"].astype("category")
komodo_df["media_code"] = komodo_df["dsmz_id"].cat.codes

komodo_df

Unnamed: 0,dsmz_id,taxon_id,organism_name,media_name,genus,media_code
0,6268,,,Substrate for DSM 6268,,8044
1,9849,46125,Abiotrophia defectiva,PYG-MEDIUM (modified),Abiotrophia,8611
2,14247,291968,Acaricomes phytoseiuli,TRYPTICASE SOY Yeast extract medium,Acaricomes,880
3,23669,,Acetatifactor muris,For DSM 23669,Acetatifactor,5114
4,5522,2382,Acetitomaculum ruminis,ACETITOMACULUM medium | METHANOBACTERIUM medium,Acetitomaculum,7890
...,...,...,...,...,...,...
8623,473,120045,Zymomonas mobilis subsp. mobilis,ZYMOMONAS medium,Zymomonas,7701
8624,22645,120044,Zymomonas mobilis subsp. pomaceae,ZYMOMONAS medium,Zymomonas,4683
8625,7201,86958,Zymophilus paucivorans,MEDIUM 58 MODIFIED FOR DSM 7201,Zymophilus,8275
8626,20765,86959,Zymophilus raffinosivorans,MEDIUM 58 MODIFIED FOR DSM 20765,Zymophilus,3910


In [42]:
uniprot_df = pd.read_csv(
    os.path.join(
        DATA_DIR,
        "uniprot",
        "komodo_taxon_to_uniprot_ec.csv"
    )
)

uniprot_df["taxonId"] = uniprot_df["taxonId"].astype(str)
uniprot_df = uniprot_df.rename(columns={
    "taxonId": "taxon_id",
    "ecNumbers": "ec_numbers"
})

# Drop all data points that do not contain an EC number
uniprot_df = uniprot_df.dropna(subset="ec_numbers")
uniprot_df

Unnamed: 0,entryType,primaryAccession,uniProtkbId,taxon_id,fullName,ec_numbers
1,UniProtKB unreviewed (TrEMBL),A0A929MPR1,A0A929MPR1_ABIDE,46125,CTP synthase,6.3.4.2
3,UniProtKB unreviewed (TrEMBL),A0A929QT52,A0A929QT52_ABIDE,46125,Aspartate-semialdehyde dehydrogenase,1.2.1.11
5,UniProtKB unreviewed (TrEMBL),A0A929MMJ5,A0A929MMJ5_ABIDE,46125,Dihydroorotate dehydrogenase,1.3.-.-
6,UniProtKB unreviewed (TrEMBL),A0A929MMW4,A0A929MMW4_ABIDE,46125,Glycerol-3-phosphate dehydrogenase [NAD(P)+],1.1.1.94
7,UniProtKB unreviewed (TrEMBL),A0A929MMY6,A0A929MMY6_ABIDE,46125,Lipid II isoglutaminyl synthase (glutamine-hyd...,6.3.5.13
...,...,...,...,...,...,...
8467,UniProtKB unreviewed (TrEMBL),A0A1H9CYL9,A0A1H9CYL9_9GAMM,355243,Lipoyl synthase,2.8.1.8
8468,UniProtKB unreviewed (TrEMBL),A0A1H9CZ15,A0A1H9CZ15_9GAMM,355243,Serine hydroxymethyltransferase,2.1.2.1
8470,UniProtKB unreviewed (TrEMBL),A0A1H9D344,A0A1H9D344_9GAMM,355243,NAD-dependent protein deacylase,2.3.1.286
8471,UniProtKB unreviewed (TrEMBL),B5A820,B5A820_AMYAL,76020,DNA topoisomerase (ATP-hydrolyzing),5.6.2.2


In [43]:
data_df = pd.merge(
    left=komodo_df,
    right=uniprot_df,
    on="taxon_id",
    how="inner"
)

# Explode EC numbers
data_df["ec_numbers"] = data_df["ec_numbers"].str.split("|")
data_df = data_df.explode("ec_numbers")

data_df = data_df[["taxon_id", "media_code", "ec_numbers"]]\
    .value_counts()\
    .reset_index()

data_df = data_df.pivot(
    index=["taxon_id", "media_code"],
    columns="ec_numbers",
    values="count"
)
data_df = data_df.fillna(0.0).reset_index()
data_df

ec_numbers,taxon_id,media_code,1.-.-.-,1.1.-.-,1.1.1.100,1.1.1.133,1.1.1.135,1.1.1.205,1.1.1.25,1.1.1.262,...,7.2.1.1,7.2.2.14,7.2.2.6,7.2.2.7,7.2.3.1,7.3.2.1,7.3.2.2,7.4.2.5,7.4.2.8,7.6.2.-
0,100469,1377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,102226,7013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1034,8324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1035,8327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,103621,1319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,94136,7818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
383,95160,7096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
384,96473,495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
385,990712,5276,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Strategy for splitting

In [None]:
from sklearn.model_selection import train_test_split

# TODO: check correct split!
X_train, X_test, y_train, y_test = train_test_split(
    data_df.drop(["taxon_id", "media_code"], axis=1),
    data_df["media_code"],
    test_size=0.2,
    random_state=RANDOM_SEED
)

### Clustering

In [None]:
reducer = umap.UMAP(
    metric=UMAP_METRIC,
    n_components=4,
    n_epochs=5000,
    random_state=UMAP_SEED,
    n_jobs=1
)

reduced_df = reducer.fit_transform(X_train)
reduced_df = pd.DataFrame(
    reduced_df,
    columns=[f"Component {i+1}" for i in range(reduced_df.shape[1])]
)

#### Map binary features to a continuous embedding space using UMAP (or another algorithm)

### Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


model = RandomForestClassifier(
    n_estimators=100,
    n_jobs=-1,
    random_state=RANDOM_SEED
)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

report = classification_report(
    y_true=y_test,
    y_pred=y_pred,
    zero_division="warn"
)

print(report)

              precision    recall  f1-score   support

         126       0.00      0.00      0.00       1.0
         283       0.00      0.00      0.00       1.0
         284       0.00      0.00      0.00       0.0
         329       0.00      0.00      0.00       0.0
         357       0.00      0.00      0.00       1.0
         385       0.00      0.00      0.00       0.0
         450       0.00      0.00      0.00       1.0
         609       0.00      0.00      0.00       0.0
         879       0.00      0.00      0.00       1.0
         893       0.00      0.00      0.00       0.0
         903       0.00      0.00      0.00       1.0
         984       0.00      0.00      0.00       0.0
         988       0.00      0.00      0.00       0.0
         990       0.00      0.00      0.00       1.0
        1121       0.00      0.00      0.00       0.0
        1224       0.00      0.00      0.00       0.0
        1226       0.00      0.00      0.00       1.0
        1247       0.00    


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



### TODO

* **How to standardise and integrate homologs:**
    * [Homology database](https://www.ncbi.nlm.nih.gov/guide/homology/)
    * BLASTP (local; all vs all)
    * Sequence 100% identity + BLASTP
    * Sequence non-100% identity + BLASTP
    * UMAP + NN

**IMPORTANT** We need the standardisation to have a matrix of genes x media instead of genomes x media