In [1]:
# Add higher directory to python modules path

import sys

sys.path.append("..")

In [2]:
import os

import pandas as pd

import plotly.express as px

In [3]:
DATA_DIR = "../data/"

RANDOM_SEED = 666

## Media

In [4]:
komodo_df = pd.read_csv(
    os.path.join(
        DATA_DIR,
        "komodo",
        "komodo.tsv"
    ),
    sep="\t"
)

# Transform taxon ID column to string for plotting
komodo_df["Taxon ID"] = komodo_df["Taxon ID"].astype(str).str.replace(".0", "")

# Add genus column
komodo_df["Genus"] = komodo_df["Organism Name"].str.split(" ").str[0]

# Get media column as a category
komodo_df["Media"] = komodo_df["Media"].astype("category")
komodo_df["media_code"] = komodo_df["Media"].cat.codes

komodo_df

Unnamed: 0,Organism DSMZ ID,Taxon ID,Organism Name,Media,Genus,media_code
0,6268,,,Substrate for DSM 6268,,2312
1,9849,46125,Abiotrophia defectiva,PYG-MEDIUM (modified),Abiotrophia,2099
2,14247,291968,Acaricomes phytoseiuli,TRYPTICASE SOY Yeast extract medium,Acaricomes,2450
3,23669,,Acetatifactor muris,For DSM 23669,Acetatifactor,729
4,5522,2382,Acetitomaculum ruminis,ACETITOMACULUM medium | METHANOBACTERIUM medium,Acetitomaculum,13
...,...,...,...,...,...,...
8623,473,120045,Zymomonas mobilis subsp. mobilis,ZYMOMONAS medium,Zymomonas,2505
8624,22645,120044,Zymomonas mobilis subsp. pomaceae,ZYMOMONAS medium,Zymomonas,2505
8625,7201,86958,Zymophilus paucivorans,MEDIUM 58 MODIFIED FOR DSM 7201,Zymophilus,1657
8626,20765,86959,Zymophilus raffinosivorans,MEDIUM 58 MODIFIED FOR DSM 20765,Zymophilus,1654


## EC numbers

In [5]:
uniprot_df = pd.read_csv(
    os.path.join(
        DATA_DIR,
        "uniprot",
        "komodo_taxon_to_uniprot_ec.csv"
    )
)

uniprot_df["taxonId"] = uniprot_df["taxonId"].astype(str)
uniprot_df = uniprot_df.rename(columns={"taxonId": "Taxon ID"})

uniprot_df

Unnamed: 0,entryType,primaryAccession,uniProtkbId,Taxon ID,fullName,ecNumbers
0,UniProtKB reviewed (Swiss-Prot),Q8GR69,DLTC_ABIDE,46125,D-alanyl carrier protein,
1,UniProtKB unreviewed (TrEMBL),A0A929MPR1,A0A929MPR1_ABIDE,46125,CTP synthase,6.3.4.2
2,UniProtKB unreviewed (TrEMBL),A0A929MUP8,A0A929MUP8_ABIDE,46125,Bifunctional protein GlmU,
3,UniProtKB unreviewed (TrEMBL),A0A929QT52,A0A929QT52_ABIDE,46125,Aspartate-semialdehyde dehydrogenase,1.2.1.11
4,UniProtKB unreviewed (TrEMBL),A0A929QTX2,A0A929QTX2_ABIDE,46125,Coenzyme A biosynthesis bifunctional protein C...,
...,...,...,...,...,...,...
8471,UniProtKB unreviewed (TrEMBL),B5A820,B5A820_AMYAL,76020,DNA topoisomerase (ATP-hydrolyzing),5.6.2.2
8472,UniProtKB unreviewed (TrEMBL),F8UBY7,F8UBY7_AMYAL,76020,DNA repair protein RecN,
8473,UniProtKB unreviewed (TrEMBL),Q45HM6,Q45HM6_AMYAL,76020,,
8474,UniProtKB unreviewed (TrEMBL),B5A821,B5A821_9PSEU,102226,DNA topoisomerase (ATP-hydrolyzing),5.6.2.2


In [6]:
# Drop all data points that do not contain an EC number
uniprot_df = uniprot_df.dropna(subset="ecNumbers")
uniprot_df

Unnamed: 0,entryType,primaryAccession,uniProtkbId,Taxon ID,fullName,ecNumbers
1,UniProtKB unreviewed (TrEMBL),A0A929MPR1,A0A929MPR1_ABIDE,46125,CTP synthase,6.3.4.2
3,UniProtKB unreviewed (TrEMBL),A0A929QT52,A0A929QT52_ABIDE,46125,Aspartate-semialdehyde dehydrogenase,1.2.1.11
5,UniProtKB unreviewed (TrEMBL),A0A929MMJ5,A0A929MMJ5_ABIDE,46125,Dihydroorotate dehydrogenase,1.3.-.-
6,UniProtKB unreviewed (TrEMBL),A0A929MMW4,A0A929MMW4_ABIDE,46125,Glycerol-3-phosphate dehydrogenase [NAD(P)+],1.1.1.94
7,UniProtKB unreviewed (TrEMBL),A0A929MMY6,A0A929MMY6_ABIDE,46125,Lipid II isoglutaminyl synthase (glutamine-hyd...,6.3.5.13
...,...,...,...,...,...,...
8467,UniProtKB unreviewed (TrEMBL),A0A1H9CYL9,A0A1H9CYL9_9GAMM,355243,Lipoyl synthase,2.8.1.8
8468,UniProtKB unreviewed (TrEMBL),A0A1H9CZ15,A0A1H9CZ15_9GAMM,355243,Serine hydroxymethyltransferase,2.1.2.1
8470,UniProtKB unreviewed (TrEMBL),A0A1H9D344,A0A1H9D344_9GAMM,355243,NAD-dependent protein deacylase,2.3.1.286
8471,UniProtKB unreviewed (TrEMBL),B5A820,B5A820_AMYAL,76020,DNA topoisomerase (ATP-hydrolyzing),5.6.2.2


In [7]:
fig = px.bar(
    data_frame=uniprot_df.value_counts("Taxon ID").reset_index(),
    x="Taxon ID",
    y="count"
)
fig.show()

In [8]:
data_df = pd.merge(
    left=komodo_df,
    right=uniprot_df,
    on="Taxon ID",
    how="inner"
)

# Explode EC numbers
data_df["ecNumbers"] = data_df["ecNumbers"].str.split("|")
data_df = data_df.explode("ecNumbers")

data_df = data_df[["Taxon ID", "media_code", "ecNumbers"]]\
    .value_counts()\
    .reset_index()

data_df = data_df.pivot(
    index=["Taxon ID", "media_code"],
    columns="ecNumbers",
    values="count"
)
data_df = data_df.fillna(0.0).reset_index()
data_df

ecNumbers,Taxon ID,media_code,1.-.-.-,1.1.-.-,1.1.1.100,1.1.1.133,1.1.1.135,1.1.1.205,1.1.1.25,1.1.1.262,...,7.2.1.1,7.2.2.14,7.2.2.6,7.2.2.7,7.2.3.1,7.3.2.1,7.3.2.2,7.4.2.5,7.4.2.8,7.6.2.-
0,100469,314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,102226,825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1034,125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1035,126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,103621,314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381,94136,1414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
382,95160,825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
383,96473,1991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
384,990712,121,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Strategy for splitting

In [9]:
from sklearn.model_selection import train_test_split

# TODO: check correct split!
X_train, X_test, y_train, y_test = train_test_split(
    data_df.drop(["Taxon ID", "media_code"], axis=1),
    data_df["media_code"],
    test_size=0.2,
    random_state=RANDOM_SEED
)

X_train, X_test, y_train, y_test

(ecNumbers  1.-.-.-  1.1.-.-  1.1.1.100  1.1.1.133  1.1.1.135  1.1.1.205  \
 287            0.0      0.0        0.0        0.0        0.0        0.0   
 375            0.0      0.0        0.0        0.0        0.0        0.0   
 7              0.0      0.0        0.0        0.0        0.0        0.0   
 376            0.0      0.0        0.0        0.0        0.0        0.0   
 76             0.0      0.0        0.0        0.0        0.0        0.0   
 ..             ...      ...        ...        ...        ...        ...   
 222            0.0      0.0        0.0        0.0        0.0        0.0   
 91             0.0      0.0        0.0        0.0        0.0        0.0   
 70             0.0      0.0        0.0        0.0        0.0        0.0   
 318            0.0      0.0        0.0        0.0        0.0        0.0   
 236            0.0      0.0        0.0        0.0        0.0        0.0   
 
 ecNumbers  1.1.1.25  1.1.1.262  1.1.1.267  1.1.1.27  ...  7.2.1.1  7.2.2.14  \
 287  

## RF classifier

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


model = RandomForestClassifier(
    n_estimators=100,
    n_jobs=-1,
    random_state=RANDOM_SEED
)
model.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)

report = classification_report(
    y_true=y_test,
    y_pred=y_pred,
    zero_division="warn"
)

print(report)

              precision    recall  f1-score   support

          -1       0.67      0.67      0.67         3
           1       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         0
          20       1.00      1.00      1.00         1
          29       0.00      0.00      0.00         0
          37       0.00      0.00      0.00         1
          39       0.00      0.00      0.00         1
          42       0.00      0.00      0.00         1
          69       0.00      0.00      0.00         1
         121       0.43      0.75      0.55         4
         123       0.00      0.00      0.00         1
         125       0.00      0.00      0.00         0
         126       0.00      0.00      0.00         1
         133       0.00      0.00      0.00         0
         136       0.00      0.00      0.00         0
         146       0.00      0.00      0.00         1
         148       0.00    


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



## Clustering

In [16]:
from sklearn.cluster import AgglomerativeClustering


clustering = AgglomerativeClustering(
    n_clusters=10,
    metric="euclidean",
    linkage="ward"
)
clustering.fit_predict(X_train)

array([5, 2, 3, 1, 2, 6, 5, 2, 5, 2, 6, 1, 2, 2, 5, 6, 2, 3, 2, 3, 5, 3,
       6, 5, 1, 5, 2, 2, 6, 5, 2, 5, 2, 5, 5, 5, 1, 2, 8, 5, 6, 8, 5, 1,
       5, 5, 0, 5, 5, 3, 5, 6, 5, 5, 2, 3, 3, 5, 5, 8, 3, 6, 4, 5, 5, 8,
       4, 3, 2, 3, 1, 5, 2, 2, 3, 2, 0, 5, 6, 3, 2, 2, 8, 3, 2, 6, 6, 2,
       6, 0, 2, 8, 7, 2, 0, 8, 2, 1, 5, 2, 6, 5, 6, 5, 7, 5, 2, 8, 5, 2,
       3, 8, 2, 2, 5, 6, 2, 3, 3, 3, 1, 9, 5, 9, 8, 8, 6, 6, 8, 5, 5, 5,
       5, 5, 5, 0, 5, 5, 6, 3, 3, 2, 3, 7, 7, 4, 5, 8, 8, 4, 5, 6, 5, 5,
       2, 6, 2, 2, 2, 8, 8, 0, 5, 5, 6, 8, 2, 5, 7, 5, 6, 8, 2, 5, 5, 2,
       4, 6, 5, 5, 1, 8, 8, 6, 2, 6, 2, 8, 3, 0, 2, 3, 2, 2, 2, 7, 5, 1,
       7, 2, 5, 8, 6, 5, 5, 2, 4, 6, 5, 0, 2, 2, 2, 8, 6, 2, 3, 1, 5, 3,
       9, 6, 8, 5, 5, 0, 0, 5, 2, 2, 2, 2, 5, 6, 8, 2, 5, 2, 2, 6, 4, 4,
       3, 2, 1, 8, 7, 8, 3, 2, 3, 2, 2, 2, 2, 5, 7, 2, 3, 8, 6, 2, 5, 3,
       1, 2, 5, 2, 8, 8, 6, 2, 3, 2, 3, 5, 5, 3, 1, 6, 1, 3, 2, 5, 5, 6,
       7, 5, 1, 6, 1, 2, 6, 2, 8, 1, 2, 2, 6, 2, 2,

### TODO

* **How to standardise and integrate homologs:**
    * [Homology database](https://www.ncbi.nlm.nih.gov/guide/homology/)
    * BLASTP (local; all vs all)
    * Sequence 100% identity + BLASTP
    * Sequence non-100% identity + BLASTP
    * UMAP + NN

**IMPORTANT** We need the standardisation to have a matrix of genes x media instead of genomes x media