# Predict media from taxa

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.cluster import KMeans
import umap

import os
DATA_DIR = "~/Desktop/code/data/"

In [2]:
TARGET_VAR = "media_id"

RANDOM_SEED = 666

# UMAP parameters
UMAP_METRIC = "euclidean"
UMAP_SEED = 42

In [3]:
data_df = pd.read_csv(os.path.join(DATA_DIR, "media2ec-explode.csv"), low_memory=False)

data_df = data_df.dropna(subset='ec_KEGG', how='any')
data_df = data_df.rename(columns={'ec_KEGG': 'ec'})

filter = data_df["ec"].str.contains("-")
data_df = data_df[~filter]

data_df = data_df[["taxon_id", "media_id", "ec"]]\
    .value_counts()\
    .reset_index()

data_df = data_df.pivot(
    index=["taxon_id", "media_id"],
    columns="ec",
    values="count"
)
data_df = data_df.fillna(0.0).reset_index()
data_df

ec,taxon_id,media_id,1.1.1.1,1.1.1.11,1.1.1.115,1.1.1.118,1.1.1.119,1.1.1.121,1.1.1.14,1.1.1.15,...,2.7.2.14,2.8.1.6,3.2.1.1,3.3.2.10,3.5.1.12,3.5.1.22,3.5.1.39,3.5.1.50,3.5.1.92,6.2.1.11
0,100,J349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1003997,J1052,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100468,J13,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1005944,J346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1005945,J346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,"[{'NCBI tax id': 985, 'Matching level': 'speci...",J155,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2876,"[{'NCBI tax id': 98672, 'Matching level': 'spe...",J28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2877,"[{'NCBI tax id': 990998, 'Matching level': 'st...",514f,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2878,"[{'NCBI tax id': 997349, 'Matching level': 'st...",J14,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Model

### Strategy for splitting

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data_df.drop(["taxon_id", "media_id"], axis=1),
    data_df[TARGET_VAR],
    test_size=0.3,
    random_state=RANDOM_SEED
)

### Dimensionality reduction

In [5]:
reducer = umap.UMAP(
    metric=UMAP_METRIC,
    n_components=4,
    n_epochs=5000,
    random_state=UMAP_SEED,
    n_jobs=1
)

reduced_df = reducer.fit_transform(X_train)
reduced_df = pd.DataFrame(
    reduced_df,
    columns=[f"Component {i+1}" for i in range(reduced_df.shape[1])]
)
reduced_df.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4
0,14.400808,16.690296,-42.231613,3.546525
1,14.366984,0.021583,3.274948,-11.788797
2,14.390467,16.631023,-42.256676,3.534129
3,14.275699,0.048408,3.279351,-11.74447
4,14.257394,0.857795,3.285041,-11.761877


### Clustering

In [6]:
# TODO: add silhouette score
# NOTE: using KMeans since the other classes do not provide a predict method
cluster = KMeans(
    n_clusters=4,
    random_state=RANDOM_SEED
)
reduced_df["Cluster"] = cluster.fit_predict(reduced_df.values)
reduced_df["Cluster"] = reduced_df["Cluster"].astype(str)



In [7]:
# Add ID columns
reduced_df[["taxon_id", "media_id"]] = \
    data_df.loc[X_train.index, ["taxon_id", "media_id"]].values

reduced_df.head()

Unnamed: 0,Component 1,Component 2,Component 3,Component 4,Cluster,taxon_id,media_id
0,14.400808,16.690296,-42.231613,3.546525,2,1849278,J612
1,14.366984,0.021583,3.274948,-11.788797,0,58172,465h
2,14.390467,16.631023,-42.256676,3.534129,2,"[{'NCBI tax id': 1076594, 'Matching level': 's...",J710
3,14.275699,0.048408,3.279351,-11.74447,0,158877,J12
4,14.257394,0.857795,3.285041,-11.761877,0,576611,J346


In [8]:
reduced_df.to_csv(os.path.join(DATA_DIR, "media-model.csv"), index=False)

#### Visualise the results

In [9]:
fig = px.scatter_3d(
    data_frame=reduced_df,
    x="Component 1",
    y="Component 2",
    z="Component 3",
    color="Component 4",
    hover_data=[
        "taxon_id",
        "media_id"
    ],
    template="plotly_white"
)
fig.show()

In [10]:
fig = px.scatter_3d(
    data_frame=reduced_df,
    x="Component 1",
    y="Component 2",
    z="Component 3",
    color="Cluster",
    color_discrete_sequence=px.colors.qualitative.Pastel,
    hover_data=[
        "taxon_id",
        "media_id"
    ],
    template="plotly_white"
)
fig.show()

#### Create new column from clusters

In [11]:
y_train = cluster.predict(reducer.transform(X_train))
y_test = cluster.predict(reducer.transform(X_test))

y_train

array([2, 0, 2, ..., 0, 0, 0])

### Classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


model = RandomForestClassifier(
    n_estimators=100,
    n_jobs=-1,
    random_state=RANDOM_SEED
)
model.fit(X_train, y_train)

### Performance

#### Feature importances

In [13]:
from sklearn.inspection import permutation_importance

result_importances = permutation_importance(
    model,
    X_test,
    y_test,
    n_repeats=1,
    random_state=RANDOM_SEED,
    n_jobs=-1
)

feat_importances = pd.DataFrame({
    "Feature name": X_train.columns,
    "Importance (mean accuracy decrease)": result_importances.importances_mean,
    "Importance (std)": result_importances.importances_std
})

# Sort them by mean
feat_importances = feat_importances\
    .sort_values("Importance (mean accuracy decrease)")

fig = px.bar(
    data_frame=feat_importances,
    x="Feature name",
    y="Importance (mean accuracy decrease)",
    error_y="Importance (std)",
    color="Importance (mean accuracy decrease)",
    width=950,
    height=800,
    template="plotly_white"
)
fig.show("png")

#### Metrics

In [None]:
y_pred = model.predict(X_test)

report = classification_report(
    y_true=y_test,
    y_pred=y_pred,
    zero_division="warn"
)

print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       679
           1       0.97      1.00      0.98        29
           2       1.00      1.00      1.00        61
           3       0.99      1.00      0.99        95

    accuracy                           1.00       864
   macro avg       0.99      1.00      0.99       864
weighted avg       1.00      1.00      1.00       864



In [None]:
from sklearn import metrics

# TODO: get PR-REC AUC score
# TODO: plot both ROC and PR-REC curves
roc_score = metrics.roc_auc_score(
    y_true=y_test,
    y_score=model.predict_proba(X_test),
    average="weighted",
    multi_class="ovo"
)

print("ROC AUC score:", roc_score)

ROC AUC score: 0.9999930641372162
