In [None]:
import pandas as pd
import seaborn as sn
import numpy as np
import plotly.graph_objects as go
import tensorflow as tf
import joblib
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,classification_report, accuracy_score
%matplotlib inline
sn.set_context("paper")

In [None]:
def model_eval(model,data,dim):
    yhat = model.predict(data)
    yhat_prob =[np.round(x[np.argmax(x)],3)  for x in yhat]
    yhat_oh = tf.convert_to_tensor([tf.one_hot(np.argmax(x),depth = dim) for x in yhat],dtype=tf.float32)
    return yhat_oh, yhat_prob

def one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,ARCH, DB):
    with tf.device("cpu"):
        train = pd.read_csv(TRAIN)
        train = train[["Class","Sequence"]]
        n_sample_train = len(train.loc[train.Class=="NonR"])
        l_train = len(train)
        del train
        test = pd.read_csv(TEST)
        test =  test[["Class","Sequence"]]
        n_sample_test = len(test.loc[test.Class=="NonR"])
        l_test = len(test)
        negative_size = n_sample_train + n_sample_test
        db_size = l_train + l_test
        test["Sequence"] = test.Sequence.apply(lambda x: " ".join(x))
        X_test = tf.convert_to_tensor(test.Sequence.to_list())
        y_test = ENCODER.transform(test.Class.to_numpy().reshape(-1,1)).toarray()
    model = tf.keras.models.load_model(f"{MODELDIR}/{ARCH}")
    yhat,_ = model_eval(model, X_test, len(ENCODER.categories_[0]))
    acc = accuracy_score(y_test, yhat)
    return [ARCH, DB, db_size, negative_size, l_train, l_test, np.round(acc,3)]

def get_clf_report(ENCODER,MODELDIR,TEST,ARCH,SUFFIX):
    test = pd.read_csv(TEST)
    test =  test[["Class","Sequence"]]
    test["Sequence"] = test.Sequence.apply(lambda x: " ".join(x))
    X_test = tf.convert_to_tensor(test.Sequence.to_list())
    y_test = ENCODER.transform(test.Class.to_numpy().reshape(-1,1)).toarray()
 
    model_cnn = tf.keras.models.load_model(f"{MODELDIR}/{ARCH[0]}")
    model_cnns = tf.keras.models.load_model(f"{MODELDIR}/{ARCH[1]}")
    model_gru = tf.keras.models.load_model(f"{MODELDIR}/{ARCH[2]}")

    yhat_cnn, _ = model_eval(model_cnn, X_test, len(ENCODER.categories_[0]))
    yhat_cnns, _ = model_eval(model_cnns, X_test, len(ENCODER.categories_[0]))
    yhat_gru, _ = model_eval(model_gru, X_test, len(ENCODER.categories_[0]))

    df = pd.concat([
        pd.DataFrame(classification_report(y_test, yhat_cnn, target_names=ENCODER.categories_[0], output_dict=True)).iloc[:3,:-4].round(2).T.add_prefix(f"{ARCH[0]}-"),
        pd.DataFrame(classification_report(y_test, yhat_cnns, target_names=ENCODER.categories_[0], output_dict=True)).iloc[:3,:-4].round(2).T.add_prefix(f"{ARCH[1]}-"),
        pd.DataFrame(classification_report(y_test, yhat_gru, target_names=ENCODER.categories_[0], output_dict=True)).iloc[:3,:-4].round(2).T.add_prefix(f"{ARCH[2]}-"),
    ], axis = 1)
    df.rename(index = {"macrolide-lincosamide-streptogramin":"MLS"}, inplace = True)
    df = df.add_suffix(f" ({SUFFIX})", axis = 0)
    return df.sort_index()


ACURACY

NCRD

In [None]:

ENCODER =joblib.load("../data/NCRD/ncrd95-ma_clf_enc.joblib")
TRAIN = "../data/NCRD/TrainNcrd95-ma.csv"
TEST = "../data/NCRD/TestNcrd95-ma.csv"
MODELDIR = "../models/Fase5-NCRD"
#Acc
tf.keras.backend.clear_session()
CNN =  one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn","NCRD")
CNNS = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn-same","NCRD")
GRU =  one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"gru","NCRD")
tf.keras.backend.clear_session()

In [None]:
clf_report = get_clf_report(ENCODER,MODELDIR,TEST,["cnn","cnn-same","gru"],"Only ARPs")
clf_report = clf_report.reindex([
    "MLS (Only ARPs)",
    "aminoglycoside (Only ARPs)",
    "beta_lactam (Only ARPs)",
    "chloramphenicol (Only ARPs)",
    "glycopeptide (Only ARPs)",
    "macrolide (Only ARPs)",
    "multidrug (Only ARPs)",
    "phosphonic acid (Only ARPs)",
    "rifamcyn (Only ARPs)",
    "tetracycline (Only ARPs)"
    ])

categories = clf_report.columns.to_list()
fig = go.Figure()
for i in clf_report.index:
    fig.add_trace(go.Scatterpolar(
    r = clf_report.loc[i,:].values,
    theta = categories,
    name = i))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      color = "black"
    )),
  showlegend=True
)

fig.show()
fig.write_image("../../figs/ncrd-report-onlyAPRs.svg")

In [None]:
ENCODER =joblib.load("../data/NCRD/ncrd95-uniprot-ma_clf_enc.joblib")
TRAIN = "../data/NCRD/TrainNcrd95-ma-align.csv"
TEST = "../data/NCRD/TestNcrd95-ma-align.csv"
MODELDIR = "../models/Fase5-NCRD-align"
#acc
tf.keras.backend.clear_session()
CNN_align = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn","NCRD-uniprot-align")
CNNS_align = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn-same","NCRD-uniprot-align")
GRU_align = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"gru","NCRD-uniprot-align")
tf.keras.backend.clear_session()
#clf report



In [None]:
clf_report = get_clf_report(ENCODER,MODELDIR,TEST,["cnn","cnn-same","gru"],"Align")
clf_report = clf_report.reindex([
    "MLS (Align)",
    "aminoglycoside (Align)",
    "beta_lactam (Align)",
    "chloramphenicol (Align)",
    "glycopeptide (Align)",
    "macrolide (Align)",
    "multidrug (Align)",
    "phosphonic acid (Align)",
    "rifamcyn (Align)",
    "tetracycline (Align)",
    "NonR (Align)"
    ])

categories = clf_report.columns.to_list()
fig = go.Figure()
for i in clf_report.index:
    fig.add_trace(go.Scatterpolar(
    r = clf_report.loc[i,:].values,
    theta = categories,
    name = i))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      color = "black"
    )),
  showlegend=True
)

fig.show()
fig.write_image("../figs/ncrd-report-Align.svg")

In [None]:
ENCODER =joblib.load("../data/NCRD/ncrd95-uniprot-ma_clf_enc.joblib")
TRAIN = "../data/NCRD/TrainNcrd95-ma-unalign.csv"
TEST = "../data/NCRD/TestNcrd95-ma-unalign.csv"
MODELDIR = "../models/Fase5-NCRD-neg-unalign"
tf.keras.backend.clear_session()
CNN_unalign = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn","NCRD-uniprot-unalign")
CNNS_unalign = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn-same","NCRD-uniprot-unalign")
GRU_unalign = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"gru","NCRD-uniprot-unalign")
tf.keras.backend.clear_session()
#clf report


In [None]:
clf_report = get_clf_report(ENCODER,MODELDIR,TEST,["cnn","cnn-same","gru"],"Unalign")
clf_report = clf_report.reindex([
    "MLS (Unalign)",
    "aminoglycoside (Unalign)",
    "beta_lactam (Unalign)",
    "chloramphenicol (Unalign)",
    "glycopeptide (Unalign)",
    "macrolide (Unalign)",
    "multidrug (Unalign)",
    "phosphonic acid (Unalign)",
    "rifamcyn (Unalign)",
    "tetracycline (Unalign)",
    "NonR (Unalign)"
    ])

categories = clf_report.columns.to_list()
fig = go.Figure()
for i in clf_report.index:
    fig.add_trace(go.Scatterpolar(
    r = clf_report.loc[i,:].values,
    theta = categories,
    name = i))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      color = "black"
    )),
  showlegend=True
)

fig.show()
fig.write_image("../../figs/ncrd-report-Unalign.svg")

In [None]:
ENCODER =joblib.load("../data/NCRD/ncrd95-uniprot-ma_clf_enc.joblib")
TRAIN = "../data/NCRD/TrainNcrd95-ma-unalign50.csv"
TEST = "../data/NCRD/TestNcrd95-ma-unalign50.csv"
MODELDIR = "../models/Fase5-NCRD-neg-unalign50"
tf.keras.backend.clear_session()
CNN_unalign50 = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn","NCRD-uniprot-unalign (50%)")
CNNS_unalign50 = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn-same","NCRD-uniprot-unalign (50%)")
GRU_unalign50 = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"gru","NCRD-uniprot-unalign (50%)")
tf.keras.backend.clear_session()
#clf report


In [None]:
clf_report = get_clf_report(ENCODER,MODELDIR,TEST,["cnn","cnn-same","gru"],"Unalig 50%")
clf_report = clf_report.reindex([
    "MLS (Unalig 50%)",
    "aminoglycoside (Unalig 50%)",
    "beta_lactam (Unalig 50%)",
    "chloramphenicol (Unalig 50%)",
    "glycopeptide (Unalig 50%)",
    "macrolide (Unalig 50%)",
    "multidrug (Unalig 50%)",
    "phosphonic acid (Unalig 50%)",
    "rifamcyn (Unalig 50%)",
    "tetracycline (Unalig 50%)",
    "NonR (Unalig 50%)"
    ])

categories = clf_report.columns.to_list()
fig = go.Figure()
for i in clf_report.index:
    fig.add_trace(go.Scatterpolar(
    r = clf_report.loc[i,:].values,
    theta = categories,
    name = i))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      color = "black"
    )),
  showlegend=True
)

fig.show()
fig.write_image("../figs/ncrd-report-Unalign50.svg")

In [None]:
ENCODER =joblib.load("../data/NCRD/ncrd95-uniprot-ma_clf_enc.joblib")
TRAIN = "../data/NCRD/TrainNcrd95-ma-unalign100.csv"
TEST = "../data/NCRD/TestNcrd95-ma-unalign100.csv"
MODELDIR = "../models/Fase5-NCRD-neg-unalign100"
tf.keras.backend.clear_session()
CNN_unalign100 = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn","NCRD-uniprot-unalign (100%)")
CNNS_unalign100 = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn-same","NCRD-uniprot-unalign (100%)")
GRU_unalign100 = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"gru","NCRD-uniprot-unalign (100%)")

#clf report



In [None]:
clf_report = get_clf_report(ENCODER,MODELDIR,TEST,["cnn","cnn-same","gru"],"Unalig 100%")
clf_report = clf_report.reindex([
    "MLS (Unalig 100%)",
    "aminoglycoside (Unalig 100%)",
    "beta_lactam (Unalig 100%)",
    "chloramphenicol (Unalig 100%)",
    "glycopeptide (Unalig 100%)",
    "macrolide (Unalig 100%)",
    "multidrug (Unalig 100%)",
    "phosphonic acid (Unalig 100%)",
    "rifamcyn (Unalig 100%)",
    "tetracycline (Unalig 100%)",
    "NonR (Unalig 100%)"
    ])

categories = clf_report.columns.to_list()
fig = go.Figure()
for i in clf_report.index:
    fig.add_trace(go.Scatterpolar(
    r = clf_report.loc[i,:].values,
    theta = categories,
    name = i))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      color = "black"
    )),
  showlegend=True
)

fig.show()
fig.write_image("../figs/ncrd-report-Unalign100.svg")

In [None]:
CNN_list = [CNN,  CNN_align, CNN_unalign, CNN_unalign50,  CNN_unalign100]
CNNS_list = [CNNS, CNNS_align,CNNS_unalign,CNNS_unalign50, CNNS_unalign100]
GRU_list = [GRU,  GRU_align, GRU_unalign, GRU_unalign50,  GRU_unalign100]

In [None]:
NRDC_Metrics_df = pd.concat([
    pd.DataFrame(CNN_list, columns = ["Model","DB","DB size","Negative examples","Train size", "Test size","Acc"]),
    pd.DataFrame(CNNS_list, columns = ["Model","DB","DB size","Negative examples","Train size", "Test size","Acc"]),
    pd.DataFrame(GRU_list, columns = ["Model","DB","DB size","Negative examples","Train size", "Test size","Acc"])
], axis = 0)
NRDC_Metrics_df.to_csv("NCRD-Metrics.csv", index = False)

In [None]:
NRDC_Metrics_df

In [None]:
clf_concat = pd.concat(
    [clf_positives, clf_align, clf_unalign, clf_unalign50, clf_unalign100],
    axis = 0
)

In [None]:
sn.clustermap(clf_concat.sort_index(), col_cluster = False, figsize = (6,13), cmap = "coolwarm")
plt.savefig("../figs/clf-report-all.svg")

HMD

positives

In [None]:
ENCODER =joblib.load("../data/HMD/hmd-ma_clf_enc.joblib")
TRAIN = "../data/HMD/TrainHMD-ma.csv"
TEST = "../data/HMD/TestHMD-ma.csv"
MODELDIR = "../models/Fase4-HMD"
tf.keras.backend.clear_session()
CNN =  one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn","HMD")
CNNS = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn-same","HMD")
GRU =  one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"gru","HMD")

tf.keras.backend.clear_session()
#clf report
clf_report = get_clf_report(ENCODER,MODELDIR,TEST,["cnn","cnn-same","gru"],"Only ARPs")


In [None]:
categories = clf_report.columns.to_list()
categories
fig = go.Figure()
for i in clf_report.index:
    fig.add_trace(go.Scatterpolar(
    r = clf_report.loc[i,:].values,
    theta = categories,
    name = i))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      color = "black"
    )),
  showlegend=False
)

fig.show()
fig.write_image("../figs/report-onlyAPRs.svg")

negatives align

In [None]:
ENCODER =joblib.load("../data/HMD/uniprot-hmd-ma_clf_enc.joblib")
TRAIN = "../data/HMD/TrainHMD-ma-align.csv"
TEST = "../data/HMD/TestHMD-ma-align.csv"
MODELDIR = "../models/Fase4-HMD-align"
tf.keras.backend.clear_session()
CNN_align = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn","HMD-uniprot-align")
CNNS_align = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn-same","HMD-uniprot-align")
GRU_align = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"gru","HMD-uniprot-align")
tf.keras.backend.clear_session()
#clf report
clf_report = get_clf_report(ENCODER,MODELDIR,TEST,["cnn","cnn-same","gru"],"Align.")

In [None]:
clf_report = clf_report.reindex([
    "MLS (Align.)",
    "aminoglycoside (Align.)",
    "bacitracin (Align.)",
    "beta_lactam (Align.)",
    "multidrug (Align.)",
    "polymyxin (Align.)",
    "NonR (Align.)"
    ])


In [None]:
clf_report = clf_report.reindex([
    "MLS (Align.)",
    "aminoglycoside (Align.)",
    "bacitracin (Align.)",
    "beta_lactam (Align.)",
    "multidrug (Align.)",
    "polymyxin (Align.)",
    "NonR (Align.)"
    ])



categories = clf_report.columns.to_list()
categories
fig = go.Figure()
for i in clf_report.index:
    fig.add_trace(go.Scatterpolar(
    r = clf_report.loc[i,:].values,
    theta = categories,
    name = i))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      color = "black"
    )),
  showlegend=True
)

fig.show()
fig.write_image("../figs/report-align.svg")

negatives unalign

In [None]:
ENCODER =joblib.load("../data/HMD/uniprot-hmd-ma_clf_enc.joblib")
TRAIN = "../data/HMD/TrainHMD-ma-unalign.csv"
TEST = "../data/HMD/TestHMD-ma-unalign.csv"
MODELDIR = "../models/Fase4-HMD-unalign"
tf.keras.backend.clear_session()
CNN_unalign = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn","HMD-uniprot-unalign")
CNNS_unalign = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn-same","HMD-uniprot-unalign")
GRU_unalign = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"gru","HMD-uniprot-unalign")
tf.keras.backend.clear_session()
#clf report
clf_report = get_clf_report(ENCODER,MODELDIR,TEST,["cnn","cnn-same","gru"],"Unalign.")

In [None]:
clf_report

In [None]:
clf_report = clf_report.reindex([
    "MLS (Unalign.)",
    "aminoglycoside (Unalign.)",
    "bacitracin (Unalign.)",
    "beta_lactam (Unalign.)",
    "multidrug (Unalign.)",
    "polymyxin (Unalign.)",
    "NonR (Unalign.)"
    ])
categories = clf_report.columns.to_list()
categories
fig = go.Figure()
for i in clf_report.index:
    fig.add_trace(go.Scatterpolar(
    r = clf_report.loc[i,:].values,
    theta = categories,
    name = i))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      color = "black"
    )),
  showlegend=True
)

fig.show()
fig.write_image("../../figs/report-unalign.svg")

negatives unalign 50

In [None]:
ENCODER =joblib.load("../data/HMD/uniprot-hmd-ma_clf_enc.joblib")
TRAIN = "../data/HMD/TrainHMD-ma-unalign50.csv"
TEST = "../data/HMD/TestHMD-ma-unalign50.csv"
MODELDIR = "../models/Fase4-HMD-unalign50"
tf.keras.backend.clear_session()
CNN_unalign50 = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn","HMD-uniprot-unalign (50%)")
CNNS_unalign50 = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn-same","HMD-uniprot-unalign (50%)")
GRU_unalign50 = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"gru","HMD-uniprot-unalign (50%)")
tf.keras.backend.clear_session()
#clf report
clf_report = get_clf_report(ENCODER,MODELDIR,TEST,["cnn","cnn-same","gru"],"Unalig 50%")

In [None]:
clf_report = clf_report.reindex([
    "MLS (Unalig 50%)",
    "aminoglycoside (Unalig 50%)",
    "bacitracin (Unalig 50%)",
    "beta_lactam (Unalig 50%)",
    "multidrug (Unalig 50%)",
    "polymyxin (Unalig 50%)",
    "NonR (Unalig 50%)"
    ])
categories = clf_report.columns.to_list()
fig = go.Figure()
for i in clf_report.index:
    fig.add_trace(go.Scatterpolar(
    r = clf_report.loc[i,:].values,
    theta = categories,
    name = i))
fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      color = "black"
    )),
  showlegend=True)
fig.show()
fig.write_image("../figs/report-unalign50.svg")

In [None]:
clf_unalign50

negatives unalign 100

In [None]:
ENCODER =joblib.load("../data/HMD/uniprot-hmd-ma_clf_enc.joblib")
TRAIN = "../data/HMD/TrainHMD-ma-unalign100.csv"
TEST = "../data/HMD/TestHMD-ma-unalign100.csv"
MODELDIR = "../models/Fase4-HMD-unalign100"
tf.keras.backend.clear_session()
CNN_unalign100 = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn","HMD-uniprot-unalign (100%)")
CNNS_unalign100 = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"cnn-same","HMD-uniprot-unalign (100%)")
GRU_unalign100 = one_value_metrics(ENCODER,TRAIN,TEST,MODELDIR,"gru","HMD-uniprot-unalign (100%)")

#clf report
clf_report = get_clf_report(ENCODER,MODELDIR,TEST,["cnn","cnn-same","gru"],"Unalign 100%")

In [None]:
clf_report = clf_report.reindex([
    "MLS (Unalign 100%)",
    "aminoglycoside (Unalign 100%)",
    "bacitracin (Unalign 100%)",
    "beta_lactam (Unalign 100%)",
    "multidrug (Unalign 100%)",
    "polymyxin (Unalign 100%)",
    "NonR (Unalign 100%)"
    ])
categories = clf_report.columns.to_list()
fig = go.Figure()
for i in clf_report.index:
    fig.add_trace(go.Scatterpolar(
    r = clf_report.loc[i,:].values,
    theta = categories,
    name = i))
fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      color = "black"
    )),
  showlegend=True)
fig.show()
fig.write_image("../figs/report-unalign100.svg")

In [None]:
CNN_list = [CNN,  CNN_align, CNN_unalign, CNN_unalign50,  CNN_unalign100]
CNNS_list = [CNNS, CNNS_align,CNNS_unalign,CNNS_unalign50, CNNS_unalign100]
GRU_list = [GRU,  GRU_align, GRU_unalign, GRU_unalign50,  GRU_unalign100]

In [None]:
HMD_Metrics_df = pd.concat([
    pd.DataFrame(CNN_list, columns = ["Model","DB","DB size","Negative examples","Train size", "Test size","Acc"]),
    pd.DataFrame(CNNS_list, columns = ["Model","DB","DB size","Negative examples","Train size", "Test size","Acc"]),
    pd.DataFrame(GRU_list, columns = ["Model","DB","DB size","Negative examples","Train size", "Test size","Acc"])
], axis = 0)
# HMD_Metrics_df.to_csv("HMD-metrics.csv",index = False)

In [None]:
clf_concat = pd.concat(
    [clf_positives, clf_align, clf_unalign, clf_unalign50, clf_unalign100],
    axis = 0
)

In [None]:
sn.clustermap(clf_concat.sort_index(), col_cluster = False, figsize = (6,13), cmap = "coolwarm")
plt.savefig("../fig/clf-report-all.svg")