# EnZymClass metrics

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, matthews_corrcoef
from scipy.stats import ttest_ind
import itertools
import sys
import numpy as np

In [5]:
def get_precision(y,yhat,label=3):
    return round(precision_score(y,yhat,labels=[label],average='micro'),2)


def get_recall(y,yhat,label=3):
    return round(recall_score(y,yhat,labels=[label],average='micro'),2)


def get_accuracy(y,yhat):
    return round(accuracy_score(y,yhat),2)


def get_mcc(y,yhat):
    return round(matthews_corrcoef(y,yhat),2)


def get_metrics(val_iter):
    return get_precision(*val_iter), get_recall(*val_iter), get_accuracy(*val_iter), get_mcc(*val_iter)

In [8]:
en_pred_file = "../data/results/ensemble_preds.csv"

valid_true_iter = itertools.islice(open(en_pred_file).readlines(), 0, 20000, 2)
valid_pred_iter = itertools.islice(open(en_pred_file).readlines(), 1, 20000, 2)

valid_true = [list(map(int, v.strip("\n").split(","))) for v in list(valid_true_iter)]
valid_pred = [list(map(int, v.strip("\n").split(","))) for v in list(valid_pred_iter)]

In [10]:
# creating a list of tuples, tuple values are prec, recall, acc and MCC
mets = []

for valid_iter in zip(valid_true, valid_pred):
    met = get_metrics(valid_iter)
    mets.append(met)

In [11]:
prec = [m[0] for m in mets]
rec = [m[1] for m in mets]
acc = [m[2] for m in mets]
mcc = [m[3] for m in mets]

# Similarity model metrics

In [13]:
similarity_data = "../similarity/results/model_sims.csv"

prec_sim = []
rec_sim = []
acc_sim = []
mcc_sim = []

with open(similarity_data, "r") as f:
    for lines in f:
        values = list(map(float, lines.strip().split(",")))
        prec_sim.append(round(values[0], 2))
        rec_sim.append(round(values[1], 2))
        acc_sim.append(round(values[2], 2))
        mcc_sim.append(round(values[3], 2))

# T-test between EnZymClass and Similarity model metrics

In [27]:
tobj_acc = ttest_ind(acc_sim[:1000], acc[:1000], equal_var=True)
tobj_prec = ttest_ind(prec_sim[:1000], prec[:1000], equal_var=True)
tobj_rec = ttest_ind(rec_sim[:1000], rec[:1000], equal_var=True)
tobj_mcc = ttest_ind(mcc_sim[:1000], mcc[:1000], equal_var=True)

print("***Ttest Results***")
print(round(tobj_acc.statistic, 3), tobj_acc.pvalue)
print(round(tobj_prec.statistic, 3), tobj_prec.pvalue)
print(round(tobj_rec.statistic, 3), tobj_rec.pvalue)
print(round(tobj_mcc.statistic, 3), tobj_mcc.pvalue)

***Ttest Results***
-33.284 1.2160011429315447e-193
-18.955 8.240325963485261e-74
-24.595 6.615318198431384e-117
-35.606 1.880889960590534e-215
