In [1]:
# imports
import os
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [20]:
# test
res = pd.read_csv('results/CRC_meta/CRC_meta.fa_gt1bp_dvfpred.txt', sep="\t")
res[res['model_contig_len'] == 150]['score'].mean()

0.4346967635788023

In [None]:
# parameters
results_dir = 'results'
genomes = ['h_human-3', 'v_sars-cov-2']
contig_lengths = [25, 50, 100, 150, 300, 500, 1000, 3000]
model_types = [150, 300, 500, 1000]

In [None]:
# compute average scores for each genome
ave_scores = {}
for g in genomes:
    ave_scores[g] = np.zeros((len(contig_lengths), len(model_types)))
    for i, l in enumerate(contig_lengths):
        res = pd.read_csv(os.path.join(results_dir, g, f'{g}_{l}.fasta_gt1bp_dvfpred.txt'))
        for j, m in enumerate(model_types):
            if g.split('_')[0] == 'h':
                ave_scores[i,j] = 1 - res[res['model_contig_len'] == m]['score'].mean()
            elif g.split('_')[0] == 'v':
                ave_scores[i,j] = res[res['model_contig_len'] == m]['score'].mean()
            else:
                raise ValueError(f'Unknown identifier: {g.split("_")[0]} (should be h or v)')

    print('----------------------------------------------------')        
    print(g)
    print(ave_scores[g])
    print()

In [None]:
# compute classification metrics
metrics = {}
for l in contig_lengths:
    y_pred = np.empty((0))
    y_true = np.empty((0))
    for g in genomes:
        res = pd.read_csv(os.path.join(results_dir, g, f'{g}_{l}.fasta_gt1bp_dvfpred.txt'))
        
        for m in model_types:
            m_res = res[res['model_contig_len'] == m]['score']
            y_pred = np.append(y_pred, m_res)
            if g.split('_')[0] == 'h':
                y_true = np.append(y_true, np.zeros((len(m_res))))
            elif g.split('_')[0] == 'v':
                y_true = np.append(y_true, np.ones((len(m_res))))
            else:
                raise ValueError(f'Unknown identifier: {g.split("_")[0]} (should be h or v)')

    metrics[l]['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics[l]['Precision'] = precision_score(y_true, y_pred)
    metrics[l]['Recall'] = recall_score(y_true, y_pred)
    metrics[l]['F1'] = f1_score(y_true, y_pred)
    metrics[l]['AUROC'] = roc_auc_score(y_true, y_pred)
            
print(metrics)