In [8]:
# imports
import os
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
# parameters
results_dir = 'results'
genomes = ['h_human-3', 'v_sars-cov-2']
# contig_lengths = [1000]
contig_lengths = [25, 50, 100, 150, 300, 500, 1000, 3000]
model_types = [150, 300, 500, 1000]

In [28]:
# compute classification metrics
metrics = {}
for l in contig_lengths:
    metrics[f'Input: {l}'] = {}
    for m in model_types:
        metrics[f'Input: {l}'][f'Model: {m}'] = {}
        y_pred = np.empty((0))
        y_true = np.empty((0))

        for g in genomes:
            res = pd.read_csv(os.path.join(results_dir, g, f'{g}_{l}.fasta_gt1bp_dvfpred.txt'), sep="\t")
            m_res = res[res['model_contig_len'] == m]['score']
            y_pred = np.append(y_pred, m_res)

            if g.split('_')[0] == 'h':
                metrics[f'Input: {l}'][f'Model: {m}'][f'Genome score: {g}'] = 1 - m_res.mean()
                y_true = np.append(y_true, np.zeros((len(m_res))))
            elif g.split('_')[0] == 'v':
                metrics[f'Input: {l}'][f'Model: {m}'][f'Genome score: {g}'] = m_res.mean()
                y_true = np.append(y_true, np.ones((len(m_res))))
            else:
                raise ValueError(f'Unknown identifier: {g.split("_")[0]} (should be h or v)')

        metrics[f'Input: {l}'][f'Model: {m}']['Accuracy'] = accuracy_score(y_true, y_pred > 0.5)
        metrics[f'Input: {l}'][f'Model: {m}']['Precision'] = precision_score(y_true, y_pred > 0.5)
        metrics[f'Input: {l}'][f'Model: {m}']['Recall'] = recall_score(y_true, y_pred > 0.5)
        metrics[f'Input: {l}'][f'Model: {m}']['F1'] = f1_score(y_true, y_pred > 0.5)
        metrics[f'Input: {l}'][f'Model: {m}']['AUROC'] = roc_auc_score(y_true, y_pred)
            
metrics

{'Input: 1000': {'Model: 150': {'Genome score: h_human-3': 0.5769636034965515,
   'Genome score: v_sars-cov-2': 0.47995015382766726,
   'Accuracy': 0.6,
   'Precision': 1.0,
   'Recall': 0.2,
   'F1': 0.33333333333333337,
   'AUROC': 1.0},
  'Model: 300': {'Genome score: h_human-3': 0.4633482217788696,
   'Genome score: v_sars-cov-2': 0.6666653752326965,
   'Accuracy': 0.8,
   'Precision': 0.7142857142857143,
   'Recall': 1.0,
   'F1': 0.8333333333333333,
   'AUROC': 0.76},
  'Model: 500': {'Genome score: h_human-3': 0.36159304976463313,
   'Genome score: v_sars-cov-2': 0.9316253781318664,
   'Accuracy': 0.7,
   'Precision': 0.625,
   'Recall': 1.0,
   'F1': 0.7692307692307693,
   'AUROC': 0.6799999999999999},
  'Model: 1000': {'Genome score: h_human-3': 0.7743676940910519,
   'Genome score: v_sars-cov-2': 0.8632304072380066,
   'Accuracy': 0.8,
   'Precision': 0.8,
   'Recall': 0.8,
   'F1': 0.8000000000000002,
   'AUROC': 0.8}}}

In [25]:
# irrelevant given above cell but may be useful later
ave_scores = {}
for g in genomes:
    ave_scores[g] = np.zeros((len(contig_lengths), len(model_types)))
    for i, l in enumerate(contig_lengths):
        res = pd.read_csv(os.path.join(results_dir, g, f'{g}_{l}.fasta_gt1bp_dvfpred.txt'), sep="\t")
        for j, m in enumerate(model_types):
            if g.split('_')[0] == 'h':
                ave_scores[g][i,j] = 1 - res[res['model_contig_len'] == m]['score'].mean()
            elif g.split('_')[0] == 'v':
                ave_scores[g][i,j] = res[res['model_contig_len'] == m]['score'].mean()
            else:
                raise ValueError(f'Unknown identifier: {g.split("_")[0]} (should be h or v)')
       
    print(g)
    print(ave_scores[g])
    print()

h_human-3
[[0.5769636  0.46334822 0.36159305 0.77436769]]

v_sars-cov-2
[[0.47995015 0.66666538 0.93162538 0.86323041]]

