In [None]:
import os

os.chdir('/pi/zhiping.weng-umw/data/ramirezc/tissue_specific')

import pickle
import numpy as np
import pandas as pd
from pyfaidx import Fasta
from typing import Dict, Tuple, Set, Optional
from conssensus_evaluator import ConsensusSpliceSiteEvaluator

In [None]:
# Initialize evaluator
evaluator = ConsensusSpliceSiteEvaluator(
    gencode_gtf="path/to/gencode.v47.gtf",
    consensus_fasta="path/to/consensus.fa"
)

In [None]:
# Filter for expressed genes
expressed_genes = evaluator.filter_expressed_genes(
    quant_tsv="path/to/quantifications.tsv",
    min_tpm=2.0
)

In [None]:
# Generate ground truth with serialization
ground_truth_acceptor, ground_truth_donor = evaluator.parse_gencode(
    expressed_genes=expressed_genes,
    ground_truth_file="ground_truth.pkl"
)

In [None]:
# Generate predictions with serialization
pred_acceptor, pred_donor = evaluator.generate_spliceai_predictions(
    predictions_file="predictions.pkl"
)

In [None]:
# Calculate metrics
acceptor_precision, acceptor_recall, acceptor_auprc, acceptor_top_k = evaluator.calculate_metrics(
    ground_truth_acceptor, 
    pred_acceptor
)
donor_precision, donor_recall, donor_auprc, donor_top_k = evaluator.calculate_metrics(
    ground_truth_donor,
    pred_donor
)

In [None]:
# Calculate mean metrics
mean_auprc = (acceptor_auprc + donor_auprc) / 2
mean_topk = (acceptor_top_k + donor_top_k) / 2

# Plot precision-recall curves
plt.figure(figsize=(10, 6))
plt.plot(acceptor_recall, acceptor_precision, label=f'Acceptor (AUPRC={acceptor_auprc:.3f})')
plt.plot(donor_recall, donor_precision, label=f'Donor (AUPRC={donor_auprc:.3f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title(f'Precision-Recall Curves\nMean AUPRC: {mean_auprc:.3f}, Mean Top-k: {mean_topk:.3f}')
plt.legend()
plt.grid(True)
plt.savefig("auprc_topk_spliceai.png", dpi=300)
plt.show()

# Print results
print(f"Acceptor AUPRC: {acceptor_auprc:.4f}, Top-k: {acceptor_top_k:.4f}")
print(f"Donor AUPRC: {donor_auprc:.4f}, Top-k: {donor_topk:.4f}")
print(f"Mean AUPRC: {mean_auprc:.4f}, Mean Top-k: {mean_topk:.4f}")