In [1]:
import pandas as pd
from pathlib import Path
from src.config import load_monomer_names_helper

nerpa_dir = Path('/home/ilianolhin/git/nerpa2/')
monomer_names_cfg = nerpa_dir / 'configs/monomers_config.yaml'
monomer_names_helper = load_monomer_names_helper(monomer_names_cfg, nerpa_dir)

paras_results_tsv = Path('/data/paras_dataset_results.tsv')
paras_results = pd.read_csv(paras_results_tsv, sep='\t')
paras_results.head()


Unnamed: 0,id,substrate,score,correct
0,A05163.A1,"2,3-dihydroxybenzoic acid",0.0,False
1,A05163.A1,"2,4-diaminobutyric acid",0.0,False
2,A05163.A1,2-aminoadipic acid,0.01,False
3,A05163.A1,2-aminoisobutyric acid,0.0,False
4,A05163.A1,"3,5-dihydroxyphenylglycine",0.0,False


In [10]:
# compute calibration function
from math import log
from src.training.hmm_parameters.step_function import (
    fit_step_function,
    create_step_function
)

def get_residue(paras_name: str) -> str:
    return monomer_names_helper.parsed_name(paras_name, name_format='paras').residue

paras_results['residue'] = paras_results['substrate'].map(get_residue)
condensed = (paras_results
             .groupby(['id', 'residue'])
             .agg({'score': sum, 'correct': any})
             .reset_index())

def log_safe(x):
    return log(x) if x > 0 else float('-inf')

log_score_correctness = list(zip(map(log_safe, condensed['score']),
                                 condensed['correct']))
step_function_steps = fit_step_function(log_score_correctness,
                                  step_range=1000, num_bins=20)
print(step_function_steps)

step_function = create_step_function(step_function_steps)


  .agg({'score': sum, 'correct': any})


[0.001, 0.013, 0.018, 0.042, 0.042, 0.102, 0.102, 0.102, 0.203, 0.203, 0.203, 0.203, 0.847, 0.937, 0.937, 0.937, 0.937, 0.968, 0.968, 0.991]


In [9]:
# compute default distribution
from src.monomer_names_helper import UNKNOWN_RESIDUE
default_scores = (condensed
                  .assign(score=condensed["score"].map(step_function))
                  .groupby("residue")["score"]
                  .mean()).to_dict()
total = sum(default_scores.values())
default_scores[UNKNOWN_RESIDUE] = 1.0 - total
print(default_scores)

{'Aad': 0.004381746513038205, 'Aib': 0.007446634323832626, 'Ala': 0.0680888417222559, 'Arg': 0.018565797453001822, 'Asn': 0.028409945421467554, 'Asp': 0.027054881746513038, 'Bza': 0.023086112795633718, 'Cys': 0.029563371740448756, 'Dab': 0.01945027289266222, 'Gln': 0.02736082474226804, 'Glu': 0.017150394178289872, 'Gly': 0.053672225591267435, 'His': 0.005399636143117041, 'Hpg': 0.027091267434808972, 'Ile': 0.0361746513038205, 'Leu': 0.09462704669496665, 'Lys': 0.014043966040024258, 'Orn': 0.024078532443905398, 'Phe': 0.031186779866585808, 'Pip': 0.006620679199514858, 'Pro': 0.04003608247422681, 'Ser': 0.0781006670709521, 'Thr': 0.08122255912674349, 'Trp': 0.02419526986052153, 'Tyr': 0.044217404487568224, 'Val': 0.07266312916919344, 'bAla': 0.007682838083687083, 'dHpg': 0.01270497271073378, 'unknown': 0.07572346876895075}
