In [1]:
from pathlib import Path
import yaml
import ipynbname

nerpa_dir = Path(ipynbname.path()).parent.parent
nerpa_config = yaml.safe_load((nerpa_dir / 'configs/config.yaml').open())
print(nerpa_config)


{'antismash_config': 'configs/antismash_config.yaml', 'antismash_processing_config': 'configs/antismash_processing_config.yaml', 'specificity_prediction_config': 'configs/specificity_prediction_config.yaml', 'rban_config': 'configs/rban_config.yaml', 'rban_processing_config': 'configs/rban_processing_config.yaml', 'hmm_scoring_config': 'configs/hmm_scoring_config.yaml', 'matching_config': 'configs/matching_config.yaml', 'cpp_matcher_exec': 'build/hmm_nrp_matcher', 'monomers_config': 'configs/monomers_config.yaml', 'output_config': 'configs/output_config.yaml', 'default_results_root_dirname': 'nerpa_results'}


In [2]:
from src.monomer_names_helper import MonomerNamesHelper
import pandas as pd

# 1. Load monomer names helper
monomers_cfg = yaml.safe_load((nerpa_dir / nerpa_config['monomers_config']).open('r'))
monomers_table_tsv = nerpa_dir / monomers_cfg['monomer_names_table']
monomer_names_helper = MonomerNamesHelper(pd.read_csv(monomers_table_tsv, sep='\t'),
                                          monomers_cfg['supported_residues'],
                                          monomers_cfg['pks_names'])


In [3]:
from src.antismash_parsing.antismash_name_mappings import KNOWN_SUBSTRATES
from src.monomer_names_helper import MonomerNamesHelper, MonomerResidue
from typing import Dict
from src.data_types import LogProb
from math import log


def convert_as7_specificities_to_nerpa(residue_score: Dict[str, float],  # as_long -> probability
                                       monomer_names_helper: MonomerNamesHelper) -> Dict[MonomerResidue, LogProb]:
    def paras_name_core(paras_name: str) -> str:
        match paras_name:
            case '3-(2-nitrocyclopropylalanine)': return 'alanine'
            case '3S-methylaspartic acid branched': return '3S-methylaspartic acid'
            case _: return paras_name.split('-')[-1]


    nerpa_residue_score = {residue: -float('inf')
                           for residue in monomer_names_helper.supported_residues}
    for paras_name, prob in residue_score.items():
        try:
            as_short = next(substrate.short
                            for substrate in KNOWN_SUBSTRATES
                            if paras_name_core(paras_name) in substrate.long)
        except StopIteration:
            print(f'Unknown substrate: {paras_name}')
            raise
        monomer_residue = monomer_names_helper.parsed_name(as_short, name_format='antismash').residue
        if prob > 1e-7 and log(prob) > nerpa_residue_score[monomer_residue]:
            nerpa_residue_score[monomer_residue] = log(prob)
    return nerpa_residue_score


In [4]:
from src.training.hmm_infer_emission_params import BGC_Module_Info
from typing import Dict, Tuple
from collections import defaultdict
import re
import yaml

def parse_aa34_fasta(file_path: Path) -> Dict[Tuple[str, int], str]:
    if not file_path.exists():
        bgc_id = file_path.parent.name
        print(f'BGC {bgc_id} has no aa34 fasta')
        return defaultdict(lambda: '')

    aa34_for_adomain = {}
    with open(file_path, 'r') as file:
        identifier, domain = None, None
        sequence = ""

        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if identifier and domain:
                    aa34_for_adomain[(identifier, domain)] = sequence

                match = re.match(r">(\S+)\|domain_(\d+)\|", line)
                if match:
                    identifier, domain = match.groups()
                    domain = int(domain)
                    sequence = ""
            else:
                sequence += line

        if identifier and domain:
            aa34_for_adomain[(identifier, domain)] = sequence

    return aa34_for_adomain


def parse_adomain_specificities(file_path: Path) -> Dict[Tuple[str, int], Dict[str, float]]:
    adomain_specificities = {}

    with file_path.open("r") as file:
        headers = file.readline().strip().split("\t")  # Skip header row

        for line in file:
            parts = line.strip().split("\t")
            identifier, domain_num = parts[0].split("|")[0], int(parts[0].split("|")[1].split("_")[1])

            substrate_confidences = {
                parts[i]: float(parts[i + 1])
                for i in range(1, len(parts), 2)
                if i + 1 < len(parts)
            }

            adomain_specificities[(identifier, domain_num)] = substrate_confidences

    return adomain_specificities


paras_results_dir = nerpa_dir / 'paras/antismash7.1_nrps'
paras_predictions = []
for paras_pred_for_bgc in paras_results_dir.iterdir():
    # get signatures
    genome_id = paras_pred_for_bgc.name
    aa34_for_adomain = parse_aa34_fasta(paras_pred_for_bgc / 'run_extended_signatures.fasta')
    adomain_specificities = parse_adomain_specificities(paras_pred_for_bgc / 'run_paras_results.txt')
    for gene_id, adomain_idx in aa34_for_adomain.keys():
        aa34 = aa34_for_adomain[(gene_id, adomain_idx)]
        residue_score = adomain_specificities[(gene_id, adomain_idx)]
        residue_score = convert_as7_specificities_to_nerpa(residue_score, monomer_names_helper)
        bgc_module_info = BGC_Module_Info(genome_id,
                                          gene_id,
                                          adomain_idx - 1,  # to 0-based indexing
                                          aa34,
                                          residue_score).to_dict()
        paras_predictions.append(bgc_module_info)

print(paras_predictions[:2])
yaml.dump(paras_predictions, (nerpa_dir / 'paras/paras_predictions.yaml').open('w'))

paras_supported_residues = list(set.union(*(set(paras_prediction['residue_score'].keys())
                                       for paras_prediction in paras_predictions)))
print(paras_supported_residues)
yaml.dump(paras_supported_residues, (nerpa_dir / 'paras/paras_supported_residues.yaml').open('w'))


BGC BGC0000295 has no aa34 fasta
BGC BGC0002609 has no aa34 fasta
BGC BGC0002614 has no aa34 fasta
BGC BGC0002228 has no aa34 fasta
BGC BGC0002227 has no aa34 fasta
BGC BGC0002073 has no aa34 fasta
BGC BGC0003080 has no aa34 fasta
BGC BGC0002866 has no aa34 fasta
BGC BGC0002087 has no aa34 fasta
BGC BGC0002608 has no aa34 fasta
BGC BGC0003084 has no aa34 fasta
BGC BGC0001003 has no aa34 fasta
BGC BGC0001616 has no aa34 fasta
BGC BGC0001094 has no aa34 fasta
BGC BGC0002683 has no aa34 fasta
BGC BGC0001002 has no aa34 fasta
BGC BGC0003079 has no aa34 fasta
BGC BGC0001521 has no aa34 fasta
BGC BGC0002426 has no aa34 fasta
BGC BGC0002499 has no aa34 fasta
BGC BGC0002496 has no aa34 fasta
BGC BGC0002892 has no aa34 fasta
BGC BGC0003085 has no aa34 fasta
BGC BGC0001031 has no aa34 fasta
BGC BGC0002458 has no aa34 fasta
BGC BGC0003082 has no aa34 fasta
BGC BGC0002531 has no aa34 fasta
[{'genome_id': 'BGC0002625', 'gene_id': 'MCF2151708.1', 'a_domain_idx': 0, 'aa34': 'RWMTFDVSVWEWHFICSGEHNLYGP