In [7]:
import yaml
from pathlib import Path
norine_stats = yaml.safe_load(Path('data/norine_monomers_info.yaml').read_text())
paras_residues = ['Aad', 'Ala', 'Arg', 'Asn', 'Asp', 'Bza', 'Cys', 'Dab', 'Gln', 'Glu', 'Gly', 'His', 'Hpg', 'Ile', 'Leu', 'Lys', 'Orn', 'Phe', 'Pip', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val', 'bAla', 'dHpg']
all_res_freqs = sorted(norine_stats['residue_frequencies'].items(), key=lambda x: x[1], reverse=True)
print('All residues frequencies:')
print(all_res_freqs)
res_idx = {res: idx for idx, (res, _) in enumerate(all_res_freqs)}
print('Paras residues indices:')
print(sorted((res_idx[paras_res], paras_res) for paras_res in paras_residues))


All residues frequencies:
[('unknown', 0.21105308964316796), ('Leu', 0.10008703220191471), ('Ala', 0.07441253263707572), ('Val', 0.07154046997389034), ('Pro', 0.0658833768494343), ('Ser', 0.04142732811140122), ('Gln', 0.03951261966927763), ('Gly', 0.03498694516971279), ('Asp', 0.03080939947780679), ('Thr', 0.028807658833768495), ('Ile', 0.028720626631853787), ('Phe', 0.02863359442993908), ('Tyr', 0.025587467362924284), ('Glu', 0.020974760661444734), ('Asn', 0.0206266318537859), ('Orn', 0.018711923411662314), ('Dab', 0.017493472584856395), ('Hpg', 0.014708442123585727), ('Trp', 0.01453437771975631), ('Abu', 0.013228894691035683), ('aThr', 0.012358572671888599), ('Arg', 0.011662315056570931), ('Lys', 0.010008703220191472), ('Iva', 0.008181026979982594), ('Hiv', 0.0067014795474325504), ('aIle', 0.0067014795474325504), ('bAla', 0.005831157528285466), ('dHpg', 0.004525674499564839), ('Cys', 0.004264577893820714), ('Lac', 0.0039164490861618795), ('Pen', 0.003568320278503046), ('Dpr', 0.00339

In [6]:
import pandas as pd
from pathlib import Path

df = pd.read_table(Path('/home/ilianolhin/git/nerpa2/data/compound_info_table.tsv'), sep='\t')

# Filter MIBiG compounds with at least 3 rBAN recognized_monomers
df_mibig = df[df['id'].str.startswith('BGC')]

def unique_bgc_ids(df: pd.DataFrame) -> set:
    """
    Extract unique BGC IDs from the DataFrame.
    """
    return set(df['id'].apply(lambda s: s.split('.')[0]))

df_mibig_with_rban_monomers = df_mibig[df_mibig['recognized_monomers'] >= 3]
df_mibig_with_nerpa_monomers = df_mibig[df_mibig['nerpa2_supported_monomers'] >= 3]
print(f'BGC IDs with at least 3 rBAN recognized_monomers: {len(unique_bgc_ids(df_mibig_with_rban_monomers))}')
print(f'BGC IDs with at least 3 nerpa2_supported_monomers: {len(unique_bgc_ids(df_mibig_with_nerpa_monomers))}')


BGC IDs with at least 3 rBAN recognized_monomers: 452
BGC IDs with at least 3 nerpa2_supported_monomers: 426


In [1]:
from src.matching.match_type import Match
from pathlib import Path
import shutil

nerpa_dir = Path('/home/ilianolhin/git/nerpa2/')

# Load the approved matches from the text file
approved_matches_txt = nerpa_dir / 'test_data/approved_matches/approved_matches.txt'
matches_strs = approved_matches_txt.read_text().split('\n\n')
matches_strs = [match_str for match_str in matches_strs
                if match_str.strip()]
approved_matches = [Match.from_str(matches_str)
                    for matches_str in matches_strs]
approved_matches_nrp_ids = {match.nrp_variant_id.nrp_id for match in approved_matches}
approved_matches_bgc_ids = {nrp_id.split('.')[0] for nrp_id in approved_matches_nrp_ids}

print(f'Approved matches nrp ids: {len(approved_matches_nrp_ids)}')
print(f'Approved matches bgc ids: {len(approved_matches_bgc_ids)}')


Approved matches nrp ids: 234
Approved matches bgc ids: 145


In [6]:
# copy antismash results for the bgc ids to test_data/approved_matches
antismash_results_all = Path('/home/ilianolhin/programs/antismash7/mibig_results')
for mibig_id in approved_matches_bgc_ids:
    antismash_results_json = antismash_results_all / mibig_id / f'{mibig_id}.json'
    dst = nerpa_dir / 'test_data/approved_matches/antismash_jsons' / f'{mibig_id}.json'
    if not dst.exists():
        shutil.copy(antismash_results_json, dst)



In [7]:
# collect rban results for the approved nrp ids
import json
rban_results_all = Path('/home/ilianolhin/git/nerpa2_old/training/training/rban_records')

records = []
for nrp_id in approved_matches_nrp_ids:
    rban_results_json = rban_results_all / f'{nrp_id}.json'
    records.append(json.loads(rban_results_json.read_text()))

dest = nerpa_dir / 'test_data/approved_matches/rban_records/merged.json'

with open(dest, 'w') as f:
    json.dump(records, f, indent=4)