1. Split MIBiG compounds into 5 groups for cross-validation based on BigScape families.

In [1]:
from math import ceil
from pathlib import Path
import pandas as pd

nerpa_dir = Path('/home/ilianolhin/git/nerpa2')

mibig_bgcs_table = pd.read_csv(nerpa_dir / 'scripts/build_mibig_info_table/mibig_bgcs_info.tsv', sep='\t')
mibig_compounds_approved = mibig_bgcs_table[mibig_bgcs_table["in_approved_matches"] == True].copy()

if mibig_compounds_approved["bigscape_families"].isna().sum() > 0:
    print('Warning! Missing bigscape_families values present.')

# Aggregate BGCs by BigScape family using groupby
family_to_bgcs = mibig_compounds_approved.groupby("bigscape_families")["bgc_id"].apply(list).to_dict()

n_groups = 5  # Number of groups to split the families into
total_bgcs = sum(len(bgcs) for bgcs in family_to_bgcs.values())
target_group_size = ceil(total_bgcs / n_groups)

family_iter = iter(family_to_bgcs.items())
family_to_sample_group = {}

for group_idx in range(n_groups):
    group_size = 0
    while group_size < target_group_size and (family := next(family_iter, None)) is not None:
        family_name, bgcs = family
        family_to_sample_group[family_name] = group_idx
        group_size += len(bgcs)

# q: assert that all families are assigned to a group
assert len(family_to_sample_group) == len(family_to_bgcs), "Not all families are assigned to a group"

# Assign sample group to each row
mibig_compounds_approved["sample_group"] = (mibig_compounds_approved["bigscape_families"]
                                            .map(family_to_sample_group)
                                            .astype('Int64'))

# display mibig_compounds_approved
print(mibig_compounds_approved.head())

        bgc_id  num_a_domains origins bigscape_families   compound_id  \
15  BGC0000296              6  mibig3          NRPS_277  BGC0000296.0   
27  BGC0000305             11  mibig3  NRPS_285_mix_285  BGC0000305.0   
28  BGC0000306              6  mibig3  NRPS_286_mix_286  BGC0000306.0   
29  BGC0000307              9  mibig3  NRPS_287_mix_287  BGC0000307.0   
34  BGC0000310             12  mibig3  NRPS_290_mix_290  BGC0000310.0   

    num_recognized_nodes  iso_class_idx  in_approved_matches  sample_group  
15                    11           3006                 True             3  
27                    11           1782                 True             3  
28                     6            368                 True             3  
29                     9           3138                 True             3  
34                    12            612                 True             3  


2. Form subsets of approved matches with MIBiG compounds for cross-validation.

In [7]:
from itertools import chain
from src.matching.match_type import Match
approved_matches_txt = Path(nerpa_dir / 'test_data/approved_matches/approved_matches.txt')
approved_matches_strs = approved_matches_txt.read_text().split('\n\n')
approved_matches = (Match.from_str(match_str)
                    for match_str in approved_matches_strs
                    if match_str.strip())
approved_matches_per_compound = {match.nrp_variant_id.nrp_id: match
                                 for match in approved_matches}

benchmarking_dir = Path(nerpa_dir / 'benchmarking')
(benchmarking_dir / 'approved_matches_subsets').mkdir(parents=True, exist_ok=True)

training_subset_paths = [benchmarking_dir / 'approved_matches_subsets' / f'training_subset_{i}.txt'
                         for i in range(n_groups)]
testing_subset_paths = [benchmarking_dir / 'approved_matches_subsets' / f'testing_subset_{i}.txt'
                        for i in range(n_groups)]

subsets = mibig_compounds_approved.groupby('sample_group')['compound_id'].apply(list).to_dict()

"""
assert set(approved_matches_per_compound.keys()) == set(chain(*subsets.values())), \
    ("The set of approved matches does not match the set of MIBiG compounds in subsets.\n"
    f"subsets \ approved_matches = {set(chain(*subsets.values())) - set(approved_matches_per_compound.keys())}\n"
    f"approved_matches \ subsets = {set(approved_matches_per_compound.keys()) - set(chain(*subsets.values()))}")
"""

for group_idx, compound_ids in subsets.items():
    testing_subset = [approved_matches_per_compound[nrp_id]
                      for nrp_id in compound_ids]
    training_subset = [match
                       for match in approved_matches_per_compound.values()
                       if match.nrp_variant_id.nrp_id not in compound_ids]

    # Save the subset to a file
    with open(testing_subset_paths[group_idx], 'w') as f:
        f.write('\n'.join(match.nrp_variant_id.nrp_id
                          for match in testing_subset))

    with open(training_subset_paths[group_idx], 'w') as f:
        f.write('\n\n'.join(map(str, training_subset)))

3. Create a config file for each subset.

In [6]:
import subprocess
(benchmarking_dir / 'training_results').mkdir(parents=True, exist_ok=True)
training_results_paths = [benchmarking_dir / 'training_results' / f'subset_{i}'
                          for i in range(n_groups)]
for subset_idx in range(n_groups):
    command = (f'python {nerpa_dir / "train_nerpa.py"} '
               f'--approved-matches {training_subset_paths[subset_idx]} '
               f'--output-dir {training_results_paths[subset_idx]}')
    print(f'Running command: {command}')
    subprocess.run(command, shell=True, check=True)

Running command: python /home/ilianolhin/git/nerpa2/train_nerpa.py --approved-matches /home/ilianolhin/git/nerpa2/benchmarking/approved_matches_subsets/training_subset_0.txt --output-dir /home/ilianolhin/git/nerpa2/benchmarking/training_results/subset_0
Loading approved matches
Loading BGC variants
Inferring edge parameters from counts...
Building step function...
Calculating modifications frequencies...
Calculating PKS probability...
Unknown residue predictions correctness: [(0.008, False), (0.008, False), (0.008, False), (0.008, False), (0.008, False), (0.008, False), (0.008, False), (0.008, False), (0.008, True), (0.008, False), (0.008, True), (0.008, True), (0.008, False), (0.008, False), (0.008, False), (0.008, False), (0.008, False), (0.008, False), (0.008, False), (0.008, False), (0.008, False), (0.008, True)]
Posterior mean      = 0.2019
95 % credible band  = (0.0670, 0.3828)
Running command: python /home/ilianolhin/git/nerpa2/train_nerpa.py --approved-matches /home/ilianolhin/

4. Prepare inputs for Nerpa

In [11]:
import shutil

inputs_dir = benchmarking_dir / 'cross_validation_inputs'
inputs_dir.mkdir(parents=True, exist_ok=True)
as_results_on_mibig_dir = Path('/home/ilianolhin/programs/antismash7/as_results_mibig4_nrps')
compounds_table = pd.read_csv(nerpa_dir / 'data/pnrpdb2rc1_summary.tsv', sep='\t')

for subset_idx in range(n_groups):
    subset_input_dir = inputs_dir / f'subset_{subset_idx}'
    # 1. copy configs from training results
    source = training_results_paths[subset_idx] / 'new_configs'
    dest = subset_input_dir / 'configs'
    dest.mkdir(parents=True, exist_ok=True)
    shutil.copytree(source, dest, dirs_exist_ok=True)

    # 2. Load nrp_ids for testing
    nrp_ids = [nrp_id.strip()
               for nrp_id in testing_subset_paths[subset_idx].read_text().split('\n')
               if nrp_id.strip()]
    bgc_ids = {nrp_id.split('.')[0] for nrp_id in nrp_ids}

    # 2. Create a file with antismash paths
    antismash_paths = [as_results_on_mibig_dir / bgc_id
                for bgc_id in bgc_ids]
    with open(subset_input_dir / 'antismash_paths.txt', 'w') as f:
        f.write('\n'.join(str(path) for path in antismash_paths))

    # 3. Create a file with MIBiG compounds
    # q: assert that all nrp_ids are in compounds_table
    assert set(nrp_ids).issubset(set(compounds_table['ID'])), \
        "Not all nrp_ids are in compounds_table"

    mibig_compounds = compounds_table[compounds_table['ID'].isin(nrp_ids)]
    mibig_compounds.to_csv(subset_input_dir / 'compounds.tsv',
                           sep='\t', index=False)



In [21]:
def is_from_mibig_norine(nrp_id: str) -> bool:
    return nrp_id.startswith('BGC') or nrp_id.startswith('NOR')

def at_least_3_recognized_monomers(nrp_id: str) -> bool:
    """Check if the nrp_id has at least 3 recognized monomers in the compound_stats_table."""
    values = compound_stats_table[compound_stats_table['compound_id'] == nrp_id]['num_recognized_nodes'].values
    return values.size > 0 and values[0] >= 3

compound_stats_table = pd.read_csv(nerpa_dir / 'scripts/build_mibig_info_table/rban_graphs_filtered.tsv', sep='\t')
mibig_norine_compounds = (compounds_table[compounds_table['ID']
                        .apply(lambda nrp_id: is_from_mibig_norine(nrp_id) and at_least_3_recognized_monomers(nrp_id))])

mibig_norine_compounds.to_csv(nerpa_dir / 'data' / 'mibig_norine_compounds.tsv',
                               sep='\t', index=False)

5. Run Nerpa on each subset.

In [26]:
results_dir = benchmarking_dir / 'cross_validation_results'
results_dir.mkdir(parents=True, exist_ok=True)

for subset_idx in range(n_groups):
    subset_input_dir = inputs_dir / f'subset_{subset_idx}'
    command = (f'python {nerpa_dir / "nerpa.py"} '
               f'--antismash-paths-file {subset_input_dir / "antismash_paths.txt"} '
               f'--rban-json {nerpa_dir / "data/mibig_norine_rban_preprocessed.json"} '
               f'--configs-dir {subset_input_dir / "configs"} '
               '--process-hybrids '
               '--max-num-matches-per-bgc 10 '
               '--max-num-matches 1000 '
               f'--output-dir {results_dir / f"subset_{subset_idx}"} '
               f'--force-output-dir '
               '--fast-matching '
               '--threads 5 '
               '--skip-molecule-drawing')
    print(f'Running command: {command}')
    subprocess.run(command, shell=True, check=True)

Running command: python /home/ilianolhin/git/nerpa2/nerpa.py --antismash-paths-file /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_inputs/subset_0/antismash_paths.txt --rban-json /home/ilianolhin/git/nerpa2/data/mibig_norine_rban_preprocessed.json --configs-dir /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_inputs/subset_0/configs --process-hybrids --max-num-matches-per-bgc 10 --max-num-matches 1000 --output-dir /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_0 --force-output-dir --fast-matching --threads 5 --skip-molecule-drawing

Started with command: /home/ilianolhin/git/nerpa2/nerpa.py --antismash-paths-file /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_inputs/subset_0/antismash_paths.txt --rban-json /home/ilianolhin/git/nerpa2/data/mibig_norine_rban_preprocessed.json --configs-dir /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_inputs/subset_0/configs --process-hybrids --max-num-matches-per-bgc 10 --max-num-match

[17:55:50] Can't kekulize mol.  Unkekulized atoms: 6 7 8 11 12
[17:55:50] non-ring atom 1 marked aromatic
[17:55:50] non-ring atom 0 marked aromatic
[17:55:50] Explicit valence for atom # 7 N, 4, is greater than permitted
[17:55:50] Can't kekulize mol.  Unkekulized atoms: 2 3 4
[17:55:50] Explicit valence for atom # 0 C, 5, is greater than permitted
[17:55:50] non-ring atom 6 marked aromatic
[17:55:50] non-ring atom 6 marked aromatic
[17:55:50] Can't kekulize mol.  Unkekulized atoms: 6 7 8 10 11
[17:55:50] non-ring atom 10 marked aromatic
[17:55:50] non-ring atom 10 marked aromatic
[17:55:50] non-ring atom 10 marked aromatic
[17:55:50] non-ring atom 10 marked aromatic
[17:55:50] non-ring atom 10 marked aromatic
[17:55:50] non-ring atom 10 marked aromatic
[17:55:50] non-ring atom 10 marked aromatic
[17:55:50] non-ring atom 4 marked aromatic
[17:55:50] non-ring atom 4 marked aromatic
[17:55:50] non-ring atom 0 marked aromatic
[17:55:50] Explicit valence for atom # 0 C, 5, is greater than


=== Resolving NRP-PK hybrid monomers candidates

== Running: java -jar /home/ilianolhin/git/nerpa2/external_tools/rBAN/rBAN-1.0.jar -inputFile /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_0/hybrid_monomers_input.json -outputFolder /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_0/ -outputFileName hybrid_monomers_output.json -monomersDB /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_0/rban_monomers_db.json

  Process completed successfully. Compounds analysed: 294

== Done



results will be in /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_0/NRP_Variants

Getting NRP variants took 7.0718 seconds

Generating linearizations took 0.0083 seconds

Parsing HMMs from /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_0/hmms.json
Parsing NRP linearizations from /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_0/hmms.json
Matching 53 HMMs agains

  dp[next_state][disc_new_path_lp] += number_of_paths
  dp[next_state][disc_new_path_lp] += number_of_paths



RESULTS:
  Main report is saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_0/report.tsv
  HTML report is saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_0/report.html
  Detailed reports are saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_0/matches_details
Writing results took 4.5099 seconds
  Log is saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_0/nerpa.log

Finished: 2025-06-11 17:56:11
Elapsed time: 0:00:25.920359

Thank you for using Nerpa!
Running command: python /home/ilianolhin/git/nerpa2/nerpa.py --antismash-paths-file /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_inputs/subset_1/antismash_paths.txt --rban-json /home/ilianolhin/git/nerpa2/data/mibig_norine_rban_preprocessed.json --configs-dir /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_inputs/subset_1/configs --process-hybrids --max-num-matches-per-bgc 10 --max-num-mat

[17:56:18] Can't kekulize mol.  Unkekulized atoms: 6 7 8 11 12
[17:56:18] non-ring atom 1 marked aromatic
[17:56:18] non-ring atom 0 marked aromatic
[17:56:19] Explicit valence for atom # 7 N, 4, is greater than permitted
[17:56:19] Can't kekulize mol.  Unkekulized atoms: 2 3 4
[17:56:19] Explicit valence for atom # 0 C, 5, is greater than permitted
[17:56:19] non-ring atom 6 marked aromatic
[17:56:19] non-ring atom 6 marked aromatic
[17:56:19] Can't kekulize mol.  Unkekulized atoms: 6 7 8 10 11
[17:56:19] non-ring atom 10 marked aromatic
[17:56:19] non-ring atom 10 marked aromatic
[17:56:19] non-ring atom 10 marked aromatic
[17:56:19] non-ring atom 10 marked aromatic
[17:56:19] non-ring atom 10 marked aromatic
[17:56:19] non-ring atom 10 marked aromatic
[17:56:19] non-ring atom 10 marked aromatic
[17:56:19] non-ring atom 4 marked aromatic
[17:56:19] non-ring atom 4 marked aromatic
[17:56:19] non-ring atom 0 marked aromatic
[17:56:19] Explicit valence for atom # 0 C, 5, is greater than


=== Resolving NRP-PK hybrid monomers candidates

== Running: java -jar /home/ilianolhin/git/nerpa2/external_tools/rBAN/rBAN-1.0.jar -inputFile /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_1/hybrid_monomers_input.json -outputFolder /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_1/ -outputFileName hybrid_monomers_output.json -monomersDB /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_1/rban_monomers_db.json

  Process completed successfully. Compounds analysed: 294

== Done



results will be in /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_1/NRP_Variants

Getting NRP variants took 10.4620 seconds

Generating linearizations took 0.0088 seconds

Parsing HMMs from /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_1/hmms.json
Parsing NRP linearizations from /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_1/hmms.json
Matching 53 HMMs again

  dp[next_state][disc_new_path_lp] += number_of_paths
  dp[next_state][disc_new_path_lp] += number_of_paths



RESULTS:
  Main report is saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_1/report.tsv
  HTML report is saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_1/report.html
  Detailed reports are saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_1/matches_details
Writing results took 3.9544 seconds
  Log is saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_1/nerpa.log

Finished: 2025-06-11 17:56:41
Elapsed time: 0:00:28.928256

Thank you for using Nerpa!
Running command: python /home/ilianolhin/git/nerpa2/nerpa.py --antismash-paths-file /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_inputs/subset_2/antismash_paths.txt --rban-json /home/ilianolhin/git/nerpa2/data/mibig_norine_rban_preprocessed.json --configs-dir /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_inputs/subset_2/configs --process-hybrids --max-num-matches-per-bgc 10 --max-num-mat

[17:56:51] Can't kekulize mol.  Unkekulized atoms: 6 7 8 11 12
[17:56:51] non-ring atom 1 marked aromatic
[17:56:51] non-ring atom 0 marked aromatic
[17:56:51] Explicit valence for atom # 7 N, 4, is greater than permitted
[17:56:51] Can't kekulize mol.  Unkekulized atoms: 2 3 4
[17:56:51] Explicit valence for atom # 0 C, 5, is greater than permitted
[17:56:51] non-ring atom 6 marked aromatic
[17:56:51] non-ring atom 6 marked aromatic
[17:56:51] Can't kekulize mol.  Unkekulized atoms: 6 7 8 10 11
[17:56:51] non-ring atom 10 marked aromatic
[17:56:51] non-ring atom 10 marked aromatic
[17:56:51] non-ring atom 10 marked aromatic
[17:56:51] non-ring atom 10 marked aromatic
[17:56:51] non-ring atom 10 marked aromatic
[17:56:51] non-ring atom 10 marked aromatic
[17:56:51] non-ring atom 10 marked aromatic
[17:56:51] non-ring atom 4 marked aromatic
[17:56:51] non-ring atom 4 marked aromatic
[17:56:51] non-ring atom 0 marked aromatic
[17:56:51] Explicit valence for atom # 0 C, 5, is greater than


=== Resolving NRP-PK hybrid monomers candidates

== Running: java -jar /home/ilianolhin/git/nerpa2/external_tools/rBAN/rBAN-1.0.jar -inputFile /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_2/hybrid_monomers_input.json -outputFolder /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_2/ -outputFileName hybrid_monomers_output.json -monomersDB /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_2/rban_monomers_db.json

  Process completed successfully. Compounds analysed: 294

== Done



results will be in /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_2/NRP_Variants

Getting NRP variants took 9.9098 seconds

Generating linearizations took 0.0095 seconds

Parsing HMMs from /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_2/hmms.json
Parsing NRP linearizations from /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_2/hmms.json
Matching 47 HMMs agains

  dp[next_state][disc_new_path_lp] += number_of_paths
  dp[next_state][disc_new_path_lp] += number_of_paths



RESULTS:
  Main report is saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_2/report.tsv
  HTML report is saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_2/report.html
  Detailed reports are saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_2/matches_details
Writing results took 3.8709 seconds
  Log is saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_2/nerpa.log

Finished: 2025-06-11 17:57:13
Elapsed time: 0:00:29.556972

Thank you for using Nerpa!
Running command: python /home/ilianolhin/git/nerpa2/nerpa.py --antismash-paths-file /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_inputs/subset_3/antismash_paths.txt --rban-json /home/ilianolhin/git/nerpa2/data/mibig_norine_rban_preprocessed.json --configs-dir /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_inputs/subset_3/configs --process-hybrids --max-num-matches-per-bgc 10 --max-num-mat

[17:57:19] Can't kekulize mol.  Unkekulized atoms: 6 7 8 11 12
[17:57:19] non-ring atom 1 marked aromatic
[17:57:19] non-ring atom 0 marked aromatic
[17:57:19] Explicit valence for atom # 7 N, 4, is greater than permitted
[17:57:19] Can't kekulize mol.  Unkekulized atoms: 2 3 4
[17:57:19] Explicit valence for atom # 0 C, 5, is greater than permitted
[17:57:19] non-ring atom 6 marked aromatic
[17:57:19] non-ring atom 6 marked aromatic
[17:57:19] Can't kekulize mol.  Unkekulized atoms: 6 7 8 10 11
[17:57:19] non-ring atom 10 marked aromatic
[17:57:19] non-ring atom 10 marked aromatic
[17:57:19] non-ring atom 10 marked aromatic
[17:57:19] non-ring atom 10 marked aromatic
[17:57:19] non-ring atom 10 marked aromatic
[17:57:19] non-ring atom 10 marked aromatic
[17:57:19] non-ring atom 10 marked aromatic
[17:57:19] non-ring atom 4 marked aromatic
[17:57:19] non-ring atom 4 marked aromatic
[17:57:19] non-ring atom 0 marked aromatic
[17:57:19] Explicit valence for atom # 0 C, 5, is greater than


=== Resolving NRP-PK hybrid monomers candidates

== Running: java -jar /home/ilianolhin/git/nerpa2/external_tools/rBAN/rBAN-1.0.jar -inputFile /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_3/hybrid_monomers_input.json -outputFolder /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_3/ -outputFileName hybrid_monomers_output.json -monomersDB /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_3/rban_monomers_db.json

  Process completed successfully. Compounds analysed: 294

== Done



results will be in /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_3/NRP_Variants

Getting NRP variants took 7.0803 seconds

Generating linearizations took 0.0094 seconds

Parsing HMMs from /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_3/hmms.json
Parsing NRP linearizations from /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_3/hmms.json
Matching 52 HMMs agains

  dp[next_state][disc_new_path_lp] += number_of_paths
  dp[next_state][disc_new_path_lp] += number_of_paths



RESULTS:
  Main report is saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_3/report.tsv
  HTML report is saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_3/report.html
  Detailed reports are saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_3/matches_details
Writing results took 4.4987 seconds
  Log is saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_3/nerpa.log

Finished: 2025-06-11 17:57:39
Elapsed time: 0:00:24.552692

Thank you for using Nerpa!
Running command: python /home/ilianolhin/git/nerpa2/nerpa.py --antismash-paths-file /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_inputs/subset_4/antismash_paths.txt --rban-json /home/ilianolhin/git/nerpa2/data/mibig_norine_rban_preprocessed.json --configs-dir /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_inputs/subset_4/configs --process-hybrids --max-num-matches-per-bgc 10 --max-num-mat

[17:57:46] Can't kekulize mol.  Unkekulized atoms: 6 7 8 11 12
[17:57:46] non-ring atom 1 marked aromatic
[17:57:46] non-ring atom 0 marked aromatic
[17:57:46] Explicit valence for atom # 7 N, 4, is greater than permitted
[17:57:46] Can't kekulize mol.  Unkekulized atoms: 2 3 4
[17:57:46] Explicit valence for atom # 0 C, 5, is greater than permitted
[17:57:46] non-ring atom 6 marked aromatic
[17:57:46] non-ring atom 6 marked aromatic
[17:57:46] Can't kekulize mol.  Unkekulized atoms: 6 7 8 10 11
[17:57:46] non-ring atom 10 marked aromatic
[17:57:46] non-ring atom 10 marked aromatic
[17:57:46] non-ring atom 10 marked aromatic
[17:57:46] non-ring atom 10 marked aromatic
[17:57:46] non-ring atom 10 marked aromatic
[17:57:46] non-ring atom 10 marked aromatic
[17:57:46] non-ring atom 10 marked aromatic
[17:57:46] non-ring atom 4 marked aromatic
[17:57:46] non-ring atom 4 marked aromatic
[17:57:46] non-ring atom 0 marked aromatic
[17:57:46] Explicit valence for atom # 0 C, 5, is greater than


=== Resolving NRP-PK hybrid monomers candidates

== Running: java -jar /home/ilianolhin/git/nerpa2/external_tools/rBAN/rBAN-1.0.jar -inputFile /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_4/hybrid_monomers_input.json -outputFolder /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_4/ -outputFileName hybrid_monomers_output.json -monomersDB /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_4/rban_monomers_db.json

  Process completed successfully. Compounds analysed: 294

== Done



results will be in /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_4/NRP_Variants

Getting NRP variants took 6.7001 seconds

Generating linearizations took 0.0091 seconds

Parsing HMMs from /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_4/hmms.json
Parsing NRP linearizations from /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_4/hmms.json
Matching 66 HMMs agains

  dp[next_state][disc_new_path_lp] += number_of_paths
  dp[next_state][disc_new_path_lp] += number_of_paths



RESULTS:
  Main report is saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_4/report.tsv
  HTML report is saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_4/report.html
  Detailed reports are saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_4/matches_details
Writing results took 4.4717 seconds
  Log is saved to /home/ilianolhin/git/nerpa2/benchmarking/cross_validation_results/subset_4/nerpa.log

Finished: 2025-06-11 17:58:09
Elapsed time: 0:00:28.085012

Thank you for using Nerpa!
