1. Split MIBiG compounds into 5 groups for cross-validation based on BigScape families.

In [18]:
from math import ceil
from pathlib import Path
import pandas as pd

nerpa_dir = Path('/home/ilianolhin/git/nerpa2')

mibig_bgcs_table = pd.read_csv(nerpa_dir / 'scripts/build_mibig_info_table/mibig_bgcs_info.tsv', sep='\t')
mibig_compounds_approved = mibig_bgcs_table[mibig_bgcs_table["in_approved_matches"] == True].copy()

if mibig_compounds_approved["bigscape_families"].isna().sum() > 0:
    print('Warning! Missing bigscape_families values present.')

# Aggregate BGCs by BigScape family using groupby
family_to_bgcs = mibig_compounds_approved.groupby("bigscape_families")["bgc_id"].apply(list).to_dict()

n_groups = 5  # Number of groups to split the families into
total_bgcs = sum(len(bgcs) for bgcs in family_to_bgcs.values())
target_group_size = ceil(total_bgcs / n_groups)

family_iter = iter(family_to_bgcs.items())
family_to_sample_group = {}

for group_idx in range(n_groups):
    group_size = 0
    while group_size < target_group_size and (family := next(family_iter, None)) is not None:
        family_name, bgcs = family
        family_to_sample_group[family_name] = group_idx
        group_size += len(bgcs)

# q: assert that all families are assigned to a group
assert len(family_to_sample_group) == len(family_to_bgcs), "Not all families are assigned to a group"

# Assign sample group to each row
mibig_compounds_approved["sample_group"] = (mibig_compounds_approved["bigscape_families"]
                                            .map(family_to_sample_group)
                                            .astype('Int64'))

# display mibig_compounds_approved
print(mibig_compounds_approved.head())

        bgc_id  num_a_domains origins bigscape_families   compound_id  \
15  BGC0000296              6  mibig3          NRPS_277  BGC0000296.0   
27  BGC0000305             11  mibig3  NRPS_285_mix_285  BGC0000305.0   
28  BGC0000306              6  mibig3  NRPS_286_mix_286  BGC0000306.0   
29  BGC0000307              9  mibig3  NRPS_287_mix_287  BGC0000307.0   
34  BGC0000310             12  mibig3  NRPS_290_mix_290  BGC0000310.0   

    num_recognized_nodes  iso_class_idx  in_approved_matches  sample_group  
15                    11           3006                 True             3  
27                    11           1782                 True             3  
28                     6            368                 True             3  
29                     9           3138                 True             3  
34                    12            612                 True             3  


2. Form subsets of approved matches with MIBiG compounds for cross-validation.

In [19]:
from itertools import chain
from src.matching.match_type import Match
approved_matches_txt = Path(nerpa_dir / 'test_data/approved_matches/approved_matches.txt')
approved_matches_strs = approved_matches_txt.read_text().split('\n\n')
approved_matches = (Match.from_str(match_str)
                    for match_str in approved_matches_strs
                    if match_str.strip())
approved_matches_per_compound = {match.nrp_variant_id.nrp_id: match
                                 for match in approved_matches}

benchmarking_dir = Path(nerpa_dir / 'benchmarking')
(benchmarking_dir / 'approved_matches_subsets').mkdir(parents=True, exist_ok=True)

subsets = mibig_compounds_approved.groupby('sample_group')['compound_id'].apply(list).to_dict()

"""
assert set(approved_matches_per_compound.keys()) == set(chain(*subsets.values())), \
    ("The set of approved matches does not match the set of MIBiG compounds in subsets.\n"
    f"subsets \ approved_matches = {set(chain(*subsets.values())) - set(approved_matches_per_compound.keys())}\n"
    f"approved_matches \ subsets = {set(approved_matches_per_compound.keys()) - set(chain(*subsets.values()))}")
"""

for group_idx, compound_ids in subsets.items():
    testing_subset = [approved_matches_per_compound[nrp_id]
                      for nrp_id in compound_ids]
    training_subset = [match
                       for match in approved_matches_per_compound.values()
                       if match.nrp_variant_id.nrp_id not in compound_ids]

    # Save the subset to a file
    testing_subset_path = benchmarking_dir / 'approved_matches_subsets' / f'testing_subset_{group_idx}.txt'
    training_subset_path = benchmarking_dir / 'approved_matches_subsets' / f'training_subset_{group_idx}.txt'
    with open(testing_subset_path, 'w') as f:
        f.write('\n\n'.join(map(str, testing_subset)))

    with open(training_subset_path, 'w') as f:
        f.write('\n\n'.join(map(str, training_subset)))