In [4]:
from pathlib import Path
import json

mibig3_gbks_dir = Path('/home/ilianolhin/programs/antismash7/mibig_gbk_3.1')
mibig3_jsons_dir = Path('/home/ilianolhin/programs/antismash7/mibig_json_3.1')
mibig4_gbks_dir = Path('/home/ilianolhin/programs/antismash7/mibig_gbk_4.0')
mibig4_jsons_dir = Path('/home/ilianolhin/programs/antismash7/mibig_json_4.0')

mibig3_bgcs_nrp_plus = set()
for mibig3_entry_json in mibig3_jsons_dir.iterdir():
    mibig3_entry_data = json.loads(mibig3_entry_json.read_text())
    if 'NRP' in mibig3_entry_data['cluster']['biosyn_class']:
        entry_id = mibig3_entry_json.stem
        if (mibig3_gbks_dir / f'{entry_id}.gbk').exists():
            mibig3_bgcs_nrp_plus.add(entry_id)

mibig4_bgcs_nrp_plus = set()
for mibig4_entry_json in mibig4_jsons_dir.iterdir():
    mibig4_entry_data = json.loads(mibig4_entry_json.read_text())
    biosynth_classes = [biosynth_class['class'] for biosynth_class in mibig4_entry_data['biosynthesis']['classes']]
    if 'NRPS' in biosynth_classes:
        entry_id = mibig4_entry_json.stem
        if (mibig4_gbks_dir / f'{entry_id}.gbk').exists():
            # Only include entries that have a corresponding .gbk file
                mibig4_bgcs_nrp_plus.add(entry_id)


In [5]:
mibig4_wo_3 = sorted(mibig4_bgcs_nrp_plus - mibig3_bgcs_nrp_plus)
as_input_gbk_paths = Path('/home/ilianolhin/programs/antismash7/as_input_gbk_paths.txt')
paths_to_mibig4_wo_3 = [f'{mibig4_gbks_dir.name}/{mibig_id}.gbk' for mibig_id in mibig4_wo_3]
with open(as_input_gbk_paths, 'w') as f:
    for path in paths_to_mibig4_wo_3:
        f.write(f'{path}\n')

In [6]:
import yaml
bgc_variants = []
for bgc_variant_yaml in Path('./nerpa_results_pnrpdb2_vs_mibig34/BGC_variants').iterdir():
    if bgc_variant_yaml.suffix == '.yaml':
        variants = yaml.safe_load(bgc_variant_yaml.read_text())
        # only keep the first variant of each BGC (they all have the same origins and number of A domains)
        bgc_variants.append(variants[0])


In [7]:
# q: Create a table with the following columns:
# bgc_id, len(bgc_variant['modules']), origins (mibig3 or mibig4)
import pandas as pd
def get_bgc_id(bgc_variant):
    return bgc_variant['bgc_variant_id']['bgc_id']['genome_id']

data = {
    'bgc_id': [get_bgc_id(bgc_variant) for bgc_variant in bgc_variants],
    'num_a_domains': [len(bgc_variant['modules']) for bgc_variant in bgc_variants],
    'origins': ['mibig4' if get_bgc_id(bgc_variant) in mibig4_wo_3 else 'mibig3'
                for bgc_variant in bgc_variants]
}
df = pd.DataFrame(data)
# Save the DataFrame to a TSV file
df.to_csv('bgc_variants_mibig3_mibig4.tsv', sep='\t', index=False)