In [3]:
# Load BIG-SCAPE clustering
import pandas as pd
from pathlib import Path
bigscape_results_dir = Path('/home/ilianolhin/git/nerpa2/scripts/build_mibig_info_table/bigscape_clustering/network_files/2025-06-10_16-07-56_hybrids_glocal')
bigscape_bgc_types = [f.name
                      for f in bigscape_results_dir.iterdir()
                      if f.is_dir()]
#bigscape_bgc_types = ['NRPS', 'PKS-NRP_Hybrids', 'PKSI', 'PKSother', 'Terpene', 'mix', 'Others']
clustering_per_type = {}
for bgc_type in bigscape_bgc_types:
    # Load the clustering table for the current BGC type
    clustering_table = pd.read_csv(bigscape_results_dir / f'{bgc_type}/{bgc_type}_clustering_c0.30.tsv', sep='\t')
    clustering_per_type[bgc_type] = clustering_table

def bgc_id_to_combined_families(bgc_id: str) -> str:
    """Convert BGC ID to combined family name."""
    families_per_type = []
    for bgc_type, clustering_table in clustering_per_type.items():
        if bgc_id in clustering_table['#BGC Name'].values:
            family_number = clustering_table[clustering_table['#BGC Name'] == bgc_id]['Family Number'].values[0]
            families_per_type.append(f'{bgc_type}_{family_number}')
    return '_'.join(families_per_type)


In [9]:
from pathlib import Path

# Load the BGCs table from ./bgc_variants_mibig3_mibig4.tsv
bgc_variants_table = pd.read_csv('./bgc_variants_mibig3_mibig4.tsv', sep='\t')
# Load the rban graphs table from ./pnrpdb2_additional_info.tsv
rban_graphs_table = pd.read_csv('./rban_graphs_filtered.tsv', sep='\t')

# Load approved matches /home/ilianolhin/git/nerpa2/test_data/approved_matches/approved_matches.txt
approved_matches_path = Path('/home/ilianolhin/git/nerpa2/test_data/approved_matches/approved_matches.txt')
nrp_ids_approved = set()
for line in approved_matches_path.read_text().splitlines():
    if line.strip().startswith('NRP:'):
        nrp_id = line.strip().split(':')[1].strip()
        nrp_ids_approved.add(nrp_id)

def nrp_id_to_bgc_id(nrp_id: str) -> str:
    """Convert NRP ID to BGC ID."""
    return nrp_id.split('.')[0]  # Assuming NRP IDs are formatted as 'BGCID.compound_idx'

data_rows = []
for compound_id in rban_graphs_table['compound_id']:
    # Extract the NRP ID from the compound ID
    if not compound_id.startswith('BGC'):
        continue  # Skip if compound_id does not start with 'BGC'
    nrp_id = compound_id

    # Check if the NRP ID is in the approved matches
    bgc_id = nrp_id_to_bgc_id(nrp_id)
    if bgc_id in bgc_variants_table['bgc_id'].values:
        bigscape_families_str = bgc_id_to_combined_families(bgc_id)
        if bigscape_families_str == '':
            print('Warning: BGC ID', bgc_id, 'is not in any BIG-SCAPE family')
            bigscape_families_str = 'None'

        data_rows.append({
            'bgc_id': bgc_id,
            'num_a_domains': bgc_variants_table.loc[bgc_variants_table['bgc_id'] == bgc_id, 'num_a_domains'].values[0],
            'origins': bgc_variants_table.loc[bgc_variants_table['bgc_id'] == bgc_id, 'origins'].values[0],
            'bigscape_families': bigscape_families_str,
            'compound_id': nrp_id,
            'num_recognized_nodes': rban_graphs_table.loc[rban_graphs_table['compound_id'] == compound_id, 'num_recognized_nodes'].values[0],
            'iso_class_idx': rban_graphs_table.loc[rban_graphs_table['compound_id'] == compound_id, 'iso_class_idx'].values[0],
            'in_approved_matches': nrp_id in nrp_ids_approved,
            'deprecated in mibig4': bigscape_families_str == 'None',
        })

# Create a DataFrame from the data rows
df_final = pd.DataFrame(data_rows)
# sort the DataFrame by compound_id
df_final.sort_values(by='compound_id', inplace=True)
# Save the final DataFrame to a TSV file
df_final.to_csv('mibig_bgcs_info.tsv', sep='\t', index=False)

print(df_final[(df_final['in_approved_matches'] == True) & (df_final['bigscape_families'] == 'None')])

Empty DataFrame
Columns: [bgc_id, num_a_domains, origins, bigscape_families, compound_id, num_recognized_nodes, iso_class_idx, in_approved_matches, deprecated in mibig4]
Index: []
