In [1]:
# Load BIG-SCAPE clustering
import pandas as pd
nrps_clustering = pd.read_csv('./NRPS_clans_0.30_0.70.tsv', sep='\t')

# Fix cases like BGC0000331.region001
nrps_clustering['#BGC Name'] = nrps_clustering['#BGC Name'].apply(lambda x: x.split('.')[0])

# Remove identical rows
nrps_clustering = nrps_clustering.drop_duplicates()

# Check that there are no duplicate BGC IDs
assert nrps_clustering['#BGC Name'].is_unique, "There are duplicate BGC IDs in the NRPS clustering table."


In [2]:
from pathlib import Path

# Load the BGCs table from ./bgc_variants_mibig3_mibig4.tsv
bgc_variants_table = pd.read_csv('./bgc_variants_mibig3_mibig4.tsv', sep='\t')
# Load the rban graphs table from ./rban_graphs_filtered.tsv
rban_graphs_table = pd.read_csv('./rban_graphs_filtered.tsv', sep='\t')

# Load approved matches /home/ilianolhin/git/nerpa2/test_data/approved_matches/approved_matches.txt
approved_matches_path = Path('/home/ilianolhin/git/nerpa2/test_data/approved_matches/approved_matches.txt')
nrp_ids_approved = set()
for line in approved_matches_path.read_text().splitlines():
    if line.strip().startswith('NRP:'):
        nrp_id = line.strip().split(':')[1].strip()
        nrp_ids_approved.add(nrp_id)

def nrp_id_to_bgc_id(nrp_id: str) -> str:
    """Convert NRP ID to BGC ID."""
    return nrp_id.split('.')[0]  # Assuming NRP IDs are formatted as 'BGCID.variant'

data_rows = []
for compound_id in rban_graphs_table['compound_id']:
    # Extract the NRP ID from the compound ID
    if not compound_id.startswith('BGC'):
        continue  # Skip if compound_id does not start with 'BGC'
    nrp_id = compound_id

    # Check if the NRP ID is in the approved matches
    bgc_id = nrp_id_to_bgc_id(nrp_id)
    if bgc_id in bgc_variants_table['bgc_id'].values:
        if bgc_id not in nrps_clustering['#BGC Name'].values:
            print('Warning: BGC ID', bgc_id, 'not found in NRPS clustering table.'
                  ' It will be assigned to a placeholder clan number -1.')
            bigscape_clan, bigscape_family = -1, -1  # placeholder for BGCs not in NRPS clustering
        else:
            bigscape_clan = nrps_clustering.loc[nrps_clustering['#BGC Name'] == bgc_id, 'Clan Number'].values[0]
            bigscape_family = nrps_clustering.loc[nrps_clustering['#BGC Name'] == bgc_id, 'Family Number'].values[0]

        data_rows.append({
            'bgc_id': bgc_id,
            'num_a_domains': bgc_variants_table.loc[bgc_variants_table['bgc_id'] == bgc_id, 'num_a_domains'].values[0],
            'origins': bgc_variants_table.loc[bgc_variants_table['bgc_id'] == bgc_id, 'origins'].values[0],
            'bigscape_clan': bigscape_clan,
            'bigscape_family': bigscape_family,
            'compound_id': nrp_id,
            'num_recognized_nodes': rban_graphs_table.loc[rban_graphs_table['compound_id'] == compound_id, 'num_recognized_nodes'].values[0],
            'iso_class_idx': rban_graphs_table.loc[rban_graphs_table['compound_id'] == compound_id, 'iso_class_idx'].values[0],
            'in_approved_matches': nrp_id in nrp_ids_approved
        })

# Create a DataFrame from the data rows
df_final = pd.DataFrame(data_rows)
# sort the DataFrame by compound_id
df_final.sort_values(by='compound_id', inplace=True)
# Save the final DataFrame to a TSV file
df_final.to_csv('mibig_bgcs_info.tsv', sep='\t', index=False)

