1. Filter rban graphs to only those with at least 3 recognized nodes (not starting with 'X' and not lipid tails)

In [7]:
import yaml
rban_graphs = yaml.safe_load(open("./nerpa_results_pnrpdb2_vs_mibig34/rban_graphs.yaml"))

def num_recognized_nodes(graph):
    return sum(1 for node_idx, node_name in graph['nodes'].items()
               if not node_name.startswith('X')  # unrecognized nodes start with 'X'
               and not ':' in node_name)  # nodes with ':' are lipid tails

rban_graphs_filtered = [graph for graph in rban_graphs  # at least 3 "proper" nodes
                        if num_recognized_nodes(graph) >= 0]
print(f'{len(rban_graphs_filtered)}'
      f' out of {len(rban_graphs)} rban graphs have at least 0 recognized nodes')

9725 out of 9725 rban graphs have at least 0 recognized nodes


2. Find classes of isomorphic graphs.

In [8]:
from networkx import DiGraph, is_isomorphic

def rban_graph_to_nx_graph(graph):
    G = DiGraph()
    for node_idx, node_name in graph['nodes'].items():
        G.add_node(node_idx, name=node_name)
    for (u, v, bond_type) in graph['edges']:
        G.add_edge(u, v, bond_type=bond_type)
    return G

def graphs_isomorphic(graph1: DiGraph, graph2: DiGraph) -> bool:
    def node_match(n1, n2):
        name1, name2 = n1.get('name'), n2.get('name')
        return any([name1 == name2,  # same amino acid
                    name1.startswith('X') and name2.startswith('X'),  # both unknown
                    ':' in name1 and ':'])  # both lipid tails

    def edge_match(e1, e2):
        return e1.get('bond_type') == e2.get('bond_type')

    return is_isomorphic(graph1, graph2, node_match=node_match, edge_match=edge_match)

print(f'Finding isomorphic classes among {len(rban_graphs_filtered)} rban graphs')
free_iso_class_idx = 0
compound_id_to_nx_graph = {graph['compound_id']: rban_graph_to_nx_graph(graph)
                           for graph in rban_graphs_filtered}

compound_id_to_iso_idx = {}
for compound_id, nx_graph in compound_id_to_nx_graph.items():
    for prev_compound_id, iso_class_idx in compound_id_to_iso_idx.items():
        if graphs_isomorphic(compound_id_to_nx_graph[prev_compound_id],
                             compound_id_to_nx_graph[compound_id]):
            compound_id_to_iso_idx[compound_id] = iso_class_idx
            break
    else:
        compound_id_to_iso_idx[compound_id] = free_iso_class_idx
        free_iso_class_idx += 1

print(f'Found {len(set(compound_id_to_iso_idx.values()))} isomorphic classes among the filtered rban graphs')


Finding isomorphic classes among 9725 rban graphs
Found 5672 isomorphic classes among the filtered rban graphs


3. Load BGC variants from nerpa_results_pnrpdb2_vs_mibig34/BGC_variants

In [None]:
from pathlib import Path
import yaml
bgc_variants = []
for bgc_variant_yaml in Path('./nerpa_results_pnrpdb2_vs_mibig34/BGC_variants').iterdir():
    if bgc_variant_yaml.suffix == '.yaml':
        bgc_variant_data = yaml.safe_load(bgc_variant_yaml.read_text())
        # only keep the first variant of each BGC (they all have the same origins and number of A domains)
        if bgc_variant_data['bgc_variant_id']['variant_idx'] == 0:
            bgc_variants.append(bgc_variant_data)


3. Create a pd.DataFrame with the following columns: compound_id, num_recognized_nodes, iso_class_idx

In [9]:
# Create a DataFrame with the required columns
import pandas as pd
data = {
    'compound_id': list(compound_id_to_iso_idx.keys()),
    'num_recognized_nodes': [num_recognized_nodes(graph) for graph in rban_graphs_filtered],
    'iso_class_idx': list(compound_id_to_iso_idx.values())
}
df = pd.DataFrame(data)
# Save the DataFrame to a TSV file
df.to_csv('./nerpa_results_pnrpdb2_vs_mibig34/rban_graphs_filtered.tsv', sep='\t', index=False)