In [19]:
import yaml
from pathlib import Path
rban_graphs_yaml = Path('/home/ilianolhin/git/nerpa2/training/rban_graphs.yaml')
parsed_rban_records = yaml.safe_load(rban_graphs_yaml.read_text())

In [20]:
print(f'Total number of putative NRPs', len(parsed_rban_records))

Total number of putative NRPs 9254


In [30]:
print('Example record:')
print(next(record for record in parsed_rban_records if record['compound_id'].startswith('BGC0000963')))


Example record:
{'compound_id': 'BGC0000963.0', 'nodes': {1: 'aThr/Thr', 2: 'Ala', 3: 'Gal/Man/bGal/Glc', 4: 'X1', 5: 'OH-His', 6: 'dhCys', 7: 'dhCys', 8: 'X2', 9: 'bAla', 10: 'Asn', 11: 'X3', 12: 'X4'}, 'edges': [[2, 1, 'AMINO'], [4, 3, 'GLYCOSIDIC'], [3, 5, 'GLYCOSIDIC'], [7, 6, 'THIAZOLE_CYCLE'], [8, 5, 'AMINO'], [1, 9, 'AMINO'], [9, 7, 'THIAZOLE_CYCLE'], [10, 8, 'PYRIMIDINE_CYCLE'], [5, 2, 'AMINO'], [6, 11, 'AMINO'], [12, 10, 'NITROGEN_CARBON2']]}


In [22]:
from collections import defaultdict
import networkx as nx

def build_graph(record):
    graph = nx.DiGraph()
    graph.add_nodes_from((u, {'residue': res}) for u, res in record['nodes'].items())
    graph.add_edges_from((u, v, {'bondType': bondType}) for u, v, bondType in record['edges'])
    return graph

def graphs_equal(G1, G2):
    def node_equal(n1, n2):
        return n1['residue'] == n2['residue'] or \
            (n1['residue'].startswith('X') and n2['residue'].startswith('X'))
    return nx.is_isomorphic(G1, G2, node_match=lambda n1, n2: node_equal(n1, n2),
                            edge_match=lambda e1, e2: e1['bondType'] == e2['bondType'])
    
bgc_graphs = defaultdict(list)
for record in parsed_rban_records:
    bgc_id = record['compound_id'].split('.')[0]
    graph = build_graph(record)
    if bgc_id.startswith('BGC') and not any(graphs_equal(graph, bgc_graph) 
                                            for bgc_graph, compound_id in bgc_graphs[bgc_id]):
        bgc_graphs[bgc_id].append((graph, record['compound_id']))

In [23]:
print('Number of MIBiG BGCs with graphs:', len(bgc_graphs))

Number of MIBiG BGCs with graphs: 684


In [34]:
def identical_graph_sets(graphs1, graphs2):
    return all(any(graphs_equal(graph1, graph2) for graph2, compound_id2 in graphs2) for graph1, compound_id1 in graphs1)

filtered_bgcs = []
duplicates = []
for bgc_id, graphs in bgc_graphs.items():
    if not any(identical_graph_sets(bgc_graphs[bgc_id], bgc_graphs[other_bgc_id]) 
               for other_bgc_id in filtered_bgcs):
        filtered_bgcs.append(bgc_id)
    else:
        duplicates.append(bgc_id)

print('Number of unique BGCs (grouped by compounds monomer graphs):', len(filtered_bgcs))
print('Number of unique compounds:', sum(len(bgc_graphs[bgc_id]) for bgc_id in filtered_bgcs))
print('Duplicates:', sorted(duplicates))

Number of unique BGCs (grouped by compounds monomer graphs): 474
Number of unique compounds: 799
Duplicates: ['BGC0000064', 'BGC0000145', 'BGC0000150', 'BGC0000153', 'BGC0000287', 'BGC0000292', 'BGC0000293', 'BGC0000300', 'BGC0000303', 'BGC0000309', 'BGC0000327', 'BGC0000360', 'BGC0000373', 'BGC0000375', 'BGC0000384', 'BGC0000407', 'BGC0000409', 'BGC0000412', 'BGC0000414', 'BGC0000415', 'BGC0000418', 'BGC0000436', 'BGC0000440', 'BGC0000442', 'BGC0000444', 'BGC0000448', 'BGC0000453', 'BGC0000467', 'BGC0000816', 'BGC0000818', 'BGC0000893', 'BGC0000959', 'BGC0000961', 'BGC0000963', 'BGC0000967', 'BGC0000971', 'BGC0000973', 'BGC0000974', 'BGC0000976', 'BGC0000977', 'BGC0000978', 'BGC0000979', 'BGC0000980', 'BGC0000981', 'BGC0000983', 'BGC0000989', 'BGC0000990', 'BGC0000991', 'BGC0000994', 'BGC0000995', 'BGC0000996', 'BGC0000999', 'BGC0001001', 'BGC0001006', 'BGC0001011', 'BGC0001013', 'BGC0001015', 'BGC0001017', 'BGC0001022', 'BGC0001024', 'BGC0001026', 'BGC0001030', 'BGC0001031', 'BGC0001

In [25]:
import json
from pathlib import Path
pnrp_rban_json = Path('/home/ilianolhin/git/nerpa2/training/mibig_filtering/pnrpdb-mibig.rban.json') 
rban_records = json.loads(pnrp_rban_json.read_text())


In [26]:
print(next(record for record in rban_records if record['id'].startswith('BGC0000963')))

{'id': 'BGC0000963.0', 'isomericSmiles': 'CC1=C(N=C(N=C1N)C(CC(=O)N)NCC(C(=O)N)N)C(=O)NC(C(C2=CN=CN2)OC3C(C(C(C(O3)CO)O)O)OC4C(C(C(C(O4)CO)O)OC(=O)N)O)C(=O)NC(C)C(C(C)C(=O)NC(C(C)O)C(=O)NCCC5=NC(=CS5)C6=NC(=CS6)C(=O)NCCC[S+](C)C)O', 'canonicalSmiles': None, 'coverage': 0.53125, 'correctness': 0.0, 'missingMonomers': 5, 'monomericGraph': {'monomericGraph': {'monomers': [{'monomer': {'index': 1, 'atoms': [66, 67, 68, 71, 69, 70, 72], 'bonds': [70, 71, 74, 72, 73, 75], 'monomer': {'id': 769, 'cid': '205', 'monomer': 'aThr/Thr', 'codes': ['aThr', 'D-aThr', 'D-Thr', 'Thr'], 'names': ['allo-Threonine', 'D-allo-Threonine', 'D-Threonine', 'Threonine'], 'smiles': 'CC(O)C(N)C(=O)O', 'mwHeavyAtoms': 109.98781786, 'isNew': False, 'isIdentified': True, 'compounds': None, 'compoundsCount': 0}}}, {'monomer': {'index': 2, 'atoms': [58, 59, 60, 61, 62, 95, 63, 64, 65], 'bonds': [62, 63, 64, 65, 100, 66, 67, 68], 'monomer': {'id': 7662, 'cid': '0', 'monomer': 'X0', 'codes': ['X0'], 'names': [], 'smiles'

In [27]:
smiles_for_bgcs = defaultdict(list)
for bgc_id in filtered_bgcs:
    for graph, compound_id in bgc_graphs[bgc_id]:
        try:
            smiles = next(record['isomericSmiles'] for record in rban_records if record['id'] == compound_id)
        except StopIteration:
            print('No SMILES for', compound_id)
            continue
        smiles_for_bgcs[bgc_id].append((compound_id, smiles))

print('Number of unique BGCs with SMILES:', len(smiles_for_bgcs))
print('Number of unique compounds:', sum(len(smiles_for_bgcs[bgc_id]) for bgc_id in smiles_for_bgcs))

Number of unique BGCs with SMILES: 474
Number of unique compounds: 799


In [28]:
for bgc_id in sorted(filtered_bgcs):
    if bgc_id not in smiles_for_bgcs:
        continue
    with open(f'/home/ilianolhin/git/nerpa2/training/bgc_compounds/{bgc_id}.tsv', 'w') as f:
        f.write('ID\tSMILES\n')
        for compound_id, smiles in smiles_for_bgcs[bgc_id]:
            f.write(f'{compound_id}\t{smiles}\n')


In [29]:
with open('bgc_ids.txt', 'w') as f:
    for bgc_id in sorted(filtered_bgcs):
        if smiles_for_bgcs[bgc_id]:
            f.write(f'{bgc_id}\n')