# Introduction

# Imports

In [2]:
from pathlib import Path
from harbor.plotting.ligands import plot_aligned_ligands

In order to avoiding having this repo depend directly on the asapdiscovery repo, I'm going to comment this out, but we'll use a test example from the covid moonshot molecules:
```
from asapdiscovery.data.testing.test_resources import fetch_test_file
mypath = fetch_test_file("Mpro_combined_labeled.sdf")
``` 

In [3]:
mypath = Path("../data/Mpro_combined_labeled.sdf")

## use p-series curated subset

In [20]:
mypath = Path("./data/combined_3d.sdf")

I'm copying this code from the asapdiscovery repo.
Once that is conda installable, I'll make that a dep of this repo and use those tools for loading molecules

# Load Molecules

In [5]:
from rdkit import Chem
mols = Chem.SDMolSupplier(str(mypath))

In [6]:
mols = [mol for mol in mols]

In [7]:
import mols2grid

In [8]:
# define the grid to show the scaffolds
grid = mols2grid.display(mols)

MolGridWidget()

In [9]:
grid

# MCSS-based Clustering

In [10]:
from harbor.clustering.hierarchical import ClusterResults, ClusterCenter, HeirarchicalClustering
from openeye import oechem

In [11]:
mol: Chem.Mol = mols[0]
mol.GetPropsAsDict()

{'compound_name': 'EDG-MED-5d232de5-6',
 'provenance': '{"isomeric_smiles": "CN1CC[C@H](c2c1ccc(c2)Cl)C(=O)Nc3cncc4c3cccc4", "inchi": "InChI=1S/C20H18ClN3O/c1-24-9-8-16(17-10-14(21)6-7-19(17)24)20(25)23-18-12-22-11-13-4-2-3-5-15(13)18/h2-7,10-12,16H,8-9H2,1H3,(H,23,25)/t16-/m1/s1", "inchi_key": "NIZLDWOVHDTVKM-MRXNPFEDSA-N", "fixed_inchi": "InChI=1/C20H18ClN3O/c1-24-9-8-16(17-10-14(21)6-7-19(17)24)20(25)23-18-12-22-11-13-4-2-3-5-15(13)18/h2-7,10-12,16H,8-9H2,1H3,(H,23,25)/t16-/m1/s1/f/h23H", "fixed_inchikey": "NIZLDWOVHDTVKM-ZJHKTUGWNA-N"}',
 'data_format': 'sdf'}

In [16]:
oemols = []
mol_ids = []
for rdkit_mol in mols:
    smiles = Chem.MolToSmiles(rdkit_mol)
    properties = rdkit_mol.GetPropsAsDict()
    mol_ids.append(properties["compound_name"])
    mol = oechem.OEMol()
    oechem.OESmilesToMol(mol, smiles)
    oemols.append(mol)

In [17]:
from harbor.clustering import hierarchical as h
from importlib import reload
reload(h)

<module 'harbor.clustering.hierarchical' from '/Users/alexpayne/Scientific_Projects/harbor/harbor/clustering/hierarchical.py'>

In [18]:
clusterer = h.HeirarchicalClustering(molecules=oemols, mol_ids=mol_ids)

In [19]:
clusters = clusterer.cluster(max_iterations=50, cutoff=12)

100%|██████████| 205/205 [14:17<00:00,  4.18s/it]


['0_EDG-MED-5d232de5-6', '0_MAT-POS-f2460aef-1', '0_MAT-POS-e69ad64a-2', '0_MAT-POS-50a80394-2', '0_BRU-CON-c4e3408a-1', '0_PET-UNK-7fb4f80a-2', '0_EDJ-MED-2f867453-1', '0_RAL-THA-4aa06b95-7', '0_MAT-POS-4223bc15-12', '0_EDJ-MED-968bafd9-1', '0_MAT-POS-853c0ffa-9', '0_ALP-POS-477dc5b7-4', '0_EDG-MED-971238d3-5', '0_EDG-MED-ba1ac7b9-19', '0_EDG-MED-0e5afe9d-1', '0_MAT-POS-90fd5f68-28', '0_ALP-POS-e0fe77e5-13', '0_MAT-POS-fce787c2-5', '0_EDG-MED-5d232de5-7', '0_MAT-POS-90fd5f68-7', '0_PET-UNK-29afea89-2', '0_MAT-POS-5cd9ea36-22', '0_MAT-POS-78e1d523-1', '0_RAL-THA-2d450e86-26', '0_MAT-POS-8293a91a-8', '0_LON-WEI-9739a092-6', '0_EDG-MED-ba1ac7b9-11', '0_PET-UNK-7fb4f80a-1', '0_EDG-MED-70ae9412-1', '0_VLA-UCB-34f3ed0c-11', '0_MAT-POS-de59a476-4', '0_MAT-POS-5cd9ea36-17', '0_EDG-MED-70ae9412-2', '0_EDG-MED-5d232de5-3', '0_EDJ-MED-c3ea9889-6', '0_PET-UNK-bb7ffe78-1', '0_ALP-POS-869ac754-1', '0_MAT-POS-5d65ec79-2', '0_MAT-POS-7174c657-5', '0_EDG-MED-b1ef7fe3-1', '0_EDJ-MED-43f8f7d6-4', '0_RUB

100%|██████████| 103/103 [02:45<00:00,  1.60s/it]


['1_0', '1_1', '1_2', '1_3', '1_4', '1_5', '1_6', '1_7', '1_8', '1_9', '1_10', '1_11', '1_12', '1_13', '1_14', '1_15', '1_16', '1_17', '1_18', '1_19', '1_20', '1_21', '1_22', '1_23', '1_24', '1_25', '1_26', '1_27', '1_28', '1_29', '1_30', '1_31', '1_32', '1_33', '1_34', '1_35', '1_36', '1_37', '1_38', '1_39', '1_40', '1_41', '1_42', '1_43', '1_44', '1_45', '1_46', '1_47', '1_48', '1_49', '0_EDG-MED-5d232de5-6', '0_PET-UNK-7fb4f80a-2', '0_EDJ-MED-2f867453-1', '0_RAL-THA-4aa06b95-7', '0_EDG-MED-ba1ac7b9-19', '0_EDG-MED-0e5afe9d-1', '0_EDG-MED-5d232de5-7', '0_MAT-POS-78e1d523-1', '0_EDG-MED-ba1ac7b9-11', '0_PET-UNK-7fb4f80a-1', '0_MAT-POS-5cd9ea36-17', '0_EDJ-MED-c3ea9889-6', '0_ALP-POS-869ac754-1', '0_MAT-POS-7174c657-5', '0_RUB-POS-1325a9ea-4', '0_EDJ-MED-e4b030d8-11', '0_ALP-POS-8b8a49e1-4', '0_EDG-MED-ba1ac7b9-21', '0_MAT-POS-fce787c2-6', '0_JIN-POS-6dc588a4-6', '0_ALP-UNI-3735e77e-2', '0_MAT-POS-4223bc15-3', '0_ALP-UNI-8d415491-3', '0_EDJ-MED-705e09b8-1', '0_RAL-THA-2d450e86-1', '0_E

100%|██████████| 47/47 [00:31<00:00,  1.51it/s]


['2_0', '2_1', '2_2', '2_3', '2_4', '2_5', '2_6', '2_7', '2_8', '2_9', '2_10', '2_11', '2_12', '2_13', '2_14', '2_15', '2_16', '2_17', '2_18', '2_19', '2_20', '1_0', '1_1', '1_3', '1_7', '1_8', '1_11', '1_13', '1_14', '1_15', '1_16', '1_20', '1_21', '1_23', '1_24', '1_34', '1_36', '1_37', '1_40', '1_41', '1_45', '0_EDG-MED-0e5afe9d-1', '0_EDG-MED-5d232de5-7', '0_MAT-POS-fce787c2-6', '0_EDJ-MED-12c4873b-2', '0_MAT-POS-86c60949-2', '0_MAT-POS-5cd9ea36-21']
[[ 0 18 23 ... 11 10 14]
 [18  0 10 ... 25 23 28]
 [23 10  0 ... 10 10 10]
 ...
 [11 25 10 ...  0 23 25]
 [10 23 10 ... 23  0 23]
 [14 28 10 ... 25 23  0]]
[ 6 46  6 43 14 45  0 40 40 32 38 17 46 46 19 37 38 37 40 14 45 17 19 14
  9 26 38 46 38 38 24 29  9  8 46 13 29 17 29 25  7 23  0  3  8  5 27]
[26 28 24 22 24 25 25 30 30 27 21 17 28 23 25 21 21 24 26 25 24 12 24 21
 25 21 21 32 20 21 26 21 27 30 23 23 21 24 21 16 30 21 23 22 26 25 32]
0
0 6 [0 2]
1
1 46 [ 1 12 13 27 34]
2
3
4
4 14 [ 4 19 23]
5
5 45 [ 5 20]
6
7
7 40 [ 7  8 18]
8
9


100%|██████████| 21/21 [00:06<00:00,  3.01it/s]


['3_0', '3_1', '3_2', '3_3', '3_4', '3_5', '3_6', '3_7', '3_8', '3_9', '3_10', '2_1', '2_4', '2_11', '2_12', '2_13', '2_15', '1_8', '1_16', '1_34', '0_EDG-MED-0e5afe9d-1']
[[ 0 10 10 11 10 10 10 10 10 10 10 10 10  7 10 10 10 10 10 10  9]
 [10  0 13 13 13 21 13 21 13 21 13 13 13 17 13 13 21 21 13  9  9]
 [10 13  0 23 23 13 24 13 23 13 23 23 23  9 23 22 13 13 23 18 20]
 [11 13 23  0 22 13 23 13 24 13 24 24 22  9 24 22 13 13 22 18 19]
 [10 13 23 22  0 13 24 13 22 13 22 22 24  9 22 23 13 13 25 19 20]
 [10 21 13 13 13  0 13 21 13 21 13 13 13 17 13 13 21 21 13  9  9]
 [10 13 24 23 24 13  0 13 23 13 23 23 24  9 23 22 13 13 24 18 21]
 [10 21 13 13 13 21 13  0 13 21 13 13 13 17 13 13 21 21 13  9  9]
 [11 13 23 25 22 13 23 13  0 13 25 28 22  9 28 23 13 13 22 19 19]
 [10 21 13 13 13 21 13 21 13  0 13 13 13 17 13 13 21 21 13  9  9]
 [11 13 23 29 22 13 23 13 25 13  0 25 22  9 25 23 13 13 22 19 19]
 [11 13 23 25 22 13 23 13 28 13 25  0 22  9 28 23 13 13 22 19 19]
 [10 13 23 22 24 13 24 13 22 13 22 2

100%|██████████| 6/6 [00:00<00:00, 18.60it/s]


['4_0', '4_1', '4_2', '4_3', '4_4', '1_34']
[[ 0 13 13 13 13  9]
 [13  0 23 23 23 18]
 [13 23  0 22 24 18]
 [13 23 22  0 22 19]
 [13 23 24 22  0 19]
 [ 9 18 18 19 19  0]]
[4 4 4 1 2 4]
[13 23 24 23 24 19]
0
0 4 [0 1 2 5]
1
1 4 [0 1 2 5]
2
2 4 [0 1 2 5]
3
4
5


100%|██████████| 4/4 [00:00<00:00, 34.79it/s]


['5_0', '4_0', '4_1', '4_3']
[[ 0 13 23 22]
 [13  0 13 13]
 [23 13  0 23]
 [22 13 23  0]]
[2 3 3 2]
[23 13 23 23]
0
0 2 [0 3]
1
1 3 [1 2]
2
3


100%|██████████| 2/2 [00:00<00:00, 71.75it/s]


['6_0', '4_0']
[[ 0 13]
 [13  0]]
[1 0]
[13 13]
0
1


In [21]:
len(clusters)

3

In [33]:
clusters

{'1_38': ClusterCenter(cluster_id='1_38', children=[ClusterCenter(cluster_id='0_JAN-GHE-5a013bed-2', children=['JAN-GHE-5a013bed-2'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x154244de0> >, height=0), ClusterCenter(cluster_id='0_MAT-POS-bfb445d4-2', children=['MAT-POS-bfb445d4-2'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x1542471e0> >, height=0)], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x1543d2790> >, height=1),
 '3_0': ClusterCenter(cluster_id='3_0', children=[ClusterCenter(cluster_id='2_0', children=[ClusterCenter(cluster_id='1_2', children=[ClusterCenter(cluster_id='0_MAT-POS-50a80394-2', children=['MAT-POS-50a80394-2'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x1541ab4e0> >, height=0), ClusterCenter(cluster_id='0_MAT-POS-50a80394-1', children=['MAT-POS-50a80394-1'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x154247e40> >, height=0)],

In [34]:
def get_descendents(cluster):
    descendents = []
    for child in cluster.children:
        if isinstance(child, str):
            descendents.append(cluster)
        else:
            descendents.extend(get_descendents(child))
    return descendents

In [35]:
from harbor.plotting import ligands as l
reload(l)

<module 'harbor.plotting.ligands' from '/Users/alexpayne/Scientific_Projects/harbor/harbor/plotting/ligands.py'>

In [36]:
clusters.keys()

dict_keys(['1_38', '3_0', '7_0'])

In [37]:
len(mol_ids)

205

In [38]:
len(set(mol_ids))

205

In [39]:
ids_found = []
for cluster_id, cluster in clusters.items():
    print(f"Cluster {cluster_id}")
    descendents = get_descendents(cluster)
    print(f"Children: {len(descendents)}")
    mols = []
    for desc in descendents:
        mol = desc.repr
        mol.SetTitle(desc.children[0])
        mols.append(mol)
    l.plot_ligands_with_mcs(filename=f"cluster_{cluster_id}.png", mols=mols, mcs_mol=cluster.repr, reference="largest")
    ids_found.extend([desc.children[0] for desc in descendents])

Cluster 1_38
Children: 2
2 molecules to plot
['JAN-GHE-5a013bed-2', 'MAT-POS-bfb445d4-2']
[20 20]
Generating a figure with 1 rows and 2 columns
['MAT-POS-bfb445d4-2']
1 2
Cluster 3_0
Children: 7
7 molecules to plot
['MAT-POS-50a80394-2', 'MAT-POS-50a80394-1', 'ALP-POS-133e7cd9-2', 'EDG-MED-b1ef7fe3-1', 'EDJ-MED-8bb691af-8', 'MAT-POS-1bed62cf-3', 'MAT-POS-c7726e07-5']
[37 36 36 33 32 32 31]
Generating a figure with 3 rows and 3 columns




['MAT-POS-50a80394-1', 'ALP-POS-133e7cd9-2', 'EDG-MED-b1ef7fe3-1', 'EDJ-MED-8bb691af-8', 'MAT-POS-1bed62cf-3', 'MAT-POS-c7726e07-5']
1 2
1 3
2 1
2 2
2 3
3 1
Cluster 7_0
Children: 38
38 molecules to plot
['MAT-POS-e119ab4f-3', 'MAT-POS-4223bc15-23', 'EDJ-MED-7889e8da-3', 'MAT-POS-2e8b2191-11', 'MAT-POS-2e8b2191-10', 'EDJ-MED-43f8f7d6-4', 'EDJ-MED-43f8f7d6-6', 'MAT-POS-2e8b2191-12', 'EDG-MED-ee636701-1', 'EDJ-MED-e69ed63d-13', 'EDJ-MED-e69ed63d-1', 'MAT-POS-5cd9ea36-22', 'MAT-POS-af1eef35-2', 'MAT-POS-5cd9ea36-21', 'MAT-POS-4223bc15-11', 'EDJ-MED-1981ceba-4', 'EDJ-MED-1981ceba-2', 'EDJ-MED-1981ceba-3', 'EDG-MED-5d232de5-3', 'RAL-THA-4aa06b95-1', 'RAL-THA-8416115c-5', 'RAL-THA-8416115c-13', 'MAT-POS-86c60949-2', 'EDG-MED-5d232de5-1', 'MAT-POS-96f51285-5', 'ALP-POS-869ac754-1', 'EDJ-MED-e4b030d8-11', 'ALP-UNI-3735e77e-2', 'MAT-POS-fce787c2-5', 'MAT-POS-1f3f1a6f-1', 'MAT-POS-de59a476-4', 'MAT-POS-de59a476-2', 'MAT-POS-fce787c2-6', 'MAT-POS-90fd5f68-13', 'MAT-POS-90fd5f68-14', 'PET-UNK-7fb4f

In [40]:
total_mols_found = []
for cluster_id, cluster in clusters.items():
    total_mols_found.extend([desc.children[0] for desc in get_descendents(cluster)])

In [41]:
len(total_mols_found)

47

In [42]:
len(set(total_mols_found))

47

In [43]:
set(ids_found)

{'ADA-UCB-6c2cb422-1',
 'ALP-POS-133e7cd9-2',
 'ALP-POS-6479a3a9-2',
 'ALP-POS-869ac754-1',
 'ALP-UNI-3735e77e-2',
 'EDG-MED-5d232de5-1',
 'EDG-MED-5d232de5-3',
 'EDG-MED-b1ef7fe3-1',
 'EDG-MED-ee636701-1',
 'EDJ-MED-1981ceba-2',
 'EDJ-MED-1981ceba-3',
 'EDJ-MED-1981ceba-4',
 'EDJ-MED-43f8f7d6-4',
 'EDJ-MED-43f8f7d6-6',
 'EDJ-MED-7889e8da-3',
 'EDJ-MED-8bb691af-8',
 'EDJ-MED-e4b030d8-11',
 'EDJ-MED-e69ed63d-1',
 'EDJ-MED-e69ed63d-13',
 'JAN-GHE-5a013bed-2',
 'MAT-POS-1bed62cf-3',
 'MAT-POS-1f3f1a6f-1',
 'MAT-POS-2e8b2191-10',
 'MAT-POS-2e8b2191-11',
 'MAT-POS-2e8b2191-12',
 'MAT-POS-4223bc15-11',
 'MAT-POS-4223bc15-23',
 'MAT-POS-50a80394-1',
 'MAT-POS-50a80394-2',
 'MAT-POS-5cd9ea36-21',
 'MAT-POS-5cd9ea36-22',
 'MAT-POS-86c60949-2',
 'MAT-POS-90fd5f68-13',
 'MAT-POS-90fd5f68-14',
 'MAT-POS-96f51285-5',
 'MAT-POS-af1eef35-2',
 'MAT-POS-bfb445d4-2',
 'MAT-POS-c7726e07-5',
 'MAT-POS-de59a476-2',
 'MAT-POS-de59a476-4',
 'MAT-POS-e119ab4f-3',
 'MAT-POS-fce787c2-5',
 'MAT-POS-fce787c2-6',


In [44]:
set(mol_ids) - set(ids_found)

{'ALP-POS-1cbc2fae-1',
 'ALP-POS-1cbc2fae-2',
 'ALP-POS-477dc5b7-4',
 'ALP-POS-477dc5b7-5',
 'ALP-POS-64a710fa-1',
 'ALP-POS-6f6ae286-3',
 'ALP-POS-6f6ae286-5',
 'ALP-POS-8b8a49e1-4',
 'ALP-POS-9c80c481-1',
 'ALP-POS-a577c8a2-1',
 'ALP-POS-c3a96089-4',
 'ALP-POS-ce760d3f-2',
 'ALP-POS-ce760d3f-8',
 'ALP-POS-e0fe77e5-13',
 'ALP-POS-ecbed2ba-12',
 'ALP-POS-fe871b40-11',
 'ALP-UNI-3735e77e-1',
 'ALP-UNI-8d415491-1',
 'ALP-UNI-8d415491-3',
 'ALP-UNI-8d415491-6',
 'ALP-UNI-8e43a71e-8',
 'BEN-BAS-c2bc0d80-7',
 'BEN-DND-c852c98b-10',
 'BEN-DND-c852c98b-5',
 'BRU-CON-c4e3408a-1',
 'BRU-THA-92256091-17',
 'EDG-MED-0e5afe9d-1',
 'EDG-MED-10fcb19e-1',
 'EDG-MED-5d232de5-6',
 'EDG-MED-5d232de5-7',
 'EDG-MED-5d232de5-8',
 'EDG-MED-70ae9412-1',
 'EDG-MED-70ae9412-2',
 'EDG-MED-971238d3-1',
 'EDG-MED-971238d3-4',
 'EDG-MED-971238d3-5',
 'EDG-MED-ba1ac7b9-11',
 'EDG-MED-ba1ac7b9-13',
 'EDG-MED-ba1ac7b9-19',
 'EDG-MED-ba1ac7b9-21',
 'EDJ-MED-12c4873b-2',
 'EDJ-MED-12c4873b-5',
 'EDJ-MED-2f867453-1',
 '

# Why are there still duplicates?

In [38]:
from collections import Counter

In [40]:
count_dict = Counter(total_mols_found)

In [54]:
bigger_than_one_count_dict = {k:v for k,v in count_dict.items()
                              if v > 1}

In [55]:
len(bigger_than_one_count_dict)

229

In [56]:
_id, _count = bigger_than_one_count_dict.popitem()

In [67]:
_count

2

## which clusters is this one in?

In [80]:
_id

'TRY-UNI-714a760b-20'

In [81]:
desc_id_dict = {cluster_id: [desc.children[0] for desc in get_descendents(cluster)]
                             for cluster_id, cluster in clusters.items()}

In [91]:
owners = [cluster_id for cluster_id, ids in desc_id_dict.items() if _id in ids]

In [92]:
owners

['18_2']

## get descendents 

In [95]:
cluster0 = clusters[owners[0]]
previous_owners = [cluster.cluster_id for cluster in cluster0.children
                   if _id in [desc.children[0] for desc in get_descendents(cluster)]]

In [96]:
previous_owners

['5_29']

In [97]:
cluster0.children

[ClusterCenter(cluster_id='17_3', children=[ClusterCenter(cluster_id='15_2', children=[ClusterCenter(cluster_id='14_2', children=[ClusterCenter(cluster_id='12_2', children=[ClusterCenter(cluster_id='11_2', children=[ClusterCenter(cluster_id='10_5', children=[ClusterCenter(cluster_id='9_5', children=[ClusterCenter(cluster_id='8_5', children=[ClusterCenter(cluster_id='7_6', children=[ClusterCenter(cluster_id='6_9', children=[ClusterCenter(cluster_id='5_13', children=[ClusterCenter(cluster_id='4_20', children=[ClusterCenter(cluster_id='3_31', children=[ClusterCenter(cluster_id='2_48', children=[ClusterCenter(cluster_id='1_81', children=[ClusterCenter(cluster_id='0_EDG-MED-0da5ad92-18', children=['EDG-MED-0da5ad92-18'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197dc6c0> >, height=0), ClusterCenter(cluster_id='0_JAN-GHE-83b26c96-22', children=['JAN-GHE-83b26c96-22'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197f61f0> >, h

In [102]:
cluster1 = cluster0.children[1]
previous_owners = [cluster.cluster_id for cluster in cluster1.children
                   if _id in [desc.children[0] for desc in get_descendents(cluster)]]
print(cluster1.children)
print(previous_owners)

[ClusterCenter(cluster_id='3_35', children=[ClusterCenter(cluster_id='2_55', children=[ClusterCenter(cluster_id='1_104', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-22', children=['TRY-UNI-714a760b-22'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196148d0> >, height=0)], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad19527bd0> >, height=1), ClusterCenter(cluster_id='1_138', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-19', children=['TRY-UNI-714a760b-19'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196166a0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >,

In [106]:
cluster2 = cluster1.children[0]
previous_owners = [cluster.cluster_id for cluster in cluster2.children
                   if _id in [desc.children[0] for desc in get_descendents(cluster)]]
print(cluster2.children)
print(previous_owners)

[ClusterCenter(cluster_id='2_55', children=[ClusterCenter(cluster_id='1_104', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-22', children=['TRY-UNI-714a760b-22'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196148d0> >, height=0)], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad19527bd0> >, height=1), ClusterCenter(cluster_id='1_138', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-19', children=['TRY-UNI-714a760b-19'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196166a0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >, height=0)], repr=<oechem.OEMol; proxy of <

In [107]:
cluster3 = cluster2.children[0]
previous_owners = [cluster.cluster_id for cluster in cluster3.children
                   if _id in [desc.children[0] for desc in get_descendents(cluster)]]
print(cluster3.children)
print(previous_owners)

[ClusterCenter(cluster_id='1_104', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-22', children=['TRY-UNI-714a760b-22'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196148d0> >, height=0)], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad19527bd0> >, height=1), ClusterCenter(cluster_id='1_138', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-19', children=['TRY-UNI-714a760b-19'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196166a0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >, height=0)], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7

In [110]:
cluster3.children[0].children[0].children[0]

'TRY-UNI-714a760b-20'

In [112]:
cluster3.children[1].children[1].children[0]

'TRY-UNI-714a760b-20'

In [116]:
og_count_dict = Counter(mol_ids)
og_count_dict['TRY-UNI-714a760b-20']

1

In [118]:
cluster3.children[0]

ClusterCenter(cluster_id='1_104', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-22', children=['TRY-UNI-714a760b-22'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196148d0> >, height=0)], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad19527bd0> >, height=1)

In [119]:
# list of og pairs

In [120]:
pairs = [(0, 100), (2, 427), (3, 135), (4, 19), (5, 33), (8, 367), (10, 110), (12, 358), (14, 120), (15, 494), (17, 575), (18, 280), (20, 512), (21, 22), (23, 240), (24, 27), (25, 515), (26, 521), (28, 529), (30, 201), (31, 121), (32, 281), (35, 123), (37, 260), (38, 56), (39, 278), (42, 524), (43, 46), (44, 528), (45, 84), (47, 229), (49, 534), (51, 149), (53, 179), (59, 81), (61, 377), (62, 385), (63, 174), (64, 371), (65, 549), (72, 104), (75, 255), (77, 88), (78, 382), (82, 410), (89, 458), (98, 533), (101, 376), (102, 193), (105, 522), (106, 164), (113, 392), (114, 547), (115, 449), (116, 345), (124, 446), (125, 348), (126, 321), (127, 402), (128, 39), (130, 235), (133, 347), (136, 542), (138, 567), (139, 231), (144, 18), (147, 571), (150, 157), (153, 434), (159, 535), (161, 217), (166, 543), (169, 113), (171, 180), (172, 384), (175, 177), (176, 296), (182, 190), (187, 511), (189, 466), (197, 419), (198, 133), (207, 527), (210, 546), (213, 304), (223, 305), (225, 237), (228, 351), (233, 573), (239, 303), (241, 259), (249, 496), (253, 562), (257, 372), (263, 413), (266, 431), (267, 399), (269, 313), (273, 381), (274, 415), (283, 460), (284, 532), (288, 412), (291, 505), (292, 435), (293, 423), (297, 561), (300, 342), (302, 363), (308, 482), (309, 545), (310, 501), (314, 484), (316, 456), (326, 574), (339, 273), (341, 368), (344, 380), (360, 187), (364, 375), (370, 520), (389, 25), (393, 498), (394, 480), (404, 554), (409, 176), (418, 572), (424, 455), (428, 452), (433, 439), (437, 166), (441, 472), (444, 326), (447, 468), (451, 552), (462, 565), (467, 297), (475, 550), (488, 292), (506, 293)]