# Introduction

# Imports

In [7]:
from pathlib import Path
from harbor.plotting.ligands import plot_aligned_ligands

In order to avoiding having this repo depend directly on the asapdiscovery repo, I'm going to comment this out, but we'll use a test example from the covid moonshot molecules:
```
from asapdiscovery.data.testing.test_resources import fetch_test_file
mypath = fetch_test_file("Mpro_combined_labeled.sdf")
``` 

In [8]:
mypath = Path("../data/Mpro_combined_labeled.sdf")

## use p-series curated subset

In [9]:
mypath = Path("../data/combined_3d.sdf")

I'm copying this code from the asapdiscovery repo.
Once that is conda installable, I'll make that a dep of this repo and use those tools for loading molecules

# Load Molecules

In [225]:
from rdkit import Chem
mols = Chem.SDMolSupplier(str(mypath))

In [226]:
mols = [mol for mol in mols]

In [227]:
import mols2grid

In [228]:
# define the grid to show the scaffolds
grid = mols2grid.display(mols)

In [229]:
grid

# MCSS-based Clustering

In [230]:
from harbor.clustering.hierarchical import ClusterResults, ClusterCenter, HeirarchicalClustering
from openeye import oechem

In [231]:
mol: Chem.Mol = mols[0]
mol.GetPropsAsDict()

{'compound_name': 'EDG-MED-5d232de5-6',
 'provenance': '{"isomeric_smiles": "CN1CC[C@H](c2c1ccc(c2)Cl)C(=O)Nc3cncc4c3cccc4", "inchi": "InChI=1S/C20H18ClN3O/c1-24-9-8-16(17-10-14(21)6-7-19(17)24)20(25)23-18-12-22-11-13-4-2-3-5-15(13)18/h2-7,10-12,16H,8-9H2,1H3,(H,23,25)/t16-/m1/s1", "inchi_key": "NIZLDWOVHDTVKM-MRXNPFEDSA-N", "fixed_inchi": "InChI=1/C20H18ClN3O/c1-24-9-8-16(17-10-14(21)6-7-19(17)24)20(25)23-18-12-22-11-13-4-2-3-5-15(13)18/h2-7,10-12,16H,8-9H2,1H3,(H,23,25)/t16-/m1/s1/f/h23H", "fixed_inchikey": "NIZLDWOVHDTVKM-ZJHKTUGWNA-N"}',
 'data_format': 'sdf'}

In [232]:
oemols = []
mol_ids = []
for rdkit_mol in mols[20:50]:
    smiles = Chem.MolToSmiles(rdkit_mol)
    properties = rdkit_mol.GetPropsAsDict()
    mol_ids.append(properties["compound_name"])
    mol = oechem.OEMol()
    oechem.OESmilesToMol(mol, smiles)
    oemols.append(mol)

# testing

In [104]:
from harbor.clustering.hierarchical import mcs_wrapper
# Make initial clusters
clusters = [
    ClusterCenter.from_mol(mol_id, mol)
    for mol_id, mol in zip(mol_ids, oemols)
]

# keep track of the molecule outliers
cluster_records = {}
# generate n x n matrix of MCS values
mcs_matrix = mcs_wrapper(clusters)

100%|██████████| 30/30 [00:08<00:00,  3.45it/s]


In [141]:
len(clusters)

30

In [142]:
np.shape(mcs_matrix)

(30, 30)

In [165]:
mtx = mcs_matrix.copy()

In [166]:
mtx[:, 0]

array([26, 22, 23, 13, 10, 13, 24, 13, 24, 10, 13, 22, 24, 23, 24, 13, 24,
       24, 13, 10, 22, 13, 13, 24, 22, 21, 10, 14, 24, 22])

In [194]:
from enum import Enum, auto
class ClusterStatus(Enum):
    unassigned = 0
    success = 1
    single = 2
    outlier = 3

In [198]:
mtx = mcs_matrix.copy()
success_array = np.array([ClusterStatus.unassigned]*len(clusters))
np.fill_diagonal(mtx, 0)
pairs = []
for i, cluster in enumerate(clusters):

    # skip if already assigned
    if success_array[i] != ClusterStatus.unassigned:
        continue

    # get largest mcs
    mcs_array = mtx[i]
    max_mcs = mcs_array.max()

    # if max mcs < cutoff, add to outliers
    if max_mcs < 12:
        success_array[i] = ClusterStatus.outlier

    # get all potential matches
    potential_matches = np.where(mcs_array == max_mcs)[0]
    
    fail_to_find = True
    for j in potential_matches:

        # potential match can't have already been assigned
        if success_array[j] != ClusterStatus.unassigned:
            continue

        # get the max mcs of the potential match
        mcs_array = mtx[j]
        sweetheart_max_mcs = mcs_array.max()

        # only match if the max mcs of the potential match is <= to our mcs
        if max_mcs >= sweetheart_max_mcs:
            fail_to_find = False
            break
    
    if fail_to_find:
        success_array[i] = ClusterStatus.single
    else:
        pairs.append((i, j))
        success_array[i] = ClusterStatus.success
        success_array[j] = ClusterStatus.success
        

In [200]:
cluster_array = np.array(clusters)

In [203]:
singles = cluster_array[success_array == ClusterStatus.single]

In [205]:
singles

array([ClusterCenter(cluster_id='0_RAL-THA-2d450e86-26', children=['RAL-THA-2d450e86-26'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7f5d1304e1c0> >, height=0),
       ClusterCenter(cluster_id='0_MAT-POS-5d65ec79-2', children=['MAT-POS-5d65ec79-2'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7f5d122ce790> >, height=0),
       ClusterCenter(cluster_id='0_RAL-THA-2d450e86-30', children=['RAL-THA-2d450e86-30'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7f5d122ce250> >, height=0),
       ClusterCenter(cluster_id='0_MAT-POS-dd3ad2b5-2', children=['MAT-POS-dd3ad2b5-2'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7f5d122ce130> >, height=0),
       ClusterCenter(cluster_id='0_MAT-POS-fb82b63d-1', children=['MAT-POS-fb82b63d-1'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7f5d122cdbc0> >, height=0),
       ClusterCenter(cluster_id='0_ALP-POS-8b8a49e1

In [204]:
outliers = cluster_array[success_array == ClusterStatus.outlier]

In [206]:
outliers

array([], dtype=object)

In [256]:
pairs

[(0, 14),
 (1, 11),
 (2, 13),
 (4, 9),
 (5, 7),
 (6, 28),
 (8, 12),
 (10, 15),
 (16, 23),
 (18, 21),
 (19, 26),
 (20, 29)]

In [241]:
from harbor.clustering import hierarchical as h
from importlib import reload
reload(h)

<module 'harbor.clustering.hierarchical' from '/home/feanor/harbor/harbor/clustering/hierarchical.py'>

In [242]:
clusterer = h.HeirarchicalClustering(molecules=oemols, mol_ids=mol_ids)

In [243]:
clusterer

HeirarchicalClustering(molecules=[<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7f5d11148e70> >, <oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7f5d112203f0> >, <oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7f5d11358390> >, <oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7f5d1135aa30> >, <oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7f5d11359350> >, <oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7f5d11359980> >, <oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7f5d111498f0> >, <oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7f5d1114ae20> >, <oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7f5d1114a7f0> >, <oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7f5d112055c0> >, <oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7f5d11205590> >, <oechem.OEMol; proxy of <Swig Object of ty

In [244]:
clusters = clusterer.cluster(max_iterations=10, cutoff=12)

100%|██████████| 30/30 [00:14<00:00,  2.05it/s]


['0_PET-UNK-29afea89-2', '0_MAT-POS-5cd9ea36-22', '0_MAT-POS-78e1d523-1', '0_RAL-THA-2d450e86-26', '0_MAT-POS-8293a91a-8', '0_LON-WEI-9739a092-6', '0_EDG-MED-ba1ac7b9-11', '0_PET-UNK-7fb4f80a-1', '0_EDG-MED-70ae9412-1', '0_VLA-UCB-34f3ed0c-11', '0_MAT-POS-de59a476-4', '0_MAT-POS-5cd9ea36-17', '0_EDG-MED-70ae9412-2', '0_EDG-MED-5d232de5-3', '0_EDJ-MED-c3ea9889-6', '0_PET-UNK-bb7ffe78-1', '0_ALP-POS-869ac754-1', '0_MAT-POS-5d65ec79-2', '0_MAT-POS-7174c657-5', '0_EDG-MED-b1ef7fe3-1', '0_EDJ-MED-43f8f7d6-4', '0_RUB-POS-1325a9ea-4', '0_RAL-THA-2d450e86-30', '0_EDJ-MED-e4b030d8-11', '0_MAT-POS-dd3ad2b5-2', '0_MAT-POS-fb82b63d-1', '0_MAT-POS-1bed62cf-3', '0_ALP-POS-8b8a49e1-4', '0_EDG-MED-ba1ac7b9-21', '0_MAT-POS-e119ab4f-3']
[[ 0 22 23 13 10 13 24 13 24 11 13 22 24 23 24 13 24 24 13 10 22 13 13 24
  22 21 10 14 24 22]
 [22  0 23 13 10 13 22 13 22 10 13 31 22 23 22 13 23 22 13 11 25 13 13 23
  25 23 11 13 22 25]
 [23 23  0 13 10 13 23 13 23 10 13 23 23 24 23 13 24 23 13 10 23 13 13 24
  23 22

100%|██████████| 18/18 [00:02<00:00,  8.36it/s]


['1_0', '1_1', '1_2', '1_3', '1_4', '1_5', '1_6', '1_7', '1_8', '1_9', '1_10', '1_11', '0_RAL-THA-2d450e86-26', '0_MAT-POS-5d65ec79-2', '0_RAL-THA-2d450e86-30', '0_MAT-POS-dd3ad2b5-2', '0_MAT-POS-fb82b63d-1', '0_ALP-POS-8b8a49e1-4']
[[ 0 22 23 10 13 24 24 13 13 24 10 22 13 24 13 22 21 14]
 [22  0 23 10 13 22 22 13 13 23 11 25 13 22 13 25 23 13]
 [23 23  0 10 13 23 23 13 13 24 10 23 13 23 13 23 22 14]
 [11 10 10  0 10 11 11 10 10 11 17 10 10 11 10 10 10 11]
 [13 13 13 10  0 13 13 21 21 13 10 13 20 13 20 13 13  7]
 [24 22 23 10 13  0 25 13 13 24 10 22 13 27 13 22 21 14]
 [24 22 23 10 13 25  0 13 13 24 10 22 13 25 13 22 21 14]
 [13 13 13 10 21 13 13  0 21 13 10 13 20 13 20 13 13  7]
 [13 13 13 10 21 13 13 21  0 13 10 13 20 13 20 13 13  7]
 [24 23 24 10 13 24 24 13 13  0 10 23 13 24 13 23 22 15]
 [10 10 10 17 10 10 10 10 10 10  0 15 10 10 10 10 10 10]
 [22 24 23 10 13 22 22 13 13 23 15  0 13 22 13 24 23 13]
 [13 13 13 10 20 13 13 20 20 13 10 13  0 13 19 13 13  6]
 [24 22 23 10 13 27 25 13 

100%|██████████| 13/13 [00:00<00:00, 19.03it/s]


['2_0', '2_1', '2_2', '2_3', '2_4', '1_2', '1_6', '1_8', '0_RAL-THA-2d450e86-26', '0_RAL-THA-2d450e86-30', '0_MAT-POS-dd3ad2b5-2', '0_MAT-POS-fb82b63d-1', '0_ALP-POS-8b8a49e1-4']
[[ 0 22 10 13 24 23 24 13 13 13 22 21 14]
 [22  0 10 13 22 23 22 13 13 13 24 23 13]
 [10 10  0 10 10 10 10 10 10 10 10 10  7]
 [13 13 10  0 13 13 13 21 20 20 13 13  7]
 [24 22 10 13  0 23 25 13 13 13 22 21 14]
 [23 23 10 13 23  0 23 13 13 13 23 22 14]
 [24 22 10 13 25 23  0 13 13 13 22 21 14]
 [13 13 10 21 13 13 13  0 20 20 13 13  7]
 [13 13 10 20 13 13 13 20  0 19 13 13  6]
 [13 13 10 20 13 13 13 20 19  0 13 13  6]
 [22 25 10 13 22 23 22 13 13 13  0 23 13]
 [21 23 10 13 21 22 21 13 13 13 23  0 12]
 [14 13  7  7 14 14 14  7  6  6 13 12  0]]
0 [4 6]
0 4 24 25 24
0 6 24 25 24
1 [10]
1 10 24 25 25
2 [ 0  1  3  4  5  6  7  8  9 10 11]
2 3 10 21 10
2 4 10 25 10
2 5 10 23 10
2 6 10 25 10
2 7 10 21 10
2 8 10 20 10
2 9 10 20 10
2 11 10 23 10
3 [7]
3 7 21 21 21
4 [6]
4 6 25 25 25
5 [ 0  1  4  6 10]
8 [3 7]
9 [3 7]
11 [

100%|██████████| 10/10 [00:00<00:00, 31.41it/s]


['3_0', '3_1', '3_2', '2_0', '2_2', '1_2', '0_RAL-THA-2d450e86-26', '0_RAL-THA-2d450e86-30', '0_MAT-POS-fb82b63d-1', '0_ALP-POS-8b8a49e1-4']
[[ 0 13 22 22 10 23 13 13 23 13]
 [13  0 13 13 10 13 20 20 13  7]
 [22 13  0 24 10 23 13 13 21 14]
 [22 13 24  0 10 23 13 13 21 14]
 [10 10 10 10  0 10 10 10 10  7]
 [23 13 23 23 10  0 13 13 22 14]
 [13 20 13 13 10 13  0 19 13  6]
 [13 20 13 13 10 13 19  0 13  6]
 [23 13 21 21 10 22 13 13  0 12]
 [13  7 14 14  7 14  6  6 12  0]]
0 [5 8]
0 5 23 23 23
1 [6 7]
1 6 20 20 20
2 [3]
2 3 24 24 24
4 [0 1 2 3 5 6 7 8]
4 7 10 20 10
4 8 10 23 10
7 [1]
8 [0]
9 [2 3 5]
[<ClusterStatus.success: 1> <ClusterStatus.success: 1>
 <ClusterStatus.success: 1> <ClusterStatus.success: 1>
 <ClusterStatus.single: 2> <ClusterStatus.success: 1>
 <ClusterStatus.success: 1> <ClusterStatus.single: 2>
 <ClusterStatus.single: 2> <ClusterStatus.single: 2>]


100%|██████████| 7/7 [00:00<00:00, 50.14it/s]


['4_0', '4_1', '4_2', '2_2', '0_RAL-THA-2d450e86-30', '0_MAT-POS-fb82b63d-1', '0_ALP-POS-8b8a49e1-4']
[[ 0 13 22 10 13 22 13]
 [13  0 13 10 19 13  6]
 [22 13  0 10 13 21 14]
 [10 10 10  0 10 10  7]
 [13 19 13 10  0 13  6]
 [22 13 21 10 13  0 12]
 [13  6 14  7  6 12  0]]
0 [2 5]
0 2 22 22 22
1 [4]
1 4 19 19 19
3 [0 1 2 4 5]
3 5 10 22 10
5 [0]
6 [2]
[<ClusterStatus.success: 1> <ClusterStatus.success: 1>
 <ClusterStatus.success: 1> <ClusterStatus.single: 2>
 <ClusterStatus.success: 1> <ClusterStatus.single: 2>
 <ClusterStatus.single: 2>]


100%|██████████| 5/5 [00:00<00:00, 82.39it/s]


['5_0', '5_1', '2_2', '0_MAT-POS-fb82b63d-1', '0_ALP-POS-8b8a49e1-4']
[[ 0 13 10 21 12]
 [13  0 10 13  5]
 [10 10  0 10  7]
 [21 13 10  0 12]
 [12  6  7 12  0]]
0 [3]
0 3 21 21 21
1 [0 3]
2 [0 1 3]
4 [0 3]
[<ClusterStatus.success: 1> <ClusterStatus.single: 2>
 <ClusterStatus.single: 2> <ClusterStatus.success: 1>
 <ClusterStatus.single: 2>]


100%|██████████| 4/4 [00:00<00:00, 131.26it/s]


['6_0', '5_1', '2_2', '0_ALP-POS-8b8a49e1-4']
[[ 0 13 10 11]
 [13  0 10  5]
 [10 10  0  7]
 [11  6  7  0]]
0 [1]
0 1 13 13 13
2 [0 1]
3 [0]
[<ClusterStatus.success: 1> <ClusterStatus.success: 1>
 <ClusterStatus.single: 2> <ClusterStatus.single: 2>]


100%|██████████| 3/3 [00:00<00:00, 205.74it/s]


['7_0', '2_2', '0_ALP-POS-8b8a49e1-4']
[[ 0 10  6]
 [10  0  7]
 [ 6  7  0]]
0 [1]
0 1 10 10 10
2 [1]
[<ClusterStatus.success: 1> <ClusterStatus.success: 1>
 <ClusterStatus.single: 2>]


100%|██████████| 2/2 [00:00<00:00, 245.53it/s]

['8_0', '0_ALP-POS-8b8a49e1-4']
[[0 6]
 [6 0]]
0 [1]
0 1 6 6 6
[<ClusterStatus.success: 1> <ClusterStatus.success: 1>]





In [245]:
def get_descendents(cluster):
    descendents = []
    for child in cluster.children:
        if isinstance(child, str):
            descendents.append(cluster)
        else:
            descendents.extend(get_descendents(child))
    return descendents

In [246]:
from harbor.plotting import ligands as l
reload(l)

<module 'harbor.plotting.ligands' from '/home/feanor/harbor/harbor/plotting/ligands.py'>

In [247]:
clusters.keys()

dict_keys(['9_0'])

In [248]:
len(mol_ids)

30

In [249]:
len(set(mol_ids))

30

In [250]:
ids_found = []
for cluster_id, cluster in clusters.items():
    print(f"Cluster {cluster_id}")
    descendents = get_descendents(cluster)
    print(f"Children: {len(descendents)}")
    mols = []
    for desc in descendents:
        mol = desc.repr
        mol.SetTitle(desc.children[0])
        mols.append(mol)
    l.plot_ligands_with_mcs(filename=f"cluster_{cluster_id}.png", mols=mols, mcs_mol=cluster.repr, reference="largest")
    ids_found.extend([desc.children[0] for desc in descendents])

Cluster 9_0
Children: 30
30 molecules to plot
['MAT-POS-5cd9ea36-22', 'MAT-POS-5cd9ea36-17', 'EDJ-MED-43f8f7d6-4', 'MAT-POS-e119ab4f-3', 'MAT-POS-dd3ad2b5-2', 'MAT-POS-78e1d523-1', 'EDG-MED-5d232de5-3', 'EDG-MED-ba1ac7b9-11', 'EDG-MED-ba1ac7b9-21', 'MAT-POS-5d65ec79-2', 'EDG-MED-70ae9412-1', 'EDG-MED-70ae9412-2', 'PET-UNK-29afea89-2', 'EDJ-MED-c3ea9889-6', 'ALP-POS-869ac754-1', 'EDJ-MED-e4b030d8-11', 'MAT-POS-fb82b63d-1', 'LON-WEI-9739a092-6', 'PET-UNK-7fb4f80a-1', 'MAT-POS-de59a476-4', 'MAT-POS-7174c657-5', 'PET-UNK-bb7ffe78-1', 'RUB-POS-1325a9ea-4', 'RAL-THA-2d450e86-26', 'RAL-THA-2d450e86-30', 'MAT-POS-8293a91a-8', 'VLA-UCB-34f3ed0c-11', 'EDG-MED-b1ef7fe3-1', 'MAT-POS-1bed62cf-3', 'ALP-POS-8b8a49e1-4']
[34 31 33 32 28 25 28 39 39 29 34 32 26 27 26 27 25 34 26 28 39 23 22 22
 21 26 27 33 33 23]
Generating a figure with 5 rows and 6 columns
['EDG-MED-ba1ac7b9-21', 'MAT-POS-7174c657-5', 'MAT-POS-5cd9ea36-22', 'LON-WEI-9739a092-6', 'EDG-MED-70ae9412-1', 'EDJ-MED-43f8f7d6-4', 'EDG-MED-b1



In [251]:
total_mols_found = []
for cluster_id, cluster in clusters.items():
    total_mols_found.extend([desc.children[0] for desc in get_descendents(cluster)])

In [252]:
len(total_mols_found)

30

In [253]:
len(set(total_mols_found))

30

In [254]:
set(ids_found)

{'ALP-POS-869ac754-1',
 'ALP-POS-8b8a49e1-4',
 'EDG-MED-5d232de5-3',
 'EDG-MED-70ae9412-1',
 'EDG-MED-70ae9412-2',
 'EDG-MED-b1ef7fe3-1',
 'EDG-MED-ba1ac7b9-11',
 'EDG-MED-ba1ac7b9-21',
 'EDJ-MED-43f8f7d6-4',
 'EDJ-MED-c3ea9889-6',
 'EDJ-MED-e4b030d8-11',
 'LON-WEI-9739a092-6',
 'MAT-POS-1bed62cf-3',
 'MAT-POS-5cd9ea36-17',
 'MAT-POS-5cd9ea36-22',
 'MAT-POS-5d65ec79-2',
 'MAT-POS-7174c657-5',
 'MAT-POS-78e1d523-1',
 'MAT-POS-8293a91a-8',
 'MAT-POS-dd3ad2b5-2',
 'MAT-POS-de59a476-4',
 'MAT-POS-e119ab4f-3',
 'MAT-POS-fb82b63d-1',
 'PET-UNK-29afea89-2',
 'PET-UNK-7fb4f80a-1',
 'PET-UNK-bb7ffe78-1',
 'RAL-THA-2d450e86-26',
 'RAL-THA-2d450e86-30',
 'RUB-POS-1325a9ea-4',
 'VLA-UCB-34f3ed0c-11'}

In [255]:
set(mol_ids) - set(ids_found)

set()

# Why are there still duplicates?

In [46]:
from collections import Counter

In [47]:
count_dict = Counter(total_mols_found)

In [48]:
bigger_than_one_count_dict = {k:v for k,v in count_dict.items()
                              if v > 1}

In [49]:
len(bigger_than_one_count_dict)

0

In [50]:
_id, _count = bigger_than_one_count_dict.popitem()

KeyError: 'popitem(): dictionary is empty'

In [51]:
_count

NameError: name '_count' is not defined

## which clusters is this one in?

In [None]:
_id

'TRY-UNI-714a760b-20'

In [None]:
desc_id_dict = {cluster_id: [desc.children[0] for desc in get_descendents(cluster)]
                             for cluster_id, cluster in clusters.items()}

In [None]:
owners = [cluster_id for cluster_id, ids in desc_id_dict.items() if _id in ids]

In [None]:
owners

['18_2']

## get descendents 

In [None]:
cluster0 = clusters[owners[0]]
previous_owners = [cluster.cluster_id for cluster in cluster0.children
                   if _id in [desc.children[0] for desc in get_descendents(cluster)]]

In [None]:
previous_owners

['5_29']

In [None]:
cluster0.children

[ClusterCenter(cluster_id='17_3', children=[ClusterCenter(cluster_id='15_2', children=[ClusterCenter(cluster_id='14_2', children=[ClusterCenter(cluster_id='12_2', children=[ClusterCenter(cluster_id='11_2', children=[ClusterCenter(cluster_id='10_5', children=[ClusterCenter(cluster_id='9_5', children=[ClusterCenter(cluster_id='8_5', children=[ClusterCenter(cluster_id='7_6', children=[ClusterCenter(cluster_id='6_9', children=[ClusterCenter(cluster_id='5_13', children=[ClusterCenter(cluster_id='4_20', children=[ClusterCenter(cluster_id='3_31', children=[ClusterCenter(cluster_id='2_48', children=[ClusterCenter(cluster_id='1_81', children=[ClusterCenter(cluster_id='0_EDG-MED-0da5ad92-18', children=['EDG-MED-0da5ad92-18'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197dc6c0> >, height=0), ClusterCenter(cluster_id='0_JAN-GHE-83b26c96-22', children=['JAN-GHE-83b26c96-22'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197f61f0> >, h

In [None]:
cluster1 = cluster0.children[1]
previous_owners = [cluster.cluster_id for cluster in cluster1.children
                   if _id in [desc.children[0] for desc in get_descendents(cluster)]]
print(cluster1.children)
print(previous_owners)

[ClusterCenter(cluster_id='3_35', children=[ClusterCenter(cluster_id='2_55', children=[ClusterCenter(cluster_id='1_104', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-22', children=['TRY-UNI-714a760b-22'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196148d0> >, height=0)], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad19527bd0> >, height=1), ClusterCenter(cluster_id='1_138', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-19', children=['TRY-UNI-714a760b-19'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196166a0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >,

In [None]:
cluster2 = cluster1.children[0]
previous_owners = [cluster.cluster_id for cluster in cluster2.children
                   if _id in [desc.children[0] for desc in get_descendents(cluster)]]
print(cluster2.children)
print(previous_owners)

[ClusterCenter(cluster_id='2_55', children=[ClusterCenter(cluster_id='1_104', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-22', children=['TRY-UNI-714a760b-22'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196148d0> >, height=0)], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad19527bd0> >, height=1), ClusterCenter(cluster_id='1_138', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-19', children=['TRY-UNI-714a760b-19'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196166a0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >, height=0)], repr=<oechem.OEMol; proxy of <

In [None]:
cluster3 = cluster2.children[0]
previous_owners = [cluster.cluster_id for cluster in cluster3.children
                   if _id in [desc.children[0] for desc in get_descendents(cluster)]]
print(cluster3.children)
print(previous_owners)

[ClusterCenter(cluster_id='1_104', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-22', children=['TRY-UNI-714a760b-22'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196148d0> >, height=0)], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad19527bd0> >, height=1), ClusterCenter(cluster_id='1_138', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-19', children=['TRY-UNI-714a760b-19'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196166a0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >, height=0)], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7

In [None]:
cluster3.children[0].children[0].children[0]

'TRY-UNI-714a760b-20'

In [None]:
cluster3.children[1].children[1].children[0]

'TRY-UNI-714a760b-20'

In [None]:
og_count_dict = Counter(mol_ids)
og_count_dict['TRY-UNI-714a760b-20']

1

In [None]:
cluster3.children[0]

ClusterCenter(cluster_id='1_104', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-22', children=['TRY-UNI-714a760b-22'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196148d0> >, height=0)], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad19527bd0> >, height=1)

In [None]:
# list of og pairs

In [None]:
pairs = [(0, 100), (2, 427), (3, 135), (4, 19), (5, 33), (8, 367), (10, 110), (12, 358), (14, 120), (15, 494), (17, 575), (18, 280), (20, 512), (21, 22), (23, 240), (24, 27), (25, 515), (26, 521), (28, 529), (30, 201), (31, 121), (32, 281), (35, 123), (37, 260), (38, 56), (39, 278), (42, 524), (43, 46), (44, 528), (45, 84), (47, 229), (49, 534), (51, 149), (53, 179), (59, 81), (61, 377), (62, 385), (63, 174), (64, 371), (65, 549), (72, 104), (75, 255), (77, 88), (78, 382), (82, 410), (89, 458), (98, 533), (101, 376), (102, 193), (105, 522), (106, 164), (113, 392), (114, 547), (115, 449), (116, 345), (124, 446), (125, 348), (126, 321), (127, 402), (128, 39), (130, 235), (133, 347), (136, 542), (138, 567), (139, 231), (144, 18), (147, 571), (150, 157), (153, 434), (159, 535), (161, 217), (166, 543), (169, 113), (171, 180), (172, 384), (175, 177), (176, 296), (182, 190), (187, 511), (189, 466), (197, 419), (198, 133), (207, 527), (210, 546), (213, 304), (223, 305), (225, 237), (228, 351), (233, 573), (239, 303), (241, 259), (249, 496), (253, 562), (257, 372), (263, 413), (266, 431), (267, 399), (269, 313), (273, 381), (274, 415), (283, 460), (284, 532), (288, 412), (291, 505), (292, 435), (293, 423), (297, 561), (300, 342), (302, 363), (308, 482), (309, 545), (310, 501), (314, 484), (316, 456), (326, 574), (339, 273), (341, 368), (344, 380), (360, 187), (364, 375), (370, 520), (389, 25), (393, 498), (394, 480), (404, 554), (409, 176), (418, 572), (424, 455), (428, 452), (433, 439), (437, 166), (441, 472), (444, 326), (447, 468), (451, 552), (462, 565), (467, 297), (475, 550), (488, 292), (506, 293)]