# Introduction

# Imports

In [1]:
from pathlib import Path
from harbor.plotting.ligands import plot_aligned_ligands

In order to avoiding having this repo depend directly on the asapdiscovery repo, I'm going to comment this out, but we'll use a test example from the covid moonshot molecules:
```
from asapdiscovery.data.testing.test_resources import fetch_test_file
mypath = fetch_test_file("Mpro_combined_labeled.sdf")
``` 

In [2]:
mypath = Path("../data/Mpro_combined_labeled.sdf")

I'm copying this code from the asapdiscovery repo.
Once that is conda installable, I'll make that a dep of this repo and use those tools for loading molecules

# Load Molecules

In [3]:
from rdkit import Chem
mols = Chem.SDMolSupplier(str(mypath))

In [4]:
mols = [mol for mol in mols]

In [5]:
import mols2grid

In [6]:
# define the grid to show the scaffolds
grid = mols2grid.display(mols)

In [7]:
grid

# MCSS-based Clustering

In [8]:
from harbor.clustering.hierarchical import ClusterResults, ClusterCenter, HeirarchicalClustering
from openeye import oechem

In [9]:
mol: Chem.Mol = mols[0]
mol.GetPropsAsDict()

{'SMILES': 'ClC=1C=CC=2NCCC(C(=O)NC=3C=NC=C4C=CC=CC34)C2C1',
 'Dataset': 'Mpro-x12171_0A',
 'Compound_ID': 'ALP-POS-477dc5b7-2'}

In [10]:
oemols = []
mol_ids = []
for rdkit_mol in mols:
    smiles = Chem.MolToSmiles(rdkit_mol)
    properties = rdkit_mol.GetPropsAsDict()
    mol_ids.append(properties["Compound_ID"])
    mol = oechem.OEMol()
    oechem.OESmilesToMol(mol, smiles)
    oemols.append(mol)

In [11]:
from harbor.clustering import hierarchical as h
from importlib import reload
reload(h)

<module 'harbor.clustering.hierarchical' from '/home/feanor/harbor/harbor/clustering/hierarchical.py'>

In [12]:
clusterer = h.HeirarchicalClustering(molecules=oemols, mol_ids=mol_ids)

In [13]:
clusters = clusterer.cluster(max_iterations=50, cutoff=12)

100%|██████████| 576/576 [24:55<00:00,  2.60s/it]


['0_ALP-POS-477dc5b7-2', '0_MAK-UNK-6435e6c2-8', '0_DAN-LON-a5fc619e-3', '0_EDJ-MED-76744c27-4', '0_MED-COV-4280ac29-31', '0_JOR-UNI-2fc98d0b-6', '0_AAR-POS-0daf6b7e-36', '0_NAU-LAT-8502cac5-6', '0_JAG-UCB-52b62a6f-11', '0_DUN-NEW-f8ce3686-23', '0_EDJ-MED-015fb6b4-2', '0_MAT-POS-fa06b69f-6', '0_LON-WEI-b8d98729-18', '0_BEN-DND-f2e727cd-5', '0_MAT-POS-c20a539d-4', '0_EDJ-MED-49816e9b-1', '0_PET-UNK-e44ffd04-1', '0_MAT-POS-fce787c2-6', '0_BEN-DND-93268d01-7', '0_DUN-NEW-f8ce3686-24', '0_VLA-UCB-50c39ae8-7', '0_EDJ-MED-6af13d92-1', '0_MAT-POS-916a2c5a-2', '0_AAR-POS-d2a4d1df-4', '0_JAN-GHE-83b26c96-9', '0_MAT-POS-bb423b95-2', '0_MAT-POS-968e8d9c-1', '0_TRY-UNI-714a760b-18', '0_AAR-POS-0daf6b7e-2', '0_AAR-POS-0daf6b7e-29', '0_TRY-UNI-2eddb1ff-2', '0_EDJ-MED-d08626de-3', '0_LON-WEI-9739a092-9 ', '0_JOR-UNI-2fc98d0b-12', '0_MAT-POS-90fd5f68-2', '0_MAT-POS-932d1078-3', '0_ALP-POS-477dc5b7-5', '0_AAR-POS-0daf6b7e-43', '0_DAR-DIA-23aa0b97-13', '0_RAL-THA-05e671eb-10', '0_MAT-POS-3ccb8ef6-1', '0



New clusters
['0_ALP-POS-477dc5b7-2', '0_ALP-UNI-8d415491-6', '0_DAN-LON-a5fc619e-3', '0_DAN-LON-a5fc619e-8', '0_EDJ-MED-76744c27-4', '0_EDJ-MED-968bafd9-1', '0_MED-COV-4280ac29-31', '0_DUN-NEW-f8ce3686-24', '0_JOR-UNI-2fc98d0b-6', '0_JOR-UNI-2fc98d0b-12', '0_JAG-UCB-52b62a6f-11', '0_LON-WEI-0a73fcb8-7', '0_EDJ-MED-015fb6b4-2', '0_MAT-POS-2e8b2191-11', '0_LON-WEI-b8d98729-18', '0_LON-WEI-b8d98729-8', '0_MAT-POS-c20a539d-4', '0_MAT-POS-90fd5f68-38', '0_EDJ-MED-49816e9b-1', '0_MAT-POS-6344a35d-1', '0_MAT-POS-fce787c2-6', '0_MAT-POS-fce787c2-3', '0_BEN-DND-93268d01-7', '0_ALP-POS-f13221e1-4', '0_VLA-UCB-50c39ae8-7', '0_MAT-POS-090737b9-1', '0_EDJ-MED-6af13d92-1', '0_MAT-POS-916a2c5a-2', '0_AAR-POS-d2a4d1df-4', '0_DUN-NEW-f8ce3686-14', '0_JAN-GHE-83b26c96-9', '0_TRY-UNI-714a760b-18', '0_MAT-POS-bb423b95-2', '0_VLA-UNK-82501c2c-1', '0_MAT-POS-968e8d9c-1', '0_JAG-UCB-119787ef-1', '0_AAR-POS-0daf6b7e-2', '0_LON-WEI-8f408cad-5', '0_TRY-UNI-2eddb1ff-2', '0_TRY-UNI-2eddb1ff-3', '0_EDJ-MED-d08626

100%|██████████| 412/412 [08:50<00:00,  1.29s/it]


['1_0', '1_1', '1_2', '1_3', '1_4', '1_5', '1_6', '1_7', '1_8', '1_9', '1_10', '1_11', '1_12', '1_13', '1_14', '1_15', '1_16', '1_17', '1_18', '1_19', '1_20', '1_21', '1_22', '1_23', '1_24', '1_25', '1_26', '1_27', '1_28', '1_29', '1_30', '1_31', '1_32', '1_33', '1_34', '1_35', '1_36', '1_37', '1_38', '1_39', '1_40', '1_41', '1_42', '1_43', '1_44', '1_45', '1_46', '1_47', '1_48', '1_49', '1_50', '1_51', '1_52', '1_53', '1_54', '1_55', '1_56', '1_57', '1_58', '1_59', '1_60', '1_61', '1_62', '1_63', '1_64', '1_65', '1_66', '1_67', '1_68', '1_69', '1_70', '1_71', '1_72', '1_73', '1_74', '1_75', '1_76', '1_77', '1_78', '1_79', '1_80', '1_81', '1_82', '1_83', '1_84', '1_85', '1_86', '1_87', '1_88', '1_89', '1_90', '1_91', '1_92', '1_93', '1_94', '1_95', '1_96', '1_97', '1_98', '1_99', '1_100', '1_101', '1_102', '1_103', '1_104', '1_105', '1_106', '1_107', '1_108', '1_109', '1_110', '1_111', '1_112', '1_113', '1_114', '1_115', '1_116', '1_117', '1_118', '1_119', '1_120', '1_121', '1_122', '1

100%|██████████| 321/321 [05:03<00:00,  1.06it/s]


['2_0', '2_1', '2_2', '2_3', '2_4', '2_5', '2_6', '2_7', '2_8', '2_9', '2_10', '2_11', '2_12', '2_13', '2_14', '2_15', '2_16', '2_17', '2_18', '2_19', '2_20', '2_21', '2_22', '2_23', '2_24', '2_25', '2_26', '2_27', '2_28', '2_29', '2_30', '2_31', '2_32', '2_33', '2_34', '2_35', '2_36', '2_37', '2_38', '2_39', '2_40', '2_41', '2_42', '2_43', '2_44', '2_45', '2_46', '2_47', '2_48', '2_49', '2_50', '2_51', '2_52', '2_53', '2_54', '2_55', '2_56', '2_57', '2_58', '2_59', '2_60', '2_61', '2_62', '2_63', '2_64', '2_65', '2_66', '2_67', '2_68', '2_69', '2_70', '2_71', '2_72', '2_73', '2_74', '2_75', '2_76', '2_77', '2_78', '2_79', '2_80', '2_81', '1_2', '1_3', '1_6', '1_10', '1_19', '1_22', '1_23', '1_27', '1_28', '1_29', '1_31', '1_34', '1_35', '1_37', '1_39', '1_46', '1_48', '1_49', '1_61', '1_66', '1_70', '1_73', '1_77', '1_79', '1_80', '1_84', '1_85', '1_87', '1_89', '1_90', '1_91', '1_94', '1_95', '1_101', '1_103', '1_105', '1_111', '1_116', '1_121', '1_126', '1_128', '1_129', '1_134', '1

100%|██████████| 264/264 [03:08<00:00,  1.40it/s]


['3_0', '3_1', '3_2', '3_3', '3_4', '3_5', '3_6', '3_7', '3_8', '3_9', '3_10', '3_11', '3_12', '3_13', '3_14', '3_15', '3_16', '3_17', '3_18', '3_19', '3_20', '3_21', '3_22', '3_23', '3_24', '3_25', '3_26', '3_27', '3_28', '3_29', '3_30', '3_31', '3_32', '3_33', '3_34', '3_35', '3_36', '3_37', '3_38', '3_39', '3_40', '3_41', '3_42', '3_43', '3_44', '3_45', '3_46', '3_47', '3_48', '3_49', '3_50', '3_51', '3_52', '3_53', '3_54', '3_55', '3_56', '3_57', '3_58', '3_59', '3_60', '2_0', '2_4', '2_6', '2_11', '2_14', '2_20', '2_22', '2_26', '2_32', '2_34', '2_35', '2_36', '2_40', '2_41', '2_47', '2_49', '2_50', '2_52', '2_57', '2_59', '2_60', '2_61', '2_63', '2_66', '2_67', '2_72', '2_73', '2_75', '2_76', '2_77', '2_78', '2_80', '2_81', '1_6', '1_19', '1_22', '1_23', '1_28', '1_31', '1_34', '1_35', '1_37', '1_46', '1_61', '1_80', '1_84', '1_87', '1_90', '1_91', '1_94', '1_101', '1_103', '1_105', '1_111', '1_116', '1_121', '1_126', '1_128', '1_129', '1_134', '1_137', '1_139', '0_NAU-LAT-8502ca

100%|██████████| 212/212 [01:56<00:00,  1.82it/s]


['4_0', '4_1', '4_2', '4_3', '4_4', '4_5', '4_6', '4_7', '4_8', '4_9', '4_10', '4_11', '4_12', '4_13', '4_14', '4_15', '4_16', '4_17', '4_18', '4_19', '4_20', '4_21', '4_22', '4_23', '4_24', '4_25', '4_26', '4_27', '4_28', '4_29', '4_30', '4_31', '4_32', '4_33', '4_34', '4_35', '4_36', '4_37', '4_38', '4_39', '4_40', '4_41', '4_42', '4_43', '4_44', '4_45', '4_46', '4_47', '3_5', '3_11', '3_19', '3_20', '3_21', '3_23', '3_25', '3_26', '3_30', '3_35', '3_37', '3_47', '3_48', '3_50', '3_52', '3_54', '3_55', '3_56', '3_57', '3_59', '3_60', '2_20', '2_26', '2_32', '2_35', '2_36', '2_40', '2_47', '2_49', '2_52', '2_57', '2_59', '2_60', '2_63', '2_66', '2_73', '2_75', '2_76', '2_78', '2_80', '2_81', '1_22', '1_23', '1_28', '1_31', '1_34', '1_35', '1_46', '1_61', '1_80', '1_84', '1_90', '1_91', '1_103', '1_116', '1_121', '1_126', '1_129', '1_137', '1_139', '0_NAU-LAT-8502cac5-6', '0_BEN-DND-f2e727cd-5', '0_PET-UNK-e44ffd04-1', '0_MAT-POS-3ccb8ef6-1', '0_ADA-UCB-6c2cb422-1', '0_MAT-POS-590ac91e

100%|██████████| 169/169 [01:02<00:00,  2.70it/s]


['5_0', '5_1', '5_2', '5_3', '5_4', '5_5', '5_6', '5_7', '5_8', '5_9', '5_10', '5_11', '5_12', '5_13', '5_14', '5_15', '5_16', '5_17', '5_18', '5_19', '5_20', '5_21', '5_22', '5_23', '5_24', '5_25', '5_26', '5_27', '5_28', '5_29', '5_30', '5_31', '5_32', '5_33', '5_34', '5_35', '5_36', '5_37', '4_2', '4_3', '4_4', '4_6', '4_9', '4_21', '4_22', '4_24', '4_27', '4_35', '4_38', '4_44', '4_45', '3_5', '3_23', '3_25', '3_26', '3_30', '3_37', '3_48', '3_50', '3_54', '3_55', '3_56', '2_20', '2_26', '2_32', '2_35', '2_40', '2_49', '2_57', '2_59', '2_60', '2_63', '2_66', '2_73', '2_75', '2_76', '2_78', '2_80', '2_81', '1_22', '1_23', '1_28', '1_31', '1_34', '1_80', '1_84', '1_90', '1_103', '1_116', '1_121', '1_129', '1_137', '1_139', '0_NAU-LAT-8502cac5-6', '0_BEN-DND-f2e727cd-5', '0_PET-UNK-e44ffd04-1', '0_MAT-POS-3ccb8ef6-1', '0_ADA-UCB-6c2cb422-1', '0_MAT-POS-590ac91e-27', '0_EDJ-MED-c3ea9889-6', '0_MAT-POS-bfd29aac-1', '0_BAR-COM-0f94fc3d-48', '0_MAT-POS-f7918075-8', '0_EDG-MED-0da5ad92-15'

100%|██████████| 136/136 [00:35<00:00,  3.81it/s]


['6_0', '6_1', '6_2', '6_3', '6_4', '6_5', '6_6', '6_7', '6_8', '6_9', '6_10', '6_11', '6_12', '6_13', '6_14', '6_15', '6_16', '6_17', '6_18', '6_19', '6_20', '6_21', '6_22', '6_23', '6_24', '6_25', '6_26', '6_27', '6_28', '6_29', '6_30', '5_9', '5_11', '5_20', '5_21', '5_24', '5_27', '5_28', '5_29', '5_31', '5_32', '5_33', '5_35', '4_2', '4_3', '4_6', '4_22', '4_24', '4_38', '4_44', '4_45', '3_5', '3_23', '3_26', '3_30', '3_37', '3_55', '3_56', '2_20', '2_35', '2_40', '2_49', '2_57', '2_59', '2_60', '2_63', '2_66', '2_73', '2_75', '1_23', '1_28', '1_31', '1_34', '1_80', '1_84', '1_90', '1_103', '1_116', '1_121', '1_129', '1_139', '0_BEN-DND-f2e727cd-5', '0_PET-UNK-e44ffd04-1', '0_MAT-POS-3ccb8ef6-1', '0_ADA-UCB-6c2cb422-1', '0_MAT-POS-590ac91e-27', '0_EDJ-MED-c3ea9889-6', '0_MAT-POS-bfd29aac-1', '0_MAT-POS-f7918075-8', '0_EDG-MED-0da5ad92-15', '0_EDG-MED-971238d3-1', '0_AAR-POS-0daf6b7e-27', '0_LON-WEI-8f408cad-7', '0_MIC-UNK-66895286-1', '0_EDG-MED-971238d3-4', '0_ALP-POS-9c80c481-1'

100%|██████████| 110/110 [00:19<00:00,  5.57it/s]


['7_0', '7_1', '7_2', '7_3', '7_4', '7_5', '7_6', '7_7', '7_8', '7_9', '7_10', '7_11', '7_12', '7_13', '7_14', '7_15', '7_16', '7_17', '7_18', '7_19', '7_20', '7_21', '7_22', '7_23', '7_24', '7_25', '7_26', '7_27', '7_28', '6_1', '6_6', '6_7', '6_13', '6_14', '6_15', '6_17', '6_18', '6_21', '6_22', '6_23', '6_24', '6_27', '6_28', '6_30', '5_9', '5_20', '5_24', '5_29', '5_31', '5_32', '4_2', '4_3', '4_6', '4_22', '4_24', '4_38', '4_44', '3_26', '3_30', '3_55', '3_56', '2_20', '2_35', '2_59', '2_60', '2_66', '1_28', '1_31', '1_80', '1_84', '1_90', '1_116', '1_129', '1_139', '0_MAT-POS-590ac91e-27', '0_EDJ-MED-c3ea9889-6', '0_MAT-POS-bfd29aac-1', '0_EDG-MED-0da5ad92-15', '0_AAR-POS-0daf6b7e-27', '0_LON-WEI-8f408cad-7', '0_EDG-MED-971238d3-4', '0_BAR-COM-0f94fc3d-59', '0_DAR-DIA-842b4336-13', '0_DAR-DIA-23aa0b97-6', '0_LON-WEI-8f408cad-1', '0_AAR-POS-d2a4d1df-19', '0_GAB-REV-70cc3ca5-4', '0_PET-UNK-8df914d1-4', '0_JAG-UCB-a3ef7265-18', '0_MAT-POS-c7771779-1', '0_RAL-THA-2d450e86-7', '0_MAT

100%|██████████| 86/86 [00:12<00:00,  6.65it/s]


['8_0', '8_1', '8_2', '8_3', '8_4', '8_5', '8_6', '8_7', '8_8', '8_9', '8_10', '8_11', '8_12', '8_13', '8_14', '8_15', '8_16', '8_17', '8_18', '8_19', '7_1', '7_15', '7_16', '7_17', '7_18', '7_20', '7_21', '7_26', '7_28', '6_1', '6_6', '6_13', '6_17', '6_18', '6_22', '6_23', '6_24', '6_27', '6_28', '5_9', '5_20', '5_24', '5_29', '5_31', '4_2', '4_6', '4_24', '4_38', '4_44', '3_26', '3_30', '3_55', '3_56', '2_20', '2_59', '2_66', '1_28', '1_31', '1_84', '1_90', '1_129', '0_MAT-POS-590ac91e-27', '0_MAT-POS-bfd29aac-1', '0_BAR-COM-0f94fc3d-59', '0_DAR-DIA-842b4336-13', '0_DAR-DIA-23aa0b97-6', '0_LON-WEI-8f408cad-1', '0_GAB-REV-70cc3ca5-4', '0_PET-UNK-8df914d1-4', '0_JAG-UCB-a3ef7265-18', '0_RAL-THA-2d450e86-7', '0_MAT-POS-b5746674-38', '0_MIC-UNK-66895286-3', '0_ALP-UNI-8d415491-1', '0_GAB-REV-70cc3ca5-13', '0_MAT-POS-4223bc15-28', '0_ALP-POS-95b75b4d-5', '0_VLA-UCB-00f2c2b3-7', '0_EDJ-MED-12c4873b-2', '0_JAN-GHE-83b26c96-18', '0_AAR-POS-0daf6b7e-38', '0_ALP-POS-6479a3a9-2', '0_EDG-MED-0d

100%|██████████| 69/69 [00:07<00:00,  9.48it/s]


['9_0', '9_1', '9_2', '9_3', '9_4', '9_5', '9_6', '9_7', '9_8', '9_9', '9_10', '9_11', '9_12', '9_13', '9_14', '8_13', '8_14', '7_1', '7_16', '7_17', '7_18', '7_20', '7_21', '7_26', '6_1', '6_13', '6_17', '6_18', '6_22', '6_23', '6_28', '5_9', '5_24', '5_29', '5_31', '4_2', '4_6', '4_24', '4_38', '4_44', '3_26', '3_30', '3_55', '3_56', '2_20', '2_59', '2_66', '1_28', '1_31', '1_90', '1_129', '0_MAT-POS-590ac91e-27', '0_MAT-POS-bfd29aac-1', '0_BAR-COM-0f94fc3d-59', '0_DAR-DIA-842b4336-13', '0_DAR-DIA-23aa0b97-6', '0_LON-WEI-8f408cad-1', '0_PET-UNK-8df914d1-4', '0_JAG-UCB-a3ef7265-18', '0_RAL-THA-2d450e86-7', '0_MAT-POS-b5746674-38', '0_MIC-UNK-66895286-3', '0_ALP-UNI-8d415491-1', '0_MAT-POS-4223bc15-28', '0_ALP-POS-95b75b4d-5', '0_VLA-UCB-00f2c2b3-7', '0_JAN-GHE-83b26c96-18', '0_EDG-MED-0da5ad92-12', '0_ALP-POS-95b75b4d-1']
[[ 0  3  6 ...  3  4  4]
 [ 2  0 21 ...  7  9  9]
 [ 5 21  0 ...  7  9  9]
 ...
 [ 5  7  7 ...  0  7  6]
 [ 3  9  9 ...  7  0 16]
 [ 3  9  9 ...  6 16  0]]
[46 14 63

100%|██████████| 57/57 [00:04<00:00, 11.94it/s]


['10_0', '10_1', '10_2', '10_3', '10_4', '10_5', '10_6', '10_7', '10_8', '10_9', '10_10', '10_11', '9_10', '9_12', '8_13', '7_16', '7_17', '7_18', '7_20', '7_21', '6_1', '6_17', '6_18', '6_23', '6_28', '5_9', '5_24', '5_29', '4_2', '4_6', '4_24', '4_38', '4_44', '3_26', '3_30', '3_55', '3_56', '2_20', '2_59', '1_28', '1_90', '1_129', '0_MAT-POS-590ac91e-27', '0_MAT-POS-bfd29aac-1', '0_BAR-COM-0f94fc3d-59', '0_DAR-DIA-842b4336-13', '0_DAR-DIA-23aa0b97-6', '0_LON-WEI-8f408cad-1', '0_PET-UNK-8df914d1-4', '0_JAG-UCB-a3ef7265-18', '0_RAL-THA-2d450e86-7', '0_MAT-POS-b5746674-38', '0_ALP-UNI-8d415491-1', '0_ALP-POS-95b75b4d-5', '0_VLA-UCB-00f2c2b3-7', '0_JAN-GHE-83b26c96-18', '0_EDG-MED-0da5ad92-12']
[[ 0  2  6 ...  3  4  3]
 [ 2  0 21 ...  9  7  9]
 [ 5 21  0 ... 10  7  9]
 ...
 [ 3  7  7 ...  0  7 16]
 [ 5  7  7 ...  7  0  7]
 [ 3  9  9 ... 16  7  0]]
[23  2  3  2 11 56  7  6 32 25  4  4 52 56 49  9 56 56 24  4 28  4  4 33
 52  9 56 56  2 56 56 33  8  2  3 18 56 34 32 30 12  3 24  2 41 56 5

100%|██████████| 47/47 [00:03<00:00, 13.79it/s]


['11_0', '11_1', '11_2', '11_3', '11_4', '11_5', '11_6', '11_7', '11_8', '10_1', '10_10', '9_12', '8_13', '7_16', '7_17', '7_18', '7_21', '6_1', '6_17', '6_18', '6_23', '5_24', '5_29', '4_2', '4_6', '4_24', '3_30', '3_55', '3_56', '2_20', '2_59', '1_28', '1_90', '1_129', '0_MAT-POS-590ac91e-27', '0_MAT-POS-bfd29aac-1', '0_BAR-COM-0f94fc3d-59', '0_DAR-DIA-842b4336-13', '0_DAR-DIA-23aa0b97-6', '0_LON-WEI-8f408cad-1', '0_PET-UNK-8df914d1-4', '0_JAG-UCB-a3ef7265-18', '0_RAL-THA-2d450e86-7', '0_MAT-POS-b5746674-38', '0_ALP-POS-95b75b4d-5', '0_VLA-UCB-00f2c2b3-7', '0_JAN-GHE-83b26c96-18']
[[ 0 13  9 ...  9 10  7]
 [13  0 16 ... 16 17  6]
 [ 9 16  0 ... 17 13  7]
 ...
 [ 9 16 17 ...  0 16  7]
 [ 7 17 13 ... 16  0  7]
 [ 7  6  7 ...  7  7  0]]
[33 42 44  0 21 46  0 34  0  0  1 44 41 39 44 16 42 23 19 18  0 44 16  0
 16 16  0  0 16 26  4 25 23  0  7  0  0 18 21 13 44 40  1  0  2 16  5]
[23 20 17 19 16 15 19 13 21 21 13 17 14 12 16 15 19 13 19 19 10 16 15 18
 16 14 23 12 16 22 16 12 14 23 13 22 

100%|██████████| 38/38 [00:02<00:00, 14.52it/s]


['12_0', '12_1', '12_2', '12_3', '12_4', '12_5', '12_6', '12_7', '12_8', '11_3', '11_6', '11_8', '10_1', '10_10', '9_12', '8_13', '7_17', '7_18', '7_21', '6_1', '5_29', '4_2', '4_6', '4_24', '3_55', '3_56', '2_20', '2_59', '1_28', '1_90', '0_MAT-POS-bfd29aac-1', '0_BAR-COM-0f94fc3d-59', '0_DAR-DIA-842b4336-13', '0_DAR-DIA-23aa0b97-6', '0_PET-UNK-8df914d1-4', '0_JAG-UCB-a3ef7265-18', '0_MAT-POS-b5746674-38', '0_VLA-UCB-00f2c2b3-7']
[[ 0 13  7 ...  6 13  7]
 [13  0 16 ... 11 13 17]
 [ 9 16  0 ... 11  9 13]
 ...
 [ 6 11 11 ...  0  6 11]
 [13 13  9 ...  6  0 10]
 [ 7 17 13 ... 11 10  0]]
[11 18  1 18 30 10  4  1  0  0 30  0  8 37 18 35  1 18  1 21 18  0 18 37
  0 18  8 18 23 21  0  0 18 18 18 34  0 18]
[21 19 16 15  9 12  8 19 21 19 19 21 21 13 17 14 16 15 19 13 15 18 16 14
 11 16 19 15 12 14 20 18 14 15 15 15 13 18]
0
Pairs [(0, 11)]
Singles []
Outliers []
Ignore [0, 11]
1
1 18 [ 1  3 14 17 20 22 25 27 32 33 34 37]
Pairs [(0, 11), (1, 18)]
Singles []
Outliers []
Ignore [0, 11, 1, 18]
2
2 

100%|██████████| 31/31 [00:00<00:00, 44.01it/s]


['13_0', '13_1', '13_2', '13_3', '13_4', '13_5', '13_6', '12_2', '12_3', '12_5', '11_3', '10_10', '9_12', '8_13', '7_17', '7_18', '6_1', '5_29', '4_2', '4_6', '3_56', '2_20', '2_59', '1_28', '1_90', '0_BAR-COM-0f94fc3d-59', '0_DAR-DIA-842b4336-13', '0_DAR-DIA-23aa0b97-6', '0_PET-UNK-8df914d1-4', '0_JAG-UCB-a3ef7265-18', '0_MAT-POS-b5746674-38']
[[ 0 12 13 21 17 20  7  7  9 10 19  9  7  7  9  7 11  9 17  7  9 17  9  9
  11 16  9  9  7  6 13]
 [12  0 18 12  7 11 14 14 15  9 10 12 16 11 14 14  6 15  9 15 16  9 15  9
   6  9 14 15 14 11 12]
 [13 18  0 13  9 12 13 15 14  9 12 12 15 10 15 13  5 14  9 14 15  9 15  8
   5  9 14 14 13 10 13]
 [21 12 13  0 17 20  7  7  9 10 19  9  7  7  9  7 11  9 17  7  9 17  9  9
  11 16  9  9  7  6 13]
 [17  7  9 17  0 18  5  9  9 11 12  5  7  7  9  7 11  9 15  7  9 18  9  7
  13 18  9  9  7  6  9]
 [20 11 12 20 18  0  7  9  9 11 18  7  9  7  9  7 11  9 17  7  9 18  9  9
  12 17  9  9  7  6 12]
 [ 7 14 13  7  6  7  0 12 12  7  7 10 13 11 12 12  6 12  7 13 14 

100%|██████████| 25/25 [00:00<00:00, 65.54it/s]


['14_0', '14_1', '14_2', '14_3', '14_4', '13_4', '13_5', '13_6', '11_3', '10_10', '9_12', '8_13', '7_17', '6_1', '5_29', '4_2', '2_20', '2_59', '1_90', '0_BAR-COM-0f94fc3d-59', '0_DAR-DIA-842b4336-13', '0_DAR-DIA-23aa0b97-6', '0_PET-UNK-8df914d1-4', '0_JAG-UCB-a3ef7265-18', '0_MAT-POS-b5746674-38']
[[ 0 12  9  9  7 17 20  7 19  9  7  7  9 11  9 17 17  9 11 16  9  9  7  6
  13]
 [12  0 13 12 13  7 11 13 10 11 15 10 13  6 14  9  9 15  6  9 14 14 13 10
  12]
 [ 9 13  0 15 13  9  9 12  6  9 14 11 16  6 15  6  9 15  6  9 14 15 14 11
   9]
 [ 9 12 15  0 12  9  9 10  5  9 13 10 15  5 15  6  9 14  5  9 14 15 13 10
   9]
 [ 7 13 13 12  0  7  7 12  6  9 15 12 13  6 13  7  7 13  7  7 12 13 14 11
   7]
 [17  7  9  9  7  0 18  5 12  5  7  7  9 11  9 15 18  9 13 18  9  9  7  6
   9]
 [20 11  9  9  7 18  0  7 18  7  9  7  9 11  9 17 18  9 12 17  9  9  7  6
  12]
 [ 7 13 12 10 12  6  7  0  7 10 13 11 12  6 12  7  7 13  6  7 12 12 12 11
   7]
 [19 10  6  5  6 12 18  7  0  9  6  6  6 13  8 17 15  8 12 1

100%|██████████| 21/21 [00:00<00:00, 82.95it/s]


['15_0', '15_1', '15_2', '15_3', '14_3', '14_4', '13_4', '13_6', '11_3', '10_10', '8_13', '6_1', '5_29', '4_2', '2_20', '2_59', '1_90', '0_BAR-COM-0f94fc3d-59', '0_DAR-DIA-842b4336-13', '0_DAR-DIA-23aa0b97-6', '0_MAT-POS-b5746674-38']
[[ 0  9  9  6  9  7 17  7 18  7  7 11  9 17 17  9 11 16  9  9 12]
 [ 9  0 13 10 12 13  7 12  7  9 10  5 14  8  9 15  5  9 14 14  9]
 [ 9 13  0 11 15 13  9 12  6  9 11  6 15  6  9 15  6  9 14 15  9]
 [ 6 10 11  0 10 11  6 11  6  8 12  7 10  6  6 10  6  6  9 10  6]
 [ 9 12 15 10  0 12  9 10  5  9 10  5 15  6  9 14  5  9 14 15  9]
 [ 7 13 13 11 12  0  7 12  6  9 12  6 13  7  7 13  7  7 12 13  7]
 [17  7  9  6  9  7  0  5 12  5  7 11  9 15 18  9 13 18  9  9  9]
 [ 7 12 12 11 10 12  6  0  7 10 11  6 12  7  7 13  6  7 12 12  7]
 [18  7  6  6  5  6 12  7  0  9  6 13  8 17 15  8 12 15  8  8 12]
 [ 7  9  9  8  9  9  6 10  9  0  8  6 11  7  7 10  6  7 10 12  9]
 [ 7 10 11 12 10 12  7 11  6  8  0  7 10  7  7 10  7  6  9 10  6]
 [11  5  6  6  5  6 11  6 13  6  6  0  

100%|██████████| 17/17 [00:00<00:00, 121.91it/s]


['16_0', '16_1', '16_2', '16_3', '16_4', '15_2', '14_3', '14_4', '13_6', '10_10', '6_1', '5_29', '4_2', '1_90', '0_DAR-DIA-842b4336-13', '0_DAR-DIA-23aa0b97-6', '0_MAT-POS-b5746674-38']
[[ 0  7  6 12 12  6  5  6  7  7 11  8 15 10  8  8 11]
 [ 7  0 10  7  7 13 12 13 12  9  5 14  8  5 14 14  9]
 [ 6 10  0  6  6 11 10 11 11  8  6 10  6  6  9 10  6]
 [12  7  6  0 17  9  9  6  5  5 11  9 14 12  9  9  9]
 [12  7  6 17  0  9  9  7  5  5 11  9 14 12  9  9  9]
 [ 6 13 11  9  9  0 15 13 12  9  6 15  6  6 14 15  9]
 [ 5 12 10  9  9 15  0 12 10  9  5 15  6  5 14 15  9]
 [ 6 13 11  6  7 13 12  0 12  9  6 13  7  7 12 13  7]
 [ 7 12 11  6  6 12 10 12  0 10  6 12  7  6 12 12  7]
 [ 7  9  8  6  6  9  9  9 10  0  6 11  7  6 10 12  9]
 [11  5  6 11 11  6  5  6  6  6  0  5 13 12  5  5  6]
 [ 8 14 10  9  9 15 15 13 12 11  5  0  9  5 14 15  9]
 [15  8  6 14 14  6  6  7  7  7 13  9  0 14  9  9  9]
 [10  3  6 12 12  6  3  7  3  3 12  3 14  0  3  3  6]
 [ 8 14  9  9  9 14 14 12 12 10  5 14  9  5  0 14  9]
 [ 8

100%|██████████| 11/11 [00:00<00:00, 230.96it/s]


['17_0', '17_1', '17_2', '17_3', '14_4', '13_6', '10_10', '6_1', '5_29', '1_90', '0_DAR-DIA-842b4336-13']
[[ 0  7 12  5  6  7  7 11  8 10  8]
 [ 7  0  7 12 12 11  9  4 14  4 14]
 [12  7  0  9  6  5  5 11  9 11  9]
 [ 5 12  9  0 12 10  9  5 15  5 14]
 [ 6 12  6 12  0 12  9  6 13  7 12]
 [ 7 11  6 10 12  0 10  6 12  6 12]
 [ 7  9  6  9  9 10  0  6 11  6 10]
 [11  4 11  5  6  6  6  0  5 12  5]
 [ 8 14  9 15 13 12 11  5  0  5 14]
 [10  3 11  3  7  3  3 12  3  0  3]
 [ 8 14  9 14 12 12 10  5 14  5  0]]
[ 2 10  0  8  8 10  8  9  3  7  8]
[12 14 12 15 13 12 11 12 15 12 14]
0
Pairs [(0, 2)]
Singles []
Outliers []
Ignore [0, 2]
1
1 10 [1 5]
Pairs [(0, 2), (1, 10)]
Singles []
Outliers []
Ignore [0, 2, 1, 10]
2
2
3
3 8 [ 3  4  6 10]
Pairs [(0, 2), (1, 10), (3, 8)]
Singles []
Outliers []
Ignore [0, 2, 1, 10, 3, 8]
4
4 8 [ 3  4  6 10]
Pairs [(0, 2), (1, 10), (3, 8)]
Singles [4]
Outliers []
Ignore [0, 2, 1, 10, 3, 8]
5
5 10 [1 5]
Pairs [(0, 2), (1, 10), (3, 8)]
Singles [4, 5]
Outliers []
Ignore [0, 

100%|██████████| 6/6 [00:00<00:00, 495.90it/s]


['18_0', '18_1', '18_2', '18_3', '14_4', '13_6']
[[ 0  5  5 10  6  5]
 [ 5  0 12  4 12 11]
 [ 5 12  0  5 12 10]
 [10  3  5  0  6  6]
 [ 6 12 12  6  0 12]
 [ 6 11 10  6 12  0]]
[3 4 4 0 5 4]
[10 12 12 10 12 12]
0
1
1 4 [1 2 5]
Pairs [(1, 4)]
Singles []
Outliers [0]
Ignore [1, 4]
2
2 4 [1 2 5]
Pairs [(1, 4)]
Singles [2]
Outliers [0]
Ignore [1, 4]
3
4
4
5
5 4 [1 2 5]
Pairs [(1, 4)]
Singles [2, 5]
Outliers [0, 3]
Ignore [1, 4]
Pairs [(1, 4)]
Singles [2, 5]
Outliers [0, 3]
Ignore [1, 4]
New clusters
['18_1', '14_4']
Singles
['17_3', '5_29', '4_24', '0_VLA-UCB-00f2c2b3-7']
Outliers
['17_0', '17_2', '6_1', '1_90']


100%|██████████| 3/3 [00:00<00:00, 1505.85it/s]

['19_0', '18_2', '13_6']
[[ 0 11 10]
 [11  0 10]
 [10 10  0]]
[1 0 1]
[11 11 10]
0
1
2
Pairs []
Singles []
Outliers [0, 1, 2]
Ignore []
New clusters
[]
Singles
[]
Outliers
['18_1', '14_4', '17_3', '5_29', '4_24', '0_VLA-UCB-00f2c2b3-7']





In [14]:
len(clusters)

102

In [15]:
def get_descendents(cluster):
    descendents = []
    for child in cluster.children:
        if isinstance(child, str):
            descendents.append(cluster)
        else:
            descendents.extend(get_descendents(child))
    return descendents

In [20]:
from harbor.plotting import ligands as l
reload(l)

<module 'harbor.plotting.ligands' from '/home/feanor/harbor/harbor/plotting/ligands.py'>

In [24]:
clusters.keys()

dict_keys(['0_MAK-UNK-6435e6c2-8', '0_AAR-POS-0daf6b7e-36', '0_AAR-POS-0daf6b7e-28', '0_AAR-POS-d2a4d1df-9', '0_AAR-POS-0daf6b7e-5', '0_AAR-POS-f650c5f2-2', '0_VIR-GIT-7b3d3065-2', '0_AAR-POS-d2a4d1df-8', '0_AAR-POS-0daf6b7e-39', '0_MAK-UNK-6435e6c2-7', '0_WAR-XCH-72a8c209-5', '0_AAR-POS-0daf6b7e-37', '0_AAR-POS-d2a4d1df-18', '0_AAR-POS-d2a4d1df-12', '0_AAR-POS-0daf6b7e-40', '0_MAT-POS-7dfc56d9-1', '0_AAR-POS-0daf6b7e-45', '0_NAU-LAT-445f63e5-6', '0_AAR-POS-d2a4d1df-5', '0_AAR-POS-0daf6b7e-20', '0_AAR-POS-0daf6b7e-22', '0_AAR-POS-0daf6b7e-35', '0_AAR-RCN-748c104b-1', '0_ALP-POS-c59291d4-5', '0_TAT-ENA-80bfd3e5-7', '0_AAR-POS-0daf6b7e-24', '0_AAR-POS-d2a4d1df-15', '0_TOB-UNK-c2aba166-1', '0_AAR-POS-d2a4d1df-17', '0_AAR-POS-f650c5f2-3', '0_JAG-UCB-cedd89ab-1', '0_AAR-POS-d2a4d1df-31', '0_AAR-POS-d2a4d1df-21', '0_JAG-UCB-cedd89ab-2', '0_AAR-POS-0daf6b7e-34', '0_AAR-POS-d2a4d1df-26', '0_AAR-POS-0daf6b7e-46', '1_14', '1_18', '1_74', '1_96', '1_132', '0_AAR-POS-d2a4d1df-6', '0_AAR-POS-0daf6b

In [26]:
len(mol_ids)

576

In [27]:
len(set(mol_ids))

554

In [21]:
ids_found = []
for cluster_id, cluster in clusters.items():
    print(f"Cluster {cluster_id}")
    descendents = get_descendents(cluster)
    print(f"Children: {len(descendents)}")
    mols = []
    for desc in descendents:
        mol = desc.repr
        mol.SetTitle(desc.children[0])
        mols.append(mol)
    l.plot_ligands_with_mcs(filename=f"cluster_{cluster_id}.png", mols=mols, mcs_mol=cluster.repr, reference="largest")
    ids_found.extend([desc.children[0] for desc in descendents])

Cluster 0_MAK-UNK-6435e6c2-8
Children: 1
1 molecules to plot
['MAK-UNK-6435e6c2-8']
[11]
Generating a figure with 1 rows and 1 columns
[]
Cluster 0_AAR-POS-0daf6b7e-36
Children: 1
1 molecules to plot
['AAR-POS-0daf6b7e-36']
[15]
Generating a figure with 1 rows and 1 columns
[]
Cluster 0_AAR-POS-0daf6b7e-28
Children: 1
1 molecules to plot
['AAR-POS-0daf6b7e-28']
[15]
Generating a figure with 1 rows and 1 columns
[]
Cluster 0_AAR-POS-d2a4d1df-9
Children: 1
1 molecules to plot
['AAR-POS-d2a4d1df-9']
[15]
Generating a figure with 1 rows and 1 columns
[]
Cluster 0_AAR-POS-0daf6b7e-5
Children: 1
1 molecules to plot
['AAR-POS-0daf6b7e-5']
[18]
Generating a figure with 1 rows and 1 columns
[]
Cluster 0_AAR-POS-f650c5f2-2
Children: 1
1 molecules to plot
['AAR-POS-f650c5f2-2']
[7]
Generating a figure with 1 rows and 1 columns
[]
Cluster 0_VIR-GIT-7b3d3065-2
Children: 1
1 molecules to plot
['VIR-GIT-7b3d3065-2']
[17]
Generating a figure with 1 rows and 1 columns
[]
Cluster 0_AAR-POS-d2a4d1df-8
Ch



1 4
1 5
1 6
2 1
2 2
2 3
2 4
2 5
2 6
3 1
3 2
3 3
3 4
3 5
3 6
4 1
4 2
4 3
4 4
4 5
4 6
5 1
5 2
5 3
5 4
5 5
5 6
6 1
6 2
6 3
Cluster 12_6
Children: 11
11 molecules to plot
['VLA-UCB-29506327-1', 'VLA-UCB-34f3ed0c-11', 'BEN-BAS-c2bc0d80-7', 'VLA-UCB-29506327-1', 'VLA-UCB-29506327-1', 'VLA-UCB-29506327-1', 'VLA-UCB-34f3ed0c-11', 'MAT-POS-8293a91a-8', 'ALP-POS-ce760d3f-8', 'ALP-POS-ce760d3f-8', 'LON-WEI-8f408cad-1']
[27 27 28 27 27 27 27 26 24 24 16]
Generating a figure with 3 rows and 4 columns
['VLA-UCB-29506327-1', 'VLA-UCB-34f3ed0c-11', 'VLA-UCB-29506327-1', 'VLA-UCB-29506327-1', 'VLA-UCB-29506327-1', 'VLA-UCB-34f3ed0c-11', 'MAT-POS-8293a91a-8', 'ALP-POS-ce760d3f-8', 'ALP-POS-ce760d3f-8', 'LON-WEI-8f408cad-1']
1 2
1 3
1 4
2 1
2 2
2 3
2 4
3 1
3 2
3 3
Cluster 3_55
Children: 2
2 molecules to plot
['AAR-POS-0daf6b7e-41', 'MAT-POS-86c60949-2']
[17 28]
Generating a figure with 1 rows and 2 columns
['AAR-POS-0daf6b7e-41']
1 2
Cluster 12_5
Children: 8
8 molecules to plot
['MAT-POS-590ac91e-18', 'M



1 5
1 6
1 7
1 8




1 9
1 10
1 11
1 12
1 13
1 14
1 15
1 16
1 17
1 18
1 19
1 20
1 21
2 1
2 2
2 3
2 4
2 5
2 6
2 7
2 8
2 9
2 10
2 11
2 12
2 13
2 14
2 15
2 16
2 17
2 18
2 19
2 20
2 21
3 1
3 2
3 3
3 4
3 5
3 6
3 7
3 8
3 9
3 10
3 11
3 12
3 13
3 14
3 15
3 16
3 17
3 18
3 19
3 20
3 21
4 1
4 2
4 3
4 4
4 5
4 6
4 7
4 8
4 9
4 10
4 11
4 12
4 13
4 14
4 15
4 16
4 17
4 18
4 19
4 20
4 21
5 1
5 2
5 3
5 4
5 5
5 6
5 7
5 8
5 9
5 10
5 11
5 12
5 13
5 14
5 15
5 16
5 17
5 18
5 19
5 20
5 21
6 1
6 2
6 3
6 4
6 5
6 6
6 7
6 8
6 9
6 10
6 11
6 12
6 13
6 14
6 15
6 16
6 17
6 18
6 19
6 20
6 21
7 1
7 2
7 3
7 4
7 5
7 6
7 7
7 8
7 9
7 10
7 11
7 12
7 13
7 14
7 15
7 16
7 17
7 18
7 19
7 20
7 21
8 1
8 2
8 3
8 4
8 5
8 6
8 7
8 8
8 9
8 10
8 11
8 12
8 13
8 14
8 15
8 16
8 17
8 18
8 19
8 20
8 21
9 1
9 2
9 3
9 4
9 5
9 6
9 7
9 8
9 9
9 10
9 11
9 12
9 13
9 14
9 15
9 16
9 17
9 18
9 19
9 20
9 21
10 1
10 2
10 3
10 4
10 5
10 6
10 7
10 8
10 9
10 10
10 11
10 12
10 13
10 14
10 15
10 16
10 17
10 18
10 19
10 20
10 21
11 1
11 2
11 3
11 4
11 5
11 6
11 7
11 8
11 9
11 10


In [32]:
total_mols_found = []
for cluster_id, cluster in clusters.items():
    total_mols_found.extend([desc.children[0] for desc in get_descendents(cluster)])

In [33]:
len(total_mols_found)

1176

In [36]:
len(set(total_mols_found))

554

In [18]:
set(ids_found)

{'AAR-POS-0daf6b7e-1',
 'AAR-POS-0daf6b7e-10',
 'AAR-POS-0daf6b7e-14',
 'AAR-POS-0daf6b7e-15',
 'AAR-POS-0daf6b7e-16',
 'AAR-POS-0daf6b7e-18',
 'AAR-POS-0daf6b7e-2',
 'AAR-POS-0daf6b7e-20',
 'AAR-POS-0daf6b7e-21',
 'AAR-POS-0daf6b7e-22',
 'AAR-POS-0daf6b7e-23',
 'AAR-POS-0daf6b7e-24',
 'AAR-POS-0daf6b7e-25',
 'AAR-POS-0daf6b7e-27',
 'AAR-POS-0daf6b7e-28',
 'AAR-POS-0daf6b7e-29',
 'AAR-POS-0daf6b7e-30',
 'AAR-POS-0daf6b7e-32',
 'AAR-POS-0daf6b7e-33',
 'AAR-POS-0daf6b7e-34',
 'AAR-POS-0daf6b7e-35',
 'AAR-POS-0daf6b7e-36',
 'AAR-POS-0daf6b7e-37',
 'AAR-POS-0daf6b7e-38',
 'AAR-POS-0daf6b7e-39',
 'AAR-POS-0daf6b7e-4',
 'AAR-POS-0daf6b7e-40',
 'AAR-POS-0daf6b7e-41',
 'AAR-POS-0daf6b7e-42',
 'AAR-POS-0daf6b7e-43',
 'AAR-POS-0daf6b7e-44',
 'AAR-POS-0daf6b7e-45',
 'AAR-POS-0daf6b7e-46',
 'AAR-POS-0daf6b7e-5',
 'AAR-POS-0daf6b7e-6',
 'AAR-POS-0daf6b7e-7',
 'AAR-POS-0daf6b7e-8',
 'AAR-POS-5507155c-1',
 'AAR-POS-5507155c-2',
 'AAR-POS-d2a4d1df-1',
 'AAR-POS-d2a4d1df-10',
 'AAR-POS-d2a4d1df-11',
 '

In [19]:
set(mol_ids) - set(ids_found)

set()

# Why are there still duplicates?

In [38]:
from collections import Counter

In [40]:
count_dict = Counter(total_mols_found)

In [54]:
bigger_than_one_count_dict = {k:v for k,v in count_dict.items()
                              if v > 1}

In [55]:
len(bigger_than_one_count_dict)

229

In [56]:
_id, _count = bigger_than_one_count_dict.popitem()

In [67]:
_count

2

## which clusters is this one in?

In [80]:
_id

'TRY-UNI-714a760b-20'

In [81]:
desc_id_dict = {cluster_id: [desc.children[0] for desc in get_descendents(cluster)]
                             for cluster_id, cluster in clusters.items()}

In [91]:
owners = [cluster_id for cluster_id, ids in desc_id_dict.items() if _id in ids]

In [92]:
owners

['18_2']

## get descendents 

In [95]:
cluster0 = clusters[owners[0]]
previous_owners = [cluster.cluster_id for cluster in cluster0.children
                   if _id in [desc.children[0] for desc in get_descendents(cluster)]]

In [96]:
previous_owners

['5_29']

In [97]:
cluster0.children

[ClusterCenter(cluster_id='17_3', children=[ClusterCenter(cluster_id='15_2', children=[ClusterCenter(cluster_id='14_2', children=[ClusterCenter(cluster_id='12_2', children=[ClusterCenter(cluster_id='11_2', children=[ClusterCenter(cluster_id='10_5', children=[ClusterCenter(cluster_id='9_5', children=[ClusterCenter(cluster_id='8_5', children=[ClusterCenter(cluster_id='7_6', children=[ClusterCenter(cluster_id='6_9', children=[ClusterCenter(cluster_id='5_13', children=[ClusterCenter(cluster_id='4_20', children=[ClusterCenter(cluster_id='3_31', children=[ClusterCenter(cluster_id='2_48', children=[ClusterCenter(cluster_id='1_81', children=[ClusterCenter(cluster_id='0_EDG-MED-0da5ad92-18', children=['EDG-MED-0da5ad92-18'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197dc6c0> >, height=0), ClusterCenter(cluster_id='0_JAN-GHE-83b26c96-22', children=['JAN-GHE-83b26c96-22'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197f61f0> >, h

In [102]:
cluster1 = cluster0.children[1]
previous_owners = [cluster.cluster_id for cluster in cluster1.children
                   if _id in [desc.children[0] for desc in get_descendents(cluster)]]
print(cluster1.children)
print(previous_owners)

[ClusterCenter(cluster_id='3_35', children=[ClusterCenter(cluster_id='2_55', children=[ClusterCenter(cluster_id='1_104', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-22', children=['TRY-UNI-714a760b-22'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196148d0> >, height=0)], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad19527bd0> >, height=1), ClusterCenter(cluster_id='1_138', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-19', children=['TRY-UNI-714a760b-19'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196166a0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >,

In [106]:
cluster2 = cluster1.children[0]
previous_owners = [cluster.cluster_id for cluster in cluster2.children
                   if _id in [desc.children[0] for desc in get_descendents(cluster)]]
print(cluster2.children)
print(previous_owners)

[ClusterCenter(cluster_id='2_55', children=[ClusterCenter(cluster_id='1_104', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-22', children=['TRY-UNI-714a760b-22'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196148d0> >, height=0)], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad19527bd0> >, height=1), ClusterCenter(cluster_id='1_138', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-19', children=['TRY-UNI-714a760b-19'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196166a0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >, height=0)], repr=<oechem.OEMol; proxy of <

In [107]:
cluster3 = cluster2.children[0]
previous_owners = [cluster.cluster_id for cluster in cluster3.children
                   if _id in [desc.children[0] for desc in get_descendents(cluster)]]
print(cluster3.children)
print(previous_owners)

[ClusterCenter(cluster_id='1_104', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-22', children=['TRY-UNI-714a760b-22'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196148d0> >, height=0)], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad19527bd0> >, height=1), ClusterCenter(cluster_id='1_138', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-19', children=['TRY-UNI-714a760b-19'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196166a0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >, height=0)], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7

In [110]:
cluster3.children[0].children[0].children[0]

'TRY-UNI-714a760b-20'

In [112]:
cluster3.children[1].children[1].children[0]

'TRY-UNI-714a760b-20'

In [116]:
og_count_dict = Counter(mol_ids)
og_count_dict['TRY-UNI-714a760b-20']

1

In [118]:
cluster3.children[0]

ClusterCenter(cluster_id='1_104', children=[ClusterCenter(cluster_id='0_TRY-UNI-714a760b-20', children=['TRY-UNI-714a760b-20'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad197df7e0> >, height=0), ClusterCenter(cluster_id='0_TRY-UNI-714a760b-22', children=['TRY-UNI-714a760b-22'], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad196148d0> >, height=0)], repr=<oechem.OEMol; proxy of <Swig Object of type 'OEMolWrapper *' at 0x7fad19527bd0> >, height=1)

In [119]:
# list of og pairs

In [120]:
pairs = [(0, 100), (2, 427), (3, 135), (4, 19), (5, 33), (8, 367), (10, 110), (12, 358), (14, 120), (15, 494), (17, 575), (18, 280), (20, 512), (21, 22), (23, 240), (24, 27), (25, 515), (26, 521), (28, 529), (30, 201), (31, 121), (32, 281), (35, 123), (37, 260), (38, 56), (39, 278), (42, 524), (43, 46), (44, 528), (45, 84), (47, 229), (49, 534), (51, 149), (53, 179), (59, 81), (61, 377), (62, 385), (63, 174), (64, 371), (65, 549), (72, 104), (75, 255), (77, 88), (78, 382), (82, 410), (89, 458), (98, 533), (101, 376), (102, 193), (105, 522), (106, 164), (113, 392), (114, 547), (115, 449), (116, 345), (124, 446), (125, 348), (126, 321), (127, 402), (128, 39), (130, 235), (133, 347), (136, 542), (138, 567), (139, 231), (144, 18), (147, 571), (150, 157), (153, 434), (159, 535), (161, 217), (166, 543), (169, 113), (171, 180), (172, 384), (175, 177), (176, 296), (182, 190), (187, 511), (189, 466), (197, 419), (198, 133), (207, 527), (210, 546), (213, 304), (223, 305), (225, 237), (228, 351), (233, 573), (239, 303), (241, 259), (249, 496), (253, 562), (257, 372), (263, 413), (266, 431), (267, 399), (269, 313), (273, 381), (274, 415), (283, 460), (284, 532), (288, 412), (291, 505), (292, 435), (293, 423), (297, 561), (300, 342), (302, 363), (308, 482), (309, 545), (310, 501), (314, 484), (316, 456), (326, 574), (339, 273), (341, 368), (344, 380), (360, 187), (364, 375), (370, 520), (389, 25), (393, 498), (394, 480), (404, 554), (409, 176), (418, 572), (424, 455), (428, 452), (433, 439), (437, 166), (441, 472), (444, 326), (447, 468), (451, 552), (462, 565), (467, 297), (475, 550), (488, 292), (506, 293)]