# Introduction

# Imports

In [1]:
from pathlib import Path
from harbor.plotting.ligands import plot_aligned_ligands

In order to avoiding having this repo depend directly on the asapdiscovery repo, I'm going to comment this out, but we'll use a test example from the covid moonshot molecules:
```
from asapdiscovery.data.testing.test_resources import fetch_test_file
mypath = fetch_test_file("Mpro_combined_labeled.sdf")
``` 

In [2]:
mypath = Path("../data/Mpro_combined_labeled.sdf")

I'm copying this code from the asapdiscovery repo.
Once that is conda installable, I'll make that a dep of this repo and use those tools for loading molecules

# Load Molecules

In [3]:
from rdkit import Chem
mols = Chem.SDMolSupplier(str(mypath))

In [4]:
mols = [mol for mol in mols]

In [5]:
import mols2grid

In [6]:
# define the grid to show the scaffolds
grid = mols2grid.display(mols)

In [7]:
grid

# MCSS-based Clustering

In [8]:
from harbor.clustering.hierarchical import ClusterResults, ClusterCenter, HeirarchicalClustering
from openeye import oechem

In [9]:
mol: Chem.Mol = mols[0]
mol.GetPropsAsDict()

{'SMILES': 'ClC=1C=CC=2NCCC(C(=O)NC=3C=NC=C4C=CC=CC34)C2C1',
 'Dataset': 'Mpro-x12171_0A',
 'Compound_ID': 'ALP-POS-477dc5b7-2'}

In [10]:
oemols = []
mol_ids = []
for rdkit_mol in mols[:20]:
    smiles = Chem.MolToSmiles(rdkit_mol)
    properties = rdkit_mol.GetPropsAsDict()
    mol_ids.append(properties["Compound_ID"])
    mol = oechem.OEMol()
    oechem.OESmilesToMol(mol, smiles)
    oemols.append(mol)

In [11]:
from harbor.clustering import hierarchical as h
from importlib import reload
reload(h)

<module 'harbor.clustering.hierarchical' from '/home/feanor/harbor/harbor/clustering/hierarchical.py'>

In [12]:
clusterer = h.HeirarchicalClustering(molecules=oemols, mol_ids=mol_ids)

In [13]:
clusters = clusterer.cluster(max_iterations=10, cutoff=15)

100%|██████████| 20/20 [00:03<00:00,  6.49it/s]


['0_ALP-POS-477dc5b7-2', '0_MAK-UNK-6435e6c2-8', '0_DAN-LON-a5fc619e-3', '0_EDJ-MED-76744c27-4', '0_MED-COV-4280ac29-31', '0_JOR-UNI-2fc98d0b-6', '0_AAR-POS-0daf6b7e-36', '0_NAU-LAT-8502cac5-6', '0_JAG-UCB-52b62a6f-11', '0_DUN-NEW-f8ce3686-23', '0_EDJ-MED-015fb6b4-2', '0_MAT-POS-fa06b69f-6', '0_LON-WEI-b8d98729-18', '0_BEN-DND-f2e727cd-5', '0_MAT-POS-c20a539d-4', '0_EDJ-MED-49816e9b-1', '0_PET-UNK-e44ffd04-1', '0_MAT-POS-fce787c2-6', '0_BEN-DND-93268d01-7', '0_DUN-NEW-f8ce3686-24']
[[ 0 10  8 23  9  9  7 14 21  9 23  7 11 23 21 15 12 21 11  9]
 [10  0  6 10  6  9  6  6 11  6 10  7 10 10 10  9  6 10 11  6]
 [ 8  6  0  8 16  9  9  8  8  7  9  5  7 11  8  7  8  8  7 16]
 [23 10 11  0 10  9  9 12 20 12 25 10 16 25 21 16 12 21 11 12]
 [ 9  4 16  9  0  9  9  7  7  7  9  5  6  9 10  6  7  9  7 18]
 [ 9  9  9  9  9  0  6  7  9  6  9  8  9  9  9 10  7  9  9  9]
 [ 7  6  9  7  9  6  0  7  7  7  7  5  7  9  7  6  7  7  7  9]
 [14  6  8 12  7  6  7  0 15  9 12  7  8 12 12  6 16 12  8  9]
 [20 11  

100%|██████████| 11/11 [00:00<00:00, 19.20it/s]


['1_0', '1_1', '1_2', '0_ALP-POS-477dc5b7-2', '0_DAN-LON-a5fc619e-3', '0_JAG-UCB-52b62a6f-11', '0_LON-WEI-b8d98729-18', '0_BEN-DND-f2e727cd-5', '0_MAT-POS-c20a539d-4', '0_EDJ-MED-49816e9b-1', '0_MAT-POS-fce787c2-6']
[[ 0 10 12 23 11 20 16 25 21 16 21]
 [ 8  0  7  8 16  7  7 10  8  6  8]
 [12  7  0 12  8 12  8 12 12  7 12]
 [23  8 12  0  8 21 11 23 21 15 21]
 [ 8 16  8  8  0  8  7 11  8  7  8]
 [19  7 12 20  8  0 16 19 17 14 17]
 [16  9  8 16 10 16  0 16 15 16 15]
 [24 10 12 23 11 20 16  0 21 16 21]
 [21  7 12 21  8 18 15 21  0 15 21]
 [16  6  7 15  7 14 16 16 15  0 15]
 [21  7 12 21  8 18 15 21 21 15  0]]
[ 7  4 10  7  1  3  9  0 10  7  8]
[25 16 12 23 16 20 16 24 21 16 21]
0
0 7 [0 3 9]
Pairs [(0, 7)]
Singles []
Outliers []
Ignore [0, 7]
1
Pairs [(0, 7), (1, 4)]
Singles []
Outliers []
Ignore [0, 7, 1, 4]
2
3
3 7 [0 3 9]
Pairs [(0, 7), (1, 4)]
Singles [3]
Outliers [2]
Ignore [0, 7, 1, 4]
4
4
5
6
Pairs [(0, 7), (1, 4)]
Singles [3, 5, 6]
Outliers [2]
Ignore [0, 7, 1, 4, 6]
7
7
8
8 10 [2 

100%|██████████| 7/7 [00:00<00:00, 18.08it/s]


['2_0', '2_1', '2_2', '0_ALP-POS-477dc5b7-2', '0_JAG-UCB-52b62a6f-11', '0_LON-WEI-b8d98729-18', '0_EDJ-MED-49816e9b-1']
[[ 0 10 21 23 20 16 16]
 [10  0  7  7  7  7  6]
 [21  7  0 21 18 15 15]
 [23  7 21  0 21 11 15]
 [19  7 17 20  0 16 14]
 [16  9 15 16 16  0 16]
 [16  6 15 15 14 16  0]]
[3 0 3 0 3 6 5]
[23 10 21 23 20 16 16]
0
0 3 [0 2 4]
Pairs [(0, 3)]
Singles []
Outliers []
Ignore [0, 3]
1
2
2 3 [0 2 4]
Pairs [(0, 3)]
Singles [2]
Outliers [1]
Ignore [0, 3]
3
3
4
4 3 [0 2 4]
Pairs [(0, 3)]
Singles [2, 4]
Outliers [1]
Ignore [0, 3]
5
Pairs [(0, 3), (5, 6)]
Singles [2, 4]
Outliers [1]
Ignore [0, 3, 5, 6]
6
6
Pairs [(0, 3), (5, 6)]
Singles [2, 4]
Outliers [1]
Ignore [0, 3, 5, 6]
New clusters
['2_0', '0_ALP-POS-477dc5b7-2', '0_LON-WEI-b8d98729-18', '0_EDJ-MED-49816e9b-1']
Singles
['0_MAT-POS-c20a539d-4', '0_MAT-POS-fce787c2-6', 'JAG-UCB-52b62a6f-11']
Outliers
['1_1', '0_DAN-LON-a5fc619e-3']


100%|██████████| 4/4 [00:00<00:00, 77.85it/s]


['3_0', '3_1', '2_2', '0_JAG-UCB-52b62a6f-11']
[[ 0 15 21 20]
 [15  0 15 14]
 [21 15  0 18]
 [19 14 17  0]]
[2 2 0 0]
[21 15 21 19]
0
0 2 [0 1]
Pairs [(0, 2)]
Singles []
Outliers []
Ignore [0, 2]
1
1 2 [0 1]
Pairs [(0, 2)]
Singles [1]
Outliers []
Ignore [0, 2]
2
2
3
3 0 [2 3]
Pairs [(0, 2)]
Singles [1, 3]
Outliers []
Ignore [0, 2]
Pairs [(0, 2)]
Singles [1, 3]
Outliers []
Ignore [0, 2]
New clusters
['3_0', '2_2']
Singles
['0_LON-WEI-b8d98729-18', '0_EDJ-MED-49816e9b-1', 'JAG-UCB-52b62a6f-11']
Outliers
[]


100%|██████████| 3/3 [00:00<00:00, 129.62it/s]


['4_0', '3_1', '0_JAG-UCB-52b62a6f-11']
[[ 0 15 18]
 [15  0 14]
 [17 14  0]]
[2 0 0]
[18 15 17]
0
Pairs [(0, 2)]
Singles []
Outliers []
Ignore [0, 2]
1
1 0 [1 2]
Pairs [(0, 2)]
Singles [1]
Outliers []
Ignore [0, 2]
2
2
Pairs [(0, 2)]
Singles [1]
Outliers []
Ignore [0, 2]
New clusters
['4_0', '0_JAG-UCB-52b62a6f-11']
Singles
['0_LON-WEI-b8d98729-18', '0_EDJ-MED-49816e9b-1']
Outliers
[]


100%|██████████| 2/2 [00:00<00:00, 693.10it/s]

['5_0', '3_1']
[[ 0 14]
 [14  0]]
[1 0]
[14 14]
0
1
Pairs []
Singles []
Outliers [0, 1]
Ignore []
New clusters
[]
Singles
[]
Outliers
['4_0', '0_JAG-UCB-52b62a6f-11', '0_LON-WEI-b8d98729-18', '0_EDJ-MED-49816e9b-1']





In [14]:
len(clusters)

10

In [15]:
def get_descendents(cluster):
    descendents = []
    for child in cluster.children:
        if isinstance(child, str):
            descendents.append(cluster)
        else:
            descendents.extend(get_descendents(child))
    return descendents

In [25]:
from harbor.plotting import ligands as l
reload(l)

<module 'harbor.plotting.ligands' from '/home/feanor/harbor/harbor/plotting/ligands.py'>

In [26]:
ids_found = []
for cluster_id, cluster in clusters.items():
    print(f"Cluster {cluster_id}")
    descendents = get_descendents(cluster)
    print(f"Children: {len(descendents)}")
    l.plot_ligands_with_mcs(filename=f"cluster_{cluster_id}.png", mols=[desc.repr for desc in descendents], mcs_mol=cluster.repr)
    ids_found.extend([desc.children[0] for desc in descendents])

Cluster 0_MAK-UNK-6435e6c2-8
Children: 1
1 1
Cluster 0_JOR-UNI-2fc98d0b-6
Children: 1
1 1
Cluster 0_AAR-POS-0daf6b7e-36
Children: 1
1 1
Cluster 0_DUN-NEW-f8ce3686-23
Children: 1
1 1
Cluster 0_MAT-POS-fa06b69f-6
Children: 1
1 1
Cluster 0_BEN-DND-93268d01-7
Children: 1
1 1
Cluster 1_2
Children: 2
1 2
1 2
Cluster 2_1
Children: 3
1 3
1 2
1 3
Cluster 5_0
Children: 7
2 4
1 2
1 3
1 4
2 1
2 2
2 3
Cluster 3_1
Children: 2
1 2
1 2


In [18]:
set(ids_found)

{'AAR-POS-0daf6b7e-36',
 'ALP-POS-477dc5b7-2',
 'BEN-DND-93268d01-7',
 'BEN-DND-f2e727cd-5',
 'DAN-LON-a5fc619e-3',
 'DUN-NEW-f8ce3686-23',
 'DUN-NEW-f8ce3686-24',
 'EDJ-MED-015fb6b4-2',
 'EDJ-MED-49816e9b-1',
 'EDJ-MED-76744c27-4',
 'JAG-UCB-52b62a6f-11',
 'JOR-UNI-2fc98d0b-6',
 'LON-WEI-b8d98729-18',
 'MAK-UNK-6435e6c2-8',
 'MAT-POS-c20a539d-4',
 'MAT-POS-fa06b69f-6',
 'MAT-POS-fce787c2-6',
 'MED-COV-4280ac29-31',
 'NAU-LAT-8502cac5-6',
 'PET-UNK-e44ffd04-1'}

In [19]:
set(mol_ids) - set(ids_found)

set()

In [20]:
def get_row_col(i, max_cols, zero_indexed=True):
    row = i // max_cols + (0 if zero_indexed else 1)
    col = i % max_cols + (0 if zero_indexed else 1)
    return row, col