In [2]:
from random import sample, seed
import numpy as np 
import pandas as pd

#utility functions : prepare the data 
from model_fp_selection.lib.utils import get_morgan_fp, get_rdkit_fp, convert_to_float
from model_fp_selection.lib.utils import prepare_df_morgan, prepare_df_rdkit, swap_identical_ligands
from model_fp_selection.lib.utils import drop_duplicates, average_duplicates, calc_desc

# RDkit
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem, DataStructs, Draw, rdMolDescriptors
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem.rdDepictor import Compute2DCoords

# Visualization
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 20})

import mols2grid
import requests
import os

# SCScorer for synthetic accessibility
from scscore.scscore.standalone_model_numpy import SCScorer
np.bool = np.bool_ # Required to avoid an error when running the model

# Combinatorial library generation
import itertools
from itertools import combinations_with_replacement

from tqdm.auto import tqdm
import time

np.random.seed(42)
seed(42)

In [3]:
predictions_fp = pd.read_csv("./data/Prediction_5k_pubchem_fp_id.csv")
predictions = pd.read_csv("./data/Prediction_5k_pubchem_desc_id.csv")

predictions_no_perm = pd.read_csv("./data/Prediction_5k_pubchem_desc_no_perm.csv")

complexes = pd.read_csv('./data/top6_A_ligands_5k_compounds.csv')
complexes['MOL1'] = complexes.L1.apply(Chem.MolFromSmiles)
complexes['MOL2'] = complexes.L2.apply(Chem.MolFromSmiles)

complexes['MOL3'] = complexes.L3.apply(Chem.MolFromSmiles)
complexes['SMILES'] = complexes.L1 + '.' + complexes.L2 + '.' + complexes.L3
complexes['mol'] = complexes['SMILES'].apply(Chem.MolFromSmiles)

In [4]:
pIC50_fp = predictions_fp.pIC50.tolist()
variance_fp = [np.sqrt(x) for x in predictions_fp.Vars.tolist()]

pIC50 = predictions.pIC50.tolist()
variance = [np.sqrt(x) for x in predictions.Vars.tolist()]

pIC50_no_perm = predictions_no_perm.pIC50.tolist()
variance_no_perm = [np.sqrt(x) for x in predictions_no_perm.Vars.tolist()]

In [5]:
complexes['ID'] = complexes.index

In [6]:
merged = pd.merge(complexes, predictions, on='ID')

In [7]:
merged = calc_desc(merged)

In [8]:
merged = merged.sort_values(by=['pIC50'], ascending=False)

### Tanimoto distance

Mean Tanimoto distance across the original dataset (on 'Descriptors'): 0.78.

In [9]:
def tanimoto_distance(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm_squared_a = np.dot(vector1, vector1)
    norm_squared_b = np.dot(vector2, vector2)
    
    tanimoto_dist = 1 - dot_product / (norm_squared_a + norm_squared_b - dot_product)
    
    return tanimoto_dist

In [25]:
best_dict = {}

for cutoff in [0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]:
    selected_id = [merged['ID'].values.tolist()[0]]
    i=0
    while i < len(merged) and len(selected_id) < 100:
        fp1 = merged['Descriptors'][i]
        ID = merged['ID'][i]
        dist_i_to_selected = []
        for j in selected_id:
            fp2 = merged['Descriptors'][j]
            dist = (tanimoto_distance(fp1, fp2))
            if dist >= cutoff:
                dist_i_to_selected.append(dist)
        if len(dist_i_to_selected) == len(selected_id):
            selected_id.append(ID)
        i+=1
    
    best = merged[merged['ID'].isin(selected_id)]
    best_dict[cutoff] = best
    
    print(f'cutoff = {cutoff} : {len(best)} best compounds, mean prediction : {np.mean(best.pIC50)}, predictions above 5 : {len(best[best.pIC50 >= 5])}, predictions above 6 : {len(best[best.pIC50 >= 6])}')
    

cutoff = 0.005 : 100 best compounds, mean prediction : 4.89538543961353, predictions above 5 : 40, predictions above 6 : 2
cutoff = 0.01 : 100 best compounds, mean prediction : 4.865831668247032, predictions above 5 : 38, predictions above 6 : 2
cutoff = 0.02 : 100 best compounds, mean prediction : 4.834645119594922, predictions above 5 : 35, predictions above 6 : 2
cutoff = 0.05 : 100 best compounds, mean prediction : 4.6936876760115185, predictions above 5 : 28, predictions above 6 : 2
cutoff = 0.1 : 95 best compounds, mean prediction : 4.538683598375391, predictions above 5 : 21, predictions above 6 : 2
cutoff = 0.2 : 71 best compounds, mean prediction : 4.480042613364808, predictions above 5 : 14, predictions above 6 : 1
cutoff = 0.5 : 42 best compounds, mean prediction : 4.455245482841404, predictions above 5 : 7, predictions above 6 : 1


In [43]:
for cutoff in [0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]:
    selected_id = [merged['ID'].values.tolist()[0]]
    i=0
    while i < len(merged) and len(selected_id) < 100:
        fp1 = merged['Desc3'][i]
        ID = merged['ID'][i]
        dist_i_to_selected = []
        for j in selected_id:
            fp2 = merged['Desc3'][j]
            dist = (tanimoto_distance(fp1, fp2))
            if dist >= cutoff:
                dist_i_to_selected.append(dist)
        if len(dist_i_to_selected) == len(selected_id):
            selected_id.append(ID)
        i+=1
    
    best = merged[merged['ID'].isin(selected_id)]
    print(f'cutoff = {cutoff} : {len(best)} best compounds, mean prediction : {np.mean(best.pIC50)}, predictions above 5 : {len(best[best.pIC50 >= 5])}, predictions above 6 : {len(best[best.pIC50 >= 6])}')

cutoff = 0.005 : 100 best compounds, mean prediction : 4.905180683937371, predictions above 5 : 39, predictions above 6 : 2
cutoff = 0.01 : 100 best compounds, mean prediction : 4.930002668864563, predictions above 5 : 41, predictions above 6 : 2
cutoff = 0.02 : 100 best compounds, mean prediction : 4.836684429967511, predictions above 5 : 34, predictions above 6 : 2
cutoff = 0.05 : 100 best compounds, mean prediction : 4.525694377906978, predictions above 5 : 20, predictions above 6 : 1
cutoff = 0.1 : 80 best compounds, mean prediction : 4.540829306047498, predictions above 5 : 18, predictions above 6 : 1
cutoff = 0.2 : 60 best compounds, mean prediction : 4.497974440712566, predictions above 5 : 12, predictions above 6 : 1
cutoff = 0.5 : 38 best compounds, mean prediction : 4.452315816363213, predictions above 5 : 7, predictions above 6 : 1


### Cosine distance

Mean cosine distance across the original dataset (on 'Descriptors') : 0.06.

In [46]:
from scipy.spatial.distance import cosine

for cutoff in [0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005]:
    selected_id = [merged['ID'].values.tolist()[0]]
    i=0
    while i < len(merged) and len(selected_id) < 100:
        fp1 = merged['Descriptors'][i]
        ID = merged['ID'][i]
        dist_i_to_selected = []
        for j in selected_id:
            fp2 = merged['Descriptors'][j]
            dist = (cosine(fp1, fp2))
            if dist >= cutoff:
                dist_i_to_selected.append(dist)
        if len(dist_i_to_selected) == len(selected_id):
            selected_id.append(ID)
        i+=1
    
    best = merged[merged['ID'].isin(selected_id)]
    print(f'cutoff = {cutoff} : {len(best)} best compounds, mean prediction : {np.mean(best.pIC50)}, predictions above 5 : {len(best[best.pIC50 >= 5])}, predictions above 6 : {len(best[best.pIC50 >= 6])}')

cutoff = 0.0001 : 100 best compounds, mean prediction : 4.734489005365092, predictions above 5 : 28, predictions above 6 : 2
cutoff = 0.0002 : 100 best compounds, mean prediction : 4.747200785158244, predictions above 5 : 31, predictions above 6 : 2
cutoff = 0.0005 : 100 best compounds, mean prediction : 4.754148997919875, predictions above 5 : 27, predictions above 6 : 4
cutoff = 0.001 : 90 best compounds, mean prediction : 4.724542624340576, predictions above 5 : 22, predictions above 6 : 3
cutoff = 0.002 : 64 best compounds, mean prediction : 4.743723127088633, predictions above 5 : 15, predictions above 6 : 2
cutoff = 0.005 : 36 best compounds, mean prediction : 4.882685287146877, predictions above 5 : 12, predictions above 6 : 3


In [47]:
from scipy.spatial.distance import cosine

for cutoff in [0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005]:
    selected_id = [merged['ID'].values.tolist()[0]]
    i=0
    while i < len(merged) and len(selected_id) < 100:
        fp1 = merged['Desc3'][i]
        ID = merged['ID'][i]
        dist_i_to_selected = []
        for j in selected_id:
            fp2 = merged['Desc3'][j]
            dist = (cosine(fp1, fp2))
            if dist >= cutoff:
                dist_i_to_selected.append(dist)
        if len(dist_i_to_selected) == len(selected_id):
            selected_id.append(ID)
        i+=1
    
    best = merged[merged['ID'].isin(selected_id)]
    print(f'cutoff = {cutoff} : {len(best)} best compounds, mean prediction : {np.mean(best.pIC50)}, predictions above 5 : {len(best[best.pIC50 >= 5])}, predictions above 6 : {len(best[best.pIC50 >= 6])}')

cutoff = 0.0001 : 100 best compounds, mean prediction : 4.973485559280231, predictions above 5 : 46, predictions above 6 : 7
cutoff = 0.0002 : 92 best compounds, mean prediction : 5.045606347240308, predictions above 5 : 45, predictions above 6 : 8
cutoff = 0.0005 : 49 best compounds, mean prediction : 5.110565420324681, predictions above 5 : 26, predictions above 6 : 6
cutoff = 0.001 : 30 best compounds, mean prediction : 5.314176321153314, predictions above 5 : 20, predictions above 6 : 5
cutoff = 0.002 : 17 best compounds, mean prediction : 5.233613022230476, predictions above 5 : 10, predictions above 6 : 2
cutoff = 0.005 : 8 best compounds, mean prediction : 5.531772075705236, predictions above 5 : 7, predictions above 6 : 1
