In [1]:
import os
import sys
from scoring_functions import tanimoto
from utils import Variable, seq_to_smiles, unique
from rdkit import Chem
from rdkit.Chem import Draw
from scoring_functions import logP
import numpy as np
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from typing import List

In [2]:

import json
from scoring_functions import tanimoto, logP

In [3]:
query_structure = "C1(S(N)(=O)=O)=CC=C(N2C(C3=CC=C(C)C=C3)=CC(C(F)(F)F)=N2)C=C1"

In [4]:

def generate(molecules, number_samples: int, seed) -> List[str]:
        np.random.seed(seed)
        return list(np.random.choice(molecules, size=number_samples))

In [15]:
from utils import weighted_geometric_mean
def score(smi):
    mol = Chem.MolFromSmiles(smi)
    if mol is None or len(smi) == 0:
        return 0
    else:
        smi = Chem.MolToSmiles(mol)

        alpha = 10.0 
        score_qed = Descriptors.qed(mol)

        t = tanimoto("Celebrex")
        score_tanimoto = np.array(t.__call__([smi]))[0]
        beta = 10.0
        gamma = 1.0

        s= logP()
        score_logp = np.array(s.__call__([smi]))[0]
        
        weights = [alpha, beta, gamma]
        values = [score_logp, score_tanimoto, score_qed]
        s = weighted_geometric_mean(values, weights)


    return s



In [6]:
dist_file = "data/ChEMBL_filtered.smi"

with open(dist_file, 'r') as smiles_file:
    smiles_list = [line.strip() for line in smiles_file.readlines()]


  

In [18]:
def pass_rate(smis, seed, output_file):
    sim_pass = 0
    logP_pass = 0
    qed_pass = 0
    sim_min = 1
    sim_max = 0
    qed_min = 1
    qed_max = 0
    for smi in smis:
            m = Chem.MolFromSmiles(smi)
            mq = Chem.MolFromSmiles(query_structure)
            t = tanimoto("Celebrex")
            score_tanimoto = np.array(t.__call__([smi]))
            if sim_min > score_tanimoto:
                sim_min = score_tanimoto

            if sim_max < score_tanimoto:
                sim_max = score_tanimoto
            s= logP()
            score_logp = np.array(s.__call__([smi]))
            qed_mq = Descriptors.qed(mq)
            qed_m = Descriptors.qed(m)
            if qed_min > qed_m:
                qed_min = qed_m
            if qed_max < qed_m:
                qed_max = qed_m
            if score_tanimoto > 0.6:
                sim_pass += 1
            if score_logp == 1.0:
                logP_pass += 1
            if qed_m > qed_mq:
                qed_pass += 1

    print(f"seed:{seed}")
    print(f'random sample 100 from chembl_filtered  '
                    f'sim_pass_0.6: {float(sim_pass / 100):.3f} | '
                    f'sim_low_bound: {float(sim_min):.3f} | '
                    f'sim_max_bound: {float(sim_max):.3f} | '
                    f'logP_pass: {float(logP_pass / 100):.3f} | '
                    f'qed_pass: {float(qed_pass / 100):.3f} | '
                    f'qed_low_bound: {float(qed_min):.3f} | '
                    f'qed_max_bound: {float(qed_max):.3f} | '
        )
    with open (output_file, "r") as infile:
            existing_data = json.load(infile)
    existing_data ["random_sample100_pass_rates"]= {
            "sim_pass_0.6": float(sim_pass / 100),
            "sim_low_bound": float(sim_min),
            "sim_max_bound": float(sim_max),
            "logP_pass": float(logP_pass / 100),
            "qed_pass": float(qed_pass / 100), 
            "qed_low_bound": float(qed_min),
            "qed_max_bound": float(qed_max)
        }
    with open(output_file, 'w') as outfile:
        json.dump(existing_data, outfile, indent=4)


In [9]:
def evaluation(smis, seed,output_file):
    scores = []
    for smile in smis:
        s = score(smile)
        scores.append(s)

    avg_top1 = np.max(scores)
    avg_top10 = np.mean(sorted(scores, reverse=True)[:10])
    avg_top100 = np.mean(scores)

    print(f'random sample 100 from chembl_filtered  '
                    f'avg_top1: {avg_top1:.3f} | '
                    f'avg_top10: {avg_top10:.3f} | '
                    f'avg_top100: {avg_top100:.3f} | '
        )
    
    data = {"metrics": {
                    "avg_top1": avg_top1,
                    "avg_top10": avg_top10,
                    "avg_top100": avg_top100,

                }}
    with open(output_file, 'w') as outfile:
        json.dump(data, outfile, indent=4)


In [24]:
seeds = [0, 1, 2, 3, 5, 7, 11, 13, 17, 19]#, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97]
for seed in seeds:

    random_100 = generate(smiles_list, 100, seed)
    output_file = os.path.join('results/random_sample100_' +  "Celebrex" + "_" + str(seed)+".json")
    evaluation(random_100, seed, output_file)
    pass_rate(random_100, seed, output_file)


random sample 100 from chembl_filtered  avg_top1: 0.745 | avg_top10: 0.688 | avg_top100: 0.257 | 
seed:0
random sample 100 from chembl_filtered  sim_pass_0.6: 0.000 | sim_low_bound: 0.057 | sim_max_bound: 0.564 | logP_pass: 0.450 | qed_pass: 0.210 | qed_low_bound: 0.041 | qed_max_bound: 0.943 | 
random sample 100 from chembl_filtered  avg_top1: 0.724 | avg_top10: 0.672 | avg_top100: 0.261 | 
seed:1
random sample 100 from chembl_filtered  sim_pass_0.6: 0.010 | sim_low_bound: 0.044 | sim_max_bound: 0.608 | logP_pass: 0.470 | qed_pass: 0.240 | qed_low_bound: 0.166 | qed_max_bound: 0.912 | 
random sample 100 from chembl_filtered  avg_top1: 0.749 | avg_top10: 0.665 | avg_top100: 0.199 | 
seed:2
random sample 100 from chembl_filtered  sim_pass_0.6: 0.000 | sim_low_bound: 0.046 | sim_max_bound: 0.586 | logP_pass: 0.340 | qed_pass: 0.250 | qed_low_bound: 0.171 | qed_max_bound: 0.929 | 
random sample 100 from chembl_filtered  avg_top1: 0.752 | avg_top10: 0.689 | avg_top100: 0.231 | 
seed:3
rand

In [27]:
output_file = os.path.join('results/best100_' +  "Celebrex" + "_" + ".json")
smi_scores = []
for smi in smiles_list:
    smi_scores.append((smi, score(smi)))
best100 = sorted(smi_scores, key=lambda x: x[1], reverse=True)[:100]
beat_smi_100 = [smi for smi, _ in best100]

evaluation(beat_smi_100, seed, output_file)
pass_rate(beat_smi_100, seed, output_file)

random sample 100 from chembl_filtered  avg_top1: 0.990 | avg_top10: 0.988 | avg_top100: 0.958 | 
seed:19
random sample 100 from chembl_filtered  sim_pass_0.6: 1.000 | sim_low_bound: 0.840 | sim_max_bound: 1.000 | logP_pass: 1.000 | qed_pass: 0.340 | qed_low_bound: 0.359 | qed_max_bound: 0.807 | 
