In [1]:
import os
import sys
from scoring_functions import tanimoto
from utils import Variable, seq_to_smiles, unique
from rdkit import Chem
from rdkit.Chem import Draw
from scoring_functions import logP
import numpy as np
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from typing import List

In [12]:

import json
from scoring_functions import tanimoto, logP

In [2]:
query_structure = "C1(S(N)(=O)=O)=CC=C(N2C(C3=CC=C(C)C=C3)=CC(C(F)(F)F)=N2)C=C1"

In [3]:

def generate(molecules, number_samples: int, seed) -> List[str]:
        np.random.seed(seed)
        return list(np.random.choice(molecules, size=number_samples))

In [4]:
from utils import weighted_geometric_mean
def score(smi):
    mol = Chem.MolFromSmiles(smi)
    if mol is None or len(smi) == 0:
        return 0
    else:
        smi = Chem.MolToSmiles(mol)

        alpha = 10.0 
        score_qed = Descriptors.qed(mol)

        t = tanimoto(query_structure)
        score_tanimoto = np.array(t.__call__([smi]))[0]
        beta = 10.0
        gamma = 1.0
        score += beta * score_tanimoto
        s= logP()
        score_logp = np.array(s.__call__([smi]))[0]
        
        weights = [alpha, beta, gamma]
        values = [score_logp, score_tanimoto, score_qed]
        weights = [alpha, beta, gamma]
        score = weighted_geometric_mean(values, weights)


    return score



In [13]:
dist_file = "data/ChEMBL_filtered.smi"

with open(dist_file, 'r') as smiles_file:
    smiles_list = [line.strip() for line in smiles_file.readlines()]

seeds = [0, 1, 2, 3, 5, 7, 11, 13, 17, 19]#, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97]
seed = 0
random_100 = generate(smiles_list, 100, seed)
  

In [11]:
output_file = os.path.join('random_sample100_' +  query_structure + "_" + str(seed)+".json")

In [14]:
def pass_rate(smis, seed, output_file):
    sim_pass = 0
    logP_pass = 0
    qed_pass = 0
    sim_min = 1
    sim_max = 0
    qed_min = 1
    qed_max = 0
    for smi in smis:
            m = Chem.MolFromSmiles(smi)
            mq = Chem.MolFromSmiles(query_structure)
            t = tanimoto(query_structure)
            score_tanimoto = np.array(t.__call__([smi]))
            if sim_min > score_tanimoto:
                sim_min = score_tanimoto

            if sim_max < score_tanimoto:
                sim_max = score_tanimoto
            s= logP()
            score_logp = np.array(s.__call__([smi]))
            qed_mq = Descriptors.qed(mq)
            qed_m = Descriptors.qed(m)
            if qed_min > qed_m:
                qed_min = qed_m
            if qed_max < qed_m:
                qed_max = qed_m
            if score_tanimoto > 0.6:
                sim_pass += 1
            if score_logp == 1.0:
                logP_pass += 1
            if qed_m > qed_mq:
                qed_pass += 1

    print(f"seed:{seed}")
    
    with open (output_file, "r") as infile:
            existing_data = json.load(infile)
    existing_data ["random_sample100_pass_rates"]= {
            "sim_pass_0.6": float(sim_pass / 100),
            "sim_low_bound": float(sim_min),
            "sim_max_bound": float(sim_max),
            "logP_pass": float(logP_pass / 100),
            "qed_pass": float(qed_pass / 100), 
            "qed_low_bound": float(qed_min),
            "qed_max_bound": float(qed_max)
        }
    with open(output_file, 'w') as outfile:
        json.dump(existing_data, outfile, indent=4)


In [15]:
def evaluation(smis, seed,output_file):
    scores = []
    for smile in smis:
        s = score(smile)
        scores.append(s)

    avg_top1 = np.max(scores)
    avg_top10 = np.mean(sorted(scores, reverse=True)[:10])
    avg_top100 = np.mean(scores)

    print(f'random sample 100 from chembl_filtered  '
                    f'avg_top1: {avg_top1:.3f} | '
                    f'avg_top10: {avg_top10:.3f} | '
                    f'avg_top100: {avg_top100:.3f} | '
        )
    
    data = {"metrics": {
                    "avg_top1": avg_top1,
                    "avg_top10": avg_top10,
                    "avg_top100": avg_top100,

                }}
    with open(output_file, 'w') as outfile:
        json.dump(data, outfile, indent=4)


In [16]:
evaluation(random_100, seed, output_file)


can't find this structure


AttributeError: 'tanimoto' object has no attribute 'query_fp'

In [None]:
pass_rate(random_100, seed, output_file)