In [1]:
import os
import pandas as pd
import numpy as np
from rdkit import DataStructs
from rdkit.Chem import MolFromSmiles
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem.Descriptors import MolWt
from rdkit.Chem.inchi import MolToInchiKey

DATA_PATH="../data/ReinventResults"

In [None]:
def process_results(file):
    df=pd.read_csv(file)
    
    #create eos_id
    num=[f"{i:05}" for i in range(len(df))]
    prefix="EOS-"
    batch=str(batchnum)
    eos_id=[prefix + batch +"-" + x for x in num]
    df["eosID"]=eos_id
    
    #add inchikey identifier
    inchikeys=[]
    smiles = [x for x in df['SMILES']]
    for smi in smiles:
        inchikey=MolToInchiKey(MolFromSmiles(smi))
        inchikeys += [inchikey]
    df["InchiKey"]=inchikeys 
    
    #add molecular weight
    molweights=[]
    smiles = [x for x in df['SMILES']]
    for smi in smiles:
        molweight=MolWt(MolFromSmiles(smi))
        molweights += [molweight]
    df["MolWeight"]=molweights
    
    #change total_score style
    df=df.rename(columns={"total_score":"TotalScore"})
    
    #add batch of generation 
    df["Batch"]=batchnum
    
    #select columns
    df=df[["eosID","Batch","SMILES","InchiKey","Scaffold","MolWeight","TotalScore"]]
    
    #check if there are compounds from original series 4 batch
    original=pd.read_csv("../data/OriginalData/series4_allsmiles.csv") #smiles list must be converted to Inchikey
    original_inchikeys=[]
    smiles = [x for x in original['canonical']]
    for smi in smiles:
        original_inchikey=MolToInchiKey(MolFromSmiles(smi))
        original_inchikeys += [original_inchikey]
    
    new_inchikeys=df["InchiKey"].tolist()
    duplicates=list(set(original_inchikeys).intersection(new_inchikeys))
    if not duplicates:
        print("there are no repeated Inchikeys from original dataset")
        return df
    else:
        print(str(len(duplicates))+" Inchikeys will be eliminated from processed results{}".format(str(batchnum)))
        df=df[~df["InchiKey"].isin(original_inchikeys)]
        return df  

In [None]:
for results in os.listdir(DATA_PATH):
    if "results_" in results:
        batchnum=int(results.split("results_")[-1])
        results_file=os.path.join(DATA_PATH, results, "scaffold_memory.csv")
        df=process_results(results_file)
        df.to_csv(os.path.join(DATA_PATH, "ProcessedResults", "processed{}.csv".format(batchnum)),index=False)

In [None]:
all_batches_dict={}
for file in os.listdir(os.path.join(DATA_PATH, "ProcessedResults")):
    if "processed" in file:
        batchnum=file.split("processed")[-1]
        batchnum=int(batchnum.strip(".csv"))
        df=pd.read_csv(os.path.join(DATA_PATH, "ProcessedResults", file))
        all_batches_dict[batchnum]=df
all_batches_df = pd.concat([df for df in all_batches_dict.values()], ignore_index=True)
all_batches_df=all_batches_df.drop_duplicates(subset="InchiKey")
all_batches_df.to_csv(os.path.join(DATA_PATH, "ProcessedResults", "all_batches.csv"), index=False)

### Calculate Tanimoto Similarity to series 4 compounds
Compare each of the newly generated molecules to all series 4 compounds and keep maximum similarity score

In [13]:
# Calculate maximum similarity of each compound to any series 4 compounds
df=pd.read_csv(os.path.join(DATA_PATH,"ProcessedResults", "all_batches.csv"))
s4=pd.read_csv("../data/OriginalData/series4_allsmiles.csv")

smiles_list = [x for x in df['SMILES']] #obtain list of smiles for new molecules
mols_list=[MolFromSmiles(x) for x in smiles_list] #create list of mols for new molecules
fps_list=[FingerprintMols.FingerprintMol(x) for x in mols_list] #create list of fingerprints for new molecules

smiles_list_s4 = [x for x in s4['canonical']] #obtain list of smiles for s4
mols_list_s4=[MolFromSmiles(x) for x in smiles_list_s4] #create list of mols for s4
fps_list_s4=[FingerprintMols.FingerprintMol(x) for x in mols_list_s4] #create list of fingerprints for s4

similarity=[] #list to store similarity values
for fp in range(len(fps_list)):
    individual_sim=[]
    for fp_s4 in fps_list_s4:
        sim=DataStructs.TanimotoSimilarity(fps_list[fp], fp_s4)
        individual_sim += [sim]
    max_sim=max(individual_sim)
    similarity += [max_sim]


df["MaxSimToSeries4"]=similarity

df.to_csv(os.path.join(DATA_PATH,"ProcessedResults", "all_batches.csv"), index=False )


In [14]:
df["MaxSimToSeries4"].min()

0.41955445544554454