In [2]:
import os
import pandas as pd
import numpy as np
from rdkit import DataStructs
from rdkit.Chem import MolFromSmiles
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem.Descriptors import MolWt
from rdkit.Chem.inchi import MolToInchiKey

DATA_PATH="data"

In [5]:
def process_results(file):
    df=pd.read_csv(file)
    
    #create eos_id
    num=[f"{i:05}" for i in range(len(df))]
    prefix="EOS-"
    batch=str(batchnum)
    eos_id=[prefix + batch +"-" + x for x in num]
    df["eosID"]=eos_id
    
    #add inchikey identifier
    inchikeys=[]
    smiles = [x for x in df['SMILES']]
    for smi in smiles:
        inchikey=MolToInchiKey(MolFromSmiles(smi))
        inchikeys += [inchikey]
    df["InchiKey"]=inchikeys 
    
    #add molecular weight
    molweights=[]
    smiles = [x for x in df['SMILES']]
    for smi in smiles:
        molweight=MolWt(MolFromSmiles(smi))
        molweights += [molweight]
    df["MolWeight"]=molweights
    
    #change total_score style
    df=df.rename(columns={"total_score":"TotalScore"})
    
    #add batch of generation 
    df["Batch"]=batchnum
    
    #select columns
    df=df[["eosID","Batch","SMILES","InchiKey","Scaffold","MolWeight","TotalScore"]]
    
    #check if there are compounds from original series 4 batch
    original=pd.read_csv("../osm-series4/data/all_smiles.csv", header=None)
    old_smiles=original[0].tolist()
    new_smiles=df["SMILES"].tolist()
    duplicates=list(set(old_smiles).intersection(new_smiles))
    if not duplicates:
        print("there are no repeated smiles from original dataset")
        return df
    else:
        print(str(len(duplicates))+" SMILES will be eliminated from processed results{}".format(str(batchnum)))
        df=df[~df["SMILES"].isin(original[0])]
        return df  

In [6]:
for results in os.listdir(DATA_PATH):
    if "results_" in results:
        batchnum=int(results.split("results_")[-1])
        results_file=os.path.join(DATA_PATH, results, "scaffold_memory.csv")
        df=process_results(results_file)
        df.to_csv(os.path.join(DATA_PATH, "processed_results", "processed{}.csv".format(batchnum)),index=False)

34 SMILES will be eliminated from processed results0
31 SMILES will be eliminated from processed results1
4 SMILES will be eliminated from processed results2
there are no repeated smiles from original dataset


FileNotFoundError: [Errno 2] No such file or directory: 'data\\results_04\\scaffold_memory.csv'

In [7]:
all_batches_dict={}
for file in os.listdir(os.path.join(DATA_PATH, "processed_results")):
    if "processed" in file:
        batchnum=file.split("processed")[-1]
        batchnum=int(batchnum.strip(".csv"))
        df=pd.read_csv(os.path.join(DATA_PATH, "processed_results", file))
        all_batches_dict[batchnum]=df
all_batches_df = pd.concat([df for df in all_batches_dict.values()], ignore_index=True)
all_batches_df=all_batches_df.drop_duplicates(subset="InchiKey")
all_batches_df.to_csv(os.path.join(DATA_PATH, "processed_results", "all_batches.csv"), index=False)

In [8]:
# Calculate similarity among compounds

%matplotlib inline
import matplotlib.pyplot as plt

df=pd.read_csv(os.path.join(DATA_PATH,"processed_results", "all_batches.csv"))

smiles_list = [x for x in df['SMILES']] #obtain list of smiles
mols_list=[MolFromSmiles(x) for x in smiles_list] #create list of mols
fps_list=[FingerprintMols.FingerprintMol(x) for x in mols_list] #create list of fingerprints

query, target, sim = [] , [], [] #create empty lists for query molecule, comparison molecule and similarity value

for fp in range(len(fps_list)-1):
    similarity= DataStructs.BulkTanimotoSimilarity(fps_list[fp], fps_list[fp+1:])
    for s in range(len(similarity)):
        query.append(smiles_list[fp])
        target.append(smiles_list[fp+1:][s])
        sim.append(similarity[s])
d = {'query':query, 'target':target, 'similarity':sim}
#df_sim = pd.DataFrame(data=d)
        
        
print(sim[:10])
plt.hist(sim[:10000])

ValueError: BitVects must be same length