In [1]:
import os
import pandas as pd

In [2]:
hits = pd.read_csv("data/sf3.csv")
all = pd.read_csv("data/drugrepurposinghub.csv")

In [8]:
#standardise hits

from rdkit import Chem
from standardiser import standardise

mols = [Chem.MolFromSmiles(s) for s in hits["SMILES"]]
hits["InChIKey"] = [Chem.MolToInchiKey(m) for m in mols ]

print(hits.shape)
smiles = []
for s in hits["SMILES"]:
    mol = Chem.MolFromSmiles(s)
    if mol is not None:
        try:
            mol_ = standardise.run(mol)
            s_ = Chem.MolToSmiles(mol_)
        except:
            print("Non standard:", s)
            s_ = None
    else:
        print("No rdkit:" ,s)
        s_ = None
    smiles += [s_]

hits["smiles_ok"] = smiles
hits_ok = hits[~hits["smiles_ok"].isna()]
print(hits_ok.shape)

(240, 19)
Non standard: CC[Hg]Sc1ccccc1C(O)=O
Non standard: O1[n+]2ccccc2S[Zn]11O[n+]2ccccc2S1
Non standard: CCCCCCc1ccc(O)cc1O.Nc1c2ccccc2nc2ccccc12
Non standard: CC(=O)OCC1=C2N3[C@H](SC1)[C@H](NC(=O)CCC[C@H](N)C(=O)O[Zn]OC2=O)C3=O |t:5|
Non standard: CN[C@H]1[C@H](O)[C@@H](O)[C@H](CO)O[C@H]1O[C@H]1[C@H](O[C@@H]2[C@@H](O)[C@H](O)[C@@H](NC(N)=N)[C@H](O)[C@H]2NC(N)=N)O[C@@H](C)[C@]1(O)CO.CN[C@H]1[C@H](O)[C@@H](O)[C@H](CO)O[C@H]1O[C@H]1[C@H](O[C@@H]2[C@@H](O)[C@H](O)[C@@H](NC(N)=N)[C@H](O)[C@H]2NC(N)=N)O[C@@H](C)[C@]1(O)CO
Non standard: Oc1ccc(cc1[N+]([O-])=O)[As](O)(O)=O
Non standard: O[Hg]c1c(O)c(Br)cc2c(-c3ccccc3C(O)=O)c3cc(Br)c(=O)cc3oc12
(233, 20)


In [11]:
all

Unnamed: 0,smiles,InChIKey,pubchem_cid,smiles_ok
0,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,CN1CCc2cccc3c2[C@H]1Cc1ccc(O)c(O)c1-3
1,COc1ccc(cc1OC1CCCC1)[C@@H]1CNC(=O)C1,HJORMJIFDVBMOB-LBPRGKRZSA-N,448055.0,COc1ccc([C@@H]2CNC(=O)C2)cc1OC1CCCC1
2,NC[C@H](CC(O)=O)c1ccc(Cl)cc1,KPYSYYIEGFHWSV-QMMMGPOBSA-N,6918881.0,NC[C@H](CC(=O)O)c1ccc(Cl)cc1
3,COc1ccc(cc1OC1CCCC1)[C@H]1CNC(=O)C1,HJORMJIFDVBMOB-GFCCVEGCSA-N,158758.0,COc1ccc([C@H]2CNC(=O)C2)cc1OC1CCCC1
4,CC(C)C[C@H](NC(=O)CN(C)C(=O)[C@H](Cc1ccccc1)NC...,OUPXSLGGCPUZJJ-SARDKLJWSA-N,163829.0,CC(C)C[C@H](NC(=O)CN(C)C(=O)[C@H](Cc1ccccc1)NC...
...,...,...,...,...
6706,Nc1nc2cc(Cl)ccc2o1,YGCODSQDUUUKIV-UHFFFAOYSA-N,6103.0,Nc1nc2cc(Cl)ccc2o1
6707,"O=C1N=C2C=CC=CN2C11Cc2ccccc2C1 |c:4,6,t:2|",QZWYXEBIQWJXAR-UHFFFAOYSA-N,10220323.0,O=C1N=C2C=CC=CN2C12Cc1ccccc1C2
6708,FC(F)c1nc2ccccc2n1-c1nc(nc(n1)N1CCOCC1)N1CCOCC1,HGVNLRPZOWWDKD-UHFFFAOYSA-N,11647372.0,FC(F)c1nc2ccccc2n1-c1nc(N2CCOCC2)nc(N2CCOCC2)n1
6709,OCCN1CCN(CC\C=C2\c3ccccc3Sc3ccc(Cl)cc23)CC1,WFPIAZLQTJBIFN-DVZOWYKESA-N,5311507.0,OCCN1CCN(CC/C=C2/c3ccccc3Sc3ccc(Cl)cc32)CC1


In [13]:
import random

random.seed(42)

all = all[["smiles_ok", "InChIKey"]]
hits = hits[["smiles_ok", "InChIKey"]]

hits_shuffled = hits.sample(frac=1).reset_index(drop=True)

# Split Hits into 5 subsets of 48 molecules each
hits_splits = [hits_shuffled.iloc[i:i+48] for i in range(0, len(hits_shuffled), 48)]

datasets = []

for i, hits_subset in enumerate(hits_splits):
    # Get the remaining molecules in All that are not in the Hits subset
    remaining_all = all[~all['InChIKey'].isin(hits_subset['InChIKey'])]
    
    # Randomly select 202 molecules from the remaining All molecules, that have correct smiles
    all_subset = remaining_all.sample(n=202, random_state=42+i)
    
    # Combine the Hits subset with the selected All subset
    combined_df = pd.concat([hits_subset, all_subset])
    
    # Shuffle the combined dataset to mix Hits and All molecules
    combined_df = combined_df.sample(frac=1, random_state=42+i).reset_index(drop=True)
    combined_df.rename(columns={"smiles_ok":"smiles"}, inplace=True)
    # Save the dataset to a new CSV file
    combined_df.to_csv(f'data/drugrepurposinghub_subset_{i+1}.csv', index=False)
    
    # Append to the list of datasets
    datasets.append(combined_df)

In [4]:
#prepare datasets for prediction
subs = "subset250_3"
df = pd.read_csv(f"data/abaumannii_{subs}.csv")
df = df["smiles"]
df.to_csv(f"data/{subs}.csv", index=False)

In [18]:
subs = "subset250_1"

eos3804 = pd.read_csv(f"data/{subs}_eos3804.csv")
eos4e41 = pd.read_csv(f"data/{subs}_eos4e41.csv")
eos43at = pd.read_csv(f"data/{subs}_eos43at.csv")
eos9ei3 = pd.read_csv(f"data/{subs}_eos9ei3.csv")
eos2ta5 = pd.read_csv(f"data/{subs}_eos2ta5.csv")
eos7d58 = pd.read_csv(f"data/{subs}_eos7d58.csv")

In [19]:
df = eos3804
for df_ in [eos4e41, eos43at, eos9ei3, eos2ta5, eos7d58]:
    df = pd.merge(df, df_, on=["key", "input"], how="outer")
df.head()

Unnamed: 0,key,input,activity,50uM_Inhibition,pic50,sa_score,probability,molecular_weight,logP,hydrogen_bond_acceptors,...,Caco2_Wang_drugbank_approved_percentile,Clearance_Hepatocyte_AZ_drugbank_approved_percentile,Clearance_Microsome_AZ_drugbank_approved_percentile,Half_Life_Obach_drugbank_approved_percentile,HydrationFreeEnergy_FreeSolv_drugbank_approved_percentile,LD50_Zhu_drugbank_approved_percentile,Lipophilicity_AstraZeneca_drugbank_approved_percentile,PPBR_AZ_drugbank_approved_percentile,Solubility_AqSolDB_drugbank_approved_percentile,VDss_Lombardo_drugbank_approved_percentile
0,ABNOVCYQWFZTPI-UHFFFAOYSA-N,OCCCNCC(F)(F)F,0.005055,0.002066,3.379324,2.356462,0.178605,157.135,0.5207,2.0,...,68.708802,17.642497,2.016285,48.662272,64.210934,51.76425,21.830167,7.754944,93.640946,94.649089
1,ABXFRPQXQHAVQQ-UHFFFAOYSA-N,CC(C)(C)C(O)C(=O)NCCC(=O)NCCc1ccccc1,0.051868,0.000195,3.04099,2.404939,0.042052,306.406,1.2586,3.0,...,62.737495,62.194649,65.684374,20.62815,29.313687,42.613416,47.072509,30.787127,64.249709,26.289259
2,ACGMQPXRVMJTKD-UHFFFAOYSA-N,N#CC1=C(N)Oc2c(ccc3ccccc23)C1c1ccc([N+](=O)[O-...,0.01561,0.012579,4.419067,2.783859,0.925431,343.342,3.96618,5.0,...,64.210934,79.37185,73.090345,45.443971,58.549826,87.088019,86.506398,94.610314,2.79178,55.098876
3,AEUBKXOYMKBMCS-IOEMPLOMSA-N,CC(C)C(O/N=C(\C(=O)N[C@@H]1C(=O)N(OS(=O)(=O)O)...,0.742665,0.894173,5.229795,4.823198,0.03806,561.555,-0.8171,15.0,...,15.238465,14.850717,59.364095,67.816983,35.052346,64.09461,38.464521,48.429624,55.176425,65.529275
4,AEUBKXOYMKBMCS-LVKILCFZSA-N,CC(C)[C@@H](O/N=C(\C(=O)N[C@@H]1C(=O)N(OS(=O)(...,0.733663,0.897862,5.192461,4.823198,0.038056,561.555,-0.8171,15.0,...,15.160915,14.734393,59.286545,68.359829,34.936022,64.443583,38.154323,48.429624,55.176425,65.296627


In [22]:
cols = df.columns.tolist()

0      0.009293
1      0.523910
2      0.640874
3      0.079830
4      0.060456
         ...   
245    0.489983
246    0.797460
247    0.772718
248    0.069174
249    0.058403
Name: CYP2C19_Veith, Length: 250, dtype: float64

In [11]:
# sort by eos38 activity and keep according to decided cut-off
eos3804 = eos3804[eos3804["activity"]>0.3]