In [38]:
import numpy as np
import pandas as pd
import os
from pathlib import Path

In [10]:
def get_df_for_unsplit_dataset(unsplit_dataset_path: str, is_dr_separate: bool):
    if is_dr_separate:
        return pd.read_csv(unsplit_dataset_path)
    else:
        df = pd.read_csv(unsplit_dataset_path)
        return df[~df['DR'].isna()].reset_index()

In [24]:
sd_dataset_path = '/home/david/Projects_NEW/public-code-for-hts-paper/filtered_datasets/AID1259420-1259416/SD.csv'
embeddings_path = '/home/david/Projects_NEW/public-code-for-hts-paper/az-sd-dr-code/out_SD/1259420-1259416/2/VGAE/GCN/linear/NO-EDGES/SD_graph_embeddings.npy'

In [25]:
# dr_df = get_df_for_unsplit_dataset(unsplit_dataset_path, is_dr_separate=False)
sd_df = pd.read_csv(sd_dataset_path)
molecule_embeddings = np.load(embeddings_path, allow_pickle=True).item()[1]

In [29]:
assert len(sd_df) == molecule_embeddings.shape[0]

In [27]:
sd_df['Embeddings'] = molecule_embeddings.tolist()

In [28]:
sd_df

Unnamed: 0,CID,SD,SD Z-score,Activity,neut-smiles,Embeddings
0,46357461,7932.3100,57.214119,Inactive,O=C(Nc1ccc2c(c1)OCCO2)N1CCC2(CC1)Nc1ccccc1-n1c...,"[-0.23445597290992737, 1.4642359018325806, -0...."
1,44505432,4233.3000,30.480193,Inactive,COc1ccc(NC(=O)N(C)CC2OCc3cnnn3CCCC(=O)N(C(C)CO...,"[0.08541813492774963, 0.2121819257736206, -0.1..."
2,44620676,3773.4950,27.157035,Inactive,COc1ccc(NC(=O)N(C)CC2OCCCCC(C)Oc3ccc(NC(=O)c4c...,"[-0.28036564588546753, 3.083940267562866, -0.1..."
3,7202379,3329.6200,23.949009,Inactive,COc1ccc(-c2nc(C)c(CCNC(=O)c3ccco3)s2)cc1,"[-0.6440660953521729, 4.309677600860596, 0.000..."
4,7202384,3269.0000,23.510889,Inactive,COc1ccc(-c2nc(C)c(CCNC(=O)c3ccccc3)s2)cc1OC,"[-0.8308035135269165, 5.15700626373291, -0.211..."
...,...,...,...,...,...,...
59442,50806135,-128.5755,-1.044474,Active,Cc1ccc(C2c3[nH]c4ccccc4c3CCN2C(=O)c2cccc(F)c2)cc1,"[-0.13202843070030212, -0.7890793085098267, 0...."
59443,53058109,-129.4500,-1.050795,Active,COc1ccc2c(c1)NC1(CCN(C(=O)COc3cccc(C)c3)CC1)c1...,"[0.039738163352012634, 1.0495781898498535, 0.1..."
59444,46359928,-129.9825,-1.054643,Active,COc1ccc(C(=O)N2CCc3c([nH]c4ccccc34)C2c2ccc(F)c...,"[-0.17558176815509796, 1.3271557092666626, 0.0..."
59445,53030145,-130.4435,-1.057975,Active,COc1cc(Nc2nc(-c3nnc(-c4cccc(C)c4)o3)nc3ccccc23...,"[-0.1521126627922058, -0.8308700323104858, 0.8..."


## If SD and DR are in different .csv files

### Load the unsplit DR dataset file

In [31]:
dr_df = get_df_for_unsplit_dataset(f'/home/david/Projects_NEW/public-code-for-hts-paper/filtered_datasets/AID1259420-1259416/DR.csv', is_dr_separate=True)

In [32]:
dr_df

Unnamed: 0,CID,SD,SD Z-score,DR,Activity,neut-smiles
0,5281607,16.290450,0.002518,,Inconclusive,O=c1cc(-c2ccccc2)oc2cc(O)cc(O)c12
1,5926982,14.315960,-0.011752,,Inconclusive,COc1ccc(C(=O)C=Cc2cc(C(=O)O)cc3c2OCOC3)c(O)c1
2,49673988,13.960105,-0.014324,,Inactive,CCOc1ccc(-c2nc(CN(c3ccc(OC)cc3)S(=O)(=O)c3cccs...
3,49674014,8.405705,-0.054468,5.138,Active,CCOc1ccc(N(Cc2nc(-c3ccc(OC)cc3)oc2C)S(=O)(=O)c...
4,4011,6.635230,-0.067263,,Inactive,CNCCCC12CCC(c3ccccc31)c1ccccc12
...,...,...,...,...,...,...
516,53058141,-107.487000,-0.892061,4.864,Active,Cc1cccc(OCC(=O)N2CCC3(CC2)Nc2ccc(Cl)cc2-n2cccc...
517,3122821,-108.038500,-0.896047,5.051,Active,CCCCOc1ccc2c(=O)c(-c3cc(C(=O)OC)oc3C)c(C(F)(F)...
518,53033099,-112.376000,-0.927395,,Inconclusive,CCOc1ccc(-c2nc3c(s2)CN(C(=O)Nc2cc(C)ccc2C)CC3)cc1
519,53033177,-114.700500,-0.944195,5.048,Active,COc1ccc(-c2nc3c(s2)CN(S(=O)(=O)c2ccccc2C)CC3)cc1


In [33]:
dr_df = dr_df[['CID', 'DR']]

In [34]:
dr_with_embeddings_df = sd_df.merge(dr_df, on='CID')

In [35]:
dr_with_embeddings_df

Unnamed: 0,CID,SD,SD Z-score,Activity,neut-smiles,Embeddings,DR
0,5281607,16.290450,0.002518,Inactive,O=c1cc(-c2ccccc2)oc2cc(O)cc(O)c12,"[-0.19429828226566315, -0.21915075182914734, 0...",
1,5926982,14.315960,-0.011752,Inactive,COc1ccc(C(=O)C=Cc2cc(C(=O)O)cc3c2OCOC3)c(O)c1,"[-0.08719492703676224, -0.21458867192268372, 0...",
2,49673988,13.960105,-0.014324,Inactive,CCOc1ccc(-c2nc(CN(c3ccc(OC)cc3)S(=O)(=O)c3cccs...,"[-0.29587531089782715, -0.8521229028701782, 0....",
3,49674014,8.405705,-0.054468,Inactive,CCOc1ccc(N(Cc2nc(-c3ccc(OC)cc3)oc2C)S(=O)(=O)c...,"[-0.13461385667324066, -0.4412147104740143, 0....",5.138
4,4011,6.635230,-0.067263,Inactive,CNCCCC12CCC(c3ccccc31)c1ccccc12,"[0.11738598346710205, -0.24721387028694153, -0...",
...,...,...,...,...,...,...,...
516,53058141,-107.487000,-0.892061,Active,Cc1cccc(OCC(=O)N2CCC3(CC2)Nc2ccc(Cl)cc2-n2cccc...,"[-0.0646001547574997, 0.8963046073913574, 0.14...",4.864
517,3122821,-108.038500,-0.896047,Active,CCCCOc1ccc2c(=O)c(-c3cc(C(=O)OC)oc3C)c(C(F)(F)...,"[-0.17774514853954315, -2.4949324131011963, 1....",5.051
518,53033099,-112.376000,-0.927395,Active,CCOc1ccc(-c2nc3c(s2)CN(C(=O)Nc2cc(C)ccc2C)CC3)cc1,"[-0.2526424825191498, 0.26413822174072266, 1.8...",
519,53033177,-114.700500,-0.944195,Active,COc1ccc(-c2nc3c(s2)CN(S(=O)(=O)c2ccccc2C)CC3)cc1,"[-0.3883023262023926, 1.4514272212982178, 0.94...",5.048


In [None]:
out_path = '/home/david/Projects_NEW/public-code-for-hts-paper/az-sd-dr-code/unsplit_DR_with_embeddings/AID1259420-1259416/'
Path(out_path).mkdir(exist_ok=True, parents=True)

dr_with_embeddings_df.to_csv(os.path.join(out_path, 'DR.csv'), index=False)

## If SD and DR are in the same .csv file

In [None]:
dr_with_embeddings_df = sd_df[~sd_df['DR'].isna()]

In [None]:
out_path = '/home/david/Projects_NEW/public-code-for-hts-paper/az-sd-dr-code/unsplit_DR_with_embeddings/AID1445/'
Path(out_path).mkdir(exist_ok=True, parents=True)

dr_with_embeddings_df.to_csv(os.path.join(out_path, 'DR.csv'), index=False)