Load libraries

In [1]:
import pandas as pd
import json
import requests

Load data. hsdb-smiles is a non-bioavailable (BA) dataset. BA-Chembl-all-phases is a bioavailable (BA) dataset

In [2]:
df_hsdb = pd.read_csv('hsbd-smiles.csv',header=None, usecols=[0, 1,2], names=['ID','Name', 'Smiles'])
df_hsdb.head()

Unnamed: 0,ID,Name,Smiles
0,HSDB0001,HYOSCYAMINE,CN1C2CCC1CC(C2)OC(=O)C(CO)C3=CC=CC=C3
1,HSDB0002,OXYPEUCEDANIN,CC1(C(O1)COC2=C3C=CC(=O)OC3=CC4=C2C=CO4)C
2,HSDB0003,RETRONECINE,C1CN2CC=C(C2C1O)CO
3,HSDB0004,"1,1,1,2,3,3,3-HEPTAFLUOROPROPANE",C(C(F)(F)F)(C(F)(F)F)F
4,HSDB0005,"1,1,1,2-TETRACHLORO-2,2-DIFLUOROETHANE",C(C(Cl)(Cl)Cl)(F)(F)Cl


In [3]:
df_chembl_ba = pd.read_csv('BA-Chembl-all-phases.csv',header=None, usecols=[0,1,2], names=['ID','Name', 'Smiles'])
df_chembl_ba.head()

Unnamed: 0,ID,Name,Smiles
0,650400,"1-(3,4-DICHLOROPHENYL)-6-(METHOXYMETHYL)-3-AZA...",COC[C@@]12CNCC[C@]1(c1ccc(Cl)c(Cl)c1)C2
1,1741751,13-DEOXYDOXORUBICIN,COc1cccc2c1C(=O)c1c(O)c3c(c(O)c1C2=O)C[C@@](O)...
2,508683,16.ALPHA.-BROMOEPIANDROSTERONE,C[C@]12CC[C@H]3[C@@H](CC[C@H]4C[C@@H](O)CC[C@@...
3,447768,[18F]D,CCN(CC)C(=O)Cc1c(-c2ccc(OCCF)cc2)nn2c(C)cc(C)nc12
4,448378,[18F]DPA-714,CCN(CC)C(=O)Cc1c(-c2ccc(OCC[18F])cc2)nn2c(C)cc...


Check the length of the datasets

In [4]:
len(df_chembl_ba)

3858

In [5]:
len(df_hsdb)

3550

Calculate the difference between the datasets

In [6]:
def dataframe_difference(df1, df2, which=None):
    """Find rows which are different between two DataFrames."""
    comparison_df = df1.merge(df2,
                              on=['Smiles'],
                              indicator=True,
                              how='outer')
    if which is None:
        diff_df = comparison_df[comparison_df['Smiles'] != 'both']
    else:
        diff_df = comparison_df[comparison_df['Smiles'] == which]
    diff_df.to_csv('diff.csv')
    return diff_df

Merge the files and then create a dataframe with the data that are in both datasets

In [7]:
df_merged = dataframe_difference(df_hsdb, df_chembl_ba)

In [8]:
df_filter_both_merged = df_merged['_merge'] == 'both'

df_both_merged = df_merged[df_filter_both_merged]
len(df_both_merged)

20

In [9]:
df_merged_pure = df_both_merged.drop(columns='_merge')

In [10]:
df_merged_pure.head(20)

Unnamed: 0,ID_x,Name_x,Smiles,ID_y,Name_y
261,HSDB0240,1-METHYL-2-PYRROLIDINONE,CN1CCCC1=O,10276.0,METHYLPYRROLIDONE
280,HSDB0257,1-OCTANOL,CCCCCCCCO,32815.0,OCTANOL
497,HSDB0457,2-DIMETHYLAMINOETHANOL,CN(C)CCO,171436.0,DIMETHYLAMINOETHANOL
819,HSDB0771,ACESULFAME,CC1=CC(=O)NS(=O)(=O)O1,292300.0,ACESULFAME
1395,HSDB1326,CREATINE,CN(CC(=O)O)C(=N)N,25249.0,CREATINE
1427,HSDB1354,CYCLOHEXANE,C1CCCCC1,16602.0,CYCLOHEXANE
1428,HSDB2025,HEXAMETHYLENE,C1CCCCC1,16602.0,CYCLOHEXANE
1620,HSDB1546,DIETHYL,CCCC,222085.0,BUTANE
1621,HSDB2468,N-BUTANE,CCCC,222085.0,BUTANE
1622,HSDB1547,DIETHYLAMINE,CCNCC,199817.0,DIETHYLAMINE


Create a negative dataset without repeating elements

In [11]:
df_merged_non_BA = dataframe_difference(df_hsdb, df_merged_pure)
df_filter_diff_non_BA = df_merged_non_BA['_merge'] == 'left_only'

df_only_in_non_BA = df_merged_non_BA[df_filter_diff_non_BA]
len(df_only_in_non_BA)

3530

Create a positive dataset without repeating elements

In [12]:
df_merged_BA = dataframe_difference(df_chembl_ba, df_merged_pure)
df_filter_diff_BA = df_merged_BA['_merge'] == 'left_only'

df_only_in_BA = df_merged_BA[df_filter_diff_BA]
len(df_only_in_BA)

3842

Save both datasets into files

In [14]:
header = ['ID', 'Name', 'Smiles']
df_only_in_non_BA.to_csv('hsdb-smiles-no-repeat.csv', columns= header, index=False)
df_only_in_BA.to_csv('BA-Chembl-4-phases-smiles-no-repeat.csv', columns= header, index=False)