In [19]:
import pandas as pd
import os

DATAPATH = "../data"

In [10]:
npa = pd.read_csv(os.path.join(DATAPATH, "original", "active_NP.csv"))
npi = pd.read_csv(os.path.join(DATAPATH, "original", "inactive_NP.csv"))
sda = pd.read_csv(os.path.join(DATAPATH, "original", "active_SD.csv"))
sdi = pd.read_csv(os.path.join(DATAPATH, "original", "inactive_SD.csv"))

In [11]:
print(npa.shape, npi.shape, sda.shape, sdi.shape)
npa = npa[~npa["smiles"].isna()]
npi = npi[~npi["smiles"].isna()]
sda = sda[~sda["smiles"].isna()]
sdi = sdi[~sdi["smiles"].isna()]
print(npa.shape, npi.shape, sda.shape, sdi.shape)

(356, 20) (39, 21) (423, 19) (50, 17)
(356, 20) (39, 21) (422, 19) (50, 17)


In [12]:
smi_npa = npa["smiles"]
smi_npi = npi["smiles"]
smi_sda = sda["smiles"]
smi_sdi = sdi["smiles"]
print(len(smi_npa), len(smi_npi), len(smi_sda), len(smi_sdi))

356 39 422 50


In [16]:
from rdkit import Chem

def parse_and_standardize(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None, None
        canonical_smiles = Chem.MolToSmiles(mol, canonical=True)
        inchikey = Chem.inchi.MolToInchiKey(mol)
        return canonical_smiles, inchikey
    except:
        return None, None
npa[['canonical_smiles', 'inchikey']] = npa['smiles'].apply(lambda x: pd.Series(parse_and_standardize(x)))
npi[['canonical_smiles', 'inchikey']] = npi['smiles'].apply(lambda x: pd.Series(parse_and_standardize(x)))
sda[['canonical_smiles', 'inchikey']] = sda['smiles'].apply(lambda x: pd.Series(parse_and_standardize(x)))
sdi[['canonical_smiles', 'inchikey']] = sdi['smiles'].apply(lambda x: pd.Series(parse_and_standardize(x)))

# Remove rows without canonical SMILES
print("Before standardization:", npa.shape, npi.shape, sda.shape, sdi.shape)
npa_clean = npa.dropna(subset=['canonical_smiles']).copy()
npi_clean = npi.dropna(subset=['canonical_smiles']).copy()
sda_clean = sda.dropna(subset=['canonical_smiles']).copy()
sdi_clean = sdi.dropna(subset=['canonical_smiles']).copy()
print("After standardization:", npa_clean.shape, npi_clean.shape, sda_clean.shape, sdi_clean.shape)

# Duplicates
duplicates_in_npa = npa_clean[npa_clean.duplicated(subset=['canonical_smiles'], keep=False)].copy()
duplicates_in_npi = npi_clean[npi_clean.duplicated(subset=['canonical_smiles'], keep=False)].copy()
duplicates_in_sda = sdi_clean[sdi_clean.duplicated(subset=['canonical_smiles'], keep=False)].copy()
duplicates_in_sdi = sdi_clean[sdi_clean.duplicated(subset=['canonical_smiles'], keep=False)].copy()
print(f"Internal duplicates in npa: {duplicates_in_npa['canonical_smiles'].nunique()} unique duplicate SMILES")
print(f"Internal duplicates in npi: {duplicates_in_npi['canonical_smiles'].nunique()} unique duplicate SMILES")
print(f"Internal duplicates in sda: {duplicates_in_sda['canonical_smiles'].nunique()} unique duplicate SMILES")
print(f"Internal duplicates in sdi: {duplicates_in_sdi['canonical_smiles'].nunique()} unique duplicate SMILES")
duplicates_in_npa.to_csv(os.path.join(DATAPATH, "processed", 'duplicates_within_npa.csv'), index=False)
duplicates_in_npi.to_csv(os.path.join(DATAPATH, "processed",'duplicates_within_npi.csv'), index=False)
duplicates_in_sda.to_csv(os.path.join(DATAPATH, "processed", 'duplicates_within_sda.csv'), index=False)
duplicates_in_sdi.to_csv(os.path.join(DATAPATH, "processed",'duplicates_within_sdi.csv'), index=False)

#keep one occurrence of duplicate molecules within each set
npa_final = npa_clean.drop_duplicates(subset=['canonical_smiles'], keep='first') 
npi_final = npi_clean.drop_duplicates(subset=['canonical_smiles'], keep='first')
sda_final = sda_clean.drop_duplicates(subset=['canonical_smiles'], keep='first')
sdi_final = sdi_clean.drop_duplicates(subset=['canonical_smiles'], keep='first')

# Duplicates between sets
set1 = set(npa_final['canonical_smiles'])
set2 = set(npi_final['canonical_smiles'])
set3 = set(sda_final['canonical_smiles'])
set4 = set(sdi_final['canonical_smiles'])
# Intersection between NP Active and Inactive
common_smiles = set1.intersection(set2)
print("Duplicates Active/Inactive NP:", len(common_smiles))
common_smiles_npa = npa_final[npa_final['canonical_smiles'].isin(common_smiles)]
common_smiles_npi = npi_final[npi_final['canonical_smiles'].isin(common_smiles)]
common_smiles_npa.to_csv(os.path.join(DATAPATH, "processed", 'duplicates_npa_with_npi.csv'), index=False)
common_smiles_npi.to_csv(os.path.join(DATAPATH, "processed", 'duplicates_npi_with_npa.csv'), index=False)
# Intersection between SD Active and Inactive
common_smiles = set3.intersection(set4)
print("Duplicates Active/Inactive SD:", len(common_smiles))
# Intersection between NP and SD
np = set1.union(set2)
sd = set3.union(set4)
common_smiles = np.intersection(sd)
print("Duplicates NP/SD:", len(common_smiles))

# Remove the common SMILES between Active and Inactive NP from the Inactive set
npi_final = npi_final[~npi_final['canonical_smiles'].isin(common_smiles_npi['canonical_smiles'])]

#Save cleaned data
npa_final.to_csv(os.path.join(DATAPATH, "processed", "npa_clean.csv"), index=False)
npi_final.to_csv(os.path.join(DATAPATH, "processed", "npi_clean.csv"), index=False)
sda_final.to_csv(os.path.join(DATAPATH, "processed", "sda_clean.csv"), index=False)
sdi_final.to_csv(os.path.join(DATAPATH, "processed", "sdi_clean.csv"), index=False)

[11:22:23] SMILES Parse Error: syntax error while parsing: 1[C@H](C([C@](C(C1(C(=O)O)O)(C(=O)C2=CC(=C(C(=C2)O)O)O)C(=O)C3=CC(=C(C(=C3)O)O)O)(C(=O)C4=CC(=C(C(=C4)O)O)O)O)(C(=O)C5=CC(=C(C(=C5)O)O)O)O)O
[11:22:23] SMILES Parse Error: check for mistakes around position 1:
[11:22:23] 1[C@H](C([C@](C(C1(C(=O)O)O)(C(=O)C2=CC(=
[11:22:23] ^
[11:22:23] SMILES Parse Error: Failed parsing SMILES '1[C@H](C([C@](C(C1(C(=O)O)O)(C(=O)C2=CC(=C(C(=C2)O)O)O)C(=O)C3=CC(=C(C(=C3)O)O)O)(C(=O)C4=CC(=C(C(=C4)O)O)O)O)(C(=O)C5=CC(=C(C(=C5)O)O)O)O)O' for input: '1[C@H](C([C@](C(C1(C(=O)O)O)(C(=O)C2=CC(=C(C(=C2)O)O)O)C(=O)C3=CC(=C(C(=C3)O)O)O)(C(=O)C4=CC(=C(C(=C4)O)O)O)O)(C(=O)C5=CC(=C(C(=C5)O)O)O)O)O'
[11:22:23] SMILES Parse Error: syntax error while parsing: CC(N=C[23])=NC(C(F)(F)F)=C@23CCC(N(@23)CC[6]=CN=C(C[13]()=CC=CC=C@13C[22]=NN=NN@22)C=C@7)=O
[11:22:23] SMILES Parse Error: check for mistakes around position 10:
[11:22:23] CC(N=C[23])=NC(C(F)(F)F)=C@23CCC(N(@23)CC
[11:22:23] ~~~~~~~~~^
[11:22:23] SMILES P

Before standardization: (356, 21) (39, 22) (422, 20) (50, 18)
After standardization: (355, 21) (39, 22) (418, 20) (50, 18)
Internal duplicates in npa: 10 unique duplicate SMILES
Internal duplicates in npi: 2 unique duplicate SMILES
Internal duplicates in sda: 0 unique duplicate SMILES
Internal duplicates in sdi: 0 unique duplicate SMILES
Duplicates Active/Inactive NP: 6
Duplicates Active/Inactive SD: 0
Duplicates NP/SD: 0


[11:22:23] SMILES Parse Error: syntax error while parsing: O=C(O)CN(C(C(C)CSC(C)=O)=O)C[6:S=N]HC[S=I](C[12])(H)CCC[S=I]@12(H)C@7
[11:22:23] SMILES Parse Error: check for mistakes around position 31:
[11:22:23] (C(C)CSC(C)=O)=O)C[6:S=N]HC[S=I](C[12])(H
[11:22:23] ~~~~~~~~~~~~~~~~~~~~^
[11:22:23] SMILES Parse Error: Failed parsing SMILES 'O=C(O)CN(C(C(C)CSC(C)=O)=O)C[6:S=N]HC[S=I](C[12])(H)CCC[S=I]@12(H)C@7' for input: 'O=C(O)CN(C(C(C)CSC(C)=O)=O)C[6:S=N]HC[S=I](C[12])(H)CCC[S=I]@12(H)C@7'
[11:22:23] SMILES Parse Error: syntax error while parsing: O=C(O)CN(CCC[17]=CC=CO@17)C(C(C)CSC(C)=O)=O
[11:22:23] SMILES Parse Error: check for mistakes around position 16:
[11:22:23] O=C(O)CN(CCC[17]=CC=CO@17)C(C(C)CSC(C)=O)
[11:22:23] ~~~~~~~~~~~~~~~^
[11:22:23] SMILES Parse Error: Failed parsing SMILES 'O=C(O)CN(CCC[17]=CC=CO@17)C(C(C)CSC(C)=O)=O' for input: 'O=C(O)CN(CCC[17]=CC=CO@17)C(C(C)CSC(C)=O)=O'
[11:22:23] SMILES Parse Error: syntax error while parsing: O=C(O)CN(C[17]=C(C)C=CC=C@17)C(C(C)CS)

In [17]:
#merge into one file for general analysis
npa_final["activity"] = 1
npi_final["activity"] = 0
sda_final["activity"] = 1
sdi_final["activity"] = 0

npa_final['category'] = 'natural'
npi_final['category'] = 'natural'
sda_final['category'] = 'synthetic'
sdi_final['category'] = 'synthetic'

final_df = pd.concat([
    npa_final[['id', 'canonical_smiles', 'inchikey', 'category', 'activity']],
    npi_final[['id', 'canonical_smiles', 'inchikey', 'category', 'activity']],
    sda_final[['id', 'canonical_smiles', 'inchikey', 'category', 'activity']],
    sdi_final[['id', 'canonical_smiles', 'inchikey', 'category', 'activity']]
], ignore_index=True)

final_df.to_csv(os.path.join(DATAPATH, "processed", "all_molecules.csv"), index=False)

print(f"Final dataset has {len(final_df)} molecules.")
print(len(set(final_df['canonical_smiles'])), "unique SMILES in the final dataset.")

Final dataset has 835 molecules.
835 unique SMILES in the final dataset.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  npa_final["activity"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sda_final["activity"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  npa_final['category'] = 'natural'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inst

In [18]:
#dataset for ersilia
final_df = pd.read_csv(os.path.join(DATAPATH, "processed", "all_molecules.csv"))
ersilia_df = final_df[['canonical_smiles']]
ersilia_df = ersilia_df.to_csv(os.path.join(DATAPATH, "processed", "all_smiles.csv"), index=False)

# Assay Specific Data

In [28]:
npa = pd.read_csv(os.path.join(DATAPATH, "original", "active_NP.csv"))
npi = pd.read_csv(os.path.join(DATAPATH, "original", "inactive_NP.csv"))
sda = pd.read_csv(os.path.join(DATAPATH, "original", "active_SD.csv"))
sdi = pd.read_csv(os.path.join(DATAPATH, "original", "inactive_SD.csv"))

In [31]:
npa["category"] = "natural"
npi['category'] = 'natural'
sda['category'] = 'synthetic'
sdi['category'] = 'synthetic'

all = pd.concat([npa, npi, sda, sdi], axis=0)
print(all.shape)
all = all[~all["smiles"].isna()]
print(all.shape)

def parse_and_standardize(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None, None
        canonical_smiles = Chem.MolToSmiles(mol, canonical=True)
        inchikey = Chem.inchi.MolToInchiKey(mol)
        return canonical_smiles, inchikey
    except:
        return None, None
all[['canonical_smiles', 'inchikey']] = all['smiles'].apply(lambda x: pd.Series(parse_and_standardize(x)))
all = all.dropna(subset=['canonical_smiles']).copy()
print(all.shape)

(868, 27)
(867, 27)


[13:12:20] SMILES Parse Error: syntax error while parsing: 1[C@H](C([C@](C(C1(C(=O)O)O)(C(=O)C2=CC(=C(C(=C2)O)O)O)C(=O)C3=CC(=C(C(=C3)O)O)O)(C(=O)C4=CC(=C(C(=C4)O)O)O)O)(C(=O)C5=CC(=C(C(=C5)O)O)O)O)O
[13:12:20] SMILES Parse Error: check for mistakes around position 1:
[13:12:20] 1[C@H](C([C@](C(C1(C(=O)O)O)(C(=O)C2=CC(=
[13:12:20] ^
[13:12:20] SMILES Parse Error: Failed parsing SMILES '1[C@H](C([C@](C(C1(C(=O)O)O)(C(=O)C2=CC(=C(C(=C2)O)O)O)C(=O)C3=CC(=C(C(=C3)O)O)O)(C(=O)C4=CC(=C(C(=C4)O)O)O)O)(C(=O)C5=CC(=C(C(=C5)O)O)O)O)O' for input: '1[C@H](C([C@](C(C1(C(=O)O)O)(C(=O)C2=CC(=C(C(=C2)O)O)O)C(=O)C3=CC(=C(C(=C3)O)O)O)(C(=O)C4=CC(=C(C(=C4)O)O)O)O)(C(=O)C5=CC(=C(C(=C5)O)O)O)O)O'
[13:12:21] SMILES Parse Error: syntax error while parsing: CC(N=C[23])=NC(C(F)(F)F)=C@23CCC(N(@23)CC[6]=CN=C(C[13]()=CC=CC=C@13C[22]=NN=NN@22)C=C@7)=O
[13:12:21] SMILES Parse Error: check for mistakes around position 10:
[13:12:21] CC(N=C[23])=NC(C(F)(F)F)=C@23CCC(N(@23)CC
[13:12:21] ~~~~~~~~~^
[13:12:21] SMILES P

(862, 28)


[13:12:21] SMILES Parse Error: syntax error while parsing: O=C(O)CN(C(C(C)CSC(C)=O)=O)C[6:S=N]HC[S=I](C[12])(H)CCC[S=I]@12(H)C@7
[13:12:21] SMILES Parse Error: check for mistakes around position 31:
[13:12:21] (C(C)CSC(C)=O)=O)C[6:S=N]HC[S=I](C[12])(H
[13:12:21] ~~~~~~~~~~~~~~~~~~~~^
[13:12:21] SMILES Parse Error: Failed parsing SMILES 'O=C(O)CN(C(C(C)CSC(C)=O)=O)C[6:S=N]HC[S=I](C[12])(H)CCC[S=I]@12(H)C@7' for input: 'O=C(O)CN(C(C(C)CSC(C)=O)=O)C[6:S=N]HC[S=I](C[12])(H)CCC[S=I]@12(H)C@7'
[13:12:21] SMILES Parse Error: syntax error while parsing: O=C(O)CN(CCC[17]=CC=CO@17)C(C(C)CSC(C)=O)=O
[13:12:21] SMILES Parse Error: check for mistakes around position 16:
[13:12:21] O=C(O)CN(CCC[17]=CC=CO@17)C(C(C)CSC(C)=O)
[13:12:21] ~~~~~~~~~~~~~~~^
[13:12:21] SMILES Parse Error: Failed parsing SMILES 'O=C(O)CN(CCC[17]=CC=CO@17)C(C(C)CSC(C)=O)=O' for input: 'O=C(O)CN(CCC[17]=CC=CO@17)C(C(C)CSC(C)=O)=O'
[13:12:21] SMILES Parse Error: syntax error while parsing: O=C(O)CN(C[17]=C(C)C=CC=C@17)C(C(C)CS)

In [47]:
all = all[["canonical_smiles", "inchikey", "category", "activity", "target"]]
all["activity"] = all["activity"].map({"Active": 1, "Inactive": 0})
ace = all[all["target"].str.contains("ACE", case=False, na=False)]
enos = all[all["target"].str.contains("ENOS", case=False, na=False)]
at = all[all["target"].str.contains("AT1", case=False, na=False)]
at1rec = all[all["target"].str.contains("AT1 Receptor|AT1R", case=False, na=False)]
ca = all[all["target"].str.contains("CA2", case=False, na=False)]
k = all[all["target"].str.contains("K+", case=False, na=False)]
print(len(ace), len(enos), len(at), len(at1rec), len(ca), len(k))

280 40 12 12 119 51


# Duplicates will be removed, Actives prioritised over inactives

In [48]:
print(len(ace[ace["activity"]==1]),len(ace[ace["activity"]==0]),len(ace[ace["category"]=="natural"]), len(ace[ace["category"]=="synthetic"]))
print(len(ca[ca["activity"]==1]),len(ca[ca["activity"]==0]), len(ca[ca["category"]=="natural"]), len(ca[ca["category"]=="synthetic"]))
print(len(k[k["activity"]==1]),len(k[k["activity"]==0]), len(k[k["category"]=="natural"]), len(k[k["category"]=="synthetic"]))

247 33 126 154
117 2 80 39
50 1 28 23


In [52]:
ace[ace.duplicated(subset=['canonical_smiles'], keep=False)]

Unnamed: 0,canonical_smiles,inchikey,category,activity,target
154,O=c1cc(-c2ccc(O)c(O)c2)oc2cc(O)cc(O)c12,IQPNAANSBPBGFQ-UHFFFAOYSA-N,natural,1,ACE
167,O=c1c(O[C@@H]2O[C@H](CO)[C@@H](O)[C@H](O)[C@H]...,OVSQVDMCBVZWGM-QSOFNFLRSA-N,natural,1,ACE
3,O=c1cc(-c2ccc(O)c(O)c2)oc2cc(O)cc(O)c12,IQPNAANSBPBGFQ-UHFFFAOYSA-N,natural,0,ACE
9,O=c1c(O[C@@H]2O[C@H](CO)[C@@H](O)[C@H](O)[C@H]...,OVSQVDMCBVZWGM-QSOFNFLRSA-N,natural,0,ACE
10,O=c1c(O[C@@H]2O[C@H](CO)[C@@H](O)[C@H](O)[C@H]...,OVSQVDMCBVZWGM-QSOFNFLRSA-N,natural,0,ACE
195,CCOC(=O)[C@H](CCc1ccccc1)N[C@@H](C)C(=O)N1CCC[...,GBXSMTUPTTWBMN-XIRDDKMYSA-N,synthetic,1,ACE
305,CCOC(=O)[C@H](CCc1ccccc1)N[C@@H](C)C(=O)N1CCC[...,GBXSMTUPTTWBMN-XIRDDKMYSA-N,synthetic,1,ACE


In [54]:
#only ace of the three has duplicates
ace = ace.sort_values(by="activity", ascending=False)
ace= ace.drop_duplicates(subset="canonical_smiles", keep="first")

In [55]:
print(len(ace[ace["activity"]==1]),len(ace[ace["activity"]==0]),len(ace[ace["category"]=="natural"]), len(ace[ace["category"]=="synthetic"]))
print(len(ca[ca["activity"]==1]),len(ca[ca["activity"]==0]), len(ca[ca["category"]=="natural"]), len(ca[ca["category"]=="synthetic"]))
print(len(k[k["activity"]==1]),len(k[k["activity"]==0]), len(k[k["category"]=="natural"]), len(k[k["category"]=="synthetic"]))

246 30 123 153
117 2 80 39
50 1 28 23


In [57]:
ace.to_csv(os.path.join(DATAPATH, "processed", "ace.csv"), index=False)
ca.to_csv(os.path.join(DATAPATH, "processed", "ca.csv"), index=False)
k.to_csv(os.path.join(DATAPATH, "processed", "k.csv"), index=False)

In [63]:
print(len(k[(k["category"]=="natural")&(k["activity"]==1)]))

27
