In [1]:
import pandas as pd
import os


DATAPATH = "../data"

In [24]:
df1 = pd.read_csv(os.path.join(DATAPATH, "original", "250425_sheet1.csv"))
df2 = pd.read_csv(os.path.join(DATAPATH, "original", "250425_sheet2.csv"))

In [25]:
print(df1.shape, df2.shape)
df1 = df1[~df1["Smiles"].isna()]
df2 = df2[~df2["Smiles"].isna()]
print(df1.shape, df2.shape)

(513, 21) (467, 18)
(490, 21) (465, 18)


In [26]:
#Assume they are all active and this is a mistake
print(df1.shape, df2.shape)
df1_ = df1[~df1["Activity"].isna()]
df2_ = df2[~df2["Activity"].isna()]
print(df1.shape, df2.shape)

(490, 21) (465, 18)
(490, 21) (465, 18)


In [27]:
smi1 = df1["Smiles"]
smi2 = df2["Smiles"]
print(len(smi1), len(smi2))

490 465


In [28]:
from rdkit import Chem

def parse_and_standardize(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None, None
        canonical_smiles = Chem.MolToSmiles(mol, canonical=True)
        inchikey = Chem.inchi.MolToInchiKey(mol)
        return canonical_smiles, inchikey
    except:
        return None, None
df1[['canonical_smiles', 'inchikey']] = df1['Smiles'].apply(lambda x: pd.Series(parse_and_standardize(x)))
df2[['canonical_smiles', 'inchikey']] = df2['Smiles'].apply(lambda x: pd.Series(parse_and_standardize(x)))

# Remove rows without canonical SMILES
print("Before standardization:", df1.shape, df2.shape)
df1_clean = df1.dropna(subset=['canonical_smiles']).copy()
df2_clean = df2.dropna(subset=['canonical_smiles']).copy()
print("After standardization:", df1_clean.shape, df2_clean.shape)

# Duplicates
duplicates_in_df1 = df1_clean[df1_clean.duplicated(subset=['canonical_smiles'], keep=False)].copy()
duplicates_in_df2 = df2_clean[df2_clean.duplicated(subset=['canonical_smiles'], keep=False)].copy()
print(f"Internal duplicates in df1: {duplicates_in_df1['canonical_smiles'].nunique()} unique duplicate SMILES")
print(f"Internal duplicates in df2: {duplicates_in_df2['canonical_smiles'].nunique()} unique duplicate SMILES")
duplicates_in_df1.to_csv(os.path.join(DATAPATH, "processed", 'duplicates_within_df1.csv'), index=False)
duplicates_in_df2.to_csv(os.path.join(DATAPATH, "processed",'duplicates_within_df2.csv'), index=False)

df1_final = df1_clean.drop_duplicates(subset=['canonical_smiles'])
df2_final = df2_clean.drop_duplicates(subset=['canonical_smiles'])

#Save cleaned data
df1_final.to_csv(os.path.join(DATAPATH, "processed", "df1_clean.csv"), index=False)
df2_final.to_csv(os.path.join(DATAPATH, "processed", "df2_clean.csv"), index=False)

# Duplicates between sets
set1 = set(df1_final['canonical_smiles'])
set2 = set(df2_final['canonical_smiles'])
common_smiles = set1.intersection(set2)
common_smiles_df1 = df1_final[df1_final['canonical_smiles'].isin(common_smiles)]
common_smiles_df2 = df2_final[df2_final['canonical_smiles'].isin(common_smiles)]
common_smiles_df1.to_csv(os.path.join(DATAPATH, "processed", 'duplicates_df1_with_df2.csv'), index=False)
common_smiles_df2.to_csv(os.path.join(DATAPATH, "processed", 'duplicates_df2_with_df1.csv'), index=False)
print(f"Number of duplicates between df1 and df2: {len(common_smiles)}")

[12:33:09] SMILES Parse Error: syntax error while parsing: C1=C(C=C(C(=C1O)O)O)C(=O)OC[C@@H]2[C@H]([C@@H]([C@H([C@@H(O2)OC(=O)C3=CC(=C(C(=C3)O)O)O)OC(=O)C4=CC(=C(C(=C4)O)O)O)OC(=O)C5=CC(=C(C(=C5)O)O)O)OC(=O)C6=CC(=C(C(=C6)O)O)O
[12:33:09] SMILES Parse Error: Failed parsing SMILES 'C1=C(C=C(C(=C1O)O)O)C(=O)OC[C@@H]2[C@H]([C@@H]([C@H([C@@H(O2)OC(=O)C3=CC(=C(C(=C3)O)O)O)OC(=O)C4=CC(=C(C(=C4)O)O)O)OC(=O)C5=CC(=C(C(=C5)O)O)O)OC(=O)C6=CC(=C(C(=C6)O)O)O' for input: 'C1=C(C=C(C(=C1O)O)O)C(=O)OC[C@@H]2[C@H]([C@@H]([C@H([C@@H(O2)OC(=O)C3=CC(=C(C(=C3)O)O)O)OC(=O)C4=CC(=C(C(=C4)O)O)O)OC(=O)C5=CC(=C(C(=C5)O)O)O)OC(=O)C6=CC(=C(C(=C6)O)O)O'
[12:33:09] SMILES Parse Error: syntax error while parsing: CSCPPACGZOOCGX-UHFFFAOYSA-N
[12:33:09] SMILES Parse Error: Failed parsing SMILES 'CSCPPACGZOOCGX-UHFFFAOYSA-N' for input: 'CSCPPACGZOOCGX-UHFFFAOYSA-N'
[12:33:09] SMILES Parse Error: syntax error while parsing: 1[C@H](C([C@](C(C1(C(=O)O)O)(C(=O)C2=CC(=C(C(=C2)O)O)O)C(=O)C3=CC(=C(C(=C3)O)O)O)(C(=O)C4=CC(=C(

Before standardization: (490, 23) (465, 20)
After standardization: (487, 23) (459, 20)
Internal duplicates in df1: 63 unique duplicate SMILES
Internal duplicates in df2: 27 unique duplicate SMILES
Number of duplicates between df1 and df2: 2


[12:33:09] SMILES Parse Error: syntax error while parsing: O=C(C[3:S=I]HN(CCC@4)C(C[S=N]H(NC[S=N]H(CCC[16]=CC=CC=C@16)C(O)=O)C)=O)O
[12:33:09] SMILES Parse Error: Failed parsing SMILES 'O=C(C[3:S=I]HN(CCC@4)C(C[S=N]H(NC[S=N]H(CCC[16]=CC=CC=C@16)C(O)=O)C)=O)O' for input: 'O=C(C[3:S=I]HN(CCC@4)C(C[S=N]H(NC[S=N]H(CCC[16]=CC=CC=C@16)C(O)=O)C)=O)O'
[12:33:09] SMILES Parse Error: syntax error while parsing: O=C(O)CN(C(C(C)CSC(C)=O)=O)C[6:S=N]HC[S=I](C[12])(H)CCC[S=I]@12(H)C@7
[12:33:09] SMILES Parse Error: Failed parsing SMILES 'O=C(O)CN(C(C(C)CSC(C)=O)=O)C[6:S=N]HC[S=I](C[12])(H)CCC[S=I]@12(H)C@7' for input: 'O=C(O)CN(C(C(C)CSC(C)=O)=O)C[6:S=N]HC[S=I](C[12])(H)CCC[S=I]@12(H)C@7'
[12:33:09] SMILES Parse Error: syntax error while parsing: O=C(O)CN(CCC[17]=CC=CO@17)C(C(C)CSC(C)=O)=O
[12:33:09] SMILES Parse Error: Failed parsing SMILES 'O=C(O)CN(CCC[17]=CC=CO@17)C(C(C)CSC(C)=O)=O' for input: 'O=C(O)CN(CCC[17]=CC=CO@17)C(C(C)CSC(C)=O)=O'
[12:33:09] SMILES Parse Error: syntax error while parsing:

In [30]:
#merge into one file

df1_final['category'] = 'natural'
df2_final['category'] = 'synthetic'

df1_final = df1_final[~df1_final['canonical_smiles'].isin(common_smiles)]
df2_final = df2_final[~df2_final['canonical_smiles'].isin(common_smiles)]

final_df = pd.concat([
    df1_final[['id', 'canonical_smiles', 'inchikey', 'category']],
    df2_final[['id', 'canonical_smiles', 'inchikey', 'category']]
], ignore_index=True)

final_df.to_csv(os.path.join(DATAPATH, "processed", "all_molecules.csv"), index=False)

print(f"Final dataset has {len(final_df)} molecules.")

Final dataset has 813 molecules.


In [2]:
#dataset for ersilia
final_df = pd.read_csv(os.path.join(DATAPATH, "processed", "all_molecules.csv"))
ersilia_df = final_df[['canonical_smiles', 'id']]
ersilia_df = ersilia_df.to_csv(os.path.join(DATAPATH, "processed", "all_smiles.csv"), index=False)