In [1]:
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.Chem import MolFromSmiles, MolToSmiles
from tqdm.auto import tqdm
tqdm.pandas()
import pandas as pd
import os
import pickle
from multiprocessing import Pool
from rdkit import RDLogger
import numpy as np
import itertools
RDLogger.DisableLog('rdApp.error')

In [2]:
# load datasets pickle from ../data/standardized_data

chemdiv_df = pd.read_pickle('../../../data/standardized_data/chemdiv_df.pkl')
enamine_df = pd.read_pickle('../../../data/standardized_data/enamine_rna_df.pkl')
enamine_protein_df = pd.read_pickle('../../../data/standardized_data/enamine_protein_df.pkl')
life_chemicals_df = pd.read_pickle('../../../data/standardized_data/life_chemicals_df.pkl')
robin_df = pd.read_pickle('../../../data/standardized_data/robin_df.pkl')



In [3]:
all_df = pd.concat([chemdiv_df, enamine_df, enamine_protein_df, life_chemicals_df, robin_df], ignore_index=True)
rna_df = pd.concat([chemdiv_df, enamine_df, life_chemicals_df, robin_df], ignore_index=True)

In [4]:
all_df

Unnamed: 0,source,mol,SMILES
0,chemdiv,<rdkit.Chem.rdchem.Mol object at 0x7fa95236f6a0>,O=C(Nc1ccc2ccccc2c1)c1ccc2c(c1)C(=O)N(c1cccc(N...
1,chemdiv,<rdkit.Chem.rdchem.Mol object at 0x7fa9a812fdd0>,O=C(CSc1nnc(-c2ccccc2Cl)n1-c1ccccc1)c1ccc2c(c1...
2,chemdiv,<rdkit.Chem.rdchem.Mol object at 0x7fa9522d4130>,Cc1ccc(-n2c(=O)c3c4c(sc3n3c(SCC(=O)c5ccccc5)nn...
3,chemdiv,<rdkit.Chem.rdchem.Mol object at 0x7fa9a812cd60>,O=C(Nc1ccc(C(=O)c2ccccc2)cc1)c1ccc(Oc2ccc(C(=O...
4,chemdiv,<rdkit.Chem.rdchem.Mol object at 0x7fa9a812fc90>,O=C(Nc1ccc(Oc2cccc(Oc3ccc(NC(=O)c4ccccc4Cl)cc3...
...,...,...,...
503222,robin,<rdkit.Chem.rdchem.Mol object at 0x7fa946a20fe0>,C=CC(=O)Nc1cccc(Nc2nc(N[C@H]3CC[C@H](N(C)C)CC3...
503223,robin,<rdkit.Chem.rdchem.Mol object at 0x7fa947361030>,N#C/C(C(=O)c1ccc(Cl)cc1Cl)=C1\NC(=O)c2ccc(Cl)c...
503224,robin,<rdkit.Chem.rdchem.Mol object at 0x7fa946a21080>,C[C@H](N[C@H]1C[C@H]1c1ccccc1)c1ccc2c(c1)OCCO2
503225,robin,<rdkit.Chem.rdchem.Mol object at 0x7fa946a210d0>,NCCC[C@@H](N)CC(=O)N[C@H]1CNC(=O)[C@@H]([C@@H]...


In [5]:
# number of molecules in each dataset
# sum without protein
print(f'Total ALL: {len(all_df)}')
print(f'Total RNA: {len(rna_df)}')
print(f'Enamine Protein: {len(enamine_protein_df)}')
print(f'Chemdiv: {len(chemdiv_df)}')
print(f'Enamine: {len(enamine_df)}')
print(f'Life Chemicals: {len(life_chemicals_df)}')
print(f'Robin: {len(robin_df)}')


# create a df where will be three columns, before, after and difference. Before deduplication, after deduplication and difference. Store the first column for each dataset
df_deduplication = pd.DataFrame(columns=['before', 'after', 'difference'])
df_deduplication['before'] = [len(enamine_protein_df), len(chemdiv_df), len(enamine_df), len(life_chemicals_df), len(robin_df)]
df_deduplication.index = ['Enamine Protein', 'Chemdiv', 'Enamine', 'Life Chemicals', 'Robin']
df_deduplication[['after', 'difference']] = 0

df_deduplication

Total ALL: 503227
Total RNA: 43067
Enamine Protein: 460160
Chemdiv: 20000
Enamine: 15520
Life Chemicals: 5544
Robin: 2003


Unnamed: 0,before,after,difference
Enamine Protein,460160,0,0
Chemdiv,20000,0,0
Enamine,15520,0,0
Life Chemicals,5544,0,0
Robin,2003,0,0


In [6]:
# add column 'source' to each df
chemdiv_df['source'] = 'chemdiv'
enamine_df['source'] = 'enamine'
enamine_protein_df['source'] = 'enamine_protein'
life_chemicals_df['source'] = 'life_chemicals'
robin_df['source'] = 'robin'


In [7]:
# create SMILES column for each dataset using rdKit
datasets = [chemdiv_df, enamine_df, enamine_protein_df, life_chemicals_df, robin_df]

for dataset in datasets:
    dataset['SMILES'] = dataset['mol'].apply(MolToSmiles, canonical=True)


In [8]:
del rna_df, all_df

In [9]:
rna_df = pd.concat([chemdiv_df, enamine_df, life_chemicals_df, robin_df], ignore_index=True)
all_df = pd.concat([chemdiv_df, enamine_df, enamine_protein_df, life_chemicals_df, robin_df], ignore_index=True)

In [10]:
all_df.describe()

Unnamed: 0,source,mol,SMILES
count,503227,503227,503227
unique,5,503227,498994
top,enamine_protein,<rdkit.Chem.rdchem.Mol object at 0x7fa946a21120>,CCC(C)Nc1nc2c(c(=O)n(C)c(=O)n2C)n1CCCc1ccccc1
freq,460160,1,3


In [11]:
# import pandas as pd
# from rdkit import Chem
# from chembl_structure_pipeline import standardizer

# def standardize_smiles(smiles):
#     """Standardize SMILES using ChEMBL Structure Pipeline."""
#     mol = Chem.MolFromSmiles(smiles)
#     if mol:
#         standardized_mol = standardizer.standardize_mol(mol)
#         standardized_smiles = Chem.MolToSmiles(standardized_mol, canonical=True)
#         return standardized_smiles
#     return None

# def correct_smiles(smiles):
#     """Keep the longer part of the SMILES string, discarding salts."""
#     parts = smiles.split('.')
#     if len(parts) > 1:
#         largest_part = max(parts, key=lambda x: len(Chem.MolFromSmiles(x).GetAtoms()))
#         return largest_part
#     return smiles

# # Assuming all_df is your DataFrame containing molecules data
# # Remove the 'mol' column
# all_df.drop(columns=['mol'], inplace=True)

# # Initialize a list to store molecules with disconnected components
# disconnected_mols = []

# # Process each row in all_df
# for idx, row in all_df.iterrows():
#     smiles = row['SMILES']
#     source = row['source']

#     # Standardize and correct SMILES
#     standardized_smiles = standardize_smiles(smiles)
#     if standardized_smiles is None:
#         print(f"Invalid SMILES string at index {idx}: {smiles}")
#         continue

#     corrected_smiles = correct_smiles(standardized_smiles)
#     mol_from_smiles = Chem.MolFromSmiles(corrected_smiles)

#     if mol_from_smiles is None or mol_from_smiles.GetNumAtoms() == 0:
#         print(f"Invalid SMILES string after correction at index {idx}: {smiles}")
#         continue

#     num_components = Chem.GetMolFrags(mol_from_smiles, asMols=False, sanitizeFrags=False)
#     if len(num_components) > 1:
#         print(f"Disconnected components in molecule at index {idx}: {smiles}, from source {source}")
#         disconnected_mols.append({'Index': idx, 'SMILES': corrected_smiles, 'Source': source})
#         all_df.at[idx, 'SMILES'] = corrected_smiles  # Update SMILES with corrected version

# # Recompute 'mol' column from corrected SMILES
# all_df['mol'] = all_df['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))

# # Convert the list to a DataFrame and save to JSON
# disconnected_mols_df = pd.DataFrame(disconnected_mols)
# disconnected_mols_df.to_json('disconnected_mols_df.json')

# # Optionally, print or return the DataFrame of disconnected molecules
# disconnected_mols_df

In [12]:
all_df

Unnamed: 0,source,mol,SMILES
0,chemdiv,<rdkit.Chem.rdchem.Mol object at 0x7fa95236f6a0>,O=C(Nc1ccc2ccccc2c1)c1ccc2c(c1)C(=O)N(c1cccc(N...
1,chemdiv,<rdkit.Chem.rdchem.Mol object at 0x7fa9a812fdd0>,O=C(CSc1nnc(-c2ccccc2Cl)n1-c1ccccc1)c1ccc2c(c1...
2,chemdiv,<rdkit.Chem.rdchem.Mol object at 0x7fa9522d4130>,Cc1ccc(-n2c(=O)c3c4c(sc3n3c(SCC(=O)c5ccccc5)nn...
3,chemdiv,<rdkit.Chem.rdchem.Mol object at 0x7fa9a812cd60>,O=C(Nc1ccc(C(=O)c2ccccc2)cc1)c1ccc(Oc2ccc(C(=O...
4,chemdiv,<rdkit.Chem.rdchem.Mol object at 0x7fa9a812fc90>,O=C(Nc1ccc(Oc2cccc(Oc3ccc(NC(=O)c4ccccc4Cl)cc3...
...,...,...,...
503222,robin,<rdkit.Chem.rdchem.Mol object at 0x7fa946a20fe0>,C=CC(=O)Nc1cccc(Nc2nc(N[C@H]3CC[C@H](N(C)C)CC3...
503223,robin,<rdkit.Chem.rdchem.Mol object at 0x7fa947361030>,N#C/C(C(=O)c1ccc(Cl)cc1Cl)=C1\NC(=O)c2ccc(Cl)c...
503224,robin,<rdkit.Chem.rdchem.Mol object at 0x7fa946a21080>,C[C@H](N[C@H]1C[C@H]1c1ccccc1)c1ccc2c(c1)OCCO2
503225,robin,<rdkit.Chem.rdchem.Mol object at 0x7fa946a210d0>,NCCC[C@@H](N)CC(=O)N[C@H]1CNC(=O)[C@@H]([C@@H]...


In [13]:
# check the SMILES column in all_df if they do not have dot in the string
dissconnected_molecules = all_df[all_df['SMILES'].str.contains('\.')]

# save the dissconnected molecules to json
dissconnected_molecules.to_json('mols_with_salt.json')


In [14]:
dissconnected_molecules

Unnamed: 0,source,mol,SMILES


In [15]:
# from chembl_structure_pipeline import standardizer
# from rdkit import Chem

# def get_parent_mol(smiles):
#     """Remove salt from SMILES and return the parent molecule's SMILES."""
#     mol = Chem.MolFromSmiles(smiles)
#     if mol:
#         parent_mol, _ = standardizer.get_parent_mol(mol)  # Unpack the returned tuple
#         if parent_mol:  # Ensure parent_mol is not None
#             parent_smiles = Chem.MolToSmiles(parent_mol, canonical=True)
#             return parent_smiles
#     return None

# # Assuming dissconnected_molecules is a DataFrame containing molecules
# removed_salts = []
# for idx, row in dissconnected_molecules.iterrows():
#     smiles = row['SMILES']
#     source = row['source']
#     parent_smiles = get_parent_mol(smiles)
#     if parent_smiles is None:
#         print(f"Invalid SMILES string at index {idx}: {smiles}")
#         continue
#     removed_salts.append({'Index': idx, 'SMILES': parent_smiles, 'Source': source})
#     # Assuming you want to update all_df too
#     all_df.at[idx, 'SMILES'] = parent_smiles  # Update SMILES with corrected version

# # Process the removed_salts list as needed
# removed_salts_df = pd.DataFrame(removed_salts)
# removed_salts_df.to_json('removed_salts_df.json')



In [16]:
salt_check = all_df[all_df['SMILES'].str.contains('\.')].copy()
# drop mol column
salt_check


Unnamed: 0,source,mol,SMILES


In [17]:
# # from salt_check, remove the [O-][Cl+3]([O-])([O-])[O-] from the SMILES
# salt_check['SMILES'] = salt_check['SMILES'].str.replace(r'[O-][Cl+3]([O-])([O-])[O-].', '')
# salt_check['SMILES'] = salt_check['SMILES'].str.replace(r'.[O-][Cl+3]([O-])([O-])[O-]', '')
# salt_check['SMILES'] = salt_check['SMILES'].str.replace(r'.[O-][Cl+3]([O-])([O-])O', '')


In [18]:
for x in salt_check['SMILES']:
    print(x)

In [19]:
all_df['SMILES']  = all_df['SMILES'].str.replace(r'[O-][Cl+3]([O-])([O-])[O-].', '')
all_df['SMILES']  = all_df['SMILES'].str.replace(r'.[O-][Cl+3]([O-])([O-])[O-]', '')
all_df['SMILES']  = all_df['SMILES'].str.replace(r'.[O-][Cl+3]([O-])([O-])O', '')

In [20]:
all_df[all_df['SMILES'].str.contains('\.')]

Unnamed: 0,source,mol,SMILES


In [21]:
all_df.head(5)

Unnamed: 0,source,mol,SMILES
0,chemdiv,<rdkit.Chem.rdchem.Mol object at 0x7fa95236f6a0>,O=C(Nc1ccc2ccccc2c1)c1ccc2c(c1)C(=O)N(c1cccc(N...
1,chemdiv,<rdkit.Chem.rdchem.Mol object at 0x7fa9a812fdd0>,O=C(CSc1nnc(-c2ccccc2Cl)n1-c1ccccc1)c1ccc2c(c1...
2,chemdiv,<rdkit.Chem.rdchem.Mol object at 0x7fa9522d4130>,Cc1ccc(-n2c(=O)c3c4c(sc3n3c(SCC(=O)c5ccccc5)nn...
3,chemdiv,<rdkit.Chem.rdchem.Mol object at 0x7fa9a812cd60>,O=C(Nc1ccc(C(=O)c2ccccc2)cc1)c1ccc(Oc2ccc(C(=O...
4,chemdiv,<rdkit.Chem.rdchem.Mol object at 0x7fa9a812fc90>,O=C(Nc1ccc(Oc2cccc(Oc3ccc(NC(=O)c4ccccc4Cl)cc3...


In [22]:
# # Initialize a list to store molecules with disconnected components
# disconnected_mols = []

# # Process each row in all_df
# for idx, row in all_df.iterrows():
#     smiles = row['SMILES']
#     source = row['source']

#     # Standardize and correct SMILES
#     standardized_smiles = standardize_smiles(smiles)
#     if standardized_smiles is None:
#         print(f"Invalid SMILES string at index {idx}: {smiles}")
#         continue

#     corrected_smiles = correct_smiles(standardized_smiles)
#     mol_from_smiles = Chem.MolFromSmiles(corrected_smiles)

#     if mol_from_smiles is None or mol_from_smiles.GetNumAtoms() == 0:
#         print(f"Invalid SMILES string after correction at index {idx}: {smiles}")
#         continue

#     num_components = Chem.GetMolFrags(mol_from_smiles, asMols=False, sanitizeFrags=False)
#     if len(num_components) > 1:
#         print(f"Disconnected components in molecule at index {idx}: {smiles}, from source {source}")
#         disconnected_mols.append({'Index': idx, 'SMILES': corrected_smiles, 'Source': source})
#         all_df.at[idx, 'SMILES'] = corrected_smiles  # Update SMILES with corrected version

# # Recompute 'mol' column from corrected SMILES
# all_df['mol'] = all_df['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))

# # Convert the list to a DataFrame and save to JSON
# disconnected_mols_2_df = pd.DataFrame(disconnected_mols)
# disconnected_mols_2_df.to_json('disconnected_mols_2_df.json')

# # Optionally, print or return the DataFrame of disconnected molecules
# disconnected_mols_2_df

In [23]:
all_df[all_df['SMILES'].str.contains('\.')]

Unnamed: 0,source,mol,SMILES


In [24]:
# using source column, divide the all_df into 5 different dataframes
chemdiv_df = all_df[all_df['source'] == 'chemdiv']
enamine_df = all_df[all_df['source'] == 'enamine']
enamine_protein_df = all_df[all_df['source'] == 'enamine_protein']
life_chemicals_df = all_df[all_df['source'] == 'life_chemicals']
robin_df = all_df[all_df['source'] == 'robin']

# statistics of the number of molecules in each dataset
print(f'Chemdiv: {len(chemdiv_df)}')
print(f'Enamine: {len(enamine_df)}')
print(f'Enamine Protein: {len(enamine_protein_df)}')
print(f'Life Chemicals: {len(life_chemicals_df)}')
print(f'Robin: {len(robin_df)}')


Chemdiv: 20000
Enamine: 15520
Enamine Protein: 460160
Life Chemicals: 5544
Robin: 2003


In [25]:
# List of datasets
datasets = {
    'Enamine protein': enamine_protein_df,
    'Chemdiv': chemdiv_df,
    'Enamine': enamine_df,
    'Life Chemicals': life_chemicals_df,
    'ROBIN': robin_df,
}

# remove duplicates within datasets
for dataset in datasets:
    datasets[dataset] = datasets[dataset].drop_duplicates(subset='SMILES', keep='first')
    

# create a df to store the number of duplicates between each pair of datasets
df_dup_count = pd.DataFrame(columns=datasets.keys(), index=datasets.keys())
df_dup_count = df_dup_count.fillna(0)
    
# Iterate over all pairs of datasets and count duplicates
for (name1, df1), (name2, df2) in itertools.combinations(datasets.items(), 2):
    # Find duplicates between df1 and df2
    duplicates = df1[df1['SMILES'].isin(df2['SMILES'])]
    
    # Store the number of duplicates in the df
    df_dup_count.loc[name1, name2] = len(duplicates)
    df_dup_count.loc[name2, name1] = len(duplicates)
    
    # Print the count of duplicates
    print(f"Duplicates between {name1} and {name2}: {len(duplicates)}")
    

# center the diagonal
df_dup_count.values[[np.arange(len(df_dup_count))]*2] = 0


    
# drop duplicates in rna_df and enamine_protein_df
rna_df = rna_df.drop_duplicates(subset='SMILES', keep='first')
enamine_protein_df = enamine_protein_df.drop_duplicates(subset='SMILES', keep='first')

rna_prot_dup = rna_df[rna_df['SMILES'].isin(enamine_protein_df['SMILES'])]
print(f'Duplicates between RNA and Enamine Protein: {len(rna_prot_dup)}')
df_dup_count


Duplicates between Enamine protein and Chemdiv: 54
Duplicates between Enamine protein and Enamine: 3901
Duplicates between Enamine protein and Life Chemicals: 111
Duplicates between Enamine protein and ROBIN: 1
Duplicates between Chemdiv and Enamine: 11
Duplicates between Chemdiv and Life Chemicals: 20
Duplicates between Chemdiv and ROBIN: 7
Duplicates between Enamine and Life Chemicals: 112
Duplicates between Enamine and ROBIN: 2
Duplicates between Life Chemicals and ROBIN: 1
Duplicates between RNA and Enamine Protein: 4059


Unnamed: 0,Enamine protein,Chemdiv,Enamine,Life Chemicals,ROBIN
Enamine protein,0,54,3901,111,1
Chemdiv,54,0,11,20,7
Enamine,3901,11,0,112,2
Life Chemicals,111,20,112,0,1
ROBIN,1,7,2,1,0


In [26]:
print(f'Chemdiv: {len(chemdiv_df)}')
print(f'Enamine: {len(enamine_df)}')
print(f'Enamine Protein: {len(enamine_protein_df)}')
print(f'Life Chemicals: {len(life_chemicals_df)}')
print(f'Robin: {len(robin_df)}')

Chemdiv: 20000
Enamine: 15520
Enamine Protein: 460139
Life Chemicals: 5544
Robin: 2003


In [27]:
# # Save the number of duplicates to a pickle file
# df_dup_count.to_pickle('duplicate_counts_v4.pkl')
    
# # Save the number of duplicates to a text file
# duplicate_counts = df_dup_count.to_dict()
# with open('duplicate_counts_v4.txt', 'w') as f:
#     for key, value in duplicate_counts.items():
#         f.write(f'{key}: {value}\n')
    
# # save found duplicates to a pickle file
# duplicates = rna_df[rna_df['SMILES'].isin(enamine_protein_df['SMILES'])]
# duplicates.to_pickle('duplicates_v4.pkl')


In [28]:
# # Combine datasets into one DataFrame
# combined_df = pd.concat([enamine_protein_df, chemdiv_df, enamine_df, life_chemicals_df, robin_df])

# # Sort datasets by size (largest to smallest)
# datasets = [enamine_protein_df, chemdiv_df, enamine_df, life_chemicals_df, robin_df]
# sorted_datasets = sorted(datasets, key=lambda x: x.shape[0], reverse=True)

# # Finding all unique duplicates across datasets
# all_duplicates = combined_df[combined_df.duplicated('SMILES', keep=False)]

# # Remove duplicates in order from largest to smallest, except the smallest
# for i in range(len(sorted_datasets) - 1):  # Skip the smallest dataset
#     dataset = sorted_datasets[i]
#     duplicates_to_remove = all_duplicates[~all_duplicates['source'].isin([dataset['source'].iloc[0]])]['SMILES']
#     sorted_datasets[i] = dataset[~dataset['SMILES'].isin(duplicates_to_remove)]

# # Extracting the updated datasets
# enamine_protein_df, chemdiv_df, enamine_df, life_chemicals_df, robin_df = sorted_datasets

# # Now the datasets are deduplicated in the desired order

## remove all duplicates





In [29]:
# Combine datasets into one DataFrame
combined_df = pd.concat([enamine_protein_df, chemdiv_df, enamine_df, life_chemicals_df, robin_df])

# Finding all duplicates across datasets
all_duplicates = combined_df[combined_df.duplicated('SMILES', keep=False)]['SMILES'].unique()

# Remove these duplicates from each dataset
enamine_protein_df = enamine_protein_df[~enamine_protein_df['SMILES'].isin(all_duplicates)]
chemdiv_df = chemdiv_df[~chemdiv_df['SMILES'].isin(all_duplicates)]
enamine_df = enamine_df[~enamine_df['SMILES'].isin(all_duplicates)]
life_chemicals_df = life_chemicals_df[~life_chemicals_df['SMILES'].isin(all_duplicates)]
robin_df = robin_df[~robin_df['SMILES'].isin(all_duplicates)]

# Check for and remove duplicates within each individual dataset
datasets = [enamine_protein_df, chemdiv_df, enamine_df, life_chemicals_df, robin_df]
for i, dataset in enumerate(datasets):
    datasets[i] = dataset.drop_duplicates(subset='SMILES')

# Extracting the updated datasets
enamine_protein_df, chemdiv_df, enamine_df, life_chemicals_df, robin_df = datasets

In [30]:
# # find duplicates in enamine_protein_df and remove them

# enamine_protein_df = enamine_protein_df.drop_duplicates(subset='SMILES', keep='first')


In [31]:
# List of datasets
datasets = {
    'enamine_protein': enamine_protein_df,
    'chemdiv': chemdiv_df,
    'enamine': enamine_df,
    'life_chemicals': life_chemicals_df,
    'robin': robin_df
}

# Iterate over all pairs of datasets and count duplicates
for (name1, df1), (name2, df2) in itertools.combinations(datasets.items(), 2):
    # Find duplicates between df1 and df2
    duplicates = df1[df1['SMILES'].isin(df2['SMILES'])]
    
    # Print the count of duplicates
    print(f"Duplicates between {name1} and {name2}: {len(duplicates)}")

Duplicates between enamine_protein and chemdiv: 0
Duplicates between enamine_protein and enamine: 0
Duplicates between enamine_protein and life_chemicals: 0
Duplicates between enamine_protein and robin: 0
Duplicates between chemdiv and enamine: 0
Duplicates between chemdiv and life_chemicals: 0
Duplicates between chemdiv and robin: 0
Duplicates between enamine and life_chemicals: 0
Duplicates between enamine and robin: 0
Duplicates between life_chemicals and robin: 0


In [32]:
df_deduplication['after'] = [len(enamine_protein_df), len(chemdiv_df), len(enamine_df), len(life_chemicals_df), len(robin_df)]
df_deduplication['difference'] = df_deduplication['before'] - df_deduplication['after']
df_deduplication

Unnamed: 0,before,after,difference
Enamine Protein,460160,456080,4080
Chemdiv,20000,19908,92
Enamine,15520,11502,4018
Life Chemicals,5544,5308,236
Robin,2003,1992,11


In [33]:
# Load your datasets (assuming they are already loaded as per your previous message)

# Verify and sort datasets by size
datasets = [enamine_protein_df, chemdiv_df, enamine_df, life_chemicals_df, robin_df]
sorted_datasets = sorted(datasets, key=lambda x: x.shape[0], reverse=True)  # Sort by size

# Check for duplicates again
combined_df = pd.concat(sorted_datasets)
duplicates_check = combined_df[combined_df.duplicated('SMILES', keep=False)]
if duplicates_check.empty:
    print("No duplicates found.")
else:
    print("Duplicates still exist.")


No duplicates found.


In [34]:
# which dataset has the most duplicates?
duplicates_check['source'].value_counts()


Series([], Name: count, dtype: int64)

In [35]:
# check if True if not create dir named 'no_duplicates'
if not os.path.exists('../data/no_duplicates/v2'):
    os.makedirs('../data/no_duplicates/v2')

In [36]:
combined_df

Unnamed: 0,source,mol,SMILES
35520,enamine_protein,<rdkit.Chem.rdchem.Mol object at 0x7fa946a457b0>,CCc1nn(C)cc1NC(=O)c1cnn(CC(=O)NC2CCCCC2)c1
35521,enamine_protein,<rdkit.Chem.rdchem.Mol object at 0x7fa946a45530>,O=C(Cc1coc2ccccc12)NC1(c2ncon2)CCOCC1
35522,enamine_protein,<rdkit.Chem.rdchem.Mol object at 0x7fa946a45760>,CC(C)(C)c1ccc(O)c(NC(=O)c2cccc(Oc3cnccn3)c2)c1
35523,enamine_protein,<rdkit.Chem.rdchem.Mol object at 0x7fa946a45440>,CC(NC(=O)Cc1ccc(Cl)s1)(C(N)=O)c1ccccc1
35524,enamine_protein,<rdkit.Chem.rdchem.Mol object at 0x7fa946a46b10>,COCc1cccc(S(=O)(=O)Nc2cccc(-n3cn[nH]c3=O)c2C)c1
...,...,...,...
503222,robin,<rdkit.Chem.rdchem.Mol object at 0x7fa946a20fe0>,C=CC(=O)Nc1cccc(Nc2nc(N[C@H]3CC[C@H](N(C)C)CC3...
503223,robin,<rdkit.Chem.rdchem.Mol object at 0x7fa947361030>,N#C/C(C(=O)c1ccc(Cl)cc1Cl)=C1\NC(=O)c2ccc(Cl)c...
503224,robin,<rdkit.Chem.rdchem.Mol object at 0x7fa946a21080>,C[C@H](N[C@H]1C[C@H]1c1ccccc1)c1ccc2c(c1)OCCO2
503225,robin,<rdkit.Chem.rdchem.Mol object at 0x7fa946a210d0>,NCCC[C@@H](N)CC(=O)N[C@H]1CNC(=O)[C@@H]([C@@H]...


## v4 are deleted duplicates fully from everything

In [37]:
# create function for ECFP calculation. Function will use rdkit's AllChem.GetMorganFingerprintAsBitVect() function and it will compute ECFP4 fingerprints with radius 2, ECFP6 fingerprints with radius 3
# ,ECFP8 fingerprints with radius 4 and ECFP10 fingerprints with radius 5. Function will create new columns for each ECFP and it will return dataframe with new columns.

def compute_ECFP(df, smiles_column, radius):
    df['ECFP'+str(radius*2)] = df[smiles_column].apply(lambda x: GetMorganFingerprintAsBitVect(MolFromSmiles(x),radius, nBits=2048))
    
    return df

In [38]:
compute_ECFP(combined_df, 'SMILES', 3)

Unnamed: 0,source,mol,SMILES,ECFP6
35520,enamine_protein,<rdkit.Chem.rdchem.Mol object at 0x7fa946a457b0>,CCc1nn(C)cc1NC(=O)c1cnn(CC(=O)NC2CCCCC2)c1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
35521,enamine_protein,<rdkit.Chem.rdchem.Mol object at 0x7fa946a45530>,O=C(Cc1coc2ccccc12)NC1(c2ncon2)CCOCC1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ..."
35522,enamine_protein,<rdkit.Chem.rdchem.Mol object at 0x7fa946a45760>,CC(C)(C)c1ccc(O)c(NC(=O)c2cccc(Oc3cnccn3)c2)c1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
35523,enamine_protein,<rdkit.Chem.rdchem.Mol object at 0x7fa946a45440>,CC(NC(=O)Cc1ccc(Cl)s1)(C(N)=O)c1ccccc1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
35524,enamine_protein,<rdkit.Chem.rdchem.Mol object at 0x7fa946a46b10>,COCc1cccc(S(=O)(=O)Nc2cccc(-n3cn[nH]c3=O)c2C)c1,"[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
503222,robin,<rdkit.Chem.rdchem.Mol object at 0x7fa946a20fe0>,C=CC(=O)Nc1cccc(Nc2nc(N[C@H]3CC[C@H](N(C)C)CC3...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
503223,robin,<rdkit.Chem.rdchem.Mol object at 0x7fa947361030>,N#C/C(C(=O)c1ccc(Cl)cc1Cl)=C1\NC(=O)c2ccc(Cl)c...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
503224,robin,<rdkit.Chem.rdchem.Mol object at 0x7fa946a21080>,C[C@H](N[C@H]1C[C@H]1c1ccccc1)c1ccc2c(c1)OCCO2,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
503225,robin,<rdkit.Chem.rdchem.Mol object at 0x7fa946a210d0>,NCCC[C@@H](N)CC(=O)N[C@H]1CNC(=O)[C@@H]([C@@H]...,"[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [43]:
# check for duplicates in SMILES column 
combined_df[combined_df.duplicated('SMILES')]

Unnamed: 0,source,mol,SMILES,ECFP6


In [40]:
# split the dataset to 5 different datasets
chemdiv_df = combined_df[combined_df['source'] == 'chemdiv']
enamine_df = combined_df[combined_df['source'] == 'enamine']
enamine_protein_df = combined_df[combined_df['source'] == 'enamine_protein']
life_chemicals_df = combined_df[combined_df['source'] == 'life_chemicals']
robin_df = combined_df[combined_df['source'] == 'robin']


In [41]:
# reset indexes of each dataset
for dataset in [chemdiv_df, enamine_df, enamine_protein_df, life_chemicals_df, robin_df]:
    dataset.reset_index(drop=True, inplace=True)
    

In [42]:
# create a folder for ECFP datasets
if not os.path.exists('../data/ECFP_datasets'):
    os.makedirs('../data/ECFP_datasets')


# save ECFP datasets as pickle files
chemdiv_df.to_pickle('../../../data/ECFP_datasets/chemdiv_ECFP6_v4_dedup_all.pkl')
enamine_df.to_pickle('../../../data/ECFP_datasets/enamine_ECFP6_v4_dedup_all.pkl')
enamine_protein_df.to_pickle('../../../data/ECFP_datasets/enamine_protein_ECFP6_v4_dedup_all.pkl')
life_chemicals_df.to_pickle('../../../data/ECFP_datasets/life_chemicals_ECFP6_v4_dedup_all.pkl')
robin_df.to_pickle('../../../data/ECFP_datasets/robin_ECFP6_v4_dedup_all.pkl')
