In [1]:
import os
import pandas as pd
from rdkit.SimDivFilters import  MaxMinPicker
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.error')
from tqdm.auto import tqdm
tqdm.pandas()
from multiprocessing import Pool, cpu_count
import time
import pandas as pd
from rdkit import Chem

In [2]:
# load datasets pickle from ../data/ECFP_datasets/
rna_binders_df = pd.read_pickle('data_dataset2/standardized_data/ECFP_datasets_set2/rna_binders_ECFP6_set2_dedup.pkl')
rna_non_binders_df = pd.read_pickle('data_dataset2/standardized_data/ECFP_datasets_set2/rna_non_binders_ECFP6_set2_dedup.pkl')
protein_binders_df = pd.read_pickle('data_dataset2/standardized_data/ECFP_datasets_set2/protein_binders_ECFP6_set2_dedup.pkl')
protein_non_binders_df = pd.read_pickle('data_dataset2/standardized_data/ECFP_datasets_set2/protein_non_binders_ECFP6_set2_dedup.pkl')


In [3]:
rna_binders_df.shape, rna_non_binders_df.shape, protein_binders_df.shape, protein_non_binders_df.shape

((1961, 4), (21883, 4), (2276, 4), (24945, 4))

In [4]:
total_molecules_rna = len(rna_binders_df) + len(rna_non_binders_df)
total_molecules_rna

23844

In [5]:
# create folder for results
# create subfolder for each ECFP dataset ECFP4, ECFP6, ECFP8, ECFP10
directories = ['../data/diversity_picking/ECFP6_set2/']


for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)



In [6]:
rna_binders_df.columns

Index(['mol', 'source', 'smiles', 'ECFP6'], dtype='object')

In [7]:
all_df = pd.concat([rna_binders_df, rna_non_binders_df, protein_binders_df, protein_non_binders_df], ignore_index=True)
all_df.shape

(51065, 4)

In [8]:


def check_mol_smiles_ecfp(df):
    invalid_indices = []

    for idx, row in df.iterrows():
        mol = row['mol']
        smiles = row['smiles']
        ecfp = row['ECFP6']

        # Check mol validity
        if mol is None or mol.GetNumAtoms() == 0:
            invalid_indices.append(idx)

        # Check smiles validity
        if smiles is None or smiles == '':
            invalid_indices.append(idx)
        else:
            mol_from_smiles = Chem.MolFromSmiles(smiles)
            if mol_from_smiles is None:
                invalid_indices.append(idx)

        # Check ECFP6 validity
        if ecfp is None or ecfp == '':
            invalid_indices.append(idx)

    if invalid_indices:
        invalid_rows = df[df.index.isin(invalid_indices)]
        for _, row in invalid_rows.iterrows():
            mol = row['mol']
            smiles = row['smiles']
            ecfp = row['ECFP6']

            if mol is None or mol.GetNumAtoms() == 0:
                print(f"Invalid mol at index {row.name}: {smiles}")
            if smiles is None or smiles == '' or Chem.MolFromSmiles(smiles) is None:
                print(f"Invalid smiles at index {row.name}: {smiles}")
            if ecfp is None or ecfp == '':
                print(f"Invalid ECFP6 at index {row.name}: {ecfp}")

        df.drop(invalid_indices, inplace=True)
        print("Invalid rows removed from the dataframe.")
    else:
        print("No invalid rows found.")

    return df


In [9]:
check_mol_smiles_ecfp(all_df)

Invalid mol at index 8782: 
Invalid smiles at index 8782: 
Invalid rows removed from the dataframe.


Unnamed: 0,mol,source,smiles,ECFP6
0,<rdkit.Chem.rdchem.Mol object at 0x7f6910515850>,robin_b,CC(=O)c1ccc(Br)c(N)c1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,<rdkit.Chem.rdchem.Mol object at 0x7f68b0512ac0>,robin_b,O=C(NCCO)c1cc2ccccc2oc1=O,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
2,<rdkit.Chem.rdchem.Mol object at 0x7f68b052a070>,robin_b,Nc1ccc2oc(-c3ccccc3)nc2c1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,<rdkit.Chem.rdchem.Mol object at 0x7f68b052a110>,robin_b,Nc1ccc(-c2nc3ccccc3o2)cc1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,<rdkit.Chem.rdchem.Mol object at 0x7f68b052a1b0>,robin_b,Cc1nc2ccc(CCO)cc2s1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
51060,<rdkit.Chem.rdchem.Mol object at 0x7f68acea4680>,zinc_dark_m,Cc1nn(C)c2nc(N(C)C[C@H]3CC(=O)N(C4CCCC4)C3)sc12,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
51061,<rdkit.Chem.rdchem.Mol object at 0x7f68acea46d0>,zinc_dark_m,CCOC(=O)c1c(C)[nH]c(C(=O)[C@H](C)N2CCN(C(=O)c3...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
51062,<rdkit.Chem.rdchem.Mol object at 0x7f68acea4720>,zinc_dark_m,Cc1cc(C)nc(Nc2ccc([C@@H]3CN(C(=O)CCOc4ccccc4C)...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ..."
51063,<rdkit.Chem.rdchem.Mol object at 0x7f68acea4770>,zinc_dark_m,CC(C)C(=O)NN/C1=C(\C(=O)C(F)(F)F)CCCCCCCCCC1,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
all_df['source'].unique()

array(['robin_b', 'robin_n', 'probes_drugs', 'zinc_dark_m'], dtype=object)

In [11]:
# split all_df into datasets using column 'source' an reset index
rna_binders_df = all_df[all_df['source'] == 'robin_b'].reset_index(drop=True)
rna_non_binders_df = all_df[all_df['source'] == 'robin_n'].reset_index(drop=True)
protein_binders_df = all_df[all_df['source'] == 'probes_drugs'].reset_index(drop=True)
protein_non_binders_df = all_df[all_df['source'] == 'zinc_dark_m'].reset_index(drop=True)

In [12]:
rna_binders_df.shape, rna_non_binders_df.shape, protein_binders_df.shape, protein_non_binders_df.shape

((1961, 4), (21882, 4), (2276, 4), (24945, 4))

In [13]:
def diversity_picking_MaxMin(df, colname, filename):
    # Resetting the index of the DataFrame
    df = df.reset_index(drop=True)

    processed_col = df[colname]
    smiles_col = df['smiles']
    mol_col = df['mol']
    source_col = df['source']
    ecfp6_col = df['ECFP6']

    fps = processed_col.values
    mmp = MaxMinPicker()
    picks = mmp.LazyBitVectorPick(fps, len(df), len(df), seed=42)
    
    picked_df = pd.DataFrame({
        'mol': mol_col.iloc[picks],
        'smiles': smiles_col.iloc[picks],
        colname: processed_col.iloc[picks],
        'ECFP6': ecfp6_col.iloc[picks],     
        'source': source_col.iloc[picks]
    })

    picked_df.to_pickle(filename)
    return picked_df



In [14]:
# do diversity picking for all datasets
diversity_picking_MaxMin(rna_binders_df, 'ECFP6', '../data/diversity_picking/ECFP6_set2/rna_binders_ECFP6_set2_diversity_picked.pkl')
diversity_picking_MaxMin(rna_non_binders_df, 'ECFP6', '../data/diversity_picking/ECFP6_set2/rna_non_binders_ECFP6_set2_diversity_picked.pkl')
diversity_picking_MaxMin(protein_binders_df, 'ECFP6', '../data/diversity_picking/ECFP6_set2/protein_binders_ECFP6_set2_diversity_picked.pkl')
diversity_picking_MaxMin(protein_non_binders_df, 'ECFP6', '../data/diversity_picking/ECFP6_set2/protein_non_binders_ECFP6_set2_diversity_picked.pkl')



Unnamed: 0,mol,smiles,ECFP6,source
9342,<rdkit.Chem.rdchem.Mol object at 0x7f68ad9a28e0>,CCN[C@H]1CCCC[C@@H]1N(C)C,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",zinc_dark_m
146,<rdkit.Chem.rdchem.Mol object at 0x7f68ad476160>,Nc1cn2cccnc2n1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",zinc_dark_m
2237,<rdkit.Chem.rdchem.Mol object at 0x7f68aeb03560>,[O-][S@+](Cl)C(Cl)=C(Cl)Cl,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",zinc_dark_m
14204,<rdkit.Chem.rdchem.Mol object at 0x7f68ae199e40>,O=S1(=O)OC(F)(F)[C@@]1(F)C(F)(F)C(F)(F)C(F)(F)...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",zinc_dark_m
3880,<rdkit.Chem.rdchem.Mol object at 0x7f68ad4e78d0>,C1C[C@]2(CCSC2)CN1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",zinc_dark_m
...,...,...,...,...
24585,<rdkit.Chem.rdchem.Mol object at 0x7f68ace85710>,COC(=O)c1ccc(CN2CCN3CCC[C@H]3[C@@H]2C2CCCCC2)cc1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",zinc_dark_m
24641,<rdkit.Chem.rdchem.Mol object at 0x7f68ace86890>,CC[C@]1(C)CN(C(=O)N[C@H](C)[C@H]2CCOC2)CCO1,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",zinc_dark_m
24681,<rdkit.Chem.rdchem.Mol object at 0x7f68ace87510>,CC(C)N1CCC(N2CCN(Cc3c[nH]nc3-c3cccc(F)c3)C[C@H...,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",zinc_dark_m
24726,<rdkit.Chem.rdchem.Mol object at 0x7f68ace68360>,COC(=O)c1ccccc1-c1ccc([C@@H]2[C@@H](c3ccccn3)N...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",zinc_dark_m
