In [1]:
import os
import pandas as pd
from rdkit.SimDivFilters import  MaxMinPicker
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.error')
from tqdm.auto import tqdm
tqdm.pandas()
from multiprocessing import Pool, cpu_count
import time

In [2]:
# load datasets pickle from ../data/ECFP_datasets/
enamine_protein = pd.read_pickle('../data/ECFP_datasets/enamine_protein_ECFP6_v4_dedup_all.pkl')
chemdiv_rna = pd.read_pickle('../data/ECFP_datasets/chemdiv_ECFP6_v4_dedup_all.pkl')
enamine_rna = pd.read_pickle('../data/ECFP_datasets/enamine_ECFP6_v4_dedup_all.pkl')
life_chemicals_rna = pd.read_pickle('../data/ECFP_datasets/life_chemicals_ECFP6_v4_dedup_all.pkl')
robin_rna = pd.read_pickle('../data/ECFP_datasets/robin_ECFP6_v4_dedup_all.pkl')

In [3]:
total_molecules_rna = len(chemdiv_rna) + len(enamine_rna) + len(life_chemicals_rna) + len(robin_rna)


In [4]:
# # for each dataset create a column source, which will be used to identify the source of the molecule

# enamine_protein['source'] = 'enamine_protein'
# chemdiv_rna['source'] = 'chemdiv_rna'
# enamine_rna['source'] = 'enamine_rna'
# life_chemicals_rna['source'] = 'life_chemicals_rna'
# robin_rna['source'] = 'robin_rna'


In [5]:
life_chemicals_rna.head()

Unnamed: 0,source,SMILES,mol,ECFP6
0,life_chemicals,Cn1c(N2CCC(CO)CC2)cc(=O)n(C)c1=O,<rdkit.Chem.rdchem.Mol object at 0x7fa6101501d0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,life_chemicals,O=c1cccc2n1CC1CC2CN(C2CCOCC2)C1,<rdkit.Chem.rdchem.Mol object at 0x7fa60d0e0220>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,life_chemicals,CN1CCCN(c2ncccc2C#N)CC1,<rdkit.Chem.rdchem.Mol object at 0x7fa60d0e0270>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
3,life_chemicals,CN1CCN(C(=O)c2cc(C3CC3)on2)CC1,<rdkit.Chem.rdchem.Mol object at 0x7fa60d0e02c0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
4,life_chemicals,O=C(NCC1CCCO1)c1noc2c1CCCC2,<rdkit.Chem.rdchem.Mol object at 0x7fa60d0e0310>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [6]:
# create folder for results
# create subfolder for each ECFP dataset ECFP4, ECFP6, ECFP8, ECFP10
directories = ['../data/diversity_picking/ECFP6_v2/',
               '../data/diversity_picking/RNA/ECFP6_v2/',]


for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)



In [7]:
def diversity_picking_MaxMin(df, colname, filename):
    # Resetting the index of the DataFrame
    df = df.reset_index(drop=True)

    processed_col = df[colname]
    smiles_col = df['SMILES']
    mol_col = df['mol']
    source_col = df['source']

    fps = processed_col.values
    mmp = MaxMinPicker()
    picks = mmp.LazyBitVectorPick(fps, len(df), len(df), seed=42)
    
    picked_df = pd.DataFrame({
        'mol': mol_col.iloc[picks],
        'smiles': smiles_col.iloc[picks],
        colname: processed_col.iloc[picks],     
        'source': source_col.iloc[picks]
    })

    picked_df.to_pickle(filename)
    return picked_df



In [8]:
def diversity_picking_MaxMin_protein(df, colname, filename):
    df = df.reset_index(drop=True)

    
    processed_col = df[colname]
    smiles_col = df['SMILES']
    mol_col = df['mol']
    source_col = df['source']

    fps = processed_col.values
    mmp = MaxMinPicker()
    picks = mmp.LazyBitVectorPick(fps, len(df), total_molecules_rna, seed=42)
    
    picked_df = pd.DataFrame({
        'mol': mol_col[picks],
        'smiles': smiles_col[picks],
        # ECFP col will have the same name as the input colname
        colname: processed_col[picks],   
        'source': source_col[picks]
    })

    picked_df.to_pickle(filename)
    return picked_df


In [9]:

datasets = [
    (robin_rna, 'ECFP6', '../data/diversity_picking/ECFP6_v2/robin_rna_ECFP6_MaxMin_v3.pkl'),
    (chemdiv_rna, 'ECFP6', '../data/diversity_picking/ECFP6_v2/chemdiv_rna_ECFP6_MaxMin_v3.pkl'),
    (enamine_rna, 'ECFP6', '../data/diversity_picking/ECFP6_v2/enamine_rna_ECFP6_MaxMin_v3.pkl'),
    (life_chemicals_rna, 'ECFP6', '../data/diversity_picking/ECFP6_v2/life_chemicals_rna_ECFP6_MaxMin_v3.pkl'),

]

protein = [
    (enamine_protein, 'ECFP6', '../data/diversity_picking/ECFP6_v2/enamine_protein_ECFP6_MaxMin_v3.pkl'),
]

In [10]:

# Determine the number of processors to use
num_processors = 8   # Use all available processors: cpu_count()



In [11]:
def process_dataset(dataset):
    df, fp_type, output_path = dataset
    with tqdm(total=len(df), desc=f"Processing {fp_type}") as pbar:
        start_time = time.time()
        diversity_picking_MaxMin(df, fp_type, output_path)
        elapsed_time = time.time() - start_time
        pbar.set_postfix({"Time": f"{elapsed_time:.2f} s"})
        pbar.update(len(df))

def process_dataset_protein(dataset):
    df, fp_type, output_path = dataset
    with tqdm(total=len(df), desc=f"Processing {fp_type}") as pbar:
        start_time = time.time()
        diversity_picking_MaxMin_protein(df, fp_type, output_path)
        elapsed_time = time.time() - start_time
        pbar.set_postfix({"Time": f"{elapsed_time:.2f} s"})
        pbar.update(len(df))

In [12]:
# with Pool(processes=num_processors) as pool:
#     list(tqdm(pool.imap(process_dataset, datasets), total=len(datasets), desc="Processing RNA datasets"))
print(f'Processing RNA datasets')    
# use without tqdm
with Pool(processes=num_processors) as pool:
    pool.map(process_dataset, datasets)
    print(f"Processing RNA datasets done")

Processing RNA datasets


Processing RNA datasets done


In [13]:
# with Pool(processes=num_processors) as pool:
#     list(tqdm(pool.imap(process_dataset_protein, protein), total=len(protein), desc="Processing protein datasets"))

print(f"Processing protein datasets")

with Pool(processes=num_processors) as pool:
    pool.map(process_dataset_protein, protein)
    print(f"Processing protein datasets done")

Processing protein datasets


Processing protein datasets done


In [14]:
rna_df = pd.concat([chemdiv_rna, enamine_rna, robin_rna, life_chemicals_rna], ignore_index=True)
rna_df.reset_index(drop=True, inplace=True)


In [15]:
rna_df.shape

(38710, 4)

In [16]:
rna_datasets = [
    (rna_df, 'ECFP6', '../data/diversity_picking/RNA/ECFP6_v2/rna_ECFP6_MaxMin_v3.pkl'),  #  explain the set = (df, colname, filename)

]


def diversity_picking_MaxMin_rna_merged(df, colname,  filename):
    fps = df[colname].values
    mmp = MaxMinPicker()
    picks = mmp.LazyBitVectorPick(fps, len(df), len(df), seed=42) # second len(df) is the number of molecules to pick 
    picked_df = df.iloc[picks]
    picked_df.to_pickle(filename)
    return picked_df


def process_rna_dataset(dataset):
    df, fp_type, output_path = dataset
    with tqdm(total=len(df), desc=f"Processing {fp_type} (RNA)") as pbar:
        start_time = time.time()
        diversity_picking_MaxMin_rna_merged(df, fp_type, output_path)
        elapsed_time = time.time() - start_time
        pbar.set_postfix({"Time": f"{elapsed_time:.2f} s"})
        pbar.update(len(df))


# with Pool(processes=num_processors) as pool:
#     list(tqdm(pool.imap(process_rna_dataset, rna_datasets), total=len(rna_datasets), desc="Processing RNA merged datasets"))
    
print(f"Processing RNA merged datasets")
with Pool(processes=num_processors) as pool:
    pool.map(process_rna_dataset, rna_datasets)
    print(f"Processing RNA merged datasets done")
    

Processing RNA merged datasets


Processing RNA merged datasets done
