In [9]:
# Import libraries

import pandas as pd
import numpy as np

from IPython.display import display

import pickle
import itertools

from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from rdkit.Chem import Descriptors
from rdkit.Chem import MolFromSmiles

import random

from src.ligand_clustering_functions import bemis_murcko_clustering, butina_clustering, compound_k_means_clustering, get_decoys, compute_tanimoto

import os

In [2]:
def cluster_ligands(compound_list,fps,scaffolds,bm_clustering=True,butina_threshold=0.4,k_means_representatives=100):
    """Apply sequential clustering steps to a compound dataset in order to reduce redundancy and maximize structural diversity.""" 
    if bm_clustering == True:
        # Apply Bemis-Murcko clustering
        compound_list = bemis_murcko_clustering(compound_list,scaffolds)
    if butina_threshold <1:    
        # Apply Butina clustering. Set threshold == 1 to disable clustering.
        compound_list = butina_clustering(compound_list,fps,threshold=butina_threshold)
    if k_means_representatives:
        # Apply a last k-means clustering for a final maximum number of compounds.
        if len(compound_list) > k_means_representatives:
            compound_list = compound_k_means_clustering(compound_list,fps,n_clusters=100)
    return compound_list

def generate_compound_pairs(
    prot_list, 
    base_dataset, 
    fps, 
    scaffolds, 
    decoys, 
    n_decoys_per_lig=25, 
    decoys_proportion=2, 
    min_positives=25, 
    k_means_representatives=100, 
    butina_threshold=0.4,
    tanimoto_threshold=0.4,
    random_seed=10
):
    """
    Generate compound pairs (S and N) for a list of proteins, using clustering and decoys.
    
    Parameters
    ----------
    prot_list : list
        List of protein identifiers to process.
    base_dataset : pd.DataFrame
        Dataset with columns: 'lig', 'prot', 'activity', etc.
    fps : dict or list
        Fingerprints for the ligands.
    scaffolds : dict or list
        Scaffold assignments for the ligands.
    decoys : dict or list
        Decoys assigned for each active ligand.
    n_decoys_per_lig : int, optional
        Number of decoys to find per active ligand (default: 25).
    decoys_proportion : float, optional
        Proportion of decoys to sample relative to the number of actives (default: 2).
    min_positives : int, optional
        Minimum number of active ligands required to process a protein (default: 25).
    k_means_representatives : int, optional
        Number of representatives for K-means clustering (default: 100).
    butina_threshold : float, optional
        Similarity threshold for Butina clustering (default: 0.4).
    tanimoto_threshold: float, optional
        Maximum Tanimoto similarity of compound pairs to train the models (default:0.4)
    random_seed : int, optional
        Seed for random search (default: 10)
    
    Returns
    -------
    compound_pairs : pd.DataFrame
        DataFrame with columns ['prot', 'l1', 'l2', 'Tanimoto', 'y'] containing all N and S pairs.
    """
    random.seed(random_seed)
    compound_pairs = pd.DataFrame(columns=['prot', 'l1', 'l2', 'Tanimoto', 'y'])
    
    for prot in prot_list:
        # Select positive ligands (actives) for the current protein
        l_pos = base_dataset[(base_dataset['activity'] == 1) & (base_dataset['prot'] == prot)]['lig']
        l_pos = cluster_ligands(
            l_pos, fps, scaffolds,
            bm_clustering=True,
            butina_threshold=butina_threshold,
            k_means_representatives=k_means_representatives
        )
        
        # Proceed only if there are at least min_positives active ligands
        if len(l_pos) > min_positives:
            # Select negative ligands (true inactives) for the current protein
            l_neg = base_dataset[(base_dataset['activity'] == 0) & (base_dataset['prot'] == prot)]['lig']
            l_neg = cluster_ligands(
                l_neg, fps, scaffolds,
                bm_clustering=True,
                butina_threshold=butina_threshold,
                k_means_representatives=k_means_representatives
            )
            
            # Add decoys to the set of negatives
            decoys_l_neg = get_decoys(l_pos, decoys, scaffolds, n_decoys_per_lig=n_decoys_per_lig)
            if len(decoys_l_neg) > len(l_pos):
                # If enough decoys, sample proportional to the number of actives
                l_neg += random.sample(decoys_l_neg, int(decoys_proportion * len(l_pos)))
            else:
                # Otherwise, use all available decoys
                l_neg += decoys_l_neg

            # Generate S pairs: all possible pairs of actives
            s_pairs = list(itertools.combinations(l_pos, 2))
            tanimoto_s_pairs = compute_tanimoto(s_pairs,fps)
            s_pairs_df = pd.DataFrame(s_pairs, columns=['l1', 'l2'])
            s_pairs_df['Tanimoto'] = tanimoto_s_pairs
            s_pairs_df['y'] = 1  # Mark as similar

            # Generate N pairs: all possible pairs of one active and one inactive (or decoy)
            n_pairs = list(itertools.product(l_pos, l_neg))
            tanimoto_n_pairs = compute_tanimoto(n_pairs,fps)
            n_pairs_df = pd.DataFrame(n_pairs, columns=['l1', 'l2'])
            n_pairs_df['Tanimoto'] = tanimoto_n_pairs
            n_pairs_df['y'] = 0  # Mark as non-similar

            # Combine positive and negative pairs
            total_pairs_prot = pd.concat([s_pairs_df, n_pairs_df], axis=0)
            total_pairs_prot['prot'] = prot

            # Filter out pairs with Tanimoto >= the specified threshold
            total_pairs_prot = total_pairs_prot[total_pairs_prot['Tanimoto'] < tanimoto_threshold]

            # Concatenate with the main DataFrame, keeping relevant columns
            compound_pairs = pd.concat([compound_pairs, total_pairs_prot], axis=0)
            compound_pairs = compound_pairs[['prot', 'l1', 'l2', 'Tanimoto', 'y']]
    
    return compound_pairs

def shuffle_and_save_chunks(
    df, 
    output_folder, 
    num_chunks=30, 
    prefix="chunk", 
    random_state=42
):
    """
    Shuffle the rows of a DataFrame and save it in evenly-sized chunks as CSV files.

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame to shuffle and split.
    output_folder : str
        Path to the directory where the CSV files will be saved.
    num_chunks : int, optional
        Number of output chunks/files (default: 30).
    prefix : str, optional
        Prefix for output file names (default: "chunk").
    random_state : int, optional
        Seed for reproducible shuffling (default: 42).
    """
    # Shuffle the DataFrame rows
    df_shuffled = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    # Compute the approximate size of each chunk
    chunk_size = len(df_shuffled) // num_chunks
    
    # Ensure the output directory exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Split and save each chunk
    for i in range(num_chunks):
        start = i * chunk_size
        end = len(df_shuffled) if i == num_chunks - 1 else (i + 1) * chunk_size
        
        # Get the corresponding slice
        sub_df = df_shuffled.iloc[start:end]
        
        # Save as CSV
        sub_df.to_csv(
            os.path.join(output_folder, f"{prefix}_{i + 1}.csv"),
            index=False
        )

In [3]:
# Open databases
data_dir = 'data'
base_dataset = pd.read_csv(f'{data_dir}/prot_ligs_db.csv')
# Open fingerprints dictionary
with open(f'{data_dir}/comps_fps.pkl','rb') as f:
    fps = pickle.load(f) 
# Open scaffolds dictionary
with open(f'{data_dir}/ligs_scaffolds.pkl','rb') as f:
    scaffolds = pickle.load(f)
# Open decoys dictionary
with open(f'{data_dir}/decoys_dict.pkl','rb') as f:
    decoys = pickle.load(f)

## Overview of the databases

In [4]:
# Show base dataset (extracted from ChEMBL)
base_dataset.head(20)

Unnamed: 0,lig,prot,pchembl,comment,pfam,activity
0,CHEMBL514046,P37059,6.57,,PF00106,1.0
1,CHEMBL514046,P37059,6.6,,PF00106,1.0
2,CHEMBL79955,P11715,4.2,,PF00067,0.0
3,CHEMBL78395,P11715,4.3,,PF00067,0.0
4,CHEMBL410953,F5BCZ9,8.0,,PF02364,1.0
5,CHEMBL405670,F5BCZ9,9.0,,PF02364,1.0
6,CHEMBL437250,F5BCZ9,7.3,,PF02364,1.0
7,CHEMBL428614,F5BCZ9,7.82,,PF02364,1.0
8,CHEMBL438381,F5BCZ9,7.52,,PF02364,1.0
9,CHEMBL266048,F5BCZ9,8.0,,PF02364,1.0


In [11]:
# Show statistics by protein group

# Filter only bioactive records (activity == 1.0)
bioactive = base_dataset[base_dataset['activity'] == 1.0]

# Group by Pfam and calculate the number of unique proteins and unique bioactive ligands
stats = bioactive.groupby('pfam').agg(
    Protein_Count=('prot', pd.Series.nunique),
    Bioactive_ligand_count=('lig', pd.Series.nunique)
).reset_index()

# Rename columns for clarity
stats = stats.rename(columns={
    'pfam': 'Pfam',
    'Protein_Count': 'Protein Count',
    'Bioactive_ligand_count': 'Bioactive ligand count'
})

# Sort by number of unique proteins (descending)
stats = stats.sort_values(by='Protein Count', ascending=False).reset_index(drop=True)

# Display the statistics table
display(stats)

Unnamed: 0,Pfam,Protein Count,Bioactive ligand count
0,PF00001,487,97166
1,PF00069,379,41368
2,PF07714,153,46599
3,PF00520,95,9892
4,PF00089,81,12013
...,...,...,...
211,PF00266,1,3
212,PF00275,1,1
213,PF00756,1,1
214,PF01702,1,3


## Dataset generation

In [None]:
# Select proteins for training and test datasets. 
# As a default example we select the protein group corresponding to Pfam PF00413 (MPG). 
# We use data from the protein P08254 for testing. Adapt the code to the required protein groups and test proteins.

prot_test = ['P08254'] 
prot_train = [p for p in base_dataset[base_dataset['pfam']== 'PF00413']['prot'].unique() if p not in prot_test]


In [None]:
# Generate train dataset
os.makedirs('./train_datasets', exist_ok=True)

train_pairs = generate_compound_pairs(
    prot_train, 
    base_dataset, 
    fps, 
    scaffolds, 
    decoys
)


In [None]:
# Generate test dataset
os.makedirs('./test_datasets', exist_ok=True)

test_pairs = generate_compound_pairs(
    prot_test, 
    base_dataset, 
    fps, 
    scaffolds, 
    decoys
)

In [None]:
# Save the training dataset in multiple chunks.
# Increasing the number of chunks is recommended for larger datasets to optimize memory usage.

num_chunks = 10
shuffle_and_save_chunks(train_pairs, './train_datasets/', num_chunks=num_chunks)


In [None]:
# Save test dataset if generated
test_pairs.to_csv('./test_datasets/test_pairs.csv',index=False)