# Subset selection for analysis to determine the optimal number of sets:

Originally, we generated a virtual library of 1 million compounds. However, for some generators, this process is very time-consuming. Therefore, we decided to explore different subset sizes—62 500, 125 000, 250 000, and 500 000—to identify the optimal size.

In [6]:
import pandas as pd
from itertools import combinations
import os

# Molpher generator:

For the Molpher generator, we need to select the entire path for random selection because Molpher creates a path between the starting molecule and the target molecule.

In [4]:
# Loop through different parameters for data processing
receptor = 'Glucocorticoid_receptor'
for type_split in ['dis', 'sim']:
    for number in [0, 1, 2, 3, 4]:
        for target in [300000]:
            
            # Load the dataset
            df = pd.read_csv(f"data/output_sets/{receptor}/Molpher/cOS_Molpher_{type_split}_{number}_all_columns.csv", header=None)
            df.columns = ['id_input', 'id_output', 'smiles_input', 'smiles_output', 'molpher', 'TC', 'count', 'time', 'scaf_input', 'scaf_output']
            
            # Group data by 'id_input' and 'id_output', keeping the first values of 'count' and 'molpher'
            grouped = df.groupby(['id_input', 'id_output']).agg({
                'count': 'first',    # Keep the first 'count' value
                'molpher': 'first'   # Keep the first 'molpher' value
            }).reset_index()
            
            # Target sum for selection
            
            def random_selection(df, target):
                """
                Function to randomly select rows from the DataFrame until their count sum reaches the target value.
                """
                selected_rows = []
                selected_rows_idx = []
                current_sum = 0
                
                # Perform random selection until the count sum reaches the target
                while current_sum < target:
                    random_row = df.sample(n=1)  # Select a random row

                    if random_row.index.item() not in selected_rows_idx:
                        selected_rows_idx.append(random_row.index.item())
                        selected_rows.append(random_row)
                        current_sum += random_row['count'].values[0]  # Add the 'count' value to the total
                
                # Concatenate all selected rows into a new DataFrame
                result_df = pd.concat(selected_rows, ignore_index=True)
                return result_df, current_sum
            
            # Apply the random selection function
            result, total_sum = random_selection(grouped, target)
            
            print(f"Selected combination ({len(result)} rows):")
            print(f"Total count sum: {total_sum}")
            
            # Extract id_input and id_output from the selected results
            ids_to_filter = result[['id_input', 'id_output']]
            
            # Filter the original DataFrame based on the selected id_input and id_output
            filtered_df = df[df[['id_input', 'id_output']].apply(tuple, axis=1).isin(ids_to_filter.apply(tuple, axis=1))]
            
            print("\nFiltering the original DataFrame based on id_input and id_output:")
            
            # Reset index for the filtered DataFrame
            filtered_df = filtered_df.reset_index(drop=True)
            
            # Extract 'molpher' column for the output
            morphs = filtered_df['molpher']
            print(f"Number of selected morphs: {len(morphs)}")
            
            # Convert target to string format for file naming
            target_str = {62500: '62.5k', 125000: '125k', 300000: '300k'}.get(target, str(target))
            
            # Save the selected 'molpher' values to a CSV file
            folder = f"data/output_sets/{receptor}/Molpher_{target_str}"
            if not os.path.exists(folder):
                os.makedirs(folder)
            output_path = f"{folder}/cOS_Molpher_{target_str}_{type_split}_{number}_one_column.csv"
            morphs.to_csv(output_path, index=False, header=False)
            print(f"Output saved to {output_path}\n")

Selected combination (2729 rows):
Total count sum: 300163

Filtering the original DataFrame based on id_input and id_output:
Number of selected morphs: 300163
Output saved to data/output_sets/Glucocorticoid_receptor/Molpher_300k/cOS_Molpher_300k_dis_0_one_column.csv

Selected combination (2668 rows):
Total count sum: 300137

Filtering the original DataFrame based on id_input and id_output:
Number of selected morphs: 336993
Output saved to data/output_sets/Glucocorticoid_receptor/Molpher_300k/cOS_Molpher_300k_dis_1_one_column.csv

Selected combination (2820 rows):
Total count sum: 300138

Filtering the original DataFrame based on id_input and id_output:
Number of selected morphs: 300138
Output saved to data/output_sets/Glucocorticoid_receptor/Molpher_300k/cOS_Molpher_300k_dis_2_one_column.csv

Selected combination (2546 rows):
Total count sum: 300095

Filtering the original DataFrame based on id_input and id_output:
Number of selected morphs: 300095
Output saved to data/output_sets/Gluc

# Other generators:
random seletion for other generators

In [8]:
# Iterate through different molecular generators
receptor = 'Leukocyte_elastase'
for generator in [
            #f"REINVENT",
            #f"DrugEx_GT_epsilon_0.1",
            #f"DrugEx_GT_epsilon_0.6",
            #f"DrugEx_RNN_epsilon_0.1",
            #f"DrugEx_RNN_epsilon_0.6",
            #f"GB_GA_mut_r_0.01",
            f"GB_GA_mut_r_0.5",
            #f"addcarbon"
]:
    # Iterate through different types of splitting methods
    for type_split in ['dis', 'sim']:
        # Iterate through different dataset numbers
        for number in [0, 1, 2, 3, 4]:
            # Iterate through different target sizes
            for target in [62500, 125000, 250000, 500000]:
                random_smiles = None

                # Read the input dataset
                df = pd.read_csv(
                    f"data/output_sets/{receptor}/{generator}/cOS_{generator}_{type_split}_{number}_one_column.csv",
                    header=None
                )
                df.columns = ['smiles']  # Assign column name
                
                # Randomly sample the specified number of molecules
                random_smiles = df['smiles'].sample(n=target, random_state=42).reset_index(drop=True)

                # Convert numerical target size to a readable string
                target_str = {62500: '62.5k', 125000: '125k', 250000: '250k', 500000: '500k'}.get(target, str(target))
                
                # Save the sampled molecules to a new CSV file
                folder = f"data/output_sets/{receptor}/{generator}_{target_str}"

                if not os.path.exists(folder):
                    os.makedirs(folder)
                random_smiles.to_csv(
                    f"{folder}/cOS_{generator}_{target_str}_{type_split}_{number}_one_column.csv",
                    index=False, header=False
                )
