In [None]:
## Imports

import pandas as pd
import numpy as np
import os

In [None]:
import random
seed = 42
random.seed(seed)

# Data Splitting

In [None]:
endpoints = {'half_life':'logHL', 'clearance':'logCL'}
features = ['ecfp4', 'rdkit_ecfp4']

In [None]:
for endpoint in endpoints:
    for feature in features:
        print(f'\n\n ### {endpoint} --> {feature} ### \n')
        
        # Load data
        endpoint_df = pd.read_csv(os.path.join(os.getcwd(), '..', 'data', endpoint, f'{endpoints[endpoint]}_{feature}_dataset.tsv'), sep='\t')

        # Get molecule counts per source
        source_counts = {}
        sources_list = [item for item in endpoint_df['ref'].unique().tolist() if ',' not in item]
        for source in sources_list:
            source_mols = endpoint_df.loc[endpoint_df['ref'].str.contains(source, regex=False)]
            source_counts[source] = len(source_mols)
        print(f'Molecule counts per source: {source_counts}')

        # Sort sources by number of molecules (smallest to largest)
        sources_list_sorted = [source for source, _ in sorted(source_counts.items(), key=lambda item: item[1])]
        print(f'Sorted sources: {sources_list_sorted}')

        # Get the set of molecules for each fold
        folds_mol_set = {}
        for i, source in enumerate(sources_list_sorted):
            # Get the data for the given source
            source_data = endpoint_df.loc[endpoint_df['ref'].str.contains(source, regex=False)]
            if i == 0: # smaller source
                # Suffle the data
                source_data_shuffled = source_data.sample(frac=1, random_state=seed)
                # Split into 5 folds
                folds = np.array_split(source_data_shuffled, 5)
                # Save each fold inchikey set
                for i, fold in enumerate(folds):
                    folds_mol_set[f'fold{i+1}'] = set(fold['inchikey'].tolist())
            else: # other sources
                # Exclude already included molecules from the source data
                excluded_mols = [mol for test_mols in folds_mol_set.values() for mol in test_mols]
                source_data_filtered = source_data.loc[~source_data['inchikey'].isin(excluded_mols)]
                for fold, inchikey_set in folds_mol_set.items():
                    # Extract the source molecules already included in the given fold
                    previous_mols = inchikey_set.intersection(set(source_data['inchikey'].tolist()))
                    # Add source molecules up to the corresponding proportion (20%)
                    n_mols_to_add = round(len(source_data)*0.2) - len(previous_mols)
                    if n_mols_to_add > 0:
                        if fold == 'fold5':
                            n_mols_to_add = len(source_data_filtered)
                        mols_to_add = source_data_filtered.sample(n=n_mols_to_add, random_state=seed)
                        # Remove added molecules from the remaining source data
                        source_data_filtered = source_data_filtered.loc[~source_data_filtered['inchikey'].isin(mols_to_add['inchikey'].tolist())]
                        # Update fold inchikey set
                        folds_mol_set[fold] = inchikey_set.union(set(mols_to_add['inchikey'].tolist()))

        # Save each data source fold data in a TSV file
        cv_sets_directory = os.path.join(os.getcwd(), '..', 'data', endpoint, 'cv_sets')
        if not os.path.exists(cv_sets_directory):
            os.makedirs(cv_sets_directory)
        if not os.path.exists(os.path.join(cv_sets_directory, feature)):
            os.makedirs(os.path.join(cv_sets_directory, feature))
        for fold, inchikey_set in folds_mol_set.items():
            print(f'\n{fold}\n')
            for source in source_counts.keys():
                source_data = endpoint_df.loc[endpoint_df['ref'].str.contains(source, regex=False)]
                source_fold_set = inchikey_set.intersection(set(source_data['inchikey'].tolist()))
                source_fold = source_data.loc[source_data['inchikey'].isin(list(source_fold_set))]
                prop = (len(source_fold_set) / source_counts[source]) * 100
                print(f'{source}: {prop}')

                source_fold.to_csv(os.path.join(cv_sets_directory, feature, f'{source}_{fold}.tsv'), sep='\t', index=False)

## Scaling approach: Split Data (cross-validation)

In [None]:
endpoints = {'half_life':['logHL', 'Fan'], 'clearance':['logCL', 'Astrazeneca']}
features = ['ecfp4', 'rdkit_ecfp4']

In [None]:
for endpoint in endpoints:
    for feature in features:
        print(f'\n\n ### {endpoint} --> {feature} ### \n')

        # Load data
        endpoint_df = pd.read_csv(os.path.join(os.getcwd(), '..', 'data', endpoint, f'{endpoints[endpoint][0]}_{feature}_dataset.tsv'), sep='\t')

        # Exclude molecules shared between divergent and homogenous sources
        shared_mols = endpoint_df.loc[endpoint_df['ref'].str.contains(endpoints[endpoint][1])]
        shared_mols = shared_mols.loc[shared_mols['ref'] != endpoints[endpoint][1]]
        endpoint_df_filtered = endpoint_df.loc[~endpoint_df['inchikey'].isin(shared_mols['inchikey'].tolist())]
        print(f'Original dataset: {endpoint_df.shape}')
        print(f'Filtered dataset: {endpoint_df_filtered.shape}')

        # Select divergnet data
        divergent_df = endpoint_df_filtered.loc[endpoint_df_filtered['ref'] == endpoints[endpoint][1]]
        print(f'Divergent dataset: {divergent_df.shape}')
        # Select homogenous data
        homogenous_df = endpoint_df_filtered.loc[endpoint_df_filtered['ref'] != endpoints[endpoint][1]]
        homogenous_df['ref'] = 'Homogenous'
        print(f'Homogenous dataset: {homogenous_df.shape}')

        for df in [divergent_df, homogenous_df]:
            print(f'\n{df["ref"].unique()[0]} data')
            # Suffle data
            df_shuffled = df.sample(frac=1, random_state=seed)
            # Split into 5 folds
            folds = np.array_split(df_shuffled, 5)
            # Save each fold data in a TSV file
            cv_sets_scaling_directory = os.path.join(os.getcwd(), '..', 'data', endpoint, 'cv_sets_scaling')
            if not os.path.exists(cv_sets_scaling_directory):
                os.makedirs(cv_sets_scaling_directory)
            if not os.path.exists(os.path.join(cv_sets_scaling_directory, feature)):
                os.makedirs(os.path.join(cv_sets_scaling_directory, feature))
            for i, fold in enumerate(folds):
                print(f'Fold {i+1}: {len(fold)}')
                fold.to_csv(os.path.join(cv_sets_scaling_directory, feature, f'{fold["ref"].unique()[0]}_fold{i+1}.tsv'), sep='\t', index=False)