In [None]:
## Imports

import pandas as pd
import numpy as np
import os
from scipy.spatial.distance import pdist, squareform
import networkx as nx
from sklearn.model_selection import GroupKFold

In [None]:
seed = 42
n_splits = 5
similarity_threshold= 0.95

endpoints = {'half_life':'logHL', 'clearance':'logCL'}
divergent_sources =  {'half_life': 'Fan', 'clearance': 'Astrazeneca'}
features = ['ecfp4', 'rdkit_ecfp4']

# Precomputing molecule clusters

In [None]:
def get_molecule_clusters(similarity_matrix, threshold=0.95):
    """Given a similarity matrix returns a list with all connected components using the similarity threshold"""
    
    # Build graph
    G = nx.Graph()
    N = len(similarity_matrix)
    
    # Add edges between molecules with similarity >= threshold
    for i in range(N):
        for j in range(i+1, N):
            if similarity_matrix[i, j] >= threshold:
                G.add_edge(i, j)
    
    # Find connected components
    clusters = list(nx.connected_components(G))

    # Sort clusters
    clusters = sorted(clusters, key=len, reverse=True)

    return clusters   

In [None]:
# Iterating across endpoints
mol2cluster= {}
for endpoint in endpoints:
    print(f'\n\n ### {endpoint} ###\n')
    mol2cluster[endpoint] = {}
    
    # Read mol descriptors
    df = pd.read_csv(os.path.join(os.getcwd(), '..', 'data', endpoint, f'{endpoints[endpoint]}_ecfp4_dataset.tsv'), sep='\t')
    ikeys = df['inchikey'].to_numpy()
    fps = np.array(df[['ECFP4_%i'%i for i in range(1,1025)]])
    
    # Calculated similarity matrix
    print('Calculating similarity matrix...')
    similarity_matrix = 1- squareform(pdist(fps, 'jaccard'))
    
    # Get clusters
    print('Getting molecule clusters...')
    clusters = get_molecule_clusters(similarity_matrix, similarity_threshold)
    
    # Map mols to clusters
    for cluster_id, members_ixs in enumerate(clusters):
        for member_ix in members_ixs:
            ikey = ikeys[member_ix]
            mol2cluster[endpoint][ikey] = cluster_id
            
print('Done!')

# Data Splitting

In [None]:
# Iterating across endpoints
for endpoint in endpoints:
    
    # Iterate through descriptors
    for feature in features:
        print(f'\n\n ### {endpoint} --> {feature} ### \n')
    
        # Read endpoint-feature data
        df = pd.read_csv(os.path.join(os.getcwd(), '..', 'data', endpoint, f'{endpoints[endpoint]}_{feature}_dataset.tsv'), sep='\t')
      
        # Random shuffle data to make folds split randomly
        df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
        
        # Set output file system
        cv_sets_directory = os.path.join(os.getcwd(), '..', 'data', endpoint, 'cv_sets', feature)
        os.makedirs(cv_sets_directory, exist_ok=True)
               
        # Count sources
        sources_list = [item for item in df['ref'].unique().tolist() if ',' not in item]
        source2nmols = dict([(source, len(df.loc[df['ref'].str.contains(source, regex=False)])) for source in sources_list])

        # Assigning each mol to a cluster
        cluster_list = [str(mol2cluster[endpoint][ikey]) if ikey in mol2cluster[endpoint] else str(ikey).split('-')[0] for ikey in df['inchikey']]
        
        # Generate splits keeping clusters together
        splitter = GroupKFold(n_splits=n_splits)
        ixs = np.arange(df.shape[0])
        for fold_ix, (train_ixs, test_ixs) in enumerate(splitter.split(ixs,ixs, groups=cluster_list)):
            
            fold = f'fold{fold_ix+1}'
            print(f'\n{fold}\n')
        
            # Getting test-fold dataframe
            test_fold_df = df.iloc[test_ixs]
          
            # Saving test-fold for each source
            for source in sources_list:
                source_fold = test_fold_df.loc[test_fold_df['ref'].str.contains(source, regex=False)]
                source_fold.to_csv(os.path.join(cv_sets_directory, f'{source}_{fold}.tsv'), sep='\t', index=False)
        
                # Print proportion of source in fold
                n_mols = len(source_fold) 
                print(source, round(n_mols/source2nmols[source],2))
       
                

## Scaling approach: Split Data (cross-validation)

In [None]:
# Iterate across endpoints
for endpoint in endpoints:
    divergent_source = divergent_sources[endpoint]
    
    # Iterate through features
    for feature in features:
        print(f'\n\n### {endpoint} --> {feature} ### \n')

        # Load data
        endpoint_df = pd.read_csv(os.path.join(os.getcwd(), '..', 'data', endpoint, f'{endpoints[endpoint]}_{feature}_dataset.tsv'), sep='\t')

        # Get Homogenous and divergent sources
        homogenous_df = endpoint_df.loc[endpoint_df['ref'] != divergent_source].copy()
        homogenous_df['ref'] = 'Homogenous'
        divergent_df = endpoint_df.loc[endpoint_df['ref'] == divergent_source].copy()

        # Remove shared molecules
        shared_mols = set(endpoint_df.loc[\
                  (endpoint_df['ref'].str.contains(divergent_source))\
                  & (endpoint_df['ref'] != divergent_source)
                  ].inchikey)
        homogenous_df = homogenous_df.query('inchikey not in @shared_mols').reset_index(drop=True)
        divergent_df = divergent_df.query('inchikey not in @shared_mols').reset_index(drop=True)
        
        for df in [homogenous_df, divergent_df]:
            
            # Random shuffle data to make folds split randomly
            df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
            
            # Set output file system
            cv_sets_scaling_directory = os.path.join(os.getcwd(), '..', 'data', endpoint, 'cv_sets_scaling', feature)
            os.makedirs(cv_sets_scaling_directory, exist_ok=True)
                   
            # Count sources
            sources_list = [item for item in df['ref'].unique().tolist() if ',' not in item]
            source2nmols = dict([(source, len(df.loc[df['ref'].str.contains(source, regex=False)])) for source in sources_list])
    
            # Assigning each mol to a cluster
            cluster_list = [str(mol2cluster[endpoint][ikey]) if ikey in mol2cluster[endpoint] else str(ikey).split('-')[0] for ikey in df['inchikey']]
            
            # Generate splits keeping clusters together
            splitter = GroupKFold(n_splits=n_splits)
            ixs = np.arange(df.shape[0])
            for fold_ix, (train_ixs, test_ixs) in enumerate(splitter.split(ixs,ixs, groups=cluster_list)):
                            
                # Getting test-fold dataframe
                test_fold_df = df.iloc[test_ixs]
             
                # Saving test-fold dataframe
                df_label = test_fold_df["ref"].iloc[0]
                test_fold_df.to_csv(os.path.join(cv_sets_scaling_directory,  f'{df_label}_fold{fold_ix+1}.tsv'), sep='\t', index=False)
              
      