# Safety predictive models

We trained Gradient Boosting and XGBoost models, using Optuna framework to carefully tune classifier hyperparameters, for each combination of safety endpoint (totalling 999 Preferred Terms, or side effects) and input data modality, including: Cell Painting features, RDKit 1D descriptors, ECFP4 fingerprints, Mordred descriptors, and Physicochemical properties.

For each descriptor type, we used the test set (20%) and different classification metrics for assessing the prediction model performance: Balanced Accuracy, Precision, Recall, F<sub>1</sub>-score, Area Under Curve-Receiver Operating Characteristic (AUC-ROC), Area Under Curve-Precision Recall (PRAUC or average precision, AP), and MCC (Matthews Correlation Coefficient).

In [None]:
# Standard Library Imports
from itertools import combinations_with_replacement

# Third-Party Imports
from scipy.stats import ks_2samp

# Local Imports
from src.utils import *

## Loading of data

In [None]:
# Some specifications
missing_cpd = 'FFINMCNLQNTKLU-UHFFFAOYSA-N'

In [None]:
# Create the complete dataset for each molecular representation
# Cell Painting features
cp_pt_data = create_complete_dataset('1_data/CellPainting_data_feature_selection.csv', '5_data/PT_binary_matrix.csv')
cp_pt_data = cp_pt_data[cp_pt_data['CPD_INCHIKEY'] != missing_cpd] # remove the missing compound in the PC dataset
cp_pt_data.shape

In [None]:
# RDKit 1D descriptors
desc_pt_data = create_complete_dataset('2_data/CPcompounds_1D_RDKit.tsv', '5_data/PT_binary_matrix.csv')
desc_pt_data = desc_pt_data[desc_pt_data['CPD_INCHIKEY'] != missing_cpd] # remove the missing compound in the PC dataset
desc_pt_data.shape

In [None]:
# ECFP4 fingerprints
ecfp4_pt_data = create_complete_dataset('2_data/CPcompounds_ECFP4_1024.tsv', '5_data/PT_binary_matrix.csv')
ecfp4_pt_data = ecfp4_pt_data[ecfp4_pt_data['CPD_INCHIKEY'] != missing_cpd] # remove the missing compound in the PC dataset
ecfp4_pt_data.shape

In [None]:
# Mordred descriptors
mordred_pt_data = create_complete_dataset('2_data/CPcompounds_Mordred.tsv', '5_data/PT_binary_matrix.csv')
mordred_pt_data = mordred_pt_data[mordred_pt_data['CPD_INCHIKEY'] != missing_cpd] # remove the missing compound in the PC dataset
mordred_pt_data.shape

In [None]:
# Physicochemical properties
pc_pt_data = create_complete_dataset('2_data/CPcompounds_physicochemical_properties.tsv', '5_data/PT_binary_matrix.csv')
pc_pt_data = pc_pt_data.drop(columns=['pc_logBB', 'pc_pgp'])
pc_pt_data.shape

## Training and Evaluating the predictive indiviual models

In [None]:
# Train and evaluate predictive models for each descriptor type
# Cell Painting features
cp_results = se_model_training_and_evaluation(cp_pt_data, endpoint='all', check_endpoint_distribution=False, optuna_trials=50, 
                                              verbose_optuna=False, plot_loss_model=False, plot_results=False, plot_feature_importance=False, 
                                              results_filename='cp_safety_results.tsv')
cp_results

In [None]:
# RDKit 1D descriptors
desc_results = se_model_training_and_evaluation(desc_pt_data, endpoint='all', check_endpoint_distribution=False, optuna_trials=50, 
                                                verbose_optuna=False, plot_loss_model=False, plot_results=False, plot_feature_importance=False, 
                                                results_filename='desc_safety_results.tsv')
desc_results

In [None]:
# ECFP4 fingerprints
ecfp4_results = se_model_training_and_evaluation(ecfp4_pt_data, endpoint='all', check_endpoint_distribution=False, optuna_trials=50, 
                                                 verbose_optuna=False, plot_loss_model=False, plot_results=False, plot_feature_importance=False, 
                                                 results_filename='ecfp4_safety_results.tsv')
ecfp4_results

In [None]:
# Mordred descriptors
mordred_results = se_model_training_and_evaluation(mordred_pt_data, endpoint='all', check_endpoint_distribution=False, optuna_trials=50, 
                                                   verbose_optuna=False, plot_loss_model=False, plot_results=False, plot_feature_importance=False, 
                                                   results_filename='mordred_safety_results.tsv')
mordred_results

In [None]:
# Physicochemical properties
pc_results = se_model_training_and_evaluation(pc_pt_data, endpoint='all', check_endpoint_distribution=False, optuna_trials=50, 
                                              verbose_optuna=False, plot_loss_model=False, plot_results=False, plot_feature_importance=False, 
                                              results_filename='pc_safety_results.tsv')
pc_results

## Analysing the results

### Loading the results

In [None]:
# Load the individual results
cp_results = pd.read_csv('data/5_data/cp_safety_results.tsv', sep='\t')
desc_results = pd.read_csv('data/5_data/desc_safety_results.tsv', sep='\t')
ecfp4_results = pd.read_csv('data/5_data/ecfp4_safety_results.tsv', sep='\t')
mordred_results = pd.read_csv('data/5_data/mordred_safety_results.tsv', sep='\t')
pc_results = pd.read_csv('data/5_data/pc_safety_results.tsv', sep='\t')

### Medical Dictionary for Regulatory Activities (MedDRA) classification

Medical Dictionary for Regulatory Activities (MedDRA) classification is a comprehensive, standardized medical terminology used to facilitate the sharing of regulatory information about medical products. This classification system is hierarchical, from broad categories to precise terms: System Organ Class (SOC), High-Level Group Terms (HLGT), High-Level Terms (HLT), and Preferred Terms (PT). 

Subsequenntly, we use these MedDRA terms to perform enrichment analyses to identify over-represented medical themes within the identified side-effect sets.

In [None]:
# Classify the PTs to MedDRA categories for each data modality
cp_to_meddra = classify_pt_to_meddra(cp_results)
desc_to_meddra = classify_pt_to_meddra(desc_results)
ecfp4_to_meddra = classify_pt_to_meddra(ecfp4_results)
mordred_to_meddra = classify_pt_to_meddra(mordred_results)
pc_to_meddra = classify_pt_to_meddra(pc_results)

### Enrichment analysis of HLTs, HLGTs, and SOCs

In [None]:
# Define the subset of PTs uniquely predicted by Cell Painting
unique_cp = [10007554, 10022998, 10028625, 10082214, 10016173, 10039109, 10003693, 10057666, 10065612, 10061117, 10038153, 10012601, 10067584, 10001053, 10003481, 10034835]
# Classify these PTs to MedDRA categories
unique_cp_to_meddra = classify_pt_to_meddra(cp_results.loc[cp_results['safety_endpoint'].isin(unique_cp)])

# Classify all PTs to MedDRA categories
all_pt_to_meddra = classify_pt_to_meddra(cp_results)

# Perfomr the enrichment analysis
meddra_enrichment_analysis = fisher_exact_test_on_MedDRA_terms(unique_cp_to_meddra, all_pt_to_meddra, meddra_level='hlt')
meddra_enrichment_analysis

# EARLY DATA FUSION

The first strategy to combine information from different source is early data fusion, where feature vectors from two or more modalities are simply concatenated into a single vector.

In the early fusion approach, we perform simple feature selection to reduce feature set's dimensions, using the SelectKBest function to identify the 1,000 most correlated features with the label.

### Combining data modalities

In [None]:
# Combine descriptor types
# RDKit 1D, Mordred, PC
desc_mordred_pc_pt_data = combine_data_sources(desc_pt_data, mordred_pt_data, pc_pt_data)
desc_mordred_pc_pt_data.shape

In [None]:
# CP, ECFP4
cp_ecfp4_pt_data = combine_data_sources(cp_pt_data, ecfp4_pt_data)
cp_ecfp4_pt_data.shape

In [None]:
# CP, RDKit 1D, Mordred, PC
cp_desc_mordred_pc_pt_data = combine_data_sources(cp_pt_data, desc_mordred_pc_pt_data)
cp_desc_mordred_pc_pt_data.shape

In [None]:
# ECFP4, RDKit 1D, Mordred, PC 
ecfp4_desc_mordred_pc_pt_data = combine_data_sources(ecfp4_pt_data, desc_mordred_pc_pt_data)
ecfp4_desc_mordred_pc_pt_data.shape

In [None]:
# CP, ECFP4, RDKit 1D, Mordred, PC
cp_ecfp4_desc_mordred_pc_pt_data = combine_data_sources(cp_pt_data, ecfp4_desc_mordred_pc_pt_data)
cp_ecfp4_desc_mordred_pc_pt_data.shape

### Training and Evaluating the early-stage Models

In [None]:
# Train and evaluate predictive models for each combination of descriptors
# RDKit 1D, Mordred, PC
desc_mordred_pc_results = se_early_fusion_model_training_and_evaluation(desc_mordred_pc_pt_data, endpoint='all', k_features=1000, check_endpoint_distribution=False, optuna_trials=50,
                                                        verbose_optuna=False, plot_loss_model=False, plot_results=False, plot_feature_importance=False, 
                                                        results_filename='desc_mordred_pc_safety_results.tsv')
desc_mordred_pc_results

In [None]:
# CP, ECFP4
cp_ecfp4_results = se_early_fusion_model_training_and_evaluation(cp_ecfp4_pt_data, endpoint='all', k_features=1000, check_endpoint_distribution=False, optuna_trials=50,
                                                 verbose_optuna=False, plot_loss_model=False, plot_results=False, plot_feature_importance=False, 
                                                 results_filename='cp_ecfp4_safety_results.tsv')
cp_ecfp4_results

In [None]:
# CP, RDKit 1D, Mordred, PC
cp_desc_mordred_pc_results = se_early_fusion_model_training_and_evaluation(cp_desc_mordred_pc_pt_data, endpoint='all', k_features=1000, check_endpoint_distribution=False, optuna_trials=50,
                                                           verbose_optuna=False, plot_loss_model=False, plot_results=False, plot_feature_importance=False, 
                                                           results_filename='cp_desc_mordred_pc_safety_results.tsv')
cp_desc_mordred_pc_results

In [None]:
# ECFP4, RDKit 1D, Mordred, PC 
ecfp4_desc_mordred_pc_results = se_early_fusion_model_training_and_evaluation(ecfp4_desc_mordred_pc_pt_data, endpoint='all', k_features=1000, check_endpoint_distribution=False, optuna_trials=50,
                                                              verbose_optuna=False, plot_loss_model=False, plot_results=False, plot_feature_importance=False, 
                                                              results_filename='ecfp4_desc_mordred_pc_safety_results.tsv')
ecfp4_desc_mordred_pc_results

In [None]:
# CP, ECFP4, RDKit 1D, Mordred, PC
cp_ecfp4_desc_mordred_pc_results = se_early_fusion_model_training_and_evaluation(cp_ecfp4_desc_mordred_pc_pt_data, endpoint='all', k_features=1000, check_endpoint_distribution=False, optuna_trials=50,
                                                                 verbose_optuna=False, plot_loss_model=False, plot_results=False, plot_feature_importance=False, 
                                                                 results_filename='cp_ecfp4_desc_mordred_pc_safety.tsv')
cp_ecfp4_desc_mordred_pc_results

# LATE DATA FUSION

The second strategy to combine information from different sources is late data fusion, wherein each modality is used to train a separate model and then the prediction probabilities for a new sample are aggregated using different strategies.

In the late fusion approach, we employ six multi-modal fusion methods to aggregate probabilities from the classifiers trained on each modality separately: Average, Voting, Maximal, Weighted Average, Weighted Voting, Weighted Maximal.

In [None]:
# Perform late data fusion for each combination of descriptors
# RDKit 1D, Mordred, PC
desc_mordred_pc_weighted_average_results = se_late_data_fusion(data_modalities=['desc','mordred','pc'], endpoint='all', fusion_method='weighted_average',
                                                            save_results=True, results_filename='desc_mordred_pc_weighted_average_fusion_safety_results.tsv')
desc_mordred_pc_weighted_average_results

In [None]:
# CP, ECFP4
cp_ecfp4_weighted_average_results = se_late_data_fusion(data_modalities=['cp','ecfp4'], endpoint='all', fusion_method='weighted_average',
                                                     save_results=True, results_filename='cp_ecfp4_weighted_average_fusion_safety_results.tsv')
cp_ecfp4_weighted_average_results

In [None]:
# CP, RDKit 1D, Mordred, PC
cp_desc_mordred_pc_weighted_average_results = se_late_data_fusion(data_modalities=['cp','desc','mordred','pc'], endpoint='all', fusion_method='weighted_average',
                                                               save_results=True, results_filename='cp_desc_mordred_pc_weighted_average_fusion_safety_results.tsv')
cp_desc_mordred_pc_weighted_average_results

In [None]:
# ECFP4, RDKit 1D, Mordred, PC 
ecfp4_desc_mordred_pc_weighted_average_results = se_late_data_fusion(data_modalities=['ecfp4','desc','mordred','pc'], endpoint='all', fusion_method='weighted_average',
                                                                  save_results=True, results_filename='ecfp4_desc_mordred_pc_weighted_average_fusion_safety_results.tsv')
ecfp4_desc_mordred_pc_weighted_average_results

In [None]:
# CP, ECFP4, RDKit 1D, Mordred, PC
cp_ecfp4_desc_mordred_pc_weighted_average_results = se_late_data_fusion(data_modalities=['cp','ecfp4','desc','mordred','pc'], endpoint='all', fusion_method='weighted_average',
                                                                     save_results=True, results_filename='cp_ecfp4_desc_mordred_pc_weighted_average_fusion_safety_results.tsv')
cp_ecfp4_desc_mordred_pc_weighted_average_results

## Analysing the Data Fusion results

### Loading the results

In [None]:
# Load the early-fused results
desc_mordred_pc_results_early_fusion = pd.read_csv('data/5_data/desc_mordred_pc_safety_results.tsv', sep='\t')
cp_ecfp4_results_early_fusion = pd.read_csv('data/5_data/cp_ecfp4_safety_results.tsv', sep='\t')
cp_desc_mordred_pc_results_early_fusion = pd.read_csv('data/5_data/cp_desc_mordred_pc_safety_results.tsv', sep='\t')
ecfp4_desc_mordred_pc_results_early_fusion = pd.read_csv('data/5_data/ecfp4_desc_mordred_pc_safety_results.tsv', sep='\t')
cp_ecfp4_desc_mordred_pc_results_early_fusion = pd.read_csv('data/5_data/cp_ecfp4_desc_mordred_pc_safety_results.tsv', sep='\t')

In [None]:
# Load the late-fused results
desc_mordred_pc_results_late_fusion = pd.read_csv('data/5_data/desc_mordred_pc_weighted_average_fusion_safety_results.tsv', sep='\t')
cp_ecfp4_results_late_fusion = pd.read_csv('data/5_data/cp_ecfp4_weighted_average_fusion_safety_results.tsv', sep='\t')
cp_desc_mordred_pc_results_late_fusion = pd.read_csv('data/5_data/cp_desc_mordred_pc_weighted_average_fusion_safety_results.tsv', sep='\t')
ecfp4_desc_mordred_pc_results_late_fusion = pd.read_csv('data/5_data/ecfp4_desc_mordred_pc_weighted_average_fusion_safety_results.tsv', sep='\t')
cp_ecfp4_desc_mordred_pc_results_late_fusion = pd.read_csv('data/5_data/cp_ecfp4_desc_mordred_pc_weighted_average_fusion_safety_results.tsv', sep='\t')

### Statistical significance computations

We explore each pairwise combination of data sources (including individual descriptors, as well as early-stage and late-fused combinations) and perform a two-sample Kolmogorov-Smirnov (KS) test to assess whether the distributions of scores from the two subsets of models differ significantly.

In [None]:
# Construct the set of distirbutions
prauc_distributions = {'CP':cp_results['pr_auc_score'].tolist(), 
                       'RDKit 1D':desc_results['pr_auc_score'].tolist(), 
                       'ECFP4':ecfp4_results['pr_auc_score'].tolist(), 
                       'Mordred':mordred_results['pr_auc_score'].tolist(), 
                       'PC properties':pc_results['pr_auc_score'].tolist(), 
                       'RDKit 1D, Mordred, PC (early fusion)':desc_mordred_pc_results_early_fusion['pr_auc_score'].tolist(), 
                       'CP, ECFP4 (early fusion)':cp_ecfp4_results_early_fusion['pr_auc_score'].tolist(), 
                       'CP, RDKit 1D, Mordred, PC (early fusion)':cp_desc_mordred_pc_results_early_fusion['pr_auc_score'].tolist(), 
                       'ECFP4, RDKit 1D, Mordred, PC (early fusion)':ecfp4_desc_mordred_pc_results_early_fusion['pr_auc_score'].tolist(), 
                       'CP, ECFP4, RDKit 1D, Mordred, PC (early fusion)':cp_ecfp4_desc_mordred_pc_results_early_fusion['pr_auc_score'].tolist(),
                       'RDKit 1D, Mordred, PC (late fusion)':desc_mordred_pc_results_late_fusion['pr_auc_score'].tolist(), 
                       'CP, ECFP4 (late fusion)':cp_ecfp4_results_late_fusion['pr_auc_score'].tolist(), 
                       'CP, RDKit 1D, Mordred, PC (late fusion)':cp_desc_mordred_pc_results_late_fusion['pr_auc_score'].tolist(), 
                       'ECFP4, RDKit 1D, Mordred, PC (late fusion)':ecfp4_desc_mordred_pc_results_late_fusion['pr_auc_score'].tolist(), 
                       'CP, ECFP4, RDKit 1D, Mordred, PC (late fusion)':cp_ecfp4_desc_mordred_pc_results_late_fusion['pr_auc_score'].tolist()}

# Compute the two-sample KS test for each pair of distributions
ks_test_results = []
for data1, data2 in combinations_with_replacement(list(prauc_distributions.keys()), 2):
    ks_statistic, p_value = ks_2samp(prauc_distributions[data1], prauc_distributions[data2])
    ks_test_results.append({'data1': data1, 'data2': data2, 'ks_statistic': ks_statistic, 'p_value': p_value})

# Create the DataFrame containing the results data
ks_test_results_df = pd.DataFrame(ks_test_results)
ks_test_results_df