# Cell painting morphological features predictive models

We used FLAML, a simple Python library to perform Automated Machine Learning (AutoML), to train individual regressors that predict cell painting morphological features, for each one of the four molecular descriptors: RDKit 1D descriptors, ECFP4 fingerprints, Mordred descriptors, and Physicochemical properties.

For each molecular representation, we used the test set (20%) and different regression metrics for evaluating the prediction model performance: R<sup>2</sup> score, Mean Absolute Error (MEA), Mean Squared Error (MSE), and Root Mean Squared Error (RMSE).

In [None]:
from src.utils import *

In [None]:
random_seed = 42
np.random.seed(random_seed) 

## Loading of data

In [None]:
# Create the complete dataset for each molecular representation
# RDKit 1D descriptors
desc_cp_data = cp_create_complete_dataset('1_data/CellPainting_data.csv', '2_data/CPcompounds_1D_RDKit.tsv')
desc_cp_data.head()

In [None]:
# ECFP4 fingerprints
ecfp4_cp_data = cp_create_complete_dataset('1_data/CellPainting_data.csv', '2_data/CPcompounds_ECFP4_1024.tsv')
ecfp4_cp_data.head()

In [None]:
# Mordred descriptors
mordred_cp_data = cp_create_complete_dataset('1_data/CellPainting_data.csv', '2_data/CPcompounds_Mordred.tsv')
mordred_cp_data.head()

In [None]:
# Physicochemial properties
pc_cp_data = cp_create_complete_dataset('1_data/CellPainting_data.csv', '2_data/CPcompounds_physicochemical_properties.tsv')
pc_cp_data.head() 

## Training and Evaluating the predictive models

In [None]:
# Train and evaluate predictive models for each descriptor type
# RDKit 1D descriptors
desc_results = cp_model_training_and_evaluation(desc_cp_data, CP_feature='all', train_split=0.8, verbose_flaml=False, 
                                                plot_results=False, plot_feature_importance=False, save_results=True, results_filename='desc_cp_results_automl.tsv')
desc_results

In [None]:
# ECFP4 fingerprints
ecfp4_results = cp_model_training_and_evaluation(ecfp4_cp_data, CP_feature='all', train_split=0.8, verbose_flaml=False, 
                                                 plot_results=False, plot_feature_importance=False, save_results=True, results_filename='ecfp4_cp_results_automl.tsv')
ecfp4_results

In [None]:
# Mordred descriptors
mordred_results = cp_model_training_and_evaluation(mordred_cp_data, CP_feature='all', train_split=0.8, verbose_flaml=False, 
                                                   plot_results=False, plot_feature_importance=False, save_results=True, results_filename='mordred_cp_results_automl.tsv')
mordred_results

In [None]:
# Physicochemical properties
pc_results = cp_model_training_and_evaluation(pc_cp_data, CP_feature='all', train_split=0.8, verbose_flaml=False, 
                                              plot_results=False, plot_feature_importance=False, save_results=True, results_filename='pc_cp_results_automl.tsv')
pc_results

## Analysing the results

### Loading the results

In [None]:
# Load the model results
desc_results = pd.read_csv('data/3_data/desc_cp_results_automl.tsv', sep='\t')
ecfp4_results = pd.read_csv('data/3_data/ecfp4_cp_results_automl.tsv', sep='\t')
mordred_results = pd.read_csv('data/3_data/mordred_cp_results_automl.tsv', sep='\t')
pc_results = pd.read_csv('data/3_data/pc_cp_results_automl.tsv', sep='\t')

### Cell Painting features classification

To explore if any cell painting feature or class could be better predicted, we categorized the collection of morphological features by:

* Compartment: Nuclei, Cells, Cytoplasm.
* Channel: DNA, RNA, AGP, Mito, ER. 
* Feature group: Texture, Intensity, RadialDistribution, Correlation, Granularity, AreaShape.

In [None]:
# Create an empty list to store all new feature names
new_classification_names = []

# Define the comparments, channels and feature sets
Comparments = ['Nuclei','Cells','Cytoplasm']
Channels=['DNA','RNA','AGP','Mito','ER']
FeatureGroups=['Texture','Intensity','RadialDistribution','Correlation','Granularity','AreaShape'] 

for feature in desc_results['CP_feature'].tolist():
    # Get the list of feature items
    items = feature.split('_')

    # Get the compartment, channel(s) and group of the feature
    feature_compartment = set(items) & set(Comparments)
    feature_channels = set(items) & set(Channels)
    feature_group = set(items) & set(FeatureGroups)

    if len(feature_compartment) == 1 and len(feature_group) == 1:  
        # Define the new feature name
        if feature_group == {'AreaShape'}:
            feature_name = list(feature_compartment)[0]+'_'+list(feature_group)[0]+'_None'
        elif len(feature_channels) != 1: # when feature_group == {'Correlation'}
            feature_name = list(feature_compartment)[0]+'_'+list(feature_group)[0]+'_'+items[-2]+'_'+items[-1]
        elif feature_group != {'Location'}: # exclude 'Location' features 
            feature_name = list(feature_compartment)[0]+'_'+list(feature_group)[0]+'_'+list(feature_channels)[0]

    else:
       # Set a null value for the feature name
        feature_name = np.nan

    # Append the new feature name
    new_classification_names.append(feature_name)

# Add the new feature names as a column of the four dataframes
desc_results.insert(2, 'Feature_classification', new_classification_names)
ecfp4_results.insert(2, 'Feature_classification', new_classification_names)
mordred_results.insert(2, 'Feature_classification', new_classification_names)
pc_results.insert(2, 'Feature_classification', new_classification_names)