# 1. Data Preparation

### Loading Packages

In [127]:
import sqlite3 as sql
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score

from scipy.stats import f_oneway
from statsmodels.multivariate.manova import MANOVA
import pingouin as pg
# also install necessary dependencies of the excel plugin of pandas

## Data: Bray et al. (2017)

The Cell Image Library has a “Human U2OS Cell: Compound Cell-Painting Experiment” project data that contains the images of 375 plates in 384-well format (More details: https://www.cellimagelibrary.org/pages/project_20269): 

- The images are of U2OS cells treated with each of over 30,000 known bioactive compounds.
- These cells are labeled with 6 labels that characterize seven organelles (the cell-painting assay).
- The data set is comprised of 988,994 fields of view.
- Each field was imaged in five channels (detection wavelengths), and each channel is stored as a separate, grayscale image file.
- As a result, there are approximately 5 million image files in 16-bit TIFF format.

Bray et al. (2017)’s research uses the raw image data of, and includes highly multiplexed measurements of cellular morphology of the tested compounds from the above Human U2OS Cell research. It includes:

- data files containing morphological features derived from each cell in each image (both at the single-cell level and population-averaged, i.e. per-well level),
- the image analysis workflows that generated the morphological features,
- quality-control metrics are provided as metadata,
- chemical annotations for the applied compound treatments.

- Description and files: http://gigadb.org/dataset/view/id/100351/Files_page/1
- Codebase: https://github.com/gigascience/paper-bray2017/tree/master


### Functions for SQLite Databases

In [107]:
# function for getting a dataframes' column list
def get_columns(df: pd.DataFrame):
    return df.columns.to_list()

# function for moving a column to a new position
def move_column(df, col_name, new_position):
    temp_col = df[col_name]
    df = df.drop(columns=[col_name])
    df.insert(new_position, col_name, temp_col)
    return df

# function for converting a column to numeric
def convert_to_numeric(df):
    for col in df.columns:
        # Attempt to convert the column to numeric, setting errors='ignore' keeps the original data if conversion fails
        df[col] = pd.to_numeric(df[col], errors='ignore')
    return df

# function for getting the memory size of an object
def memory_size(bytes_size: int) -> str:
    for unit in ['Bytes', 'KB', 'MB', 'GB', 'TB']:
        if bytes_size < 1024:
            return f"{bytes_size:.2f} {unit}"
        bytes_size /= 1024
    return f"{bytes_size:.2f} PB"

In [3]:
# funtion for retrieving the sql database connector
def get_db_conn(db_path: str):
    try:
        conn = sql.connect(db_path) # connect to the SQLite database
        return conn
    except sql.Error as e:
        print(f"An error occurred while accessing the database: {e}")
        pass

# function for connecting a database object and returning its connector cursor
def get_db_cursor(db_path: str):
    try:
        conn = sql.connect(db_path) # connect to the SQLite database
        cursor = conn.cursor() # create a cursor object to execute SQL queries
        return cursor
    except sql.Error as e:
        print(f"An error occurred while accessing the database: {e}")
        pass

# function for viewing the available database tables
def get_db_tables(cursor: sql.Cursor):
    try:
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") # cursor execution for retrieving the list of all tables
        db_tables = cursor.fetchall() # fetch the command output in a list object

        # view the list of tables
        temp = []
        for table in db_tables:
            temp.append(table[0])
        
        db_tables = temp
        return db_tables
    
    except sql.Error as e:
        print(f"An error occurred while accessing the database: {e}")
        return []

# function for extracting all available table columns in a dictionary
def get_db_columns(cursor: sql.Cursor):
    db_columns = {}
    
    for table in get_db_tables(cursor=cursor):
        cursor.execute(f"PRAGMA table_info({table});") # cursor call for retrieving table information
        cursor_output = cursor.fetchall() # cursor fetch operation to retrieve the SQL command output
        columns = [col[1] for col in cursor_output]
        db_columns[table] = columns

    for table in db_columns:
        print(f'{table}: {len(db_columns[table])} Columns')
        print(f'{table}: {db_columns[table]} Columns\n')

    return db_columns

# function for printing out the database table sizes
def get_db_size(cursor: sql.Cursor):
    for table in get_db_tables(cursor=cursor):
        # cursor call for executing SQL command 
        cursor.execute(f"SELECT COUNT(*) FROM {table};") 
        # fetch the first element of the cursor call output
        row_count = cursor.fetchone()[0] 
        
        # cursor command for retrieving column information
        cursor.execute(f"PRAGMA table_info({table});") 
        column_count = len(cursor.fetchall())
        
        # print the dimension and size information of each table
        print(f"{table} | Rows: {row_count} | Columns: {column_count} | Datapoints: {(row_count * column_count / 1000000):.1f}M")

def add_id_info(dataframes: dict):
    for table in dataframes.keys():
        if table == 'Image':
            # Split the FileName column into well information and picture number
            dataframes[table]['WellID'] = dataframes[table]['Image_FileName_CellOutlines'].str.split('--').str[0].str.split('_').str[0]
            dataframes[table]['FieldID'] = dataframes[table]['Image_FileName_CellOutlines'].str.split('--').str[0].str.split('_').str[1]
        else:
            # adding well ID and field ID to other tables
            dataframes[table] = dataframes[table].merge(dataframes['Image'][['TableNumber', 'WellID', 'FieldID']], on='TableNumber', how='left')
    
    return dataframes

def fix_columns(dataframes: dict):
    for table in dataframes:
        # change TableNumber to tableID in all of the tables
        dataframes[table].rename(columns={'TableNumber': 'TableID'}, inplace=True)

        # fix the order of the newly added columns
        dataframes[table] = move_column(dataframes[table], 'PlateID', 1)
        dataframes[table] = move_column(dataframes[table], 'WellID', 2)
        dataframes[table] = move_column(dataframes[table], 'FieldID', 3)

        # convert all numeric columns to float
        if table in ['Cells', 'Cytoplasm', 'Nuclei']:
            for eachColumn in dataframes[table].columns:
                if 'AreaShape' in eachColumn:
                    try:
                        # replace 'nan' strings with np.nan
                        dataframes[table][eachColumn] = dataframes[table][eachColumn].replace('nan', np.nan)
                        # attempt to convert the column to numeric type
                        dataframes[table][eachColumn] = pd.to_numeric(dataframes[table][eachColumn])
                        # print(f"Converted column '{eachColumn}' in table '{table}' to numeric.")
                    except ValueError:
                        # if conversion fails, leave the column unchanged
                        print(f"Could not convert column '{eachColumn}' in table '{table}'. Leaving it unchanged.")
    return dataframes

### Loading SQLite Databases

In [4]:
# list of plate IDs being extracted
analysis_mode = 'test'
plate_list = {'live': [24278, 24279, 24280, 24293, 24294, 24295, 24296, 24297, 24300, 24301, 24302, 24303],
              'test': [24278, 24279]}

# dictionary of required columns inside each of the sqlite database tables
df_columns = {}
df_columns['Image'] = ['TableNumber', 'Image_Count_Cells', 'Image_Count_Cytoplasm', 'Image_Count_Nuclei', 'Image_ExecutionTime_01LoadData', 'Image_ExecutionTime_02CorrectIlluminationApply', 'Image_ExecutionTime_03MeasureImageQuality', 'Image_ExecutionTime_04MeasureImageQuality', 'Image_ExecutionTime_06IdentifyPrimaryObjects', 'Image_ExecutionTime_07IdentifySecondaryObjects', 'Image_ExecutionTime_08IdentifyTertiaryObjects', 'Image_ExecutionTime_09MeasureCorrelation', 'Image_ExecutionTime_10MeasureGranularity', 'Image_ExecutionTime_11MeasureObjectIntensity', 'Image_ExecutionTime_12MeasureObjectNeighbors', 'Image_ExecutionTime_13MeasureObjectNeighbors', 'Image_ExecutionTime_14MeasureObjectNeighbors', 'Image_ExecutionTime_15MeasureObjectIntensityDistribution', 'Image_ExecutionTime_16MeasureObjectSizeShape', 'Image_ExecutionTime_17MeasureTexture', 'Image_ExecutionTime_18OverlayOutlines', 'Image_ExecutionTime_19OverlayOutlines', 'Image_ExecutionTime_20SaveImages', 'Image_ExecutionTime_21SaveImages', 'Image_FileName_CellOutlines', 'Image_FileName_IllumAGP', 'Image_FileName_IllumDNA', 'Image_FileName_IllumER', 'Image_FileName_IllumMito', 'Image_FileName_IllumRNA', 'Image_FileName_NucleiOutlines', 'Image_FileName_OrigAGP', 'Image_FileName_OrigDNA', 'Image_FileName_OrigER', 'Image_FileName_OrigMito', 'Image_FileName_OrigRNA']
df_columns['Nuclei'] = ['TableNumber', 'ImageNumber', 'ObjectNumber', 'Nuclei_AreaShape_Area', 'Nuclei_AreaShape_Center_X', 'Nuclei_AreaShape_Center_Y', 'Nuclei_AreaShape_Compactness', 'Nuclei_AreaShape_Eccentricity', 'Nuclei_AreaShape_EulerNumber', 'Nuclei_AreaShape_Extent', 'Nuclei_AreaShape_FormFactor', 'Nuclei_AreaShape_MajorAxisLength', 'Nuclei_AreaShape_MaxFeretDiameter', 'Nuclei_AreaShape_MaximumRadius', 'Nuclei_AreaShape_MeanRadius', 'Nuclei_AreaShape_MedianRadius', 'Nuclei_AreaShape_MinFeretDiameter', 'Nuclei_AreaShape_MinorAxisLength', 'Nuclei_AreaShape_Orientation', 'Nuclei_AreaShape_Perimeter', 'Nuclei_AreaShape_Solidity']
df_columns['Cytoplasm'] = ['TableNumber', 'ImageNumber', 'ObjectNumber', 'Cytoplasm_AreaShape_Area', 'Cytoplasm_AreaShape_Center_X', 'Cytoplasm_AreaShape_Center_Y', 'Cytoplasm_AreaShape_Compactness', 'Cytoplasm_AreaShape_Eccentricity', 'Cytoplasm_AreaShape_EulerNumber', 'Cytoplasm_AreaShape_Extent', 'Cytoplasm_AreaShape_FormFactor', 'Cytoplasm_AreaShape_MajorAxisLength', 'Cytoplasm_AreaShape_MaxFeretDiameter', 'Cytoplasm_AreaShape_MaximumRadius', 'Cytoplasm_AreaShape_MeanRadius', 'Cytoplasm_AreaShape_MedianRadius', 'Cytoplasm_AreaShape_MinFeretDiameter', 'Cytoplasm_AreaShape_MinorAxisLength', 'Cytoplasm_AreaShape_Orientation', 'Cytoplasm_AreaShape_Perimeter', 'Cytoplasm_AreaShape_Solidity']
df_columns['Cells'] = ['TableNumber', 'ImageNumber', 'ObjectNumber', 'Cells_AreaShape_Area', 'Cells_AreaShape_Center_X', 'Cells_AreaShape_Center_Y', 'Cells_AreaShape_Compactness', 'Cells_AreaShape_Eccentricity', 'Cells_AreaShape_EulerNumber', 'Cells_AreaShape_Extent', 'Cells_AreaShape_FormFactor', 'Cells_AreaShape_MajorAxisLength', 'Cells_AreaShape_MaxFeretDiameter', 'Cells_AreaShape_MaximumRadius', 'Cells_AreaShape_MeanRadius', 'Cells_AreaShape_MedianRadius', 'Cells_AreaShape_MinFeretDiameter', 'Cells_AreaShape_MinorAxisLength', 'Cells_AreaShape_Orientation', 'Cells_AreaShape_Perimeter', 'Cells_AreaShape_Solidity']

# create a blank dictionary to save each table as a dataframe
dataframes = {}
# initialize the dictionary with creating each table with specified columns + plateID column
for table in df_columns.keys():
    dataframes[table] = pd.DataFrame(columns=df_columns[table] + ['PlateID'])

for plate in plate_list[analysis_mode]:

    # update path to the sqlite database
    db_path = '../Data/bray2017/'+str(plate)+'/extracted_features/'+str(plate)+'.sqlite'
    # update db connector and cursor
    conn = get_db_conn(db_path=db_path)
    cursor = get_db_cursor(db_path=db_path)

    for table in get_db_tables(cursor):
        # update the query with the next table name and column information
        query = f"SELECT {', '.join(df_columns[table])} FROM {table};"
        # extract the dataframe using query
        temp = pd.read_sql_query(query, conn)

        # add plateID information to newly extracted dataframe
        temp['PlateID'] = plate

        # append the fresh dataframe with the existing dataframe
        dataframes[table] = pd.concat([dataframes[table], temp], ignore_index=True)
        print(f"PlateID: {plate} - Table: {table} | Success")

# add wellID and fieldID information to all dataframes
dataframes = add_id_info(dataframes)

# fix the column orders and data types
dataframes = fix_columns(dataframes)

  dataframes[table] = pd.concat([dataframes[table], temp], ignore_index=True)


PlateID: 24278 - Table: Image | Success


  dataframes[table] = pd.concat([dataframes[table], temp], ignore_index=True)


PlateID: 24278 - Table: Nuclei | Success


  dataframes[table] = pd.concat([dataframes[table], temp], ignore_index=True)


PlateID: 24278 - Table: Cytoplasm | Success


  dataframes[table] = pd.concat([dataframes[table], temp], ignore_index=True)


PlateID: 24278 - Table: Cells | Success
PlateID: 24279 - Table: Image | Success
PlateID: 24279 - Table: Nuclei | Success
PlateID: 24279 - Table: Cytoplasm | Success
PlateID: 24279 - Table: Cells | Success


  dataframes[table][eachColumn] = dataframes[table][eachColumn].replace('nan', np.nan)
  dataframes[table][eachColumn] = dataframes[table][eachColumn].replace('nan', np.nan)


### (Optional) Exporting CSV

In [5]:
# # set up the query parameters
# # table_name = 'Image'
# row_limit = 3

# for table in db_tables:
#     print(f"\n{table}")

#     cursor.execute(f"PRAGMA table_info({table});") # cursor call for retrieving table information
#     columns = cursor.fetchall()
#     column_names = [col[1] for col in columns]
#     print(column_names)

#     query = f"SELECT * FROM {table} LIMIT {row_limit};"  # set up the query using parameters
#     cursor.execute(query) # execute the cursor call using the query
#     rows = cursor.fetchall() # fetch the output of the cursor
#     for row in rows:
#         print(row)

# sampleDataframes = {} # create a blank dictionary to save each table as a dataframe, with a table_name key
# row_limit = 10

# for table in db_tables: # loop through all tables 
    
#     query = f"SELECT * FROM {table} LIMIT {row_limit};"  # set up the query using parameters
#     df = pd.read_sql_query(query, conn) # use the open connector to pull table into

#     sampleDataframes[table] = df # store the sample df in the dictionary
#     print(f"Table {table} has been saved.")

# for table_name, df in sampleDataframes.items():
#     # Export each DataFrame to a CSV file
#     df.to_csv(f"{table_name}.csv", index=False)  # index=False avoids writing row numbers
#     print(f"Exported {table_name} to {table_name}.csv")


In [6]:
# close the connection when the sqlite database processing is done
conn.close()

## Data: Seal et al. (2024)

Ola linked the compounds identified in the BioMorph study with the metadata of the Cell Painting dataset. She linked 603 compounds, which results in 5025 wells spread over 94 different plates.
- CPD_NAME: specific compounds
- Metadata_Plate: specific plates
- Metadata_Well: specific wells

### Endpoint Definitions

1. Apoptosis Up:
- Apoptosis is the process of programmed cell death, where cells die in a controlled manner as part of normal development or in response to damage.
- "**Apoptosis up**" means an increase in the rate of apoptosis in response to a compound, suggesting the compound is inducing cell death via the apoptotic pathway. This could be important in cancer treatments, where the goal is to promote the death of harmful cells.
 2. Cytotoxicity BLA:
- Cytotoxicity refers to the toxic effect a compound has on cells, leading to cell damage or death.
- BLA stands for Beta-Lactamase assay, a biochemical assay often used to detect cytotoxicity. The "**Cytotoxicity BLA**" endpoint indicates cell death or damage measured through the Beta-Lactamase assay.
 3. Cytotoxicity SRB:
- Similar to Cytotoxicity BLA, this measures cell toxicity, but using a different assay.
- SRB stands for Sulforhodamine B, a dye that binds to cellular proteins, and it’s commonly used to measure cell density and viability. The "**Cytotoxicity SRB**" endpoint measures the cytotoxic effect of compounds based on the amount of protein-bound SRB dye, indicating cell death or reduced viability.
 4. ER Stress:
- ER stress refers to stress in the Endoplasmic Reticulum (ER), a cell organelle involved in protein folding and secretion. When misfolded proteins accumulate, ER stress triggers the Unfolded Protein Response (UPR).
- The "ER stress" endpoint indicates that a compound is causing stress in the ER, potentially leading to apoptosis or other cellular dysfunctions.
 5. Heat Shock:
- Heat shock refers to the stress response of cells to elevated temperatures, which results in the production of heat shock proteins (HSPs) that help protect cells from damage.
- The "Heat Shock" endpoint suggests the compound is inducing a cellular response similar to what happens when cells are exposed to heat or other stresses, typically leading to the production of HSPs.
 6. Microtubule Up:
- Microtubules are part of the cell's cytoskeleton and are crucial for cell division and intracellular transport.
- "Microtubule up" indicates an increase in microtubule stabilization or polymerization due to the compound. Compounds that affect microtubules can disrupt cell division, making this endpoint important in cancer research (e.g., chemotherapy drugs like taxanes target microtubules).
 7. Mitochondrial Disruption Up:
- Mitochondria are the energy-producing organelles in cells, and mitochondrial disruption can lead to cell death or dysfunction.
- "Mitochondrial disruption up" indicates an increase in mitochondrial dysfunction, which can lead to cellular energy depletion and apoptosis. This endpoint is used to measure the impact of a compound on mitochondrial health.
 8. Oxidative Stress Up:
- Oxidative stress occurs when there is an imbalance between the production of reactive oxygen species (ROS) and the cell’s ability to detoxify them, leading to cellular damage.
- "Oxidative stress up" means the compound is causing an increase in oxidative stress, which can damage DNA, proteins, and lipids, potentially leading to cell death.
 9. Proliferation Decrease:
- Proliferation refers to the growth and division of cells. A decrease in proliferation means that the cells are dividing more slowly or not at all.
- "Proliferation decrease" indicates that the compound is inhibiting cell growth. This endpoint is often used in cancer research to evaluate the efficacy of treatments designed to slow or stop the growth of tumor cells.

### Loading & Cleaning

In [90]:
# read the xlsx file
biomorph = pd.read_excel("../Data/olaBiomorph/603_compounds_metadata.xlsx")

# view a snippet of the original dataset
# biomorph.head()

# choose the important columns
biomorph_columns = ['Metadata_Plate',	
                    'Metadata_Well',	
                    'CPD_NAME',	'CPD_SAMPLE_ID',	
                    'apoptosis up',	
                    'cytotoxicity BLA',
                    'cytotoxicity SRB',	
                    'ER stress',	
                    'heat shock',	
                    'microtubule up',	
                    'mitochondrial disruption up',	
                    'oxidative stress up', 
                    'proliferation decrease']

# list of toxicity endpoints
endpoint_columns = ['apoptosis up', 'cytotoxicity BLA', 'cytotoxicity SRB', 'ER stress', 
                    'heat shock', 'microtubule up', 'mitochondrial disruption up', 
                    'oxidative stress up', 'proliferation decrease']

# remove the unnecessary columns
biomorph = biomorph.loc[:, biomorph_columns]

# improve the column names
biomorph.rename(columns={'Metadata_Plate': 'PlateID'}, inplace=True)
biomorph.rename(columns={'Metadata_Well': 'WellID'}, inplace=True)

# remove the unused plate information
biomorph = biomorph[biomorph.PlateID.isin(plate_list[analysis_mode])]

# create a column with total endpoint sum
biomorph['total_endpoints'] = biomorph[endpoint_columns].sum(axis=1)

# add a column with a tuple list of total endpoint activities
biomorph['endpoint_combination'] = biomorph[endpoint_columns].apply(tuple, axis=1)

# quick inspection
print(biomorph.shape)
print(biomorph.info())

# print the snippet of the cleaned dataset
biomorph.head(10)

(198, 15)
<class 'pandas.core.frame.DataFrame'>
Index: 198 entries, 103 to 300
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   PlateID                      198 non-null    int64 
 1   WellID                       198 non-null    object
 2   CPD_NAME                     198 non-null    object
 3   CPD_SAMPLE_ID                198 non-null    object
 4   apoptosis up                 198 non-null    int64 
 5   cytotoxicity BLA             198 non-null    int64 
 6   cytotoxicity SRB             198 non-null    int64 
 7   ER stress                    198 non-null    int64 
 8   heat shock                   198 non-null    int64 
 9   microtubule up               198 non-null    int64 
 10  mitochondrial disruption up  198 non-null    int64 
 11  oxidative stress up          198 non-null    int64 
 12  proliferation decrease       198 non-null    int64 
 13  total_endpoints             

Unnamed: 0,PlateID,WellID,CPD_NAME,CPD_SAMPLE_ID,apoptosis up,cytotoxicity BLA,cytotoxicity SRB,ER stress,heat shock,microtubule up,mitochondrial disruption up,oxidative stress up,proliferation decrease,total_endpoints,endpoint_combination
103,24278,a03,olmesartan medoxomil,SA59556,0,0,0,0,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)"
104,24278,a06,citropten,SA59278,0,0,0,0,0,0,1,0,0,1,"(0, 0, 0, 0, 0, 0, 1, 0, 0)"
105,24278,a09,bromperidol,SA83338,0,0,0,0,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)"
106,24278,a10,leflunomide,SA792771,1,1,0,1,1,0,0,1,0,5,"(1, 1, 0, 1, 1, 0, 0, 1, 0)"
107,24278,a11,suxibuzone,SA58544,0,0,0,0,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)"
108,24278,a18,nitrofural,SA58398,1,0,0,0,0,1,0,1,1,4,"(1, 0, 0, 0, 0, 1, 0, 1, 1)"
109,24278,a20,ioxaglic acid,SA83731,0,0,0,0,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)"
110,24278,a21,albendazole,SA58834,1,0,0,0,1,0,0,0,0,2,"(1, 0, 0, 0, 1, 0, 0, 0, 0)"
111,24278,b01,benperidol,SA83335,0,0,0,0,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)"
112,24278,b05,thiamphenicol,SA59442,0,1,0,0,0,0,0,0,0,1,"(0, 1, 0, 0, 0, 0, 0, 0, 0)"


# 2. Explatory Data Analysis

## Table Snippets

In [8]:
print(dataframes['Image'].info())
dataframes['Image'].head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4567 entries, 0 to 4566
Data columns (total 39 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   TableID                                                   4567 non-null   object 
 1   PlateID                                                   4567 non-null   object 
 2   WellID                                                    4567 non-null   object 
 3   FieldID                                                   4567 non-null   object 
 4   Image_Count_Cells                                         4567 non-null   float64
 5   Image_Count_Cytoplasm                                     4567 non-null   float64
 6   Image_Count_Nuclei                                        4567 non-null   float64
 7   Image_ExecutionTime_01LoadData                            4567 non-null   float64
 8   Image_ExecutionTim

Unnamed: 0,TableID,PlateID,WellID,FieldID,Image_Count_Cells,Image_Count_Cytoplasm,Image_Count_Nuclei,Image_ExecutionTime_01LoadData,Image_ExecutionTime_02CorrectIlluminationApply,Image_ExecutionTime_03MeasureImageQuality,...,Image_FileName_IllumDNA,Image_FileName_IllumER,Image_FileName_IllumMito,Image_FileName_IllumRNA,Image_FileName_NucleiOutlines,Image_FileName_OrigAGP,Image_FileName_OrigDNA,Image_FileName_OrigER,Image_FileName_OrigMito,Image_FileName_OrigRNA
0,0702991209138712afb02ac7ea637f71,24278,a01,s1,58.0,58.0,58.0,4.33,0.05,6.79,...,24278_IllumDNA.mat,24278_IllumER.mat,24278_IllumMito.mat,24278_IllumRNA.mat,a01_s1--nuclei_outlines.png,cdp2bioactives_a01_s1_w46d2c0547-8e3b-440e-a85...,cdp2bioactives_a01_s1_w1bfb15712-b306-40fd-a77...,cdp2bioactives_a01_s1_w2edcec6dc-b1e3-4ffc-80d...,cdp2bioactives_a01_s1_w5d4e4b98c-0f39-4db9-91b...,cdp2bioactives_a01_s1_w336f7b0bc-6ae8-4667-a6a...
1,e2238b50acc3114c310acbf4c68bd114,24278,a01,s2,35.0,35.0,35.0,4.29,0.03,5.95,...,24278_IllumDNA.mat,24278_IllumER.mat,24278_IllumMito.mat,24278_IllumRNA.mat,a01_s2--nuclei_outlines.png,cdp2bioactives_a01_s2_w4da2481a5-d23f-4aae-b35...,cdp2bioactives_a01_s2_w1bd0b9bc7-0d8b-48ed-b04...,cdp2bioactives_a01_s2_w2abeb5a62-b570-447f-97f...,cdp2bioactives_a01_s2_w57540fec0-d693-46a8-bd6...,cdp2bioactives_a01_s2_w3b3fb2060-153d-4096-af5...
2,b983ac6d8cc9a5ed9b713585e32ac4ae,24278,a01,s3,26.0,26.0,26.0,4.35,0.03,5.3,...,24278_IllumDNA.mat,24278_IllumER.mat,24278_IllumMito.mat,24278_IllumRNA.mat,a01_s3--nuclei_outlines.png,cdp2bioactives_a01_s3_w48ffcfde9-24ae-486c-844...,cdp2bioactives_a01_s3_w17290b03d-9255-40d6-898...,cdp2bioactives_a01_s3_w268c116dd-b84b-4a22-94d...,cdp2bioactives_a01_s3_w5b3744705-89fc-4e6e-9c7...,cdp2bioactives_a01_s3_w3334a07f0-46b6-4e3d-8d8...
3,726ad7ac7c4813097cc3aab610c143b4,24278,a01,s4,55.0,55.0,55.0,4.7,0.04,6.84,...,24278_IllumDNA.mat,24278_IllumER.mat,24278_IllumMito.mat,24278_IllumRNA.mat,a01_s4--nuclei_outlines.png,cdp2bioactives_a01_s4_w46dbd1f1e-f3cd-4590-ba2...,cdp2bioactives_a01_s4_w1d07860e9-2432-4233-96b...,cdp2bioactives_a01_s4_w2a8a9c55a-b3b7-4f4d-b70...,cdp2bioactives_a01_s4_w5752fd3e8-0997-4cf1-8a8...,cdp2bioactives_a01_s4_w3e9947106-9c5d-428a-b8a...
4,9a0fed6eaf4ee63b89adc60e02bfbb58,24278,a01,s5,21.0,21.0,21.0,4.25,0.03,6.44,...,24278_IllumDNA.mat,24278_IllumER.mat,24278_IllumMito.mat,24278_IllumRNA.mat,a01_s5--nuclei_outlines.png,cdp2bioactives_a01_s5_w41bba91bb-81ff-4862-af3...,cdp2bioactives_a01_s5_w1ac6937d8-cbbe-40d4-899...,cdp2bioactives_a01_s5_w20f38032f-7e71-41cc-94c...,cdp2bioactives_a01_s5_w518bc5494-c49b-4c7f-a94...,cdp2bioactives_a01_s5_w373e8a6f0-ac34-4be4-8b1...


In [9]:
print(dataframes['Cells'].info())
dataframes['Cells'].head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455432 entries, 0 to 455431
Data columns (total 24 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   TableID                           455432 non-null  object 
 1   PlateID                           455432 non-null  object 
 2   WellID                            455432 non-null  object 
 3   FieldID                           455432 non-null  object 
 4   ImageNumber                       455432 non-null  object 
 5   ObjectNumber                      455432 non-null  object 
 6   Cells_AreaShape_Area              455432 non-null  int64  
 7   Cells_AreaShape_Center_X          455432 non-null  float64
 8   Cells_AreaShape_Center_Y          455432 non-null  float64
 9   Cells_AreaShape_Compactness       455411 non-null  float64
 10  Cells_AreaShape_Eccentricity      455411 non-null  float64
 11  Cells_AreaShape_EulerNumber       455432 non-null  f

Unnamed: 0,TableID,PlateID,WellID,FieldID,ImageNumber,ObjectNumber,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,...,Cells_AreaShape_MajorAxisLength,Cells_AreaShape_MaxFeretDiameter,Cells_AreaShape_MaximumRadius,Cells_AreaShape_MeanRadius,Cells_AreaShape_MedianRadius,Cells_AreaShape_MinFeretDiameter,Cells_AreaShape_MinorAxisLength,Cells_AreaShape_Orientation,Cells_AreaShape_Perimeter,Cells_AreaShape_Solidity
0,0702991209138712afb02ac7ea637f71,24278,a01,s1,1,1,1634,43.0,0.0,1.509013,...,70.397234,80.709355,21.023796,7.075857,6.082763,40.703919,36.222035,63.738571,230.324,0.836447
1,0702991209138712afb02ac7ea637f71,24278,a01,s1,1,2,2999,619.0,28.0,1.487299,...,97.565456,112.507778,20.0,7.41562,6.708204,51.680329,42.534725,-34.521891,327.642,0.796441
2,0702991209138712afb02ac7ea637f71,24278,a01,s1,1,3,2471,219.0,33.0,1.126171,...,68.268046,70.710678,20.615528,7.682926,7.0,51.342921,49.118568,-73.795204,243.636,0.853099
3,0702991209138712afb02ac7ea637f71,24278,a01,s1,1,4,1951,265.0,32.0,1.383061,...,75.198694,81.492331,17.691806,6.372729,5.830952,37.10312,34.684289,-28.019719,217.738,0.91339
4,0702991209138712afb02ac7ea637f71,24278,a01,s1,1,5,1619,82.0,11.0,1.11982,...,54.504681,58.591808,18.681542,7.630574,7.0,41.751358,40.508727,-70.130892,187.254,0.893241


In [10]:
print(dataframes['Nuclei'].info())
dataframes['Nuclei'].head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455432 entries, 0 to 455431
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   TableID                            455432 non-null  object 
 1   PlateID                            455432 non-null  object 
 2   WellID                             455432 non-null  object 
 3   FieldID                            455432 non-null  object 
 4   ImageNumber                        455432 non-null  object 
 5   ObjectNumber                       455432 non-null  object 
 6   Nuclei_AreaShape_Area              455432 non-null  int64  
 7   Nuclei_AreaShape_Center_X          455432 non-null  float64
 8   Nuclei_AreaShape_Center_Y          455432 non-null  float64
 9   Nuclei_AreaShape_Compactness       455432 non-null  float64
 10  Nuclei_AreaShape_Eccentricity      455432 non-null  float64
 11  Nuclei_AreaShape_EulerNumber       4554

Unnamed: 0,TableID,PlateID,WellID,FieldID,ImageNumber,ObjectNumber,Nuclei_AreaShape_Area,Nuclei_AreaShape_Center_X,Nuclei_AreaShape_Center_Y,Nuclei_AreaShape_Compactness,...,Nuclei_AreaShape_MajorAxisLength,Nuclei_AreaShape_MaxFeretDiameter,Nuclei_AreaShape_MaximumRadius,Nuclei_AreaShape_MeanRadius,Nuclei_AreaShape_MedianRadius,Nuclei_AreaShape_MinFeretDiameter,Nuclei_AreaShape_MinorAxisLength,Nuclei_AreaShape_Orientation,Nuclei_AreaShape_Perimeter,Nuclei_AreaShape_Solidity
0,0702991209138712afb02ac7ea637f71,24278,a01,s1,1,1,668,52.0,20.0,1.036701,...,33.056429,33.600595,13.0,4.922154,4.242641,25.491175,25.944878,50.159928,100.076,0.948864
1,0702991209138712afb02ac7ea637f71,24278,a01,s1,1,2,890,640.0,23.0,1.261758,...,47.799265,46.615448,12.041595,5.001021,4.472136,23.574758,23.961891,-34.081184,122.7,0.957504
2,0702991209138712afb02ac7ea637f71,24278,a01,s1,1,3,917,220.0,25.0,1.18388,...,45.106195,42.059482,12.165525,5.04324,4.472136,27.496545,27.015068,-34.750027,127.356,0.921145
3,0702991209138712afb02ac7ea637f71,24278,a01,s1,1,4,699,265.0,28.0,1.028651,...,33.465476,33.600595,13.416408,5.127288,4.472136,26.162951,26.708074,-42.291306,98.904,0.958848
4,0702991209138712afb02ac7ea637f71,24278,a01,s1,1,5,862,79.0,30.0,1.235444,...,45.684766,43.680659,12.369317,4.86148,4.472136,26.667468,24.993011,73.200746,126.114,0.909283


In [11]:
print(dataframes['Cytoplasm'].info())
dataframes['Cytoplasm'].head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455432 entries, 0 to 455431
Data columns (total 24 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   TableID                               455432 non-null  object 
 1   PlateID                               455432 non-null  object 
 2   WellID                                455432 non-null  object 
 3   FieldID                               455432 non-null  object 
 4   ImageNumber                           455432 non-null  object 
 5   ObjectNumber                          455432 non-null  object 
 6   Cytoplasm_AreaShape_Area              455432 non-null  int64  
 7   Cytoplasm_AreaShape_Center_X          455432 non-null  float64
 8   Cytoplasm_AreaShape_Center_Y          455432 non-null  float64
 9   Cytoplasm_AreaShape_Compactness       455411 non-null  float64
 10  Cytoplasm_AreaShape_Eccentricity      455411 non-null  float64
 11  

Unnamed: 0,TableID,PlateID,WellID,FieldID,ImageNumber,ObjectNumber,Cytoplasm_AreaShape_Area,Cytoplasm_AreaShape_Center_X,Cytoplasm_AreaShape_Center_Y,Cytoplasm_AreaShape_Compactness,...,Cytoplasm_AreaShape_MajorAxisLength,Cytoplasm_AreaShape_MaxFeretDiameter,Cytoplasm_AreaShape_MaximumRadius,Cytoplasm_AreaShape_MeanRadius,Cytoplasm_AreaShape_MedianRadius,Cytoplasm_AreaShape_MinFeretDiameter,Cytoplasm_AreaShape_MinorAxisLength,Cytoplasm_AreaShape_Orientation,Cytoplasm_AreaShape_Perimeter,Cytoplasm_AreaShape_Solidity
0,0702991209138712afb02ac7ea637f71,24278,a01,s1,1,1,1082,34.0,0.0,3.082407,...,83.722955,80.709355,14.142136,3.867136,3.0,40.703919,38.264106,64.718032,323.744,0.553878
1,0702991209138712afb02ac7ea637f71,24278,a01,s1,1,2,2257,608.0,43.0,2.15423,...,101.018066,112.507778,13.892444,4.497375,3.605551,51.680329,46.289561,-37.449278,443.272,0.599389
2,0702991209138712afb02ac7ea637f71,24278,a01,s1,1,3,1707,221.0,52.0,1.911851,...,77.918782,70.710678,13.453624,4.171019,3.605551,51.342921,47.131865,-73.961135,362.336,0.589332
3,0702991209138712afb02ac7ea637f71,24278,a01,s1,1,4,1368,290.0,24.0,2.533414,...,87.519053,81.492331,10.630146,3.582114,3.0,37.10312,33.834025,-27.396395,310.986,0.640449
4,0702991209138712afb02ac7ea637f71,24278,a01,s1,1,5,905,87.0,0.0,2.404373,...,60.685292,58.591808,12.0,3.816261,3.0,41.751358,43.015571,-67.190934,303.642,0.49931


## Cell Counts

In [91]:
# calculate the cells counts of each plate, each well and each field
cells_per_plate = dataframes['Cells'].groupby('PlateID').size().reset_index(name='Number_of_Cells')
cells_per_well = dataframes['Cells'].groupby(['PlateID', 'WellID']).size().reset_index(name='Number_of_Cells')
cells_per_field = dataframes['Cells'].groupby(['PlateID', 'WellID', 'FieldID']).size().reset_index(name='Number_of_Cells')

biomorph = biomorph.merge(right=cells_per_well[['PlateID', 'WellID', 'Number_of_Cells']],
                          on=['PlateID', 'WellID'],
                          how='left')

biomorph = move_column(df=biomorph, col_name='Number_of_Cells', new_position=2)
biomorph

Unnamed: 0,PlateID,WellID,Number_of_Cells,CPD_NAME,CPD_SAMPLE_ID,apoptosis up,cytotoxicity BLA,cytotoxicity SRB,ER stress,heat shock,microtubule up,mitochondrial disruption up,oxidative stress up,proliferation decrease,total_endpoints,endpoint_combination
0,24278,a03,434,olmesartan medoxomil,SA59556,0,0,0,0,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)"
1,24278,a06,372,citropten,SA59278,0,0,0,0,0,0,1,0,0,1,"(0, 0, 0, 0, 0, 0, 1, 0, 0)"
2,24278,a09,418,bromperidol,SA83338,0,0,0,0,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)"
3,24278,a10,358,leflunomide,SA792771,1,1,0,1,1,0,0,1,0,5,"(1, 1, 0, 1, 1, 0, 0, 1, 0)"
4,24278,a11,384,suxibuzone,SA58544,0,0,0,0,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,24279,p13,559,chlorpropamide,SA58227,0,0,0,0,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)"
194,24279,p14,611,nicorandil,SA83824,0,0,0,0,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)"
195,24279,p17,661,prednisolone,SA83046,0,0,0,0,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)"
196,24279,p21,580,chlorzoxazone,SA82767,0,0,0,0,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)"


In [53]:
# # Cell count distribution of wells
# cells_per_well.hist(column=['Number_of_Cells'], bins=25, grid=True, alpha=0.6, color='skyblue')
# plt.title('Number of Cells Per Well')
# plt.xlabel('Cell Count')
# plt.ylabel('# of Wells')
# plt.tight_layout()
# plt.show()

## Well Statistics

Aggregating FOVs of each well, compute the mean and standard deviation of cells and nuclei area:

In [92]:
# calculate the aggregated cell and nuclei metrics per well
cell_agg_per_well = dataframes['Cells'].groupby('WellID').agg(Cells_AreaShape_Area_Mean=('Cells_AreaShape_Area', 'mean'),
                                               Cells_AreaShape_Area_Std=('Cells_AreaShape_Area', 'std'),
                                               Cells_AreaShape_Compactness_Mean=('Cells_AreaShape_Compactness', 'mean'),
                                               Cells_AreaShape_Compactness_Std=('Cells_AreaShape_Compactness', 'std')).reset_index()

nuclei_agg_per_well = dataframes['Nuclei'].groupby('WellID').agg(Nuclei_AreaShape_Area_Mean=('Nuclei_AreaShape_Area', 'mean'),
                                                                      Nuclei_AreaShape_Area_Std=('Nuclei_AreaShape_Area', 'std'),
                                                                      Nuclei_AreaShape_Compactness_Mean=('Nuclei_AreaShape_Compactness', 'mean'),
                                                                      Nuclei_AreaShape_Compactness_Std=('Nuclei_AreaShape_Compactness', 'std')).reset_index()

# merge aggregated metrics with the biomorph endpoints table
biomorph_well_stats = biomorph.merge(cell_agg_per_well, on='WellID', how='left')
biomorph_well_stats = biomorph_well_stats.merge(nuclei_agg_per_well, on='WellID', how='left')
biomorph_well_stats

Unnamed: 0,PlateID,WellID,Number_of_Cells,CPD_NAME,CPD_SAMPLE_ID,apoptosis up,cytotoxicity BLA,cytotoxicity SRB,ER stress,heat shock,...,total_endpoints,endpoint_combination,Cells_AreaShape_Area_Mean,Cells_AreaShape_Area_Std,Cells_AreaShape_Compactness_Mean,Cells_AreaShape_Compactness_Std,Nuclei_AreaShape_Area_Mean,Nuclei_AreaShape_Area_Std,Nuclei_AreaShape_Compactness_Mean,Nuclei_AreaShape_Compactness_Std
0,24278,a03,434,olmesartan medoxomil,SA59556,0,0,0,0,0,...,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",2905.286538,1549.740928,1.418624,0.366025,785.850962,301.191691,1.134156,0.118731
1,24278,a06,372,citropten,SA59278,0,0,0,0,0,...,1,"(0, 0, 0, 0, 0, 0, 1, 0, 0)",3025.858862,1468.401471,1.397274,0.351147,784.751641,301.440508,1.126503,0.106892
2,24278,a09,418,bromperidol,SA83338,0,0,0,0,0,...,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",2975.467480,1627.964996,1.335697,0.283565,866.280488,370.335688,1.114013,0.099582
3,24278,a10,358,leflunomide,SA792771,1,1,0,1,1,...,5,"(1, 1, 0, 1, 1, 0, 0, 1, 0)",3026.172340,1732.385448,1.359984,0.350452,844.092553,347.917417,1.124097,0.112744
4,24278,a11,384,suxibuzone,SA58544,0,0,0,0,0,...,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",2673.789668,1350.417949,1.338160,0.303323,820.436347,337.929728,1.120367,0.111256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,24279,p13,559,chlorpropamide,SA58227,0,0,0,0,0,...,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",3172.182703,1781.926680,1.325406,0.268323,863.526486,348.114797,1.116706,0.107185
194,24279,p14,611,nicorandil,SA83824,0,0,0,0,0,...,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",2826.798303,1424.674478,1.322074,0.255763,841.606975,309.754558,1.120756,0.100744
195,24279,p17,661,prednisolone,SA83046,0,0,0,0,0,...,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",2925.992747,1398.597473,1.367770,0.303513,822.224841,284.222039,1.129370,0.109216
196,24279,p21,580,chlorzoxazone,SA82767,0,0,0,0,0,...,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",3070.371429,1685.035718,1.318786,0.304451,887.367488,353.520299,1.116026,0.116439


## Field Statistics

Because the approach above resulted in less-than-ideal number of instances, let's aggregate for each FOV instead of each well:

In [93]:
# populate the endpoint dataframe with all available fields of each well
biomorph_field_stats = biomorph.merge(dataframes['Cells'][['PlateID', 'WellID', 'FieldID']].drop_duplicates(), 
                     on=['PlateID', 'WellID'], 
                     how='left')
biomorph_field_stats = move_column(df=biomorph_field_stats, col_name='FieldID', new_position=2)

# calculate mean and standard deviation for 'Cells_AreaShape_Area' and 'Cells_AreaShape_Compactness' in cells_df by FieldID
cell_agg_per_field = dataframes['Cells'].groupby(['PlateID', 'WellID', 'FieldID']).agg(
    Cells_AreaShape_Area_Mean=('Cells_AreaShape_Area', 'mean'),
    Cells_AreaShape_Area_Std=('Cells_AreaShape_Area', 'std'),
    Cells_AreaShape_Compactness_Mean=('Cells_AreaShape_Compactness', 'mean'),
    Cells_AreaShape_Compactness_Std=('Cells_AreaShape_Compactness', 'std')
).reset_index()

# merge the aggregated metrics from cells_df into endpoint dataframe
biomorph_field_stats = biomorph_field_stats.merge(cell_agg_per_field, on=['PlateID', 'WellID', 'FieldID'], how='left')

# calculate mean and standard deviation for 'Cells_AreaShape_Area' and 'Cells_AreaShape_Compactness' in nuclei_df by FieldID
nuclei_agg_per_field = dataframes['Nuclei'].groupby(['PlateID', 'WellID', 'FieldID']).agg(
    Nuclei_AreaShape_Area_Mean=('Nuclei_AreaShape_Area', 'mean'),
    Nuclei_AreaShape_Area_Std=('Nuclei_AreaShape_Area', 'std'),
    Nuclei_AreaShape_Compactness_Mean=('Nuclei_AreaShape_Compactness', 'mean'),
    Nuclei_AreaShape_Compactness_Std=('Nuclei_AreaShape_Compactness', 'std')).reset_index()

# merge the aggregated metrics from nuclei_df into endpoint dataframe
biomorph_field_stats = biomorph_field_stats.merge(nuclei_agg_per_field, on=['PlateID', 'WellID', 'FieldID'], how='left')

# add the cell counts for each field of each well
biomorph_field_stats = biomorph_field_stats.drop('Number_of_Cells', axis=1)
biomorph_field_stats = biomorph_field_stats.merge(cells_per_field, on=['PlateID', 'WellID', 'FieldID'], how='left')
biomorph_field_stats = move_column(df=biomorph_field_stats, col_name='Number_of_Cells', new_position=3)
biomorph_field_stats

Unnamed: 0,PlateID,WellID,FieldID,Number_of_Cells,CPD_NAME,CPD_SAMPLE_ID,apoptosis up,cytotoxicity BLA,cytotoxicity SRB,ER stress,...,total_endpoints,endpoint_combination,Cells_AreaShape_Area_Mean,Cells_AreaShape_Area_Std,Cells_AreaShape_Compactness_Mean,Cells_AreaShape_Compactness_Std,Nuclei_AreaShape_Area_Mean,Nuclei_AreaShape_Area_Std,Nuclei_AreaShape_Compactness_Mean,Nuclei_AreaShape_Compactness_Std
0,24278,a03,s1,97,olmesartan medoxomil,SA59556,0,0,0,0,...,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",2697.989691,2184.240555,1.362183,0.320399,801.216495,353.681714,1.122623,0.097678
1,24278,a03,s2,110,olmesartan medoxomil,SA59556,0,0,0,0,...,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",2446.090909,1270.143419,1.369504,0.283888,798.418182,348.722502,1.133376,0.111364
2,24278,a03,s3,49,olmesartan medoxomil,SA59556,0,0,0,0,...,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",3826.408163,2494.539377,1.454695,0.437254,854.448980,374.762095,1.136081,0.106685
3,24278,a03,s4,74,olmesartan medoxomil,SA59556,0,0,0,0,...,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",3483.702703,2104.109617,1.546692,0.533525,821.567568,385.582643,1.148037,0.172811
4,24278,a03,s5,61,olmesartan medoxomil,SA59556,0,0,0,0,...,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",3414.852459,1483.216065,1.573668,0.464460,739.688525,224.409042,1.110631,0.080053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1176,24279,p24,s2,95,desoxycortone,SA83656,0,0,0,0,...,2,"(0, 0, 0, 0, 1, 0, 0, 1, 0)",2957.989474,1261.509477,1.345114,0.348745,840.852632,243.283740,1.118499,0.099514
1177,24279,p24,s3,116,desoxycortone,SA83656,0,0,0,0,...,2,"(0, 0, 0, 0, 1, 0, 0, 1, 0)",2349.034483,736.978508,1.274119,0.229126,828.474138,244.858982,1.138506,0.107688
1178,24279,p24,s4,90,desoxycortone,SA83656,0,0,0,0,...,2,"(0, 0, 0, 0, 1, 0, 0, 1, 0)",2702.655556,1189.304661,1.396498,0.420630,782.822222,245.917479,1.129670,0.105230
1179,24279,p24,s5,141,desoxycortone,SA83656,0,0,0,0,...,2,"(0, 0, 0, 0, 1, 0, 0, 1, 0)",2018.617021,1044.170119,1.302185,0.223155,778.475177,363.424891,1.117391,0.110494


## Compound & Endpoint Combination

In [94]:
print(f"The number of samples with no endpoint activity: {biomorph[biomorph['total_endpoints'] == 0].shape[0]}")
print(f"The number of samples with a singular endpoint activity: {biomorph[biomorph['total_endpoints'] == 1].shape[0]}")
print(f"The number of samples with a multiple endpoint activities: {biomorph[biomorph['total_endpoints'] > 1].shape[0]}")

The number of samples with no endpoint activity: 110
The number of samples with a singular endpoint activity: 40
The number of samples with a multiple endpoint activities: 48


In [95]:
# retrieve the occurrence counts of endpoint activities
endpoint_combination_counts = biomorph['endpoint_combination'].value_counts()

# group by endpoint combination and then count unique compounds in each group
endpoint_compound_counts = biomorph.groupby('endpoint_combination')['CPD_NAME'].nunique()
endpoint_compound_counts = endpoint_compound_counts.sort_values(ascending=False)

print(f'Endpoint Combination Counts:\n{endpoint_combination_counts}\n')
print(f'Endpoint Compound Counts:\n{endpoint_compound_counts}')

Endpoint Combination Counts:
endpoint_combination
(0, 0, 0, 0, 0, 0, 0, 0, 0)    110
(0, 1, 0, 0, 0, 0, 0, 0, 0)     14
(1, 0, 0, 0, 0, 0, 0, 0, 0)     14
(1, 0, 1, 1, 0, 0, 0, 1, 1)      4
(0, 0, 0, 0, 0, 0, 1, 0, 0)      4
(0, 0, 0, 0, 0, 0, 0, 1, 0)      4
(1, 1, 0, 1, 1, 0, 0, 0, 0)      4
(1, 1, 1, 1, 1, 0, 1, 1, 1)      4
(1, 0, 0, 0, 1, 0, 0, 0, 0)      4
(1, 1, 0, 0, 0, 0, 0, 0, 0)      2
(1, 1, 1, 1, 1, 0, 0, 1, 1)      2
(1, 0, 1, 0, 0, 0, 1, 1, 1)      2
(0, 0, 0, 1, 1, 0, 0, 1, 0)      2
(1, 0, 0, 1, 0, 0, 0, 0, 0)      2
(0, 0, 0, 0, 0, 0, 0, 0, 1)      2
(0, 0, 1, 0, 0, 0, 1, 1, 1)      2
(0, 0, 0, 1, 0, 0, 0, 0, 0)      2
(1, 1, 0, 1, 0, 0, 1, 1, 1)      2
(1, 0, 0, 0, 0, 0, 1, 1, 1)      2
(1, 1, 0, 0, 1, 0, 0, 0, 0)      2
(0, 1, 0, 1, 1, 0, 0, 0, 0)      2
(1, 0, 0, 1, 1, 0, 0, 0, 0)      2
(0, 0, 0, 0, 0, 0, 1, 0, 1)      2
(1, 0, 1, 1, 1, 1, 1, 1, 1)      2
(1, 0, 0, 0, 0, 1, 0, 1, 1)      2
(1, 1, 0, 1, 1, 0, 0, 1, 0)      2
(0, 0, 0, 0, 1, 0, 0, 1, 0)      2
Name:

In [96]:
# all of the compounds only results in a single endpoint combination
for eachCompound in biomorph['CPD_NAME'].unique():
    if len(biomorph[biomorph['CPD_NAME'] == eachCompound]['endpoint_combination'].unique()) != 1:
        print(f"{eachCompound} has multiple occurring endpoint combinations")

In [97]:
# calculate total occurrences of each compound
compound_counts = biomorph['CPD_NAME'].value_counts().reset_index()
compound_counts.columns = ['CPD_NAME', 'Total_Occurrences']

# find the most frequent endpoint combination for each compound
most_common_combinations = biomorph.groupby(['CPD_NAME', 'endpoint_combination']).size().reset_index(name='Count')
most_common_combinations = most_common_combinations.sort_values(['CPD_NAME', 'Count'], ascending=[True, False])

# get the top combination for each compound
most_common_combinations = most_common_combinations.drop_duplicates(subset='CPD_NAME', keep='first')

# merge with total occurrences and calculate decimal percentage
compound_summary = pd.merge(compound_counts, most_common_combinations, on='CPD_NAME')
compound_summary['Decimal_Percentage'] = compound_summary['Count'] / compound_summary['Total_Occurrences']

# rename columns for clarity
compound_summary.columns = ['CPD_NAME', 'Total_Occurrences', 'Most_Common_Combination', 'Occurrence_with_Combination', 'Decimal_Percentage']

# sort by total occurrences in descending order
compound_summary = compound_summary.sort_values(by='Total_Occurrences', ascending=False).reset_index(drop=True)
compound_summary

Unnamed: 0,CPD_NAME,Total_Occurrences,Most_Common_Combination,Occurrence_with_Combination,Decimal_Percentage
0,olmesartan medoxomil,2,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",2,1.0
1,etofylline,2,"(1, 0, 0, 0, 0, 0, 0, 0, 0)",2,1.0
2,bromperidol,2,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",2,1.0
3,leflunomide,2,"(1, 1, 0, 1, 1, 0, 0, 1, 0)",2,1.0
4,suxibuzone,2,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",2,1.0
...,...,...,...,...,...
94,chlorpropamide,2,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",2,1.0
95,nicorandil,2,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",2,1.0
96,prednisolone,2,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",2,1.0
97,chlorzoxazone,2,"(0, 0, 0, 0, 0, 0, 0, 0, 0)",2,1.0


## Variance Analysis

**What's our problem?**
- We need to understand whether **every site for a well** is as representative for the *end-points* as the next, or if there’s a good deal of heterogeneity if we were to consider the data on a site-level vs a well level.
- Therefore, the goal is to measure **heterogeneity across fields** within each well.
- If there is high variability across fields, it suggests that a single well-level average may not capture all the underlying variability, which could justify a more granular analysis at the site level.

**What are the options?**
1. I can use **ANOVA (Analysis of Variance)** test to see whether there are statistically significant differences in the means of a variable (in this case, cell metrics like area) across multiple groups (fields within each well).
    - I could have compared the cell-feature means of each field inside a well, but then I'd have very few number of instances (at most 6) for statistical tests.
    - That's why I directly used single-cell metrics instead of field-level aggregates for ANOVA, which helps to improve the statistical power of the test.
    - ***Adding endpoint/compound into the test***: Since the endpoint combination is the same across all fields in each well, it won’t vary within the well. This makes it less relevant for ANOVA, as I'm not *comparing across different endpoint labels* but rather *testing consistency within the fields assigned the same endpoint*.
        - If, however, **significant heterogeneity within fields** for wells labeled with the same endpoint combination is observed, this would indicate that some fields may not represent that endpoint as consistently as others. In such cases, I should follow up by **examining whether certain endpoint combinations show more heterogeneity** across fields than others, which could highlight specific conditions that are less consistent across fields.
    - **Interpretation**: Low p-value (< 0.05) indicates statistically significant differences in a feature between fields within a well. This suggests that field-level heterogeneity exists, supporting the need for site- or single-cell level analysis. High p-value suggests that field-level averages are more similar, implying less heterogeneity across fields within the well.

#### ANOVA: Single Feature

In [120]:
# initialize a list for the results
results = []

# iterate over each (unique) PlateID and WellID combination
for plate_id, well_id in dataframes['Cells'][['PlateID', 'WellID']].drop_duplicates().values:

    # filter the current PlateID & WellID combination
    well_data = dataframes['Cells'][(dataframes['Cells']['PlateID'] == plate_id) & 
                                    (dataframes['Cells']['WellID'] == well_id)]
    
    # get area measurements and sample size (cell count)
    area_values_per_field = [
        well_data[well_data['FieldID'] == field]['Cells_AreaShape_Area'].dropna().values 
        for field in well_data['FieldID'].unique()]
    
    # cell count for each field
    cell_counts_per_field = [len(values) for values in area_values_per_field]  
    
    # check if we have more than one field with data to perform ANOVA
    if len(area_values_per_field) > 1:
        # Perform one-way ANOVA
        p_value = f_oneway(*area_values_per_field).pvalue
        total_cell_count = sum(cell_counts_per_field)
        
        # add results to the list
        results.append({
            'PlateID': plate_id,
            'WellID': well_id,
            'p_value': p_value,
            'total_cell_count': total_cell_count
        })

# convert results to a dataframe for easier handling
anova_results_df = pd.DataFrame(results)
anova_results_df['p_value'] = anova_results_df['p_value'].round(5)
anova_results_df

Unnamed: 0,PlateID,WellID,p_value,total_cell_count
0,24278,a01,0.98912,228
1,24278,a02,0.00000,403
2,24278,a03,0.00002,434
3,24278,a04,0.09431,199
4,24278,a05,0.00000,283
...,...,...,...,...
760,24279,p20,0.00000,602
761,24279,p21,0.00000,580
762,24279,p22,0.00000,725
763,24279,p23,0.00000,351


#### ANOVA: Multiple Features

In [122]:
# list of features for ANOVA analysis input
features = ['Cells_AreaShape_Area', 'Cells_AreaShape_Compactness', 'Nuclei_AreaShape_Area', 'Nuclei_AreaShape_Compactness']

# list to collect results
results = []

# parse through each well of inside each plate
for plate_id, well_id in dataframes['Cells'][['PlateID', 'WellID']].drop_duplicates().values:

    # take a subset of the current plate and well
    well_data_cells = dataframes['Cells'][(dataframes['Cells']['PlateID'] == plate_id) & 
                                          (dataframes['Cells']['WellID'] == well_id)]
    well_data_nuclei = dataframes['Nuclei'][(dataframes['Nuclei']['PlateID'] == plate_id) & 
                                            (dataframes['Nuclei']['WellID'] == well_id)]
    
    # do it for each feature one by one
    for feature in features:

        # gather feature values for each field, depending on the type of feature
        if feature.startswith('Cells'):
            values_per_field = [well_data_cells[well_data_cells['FieldID'] == field][feature].dropna().values 
                                for field in well_data_cells['FieldID'].unique()]
        elif feature.startswith('Nuclei'):
            values_per_field = [well_data_nuclei[well_data_nuclei['FieldID'] == field][feature].dropna().values 
                                for field in well_data_nuclei['FieldID'].unique()]
        
        # only proceed if there are multiple fields with data
        if len(values_per_field) > 1:

            # perform ANOVA and calculate p-value
            p_value = f_oneway(*values_per_field).pvalue
            total_cell_count = sum(len(values) for values in values_per_field)

            # append the result for this feature
            results.append({
                'PlateID': plate_id,
                'WellID': well_id,
                'Feature': feature,
                'p_value': p_value,
                'total_cell_count': total_cell_count
            })

# convert the results to dataframe
anova_results_df = pd.DataFrame(results)
anova_results_df['p_value'] = anova_results_df['p_value'].round(5)
anova_results_df

Unnamed: 0,PlateID,WellID,Feature,p_value,total_cell_count
0,24278,a01,Cells_AreaShape_Area,0.98912,228
1,24278,a01,Cells_AreaShape_Compactness,0.65685,228
2,24278,a01,Nuclei_AreaShape_Area,0.74065,228
3,24278,a01,Nuclei_AreaShape_Compactness,0.23325,228
4,24278,a02,Cells_AreaShape_Area,0.00000,403
...,...,...,...,...,...
3055,24279,p23,Nuclei_AreaShape_Compactness,0.78866,351
3056,24279,p24,Cells_AreaShape_Area,0.00000,614
3057,24279,p24,Cells_AreaShape_Compactness,0.00000,614
3058,24279,p24,Nuclei_AreaShape_Area,0.49722,614


### MANOVA & Normality Testing

Although **MANOVA** has **multivariate normality** requirement, which means that the distribution of each feature and all combinations of features should be normally distributed within each group (in your case, within each field in a well). This assumption ensures accurate results in MANOVA, as it affects the validity of p-values.

This requirement can be tested by (i) Shapiro-Wilk or Kolmogorov-Smirnov Test, and (ii) Mardia’s Test:
- **Shapiro-Wilk or Kolmogorov-Smirnov Test**: Tests each feature individually within each field. It tests normality, but don’t capture interactions between features.
- **Mardia’s Test**: More suited for testing multivariate normality across features simultaneously, making it ideal before applying MANOVA.

In [147]:
# list to store MANOVA results
manova_results = []

# parse through each well & plate combination
for plate_id, well_id in dataframes['Cells'][['PlateID', 'WellID']].drop_duplicates().values:

    # subset the respective cell and nuclei data
    well_data_cells = dataframes['Cells'][(dataframes['Cells']['PlateID'] == plate_id) & 
                                          (dataframes['Cells']['WellID'] == well_id)]
    well_data_nuclei = dataframes['Nuclei'][(dataframes['Nuclei']['PlateID'] == plate_id) & 
                                            (dataframes['Nuclei']['WellID'] == well_id)]
    
    # merge cell and nuclei data but with outer join
    well_data = well_data_cells.merge(well_data_nuclei, on=['PlateID', 'WellID', 'FieldID', 'ImageNumber', 'ObjectNumber'], how='outer')
    
    # now that we have a specific well, iterate over each field within this well
    for field_id in well_data['FieldID'].unique():

        # subset the feature data
        field_data = well_data[well_data['FieldID'] == field_id][[
            'Cells_AreaShape_Area', 'Cells_AreaShape_Compactness',
            'Nuclei_AreaShape_Area', 'Nuclei_AreaShape_Compactness']].dropna()
        
        if len(field_data) > 3:
            # execute Mardia's multivariate normality test for feature data
            mardia_test = pg.multivariate_normality(field_data, alpha=0.05)

            # extract normality result
            mardia_normal = mardia_test[0]  # TRUE if normal
            mardia_p_value = mardia_test[1]  # p-value from Mardia's test
            
            # check if multivariate normality is met
            if mardia_normal:
                
                # perform MANOVA with FieldID as the grouping factor
                manova = MANOVA.from_formula('Cells_AreaShape_Area + Cells_AreaShape_Compactness + Nuclei_AreaShape_Area + Nuclei_AreaShape_Compactness ~ FieldID', data=well_data)
                manova_result = manova.mv_test()

                # IMPORTANT:
                # Mardia test is applied to each field to understand 
                # whether any of the fields inside the well is not suited 
                # for multivariate normality, but MANOVA test is performed for each well (not each field).
                
                # extract MANOVA p-value (for example, using Wilks' Lambda)
                manova_p_value = manova_result.results['FieldID']['stat']['Pr > F'].iloc[0]
            
            # set to None if multivariate normality is not met
            else:
                manova_p_value = None 

            # append combined results to the list
            manova_results.append({
                'PlateID': plate_id,
                'WellID': well_id,
                'FieldID': field_id,
                'Mardia_H_normal': mardia_normal,
                'Mardia_p_value': mardia_p_value,
                'MANOVA_p_value': manova_p_value
            })

# convert results to dataframe
manova_results_df = pd.DataFrame(manova_results)
manova_results_df

Unnamed: 0,PlateID,WellID,FieldID,Mardia_H_normal,Mardia_p_value,MANOVA_p_value
0,24278,a01,s1,2.968566,6.189672e-24,6.121132e-02
1,24278,a01,s2,1.410125,2.554677e-06,6.121132e-02
2,24278,a01,s3,1.258537,5.721446e-05,6.121132e-02
3,24278,a01,s4,3.555454,5.979282e-30,6.121132e-02
4,24278,a01,s5,0.883602,5.037093e-02,6.121132e-02
...,...,...,...,...,...,...
4540,24279,p24,s2,2.926578,2.919691e-25,4.491441e-36
4541,24279,p24,s3,2.167438,4.386713e-16,4.491441e-36
4542,24279,p24,s4,4.362480,2.199671e-41,4.491441e-36
4543,24279,p24,s5,7.229774,1.007719e-75,4.491441e-36


# 3. Classifier Modeling

- Since each endpoint represents a separate type of activity, building ***separate classifiers*** for each endpoint (like apoptosis up) is a reasonable approach. This way, each classifier can be optimized for the characteristics of its specific endpoint. However, if these endpoints are often activated together, a multi-label classification model could be beneficial, as it learns patterns across the endpoints simultaneously. **Next: The potential for multi-label will be investigated.**

- Including ***compound names*** could be useful since certain compounds are known to induce specific responses, but encoding must be done carefully. **Next: Categorical data encoding will be investigated.**
- Scaling using ***StandardScaler*** is done to mitigate potential sensitivity problems that might occur in feature-scale-sensitive model types. Random forest is not one of them, but still it is good practice.
- Since each WellID has multiple FieldIDs, treating them as groups using ***StratifiedGroupKFold*** ensures that fields from the same well don’t appear in both training and test sets, preventing data leakage and preserving the independence of samples.

In [55]:
# list of predictor columns
predictor_columns = [
    'Number_of_Cells', 'Cells_AreaShape_Area_Mean', 'Cells_AreaShape_Area_Std',
    'Cells_AreaShape_Compactness_Mean', 'Cells_AreaShape_Compactness_Std',
    'Nuclei_AreaShape_Area_Mean', 'Nuclei_AreaShape_Area_Std',
    'Nuclei_AreaShape_Compactness_Mean', 'Nuclei_AreaShape_Compactness_Std'
]

X = biomorph[predictor_columns]
y = biomorph['apoptosis up']
groups = biomorph['WellID']  # grouping by WellID to ensure fields stay together in splits

# initialize StratifiedGroupKFold
skf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

# initialize random forest pipeline with scaling included
pipeline = Pipeline([('scaler', StandardScaler()),
                     ('classifier', RandomForestClassifier(random_state=42))])

# list of scores for evaluation
accuracies = []
roc_aucs = []

# start the stratification of group cross-validation
for train_index, test_index in skf.split(X, y, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # fit the pipeline (that includes both scaler and classifier) on training data
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    
    # performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    accuracies.append(accuracy)
    roc_aucs.append(roc_auc)

print(f"Accuracy List: {accuracies}")
print(f"ROC AUC List: {roc_aucs}")

print(f"Mean Accuracy: {sum(accuracies) / len(accuracies):.2f}")
print(f"Mean ROC AUC: {sum(roc_aucs) / len(roc_aucs):.2f}")

Accuracy List: [0.6293103448275862, 0.6916666666666667, 0.7017543859649122, 0.7142857142857143, 0.8245614035087719]
ROC AUC List: [0.5326704545454546, 0.5659722222222221, 0.6282051282051282, 0.5429526748971193, 0.5367476851851851]
Mean Accuracy: 0.71
Mean ROC AUC: 0.56
