# This is a tutorial to evalute the profiles of CellPainting data

In [42]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.colors import ListedColormap, BoundaryNorm, Normalize
if not hasattr(np, "float"): # To avoir numpy/cytomyner_eval attribute error
    np.float = float
if not hasattr(np, "int"):
    np.int = int
if not hasattr(np, "bool"):
    np.bool = bool
if not hasattr(np, "str"):
    np.str = str
from cytominer_eval import evaluate
from copairs import map
from copairs.matching import assign_reference_index

In [None]:
# Load data
df_level4b = pd.read_csv('../outputs/df_level4b.csv')

In [44]:
# Define important function arguments
meta_features = df_level4b.columns[df_level4b.columns.str.startswith("Metadata_")].to_list()
features = df_level4b.drop(meta_features, axis="columns").columns.tolist()
replicate_groups = ["Metadata_broad_sample", "Metadata_mg_per_ml"]
plates = df_level4b["Metadata_Plate"].unique()

# Evaluate replicates

In [45]:
percent_matching_chem, corr_df_chem = evaluate(
                                                profiles=df_level4b,
                                                features=features,
                                                meta_features=meta_features,
                                                replicate_groups=["Metadata_JCP2022"],
                                                operation="replicate_reproducibility",
                                                similarity_metric="pearson",
                                                replicate_reproducibility_quantile=0.95,
                                                replicate_reproducibility_return_median_cor=True,
                                                )

In [None]:
corr_df_chem.to_csv("../outputs/correlation.csv")

# Evaluate grit

In [48]:
# Compute a column with unique id column
df_level4b["Metadata_Source_Plate_Well"] = (
        df_level4b["Metadata_Source"].astype(str) + "_" +
        df_level4b["Metadata_Plate"].astype(str) + "_" +
        df_level4b["Metadata_Well"].astype(str)
    )

In [49]:
meta_features.append("Metadata_Source_Plate_Well")

In [50]:
grit_replicate_groups = {
    "profile_col": "Metadata_Source_Plate_Well",
    "replicate_group_col": "Metadata_JCP2022"
}
grit_scores = []
if False: # Grit per plates ?
    for _plate in plates:
        control_wells_grit = df_level4b[df_level4b["Metadata_Plate"] == _plate].query("Metadata_Name == 'DMSO'").Metadata_Source_Plate_Well.unique().tolist()
        grit_results = evaluate(
                                profiles=df_level4b[df_level4b["Metadata_Plate"] == _plate],
                                features=features,
                                meta_features=meta_features,
                                replicate_groups=grit_replicate_groups,
                                operation="grit",
                                similarity_metric="pearson",
                                grit_replicate_summary_method="median",
                                grit_control_perts=control_wells_grit).assign(plate=_plate)
    grit_scores.append(grit_results)
    grit_scores_df = pd.concat(grit_scores).reset_index(drop=True)
control_wells_grit = df_level4b.query("Metadata_Name == 'DMSO'").Metadata_Source_Plate_Well.unique().tolist()
grit_results = evaluate(
                        profiles=df_level4b,
                        features=features,
                        meta_features=meta_features,
                        replicate_groups=grit_replicate_groups,
                        operation="grit",
                        similarity_metric="pearson",
                        grit_replicate_summary_method="median",
                        grit_control_perts=control_wells_grit)
grit_scores_df = grit_results

  .apply(


In [None]:
# Save
grit_scores_df.to_csv('../outputs/grit.csv', index=False)

# Evaluate mAP : Activity of perturbation

In [53]:
# By plates
if False:
    activity_ap_list =  []
    activity_map_list = []
    reference_col = "Metadata_reference_index"
    for _plate in plates:   
        #try:
        df_activity = assign_reference_index(
            df_level4b[df_level4b["Metadata_Plate"] == _plate],
            "Metadata_Name == 'DMSO'",  # condition to get reference profiles (neg controls)
            reference_col=reference_col,
            default_value=-1,
        )
        # positive pairs are replicates of the same treatment
        pos_sameby = ["Metadata_JCP2022", reference_col]
        pos_diffby = []
        
        neg_sameby = []
        # negative pairs are replicates of different treatments
        neg_diffby = ["Metadata_JCP2022", reference_col]

        metadata = df_activity.filter(regex="^Metadata")
        profiles = df_activity[features].values
        
        activity_ap = map.average_precision(
            metadata, profiles, pos_sameby, pos_diffby, neg_sameby, neg_diffby
        )
        activity_ap = activity_ap.query("Metadata_Name != 'DMSO'")  # remove DMSO
        activity_ap_list.append(activity_ap)

        activity_map = map.mean_average_precision(
        activity_ap, pos_sameby, null_size=1000000, threshold=0.05, seed=0
        )
        activity_map_list.append(activity_map)
        print("Pairs found in plate :", _plate)
        #except:
        #    print("No pairs in :", _plate)
    print('Done.')
    activity_ap_df = pd.concat(activity_ap_list).reset_index(drop=True)
    activity_map_df = pd.concat(activity_map_list).reset_index(drop=True)
    activity_map["-log10(p-value)"] = -activity_map["corrected_p_value"].apply(np.log10)

In [54]:
reference_col = "Metadata_reference_index"
df_activity = assign_reference_index(
                            df_level4b,
                            "Metadata_Name == 'DMSO'",  # condition to get reference profiles (neg controls)
                            reference_col=reference_col,
                            default_value=-1,
                            )
# positive pairs are replicates of the same treatment
pos_sameby = ["Metadata_JCP2022", reference_col]
pos_diffby = []

neg_sameby = []
# negative pairs are replicates of different treatments
neg_diffby = ["Metadata_JCP2022", reference_col]

metadata = df_activity.filter(regex="^Metadata")
profiles = df_activity[features].values

activity_ap = map.average_precision(
metadata, profiles, pos_sameby, pos_diffby, neg_sameby, neg_diffby
)
activity_ap_df = activity_ap.query("Metadata_Name != 'DMSO'")  # remove DMSO

activity_map_df = map.mean_average_precision(
                                    activity_ap_df, pos_sameby, null_size=1000000, threshold=0.05, seed=0
                                    )

                                                   

In [None]:
activity_ap_df.to_csv('../outputs/average_precision.csv', index=False)
activity_map_df.to_csv('../outputs/mean_average_precision.csv', index=False)