# This is a tutorial to work with cell painting data

Cell Paitning data processing comes in different levels :

| Data Description                              | Level  |
| :--------------------------------------------- | :------ |
| Images                                         | Level 1 |
| Single-cell profiles (SQLite)                  | Level 2 |
| Aggregated profiles with metadata information  | Level 3 |
| Normalized aggregated profiles                 | Level 4a |
| Normalized and feature-selected profiles       | Level 4b |
| Consensus profiles                             | Level 5 |

In [235]:
import pandas as pd
from pycytominer import annotate, normalize, feature_select, consensus

# Load data

## Load CellProfiler plates data

In [236]:
data_list = []
plates = ["BR00127145", "BR00127146", "BR00127147", "BR00127148", "BR00127149"]
for plate in plates:
    data_tmp = pd.read_parquet('../00_input/' + plate + '.parquet')
    print(plate, "has shape", data_tmp.shape)  
    data_list.append(data_tmp)
df_cellprofiler = pd.concat(data_list)
df_cellprofiler = df_cellprofiler.reset_index(drop = True)
print("Aggregated Data has shape ", df.shape)

BR00127145 has shape (384, 4765)
BR00127146 has shape (384, 4765)
BR00127147 has shape (382, 4765)
BR00127148 has shape (384, 4765)
BR00127149 has shape (384, 4765)
Aggregated Data has shape  (1918, 4765)


In [237]:
df_cellprofiler.head()

Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_BoundingBoxMinimum_X,Cells_AreaShape_BoundingBoxMinimum_Y,Cells_AreaShape_Center_X,...,Nuclei_Texture_Variance_RNA_10_02_256,Nuclei_Texture_Variance_RNA_10_03_256,Nuclei_Texture_Variance_RNA_3_00_256,Nuclei_Texture_Variance_RNA_3_01_256,Nuclei_Texture_Variance_RNA_3_02_256,Nuclei_Texture_Variance_RNA_3_03_256,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256
0,source_4,BR00127145,A01,4254.899902,8091.399902,572.119995,583.98999,482.950012,495.049988,527.059998,...,92.257004,89.775002,85.719002,86.375999,85.484001,86.625,87.264,89.302002,86.958,89.915001
1,source_4,BR00127145,A02,4784.100098,8854.400391,589.049988,608.909973,498.410004,514.130005,543.289978,...,96.736,97.602997,92.363998,92.103996,92.021004,92.017998,92.512001,94.092003,92.139999,93.738998
2,source_4,BR00127145,A03,4107.0,7763.700195,582.25,564.059998,494.73999,477.140015,537.960022,...,99.619003,99.348,93.452003,94.043999,93.553001,93.907997,94.482002,97.015999,94.483002,96.976997
3,source_4,BR00127145,A04,4183.600098,7986.299805,599.619995,572.669983,510.799988,485.290009,554.849976,...,109.550003,109.330002,102.18,102.339996,102.010002,102.440002,102.830002,105.480003,102.970001,105.519997
4,source_4,BR00127145,A05,4222.5,7984.399902,568.159973,554.690002,479.619995,465.660004,523.640015,...,113.559998,110.690002,104.790001,105.480003,104.580002,105.410004,106.290001,109.459999,106.32,109.330002


In [238]:
df_cellprofiler.shape

(1918, 4765)

## Load JUMP compounds annotations 

In [239]:
df_compounds_infos = pd.read_csv("../00_input/compound.csv")

In [240]:
df_compounds_infos.head()

Unnamed: 0,Metadata_JCP2022,Metadata_InChIKey,Metadata_InChI,Metadata_SMILES
0,JCP2022_000001,AAAHWCWPZPSPIW-UHFFFAOYSA-N,InChI=1S/C25H31N5O2/c1-4-23-26-14-16-30(23)24-...,CCc1nccn1-c1cccc(C2CCCN2C(=O)c2ccc(OCCN(C)C)cc...
1,JCP2022_000002,AAAJHRMBUHXWLD-UHFFFAOYSA-N,InChI=1S/C11H13ClN2O/c12-10-4-2-9(3-5-10)8-14-...,O=C1NCCCN1Cc1ccc(Cl)cc1
2,JCP2022_000004,AAANUZMCJQUYNX-UHFFFAOYSA-N,InChI=1S/C13H22N4O2S/c1-2-7-16-13(5-6-15-16)20...,CCCn1nccc1S(=O)(=O)N1CC2CCC1CNC2
3,JCP2022_000005,AAAQFGUYHFJNHI-UHFFFAOYSA-N,InChI=1S/C22H22ClN5O2/c1-4-24-20(29)12-18-22-2...,CCNC(=O)CC1N=C(c2ccc(Cl)cc2)c2cc(OC)ccc2-n2c(C...
4,JCP2022_000006,AAAROXVLYNJINN-UHFFFAOYSA-N,"InChI=1S/C16H20N6O/c1-16(2,3)22-13(10-5-6-10)7...",Cn1cc(-c2noc(-c3cc(C4CC4)n(C(C)(C)C)n3)n2)cn1


In [241]:
df_compounds_infos.shape

(115796, 4)

## Load compounds wells links

In [242]:
df_compounds_wells_links = pd.read_csv('../00_input/source4_batch13_target2.csv')

In [243]:
df_compounds_wells_links.head()

Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_JCP2022
0,source_4,BR00127145,A01,JCP2022_043547
1,source_4,BR00127145,A02,JCP2022_050797
2,source_4,BR00127145,A03,JCP2022_050997
3,source_4,BR00127145,A04,JCP2022_108326
4,source_4,BR00127145,A05,JCP2022_033924


In [244]:
df_compounds_wells_links.shape

(1918, 4)

## Load compounds annotations

In [245]:
df_compounds_annotations = pd.read_csv("../00_input/perturbation_control.csv")

In [246]:
df_compounds_annotations.head()

Unnamed: 0,Metadata_JCP2022,Metadata_pert_type,Metadata_Name,Metadata_modality
0,JCP2022_033924,negcon,DMSO,compound
1,JCP2022_037716,poscon,AMG900,compound
2,JCP2022_025848,poscon,dexamethasone,compound
3,JCP2022_046054,poscon,FK-866,compound
4,JCP2022_035095,poscon,LY2109761,compound


In [247]:
df_compounds_annotations.shape

(9, 4)

## Merge data

In [248]:
# merge df_cellprofiler and df_compounds_wells_links based on common columns Metadata_Source Metadata_Plate Metadata_Well
df_cellprofiler_compounds = pd.merge(
    df_cellprofiler,
    df_compounds_wells_links,
    on=["Metadata_Source", "Metadata_Plate", "Metadata_Well"],
    how="inner"   # or "left", "right", "outer" depending on what you need
)

In [249]:
df_cellprofiler_compounds.shape

(1918, 4766)

In [250]:
# merge df_cellprofiler_compounds and df_compounds_wells_links based on common column Metadata_JCP2022
df_cellprofiler_compounds_infos = pd.merge(
    df_cellprofiler_compounds,
    df_compounds_infos,
    on=["Metadata_JCP2022"],
    how="inner"   # or "left", "right", "outer" depending on what you need
)

In [251]:
df_cellprofiler_compounds_infos.shape

(1918, 4769)

In [252]:
# merge df_cellprofiler_compounds_infos and df_compounds_annotations based on common column Metadata_JCP2022
df_cellprofiler_compounds_infos_annotate = pd.merge(
    df_cellprofiler_compounds_infos,
    df_compounds_annotations,
    on=["Metadata_JCP2022"],
    how="left"   # or "left", "right", "outer" depending on what you need
)

In [253]:
df_cellprofiler_compounds_infos_annotate.shape

(1918, 4772)

In [254]:
# df od levele 3
df_level3 = df_cellprofiler_compounds_infos_annotate

In [255]:
df_level3.shape

(1918, 4772)

## Save

In [256]:
df_level3.to_csv('../02_processed_data/df_level3.csv', index=False)

# Process data

## Aggregate (level 2 --> level 3)
In this JUMP example the data are already aggregateed at the well level (level 3).
If in CellPainting dataset with profiles at the single cell level (level 2), you should aggregated the profiles with aggregate function of pycitminer

## Normalize (level 4a)

In [257]:
# Normalize by plates (and batch if exists)
df_level4a_list = []
for plate in plates:
    df_level4a_plate = normalize(
        profiles=df_level3[df_level3["Metadata_Plate"] == plate],
        features="infer", # If “infer”, then assume features are from CellProfiler output and prefixed with “Cells”, “Nuclei”, or “Cytoplasm”. 
        meta_features="infer", # If “infer”, then assume CellProfiler metadata features, identified by column names that begin with the Metadata_ prefix.”
        samples="Metadata_Name == 'DMSO'",
        method="mad_robustize",
    )
    df_level4a_list.append(df_level4a_plate)
    print("Plate", plate, "normalized")

df_level4a = pd.concat(df_level4a_list).reset_index(drop=True)

Plate BR00127145 normalized
Plate BR00127146 normalized
Plate BR00127147 normalized
Plate BR00127148 normalized
Plate BR00127149 normalized


In [258]:
df_level4a.to_csv('../02_processed_data/df_level4a.csv', index=False)

In [259]:
df_level4a.shape

(1918, 3683)

# Feature selection (level 4b)

In [260]:
feature_select_opts = [
    "variance_threshold",
    "drop_na_columns",
    "correlation_threshold",
    "blocklist",
    "drop_outliers",
]
df_level4b = feature_select(
    profiles=df_level4a, features="infer", samples="all", operation=feature_select_opts
)

In [261]:
print('There were :', len(df_level4a.columns)-len(df_level4b.columns),"features removed")
all_selected_features = [c for c in df_level4b.columns if not c.startswith("Metadata_")]
print('There are :', len(all_selected_features), 'selected features')

There were : 3118 features removed
There are : 555 selected features


In [262]:
df_level4b.to_csv('../02_processed_data/df_level4b.csv', index=False)

In [263]:
df_level4b.shape

(1918, 565)

# Consensus signature (level 5)

In [264]:
df_level5 = consensus(
    profiles=df_level4b,
    replicate_columns=["Metadata_JCP2022"],  # replicate identifier, add concentration column if you have mutliple concentrtion
    features="infer",
    operation="modz",
)

  .apply(


In [265]:
df_level5.to_csv('../02_processed_data/df_level5.csv', index=False)

In [266]:
df_level5.shape

(302, 556)