## Process single cell morphology features for CellProfiler readouts - Comparisons

Compare the output of `pycytominer` and `pycytominer-transform` using NF1_SchwannCells_data output.

In [9]:
import os
import pathlib
import warnings

import pandas as pd
from pycytominer import normalize
from pycytominer.cyto_utils import cells, output
from pycytominer_transform import convert

# ignore warnings
warnings.filterwarnings("ignore")

In [10]:
# Set file and directory constants
cp_dir = "../CellProfiler_pipelines"
output_dir = "data"

sql_file = "NF1_data.sqlite"
single_cell_filepath = f"{cp_dir}/Analysis_Output/{sql_file}"
single_cell_file = f"sqlite:///{cp_dir}/Analysis_Output/{sql_file}"
platemap_file = f"{cp_dir}/Metadata/platemap_NF1_CP.csv"

sc_output_file = pathlib.Path(f"{output_dir}/nf1_sc_cellprofiler.csv.gz")
sc_norm_output_file = pathlib.Path(f"{output_dir}/nf1_sc_norm_cellprofiler.csv.gz")

In [11]:
# Define custom linking columns between compartments
linking_cols = {
    "Per_Cytoplasm": {
        "Per_Cells": "Cytoplasm_Parent_Cells",
        "Per_Nuclei": "Cytoplasm_Parent_OrigNuclei",
    },
    "Per_Cells": {"Per_Cytoplasm": "Cells_Number_Object_Number"},
    "Per_Nuclei": {"Per_Cytoplasm": "Nuclei_Number_Object_Number"},
}

In [12]:
# Load platemap file
platemap_df = pd.read_csv(platemap_file)
platemap_df

Unnamed: 0,WellRow,WellCol,well_position,gene_name,genotype
0,C,6,C6,NF1,WT
1,C,7,C7,NF1,Het
2,D,6,D6,NF1,WT
3,D,7,D7,NF1,Het
4,E,6,E6,NF1,WT
5,E,7,E7,NF1,Het
6,F,6,F6,NF1,WT
7,F,7,F7,NF1,Het


In [13]:
# Instantiate SingleCells class
sc = cells.SingleCells(
    sql_file=single_cell_file,
    compartments=["Per_Cells", "Per_Cytoplasm", "Per_Nuclei"],
    compartment_linking_cols=linking_cols,
    image_table_name="Per_Image",
    strata=["Image_Metadata_Well", "Image_Metadata_Plate"],
    merge_cols=["ImageNumber"],
    image_cols="ImageNumber",
    load_image_data=True,
)

In [14]:
# pycytominer
# perform merge single cells without annotation
# and export to parquet format, re-reading the result
# from the parquet file for precision in comparison
pycytominer_sc_df_without_annotation = pd.read_parquet(
    path=sc.merge_single_cells(
        sc_output_file="pycytominer_singlecells_merge.parquet",
        output_type="parquet",
    )
)
pycytominer_sc_df_without_annotation.info()
pycytominer_sc_df_without_annotation.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 149 entries, 0 to 148
Columns: 1050 entries, Metadata_ImageNumber to Nuclei_Texture_Variance_RFP_3_03_256
dtypes: float64(1028), int64(20), object(2)
memory usage: 1.2+ MB


Unnamed: 0,Metadata_ImageNumber,Image_Metadata_Plate,Image_Metadata_Well,Cytoplasm_Number_Object_Number,Cytoplasm_AreaShape_Area,Cytoplasm_AreaShape_BoundingBoxArea,Cytoplasm_AreaShape_BoundingBoxMaximum_X,Cytoplasm_AreaShape_BoundingBoxMaximum_Y,Cytoplasm_AreaShape_BoundingBoxMinimum_X,Cytoplasm_AreaShape_BoundingBoxMinimum_Y,...,Nuclei_Texture_SumVariance_RFP_3_02_256,Nuclei_Texture_SumVariance_RFP_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,1,1,C6,1,10342.0,35133.0,641.0,370.0,494.0,131.0,...,1778.355949,1715.661141,306.13973,295.581509,310.469726,287.78839,496.084704,502.046808,490.259298,491.171009
1,1,1,C6,2,31676.0,100056.0,1164.0,482.0,900.0,103.0,...,366.696473,320.304744,312.669442,314.123609,330.563627,295.428066,99.874165,100.19489,104.700258,99.916735
2,1,1,C6,3,16007.0,54384.0,360.0,545.0,228.0,133.0,...,356.359632,379.334116,419.277399,366.291857,365.844449,341.137003,104.292865,102.844307,103.764869,103.749468
3,1,1,C6,4,26445.0,90468.0,783.0,526.0,531.0,167.0,...,784.257119,747.557748,390.160802,398.535455,394.923449,359.749244,213.883176,223.214126,225.159172,210.879537
4,4,1,C6,1,7300.0,16500.0,555.0,399.0,390.0,299.0,...,558.440195,495.532894,75.455753,72.548299,75.497862,70.903668,149.088921,146.259081,149.11093,149.976102


In [None]:
# pycytominer-transform
# perform merge without annotation and export
# to parquet format, reading the result
# from the parquet file for comparison
pycytominer_sc_df_without_annotation = pd.read_parquet(
    path=convert(
        source_path=single_cell_filepath,
        dest_path="./pycytominer-transform_singlecells_merge.parquet",
        dest_datatype="parquet",
        merge=True,
        merge_chunk_size=100,
        preset="cellprofiler_sqlite",
    )
)
pycytominer_sc_df_without_annotation.info()
pycytominer_sc_df_without_annotation.head()

In [None]:
# Merge single cells across compartments
anno_kwargs = {"join_on": ["Metadata_well_position", "Image_Metadata_Well"]}

sc_df = sc.merge_single_cells(
    platemap=platemap_df,
    **anno_kwargs,
)

# Save level 2 data as a csv
output(sc_df, sc_output_file)

print(sc_df.shape)
sc_df.head()

In [7]:
# Normalize single cell data and write to file
normalize_sc_df = normalize(sc_df, method="standardize")

output(normalize_sc_df, sc_norm_output_file)

print(normalize_sc_df.shape)
normalize_sc_df.head()

(149, 1054)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_OrigNuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_SumVariance_RFP_3_02_256,Nuclei_Texture_SumVariance_RFP_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,C,6,NF1,WT,1,1,C6,1,4,1,...,3.14154,3.202273,-0.097356,-0.096165,-0.094202,-0.106456,3.337969,3.350528,3.278168,3.310371
1,C,6,NF1,WT,1,1,C6,2,5,2,...,0.315924,0.258633,-0.087971,-0.069493,-0.065539,-0.095377,0.314776,0.31392,0.34842,0.318693
2,C,6,NF1,WT,1,1,C6,3,7,3,...,0.295233,0.383161,0.065251,0.00555,-0.015212,-0.029087,0.348492,0.33394,0.341312,0.347999
3,C,6,NF1,WT,1,1,C6,4,8,4,...,1.151725,1.159965,0.023403,0.051931,0.026268,-0.002094,1.184695,1.243519,1.263751,1.167156
4,C,6,NF1,WT,4,1,C6,1,3,1,...,0.699723,0.628294,-0.428904,-0.416992,-0.429383,-0.420997,0.690298,0.662006,0.685883,0.701466


### Visualize basic count statistics

In [8]:
sc_df.Metadata_genotype.value_counts()

Het    116
WT      33
Name: Metadata_genotype, dtype: int64

In [9]:
pd.crosstab(sc_df.Metadata_genotype, sc_df.Metadata_Well)

Metadata_Well,C6,C7,D6,D7,E6,E7,F6,F7
Metadata_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Het,0,12,0,14,0,44,0,46
WT,12,0,5,0,9,0,7,0
