# Compare cell size and reads per cell between default NS segmentation and trained model segmentation

#### This notebook summarizes reads per cell and cell size using custom segmentation and NS segmentation to directly compare them

In [1]:
import json
import numpy as np
from PIL import  Image
import io
import base64
import zlib
import cv2
from string import Template
import glob
import os
import tifffile
import math
from skimage import data, segmentation, util, measure
import pandas as pd
from os import listdir
from os.path import isfile, join
from matplotlib import pyplot as plt

## Obtain area of each cell in pixels

In [2]:
patients = ['P51', 'P52', 'P53', 'P56', 'P57', 'P58']

In [3]:
all_pts_dfs = []

for patient in patients:

    # loop through all mask files to get size of each cell
    mask_dir = '/data/Zhaolab/1_AMLCosMx/Final_scripts/2_Segmentation/3_NucMemMerging/' + patient + '_hybrid/labels_predicted_2_15_24/'
    masks = [f for f in listdir(mask_dir) if isfile(join(mask_dir, f))]
    masks.sort()

    all_fov_dfs = []

    for i in range(len(masks)):

        name = masks[i].replace('_normalized_cp_masks.tif', '')
        fov = int(name[-2:])
        # Read image and mask
        mask = tifffile.imread(mask_dir + masks[i])

        # Find centroid and area of each cell
        label_image = mask.astype(int)
        props = measure.regionprops_table(label_image,
                                   properties=['label', 'area', 'centroid'])
        data = pd.DataFrame(props)
        data = data.rename(columns={"centroid-0": "y", "centroid-1": "x"}) # this was corrected
        data['Patient'] = patient
        data['FOV'] = fov
        data['ID'] = data['Patient'] + '_FOV' + data['FOV'].astype(str).str.zfill(2) + '_cell_' + data['label'].astype(str)

        all_fov_dfs.append(data)

    # combine all cell size DFs from given patient    
    AllPt_cells = pd.concat(all_fov_dfs)
    AllPt_cells = AllPt_cells.set_index('ID')

    # read in RBC labels from all 3 timepoints
    tba = pd.read_csv('/data/Zhaolab/1_AMLCosMx/Final_scripts/4_CellTyping/1_IdentifyRBCs_2channel/' + patient + '/TPA/' + patient + 'A_RBCs.csv')
    tbb = pd.read_csv('/data/Zhaolab/1_AMLCosMx/Final_scripts/4_CellTyping/1_IdentifyRBCs_2channel/' + patient + '/TPB/' + patient + 'B_RBCs.csv')
    tbc = pd.read_csv('/data/Zhaolab/1_AMLCosMx/Final_scripts/4_CellTyping/1_IdentifyRBCs_2channel/' + patient + '/TPC/' + patient + 'C_RBCs.csv')
    rbcs = pd.concat([tba, tbb, tbc], ignore_index=True)

    # give each cell unique ID, set as index
    rbcs['ID'] = rbcs['Patient'] + '_FOV' + rbcs['FOV'].astype(str).str.zfill(2) + '_cell_' + rbcs['cell_ID'].astype(str)
    rbcs = rbcs.set_index('ID')

    # add column to identify all cells as RBCs
    rbcs['RBC'] = 1

    # merge RBC DF with all cell DF (0 means not RBC, 1 means RBC)
    AllPt_cells = AllPt_cells.merge(rbcs['RBC'], how='left', left_index=True, right_index=True)
    AllPt_cells['RBC'] = AllPt_cells['RBC'].fillna(0)
    AllPt_cells['RBC'] = AllPt_cells['RBC'].astype(int)
    
    # save single patient output to list
    all_pts_dfs.append(AllPt_cells)

In [4]:
# concatenate into one table for all patients
all_pt_data = pd.concat(all_pts_dfs)
all_pt_data

Unnamed: 0_level_0,label,area,y,x,Patient,FOV,RBC
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P51_FOV01_cell_1,1,903,15.766334,74.070875,P51,1,0
P51_FOV01_cell_2,2,1122,14.835116,117.772727,P51,1,0
P51_FOV01_cell_3,3,2034,20.265487,218.211898,P51,1,0
P51_FOV01_cell_4,4,2534,41.141279,962.670481,P51,1,0
P51_FOV01_cell_5,5,1006,16.085487,1059.072565,P51,1,0
...,...,...,...,...,...,...,...
P58_FOV23_cell_7971,7971,596,3640.771812,4117.850671,P58,23,0
P58_FOV23_cell_7972,7972,359,3641.122563,4992.493036,P58,23,0
P58_FOV23_cell_7973,7973,188,3642.707447,3546.191489,P58,23,0
P58_FOV23_cell_7974,7974,323,3642.761610,3656.486068,P58,23,0


In [5]:
# get median area of all segmented cells
all_pt_data['area'].median()

1423.0

In [6]:
# get median area of all segmented cells (NO RBCs)
all_pt_data[all_pt_data['RBC'] == 0]['area'].median()

1487.0

## Apply size filter

In [7]:
cells_to_keep = all_pt_data[all_pt_data['area'] > 250]

In [8]:
cells_to_keep

Unnamed: 0_level_0,label,area,y,x,Patient,FOV,RBC
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P51_FOV01_cell_1,1,903,15.766334,74.070875,P51,1,0
P51_FOV01_cell_2,2,1122,14.835116,117.772727,P51,1,0
P51_FOV01_cell_3,3,2034,20.265487,218.211898,P51,1,0
P51_FOV01_cell_4,4,2534,41.141279,962.670481,P51,1,0
P51_FOV01_cell_5,5,1006,16.085487,1059.072565,P51,1,0
...,...,...,...,...,...,...,...
P58_FOV23_cell_7966,7966,731,3638.218878,4327.127223,P58,23,0
P58_FOV23_cell_7971,7971,596,3640.771812,4117.850671,P58,23,0
P58_FOV23_cell_7972,7972,359,3641.122563,4992.493036,P58,23,0
P58_FOV23_cell_7974,7974,323,3642.761610,3656.486068,P58,23,0


In [9]:
# get median area of all segmented cells
cells_to_keep['area'].median()

1424.0

In [10]:
# get median area of all segmented cells (NO RBCs)
cells_to_keep[cells_to_keep['RBC'] == 0]['area'].median()

1489.0

In [11]:
# get median area of P51 segmented cells (NO RBCs)
#cells_to_keep_noRBC = cells_to_keep[cells_to_keep['RBC'] == 0]

for patient in patients:
    
    one_pt = cells_to_keep[cells_to_keep['Patient'] == patient]
    print(patient + ' size median (px): ' + str(one_pt['area'].median()))

P51 size median (px): 1478.0
P52 size median (px): 1314.0
P53 size median (px): 1380.0
P56 size median (px): 1759.0
P57 size median (px): 1285.0
P58 size median (px): 1476.0


In [12]:
cells_to_keep['area'].std()

981.1211657936857

## Convert pixels to microns squared

In [13]:
cells_to_keep['area_um2'] = 0.18*0.18*cells_to_keep['area']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cells_to_keep['area_um2'] = 0.18*0.18*cells_to_keep['area']


In [14]:
cells_to_keep

Unnamed: 0_level_0,label,area,y,x,Patient,FOV,RBC,area_um2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
P51_FOV01_cell_1,1,903,15.766334,74.070875,P51,1,0,29.2572
P51_FOV01_cell_2,2,1122,14.835116,117.772727,P51,1,0,36.3528
P51_FOV01_cell_3,3,2034,20.265487,218.211898,P51,1,0,65.9016
P51_FOV01_cell_4,4,2534,41.141279,962.670481,P51,1,0,82.1016
P51_FOV01_cell_5,5,1006,16.085487,1059.072565,P51,1,0,32.5944
...,...,...,...,...,...,...,...,...
P58_FOV23_cell_7966,7966,731,3638.218878,4327.127223,P58,23,0,23.6844
P58_FOV23_cell_7971,7971,596,3640.771812,4117.850671,P58,23,0,19.3104
P58_FOV23_cell_7972,7972,359,3641.122563,4992.493036,P58,23,0,11.6316
P58_FOV23_cell_7974,7974,323,3642.761610,3656.486068,P58,23,0,10.4652


In [15]:
# get median area of all segmented cells
cells_to_keep['area_um2'].median()

46.1376

In [17]:
# get median area of P51 segmented cells (NO RBCs)
#cells_to_keep_noRBC = cells_to_keep[cells_to_keep['RBC'] == 0]

for patient in patients:
    
    one_pt = cells_to_keep[cells_to_keep['Patient'] == patient]
    print(patient + ' size median (um2): ' + str(one_pt['area_um2'].median()))

P51 size median (um2): 47.8872
P52 size median (um2): 42.5736
P53 size median (um2): 44.711999999999996
P56 size median (um2): 56.9916
P57 size median (um2): 41.634
P58 size median (um2): 47.822399999999995


In [16]:
cells_to_keep['area_um2'].std()

31.788325771715407

## Get cell sizes from NS segmentation

In [18]:
# loop through all metadata files
NS_metadata_dir = 'NS_cellseg_metadata/'
metafiles = [f for f in listdir(NS_metadata_dir) if isfile(join(NS_metadata_dir, f))]
metafiles.sort()

In [19]:
metafiles

['P51_R1158_S1_metadata_file.csv',
 'P52_R1149_S1_metadata_file.csv',
 'P53_R1149_S2_metadata_file.csv',
 'P56_R1158_S2_metadata_file.csv',
 'P57_R1158_S3_metadata_file.csv',
 'P58_R1149_S3_metadata_file.csv']

In [20]:
allNS_areas = []
for file in metafiles:
    
    patient = file[0:3]
    df = pd.read_csv(NS_metadata_dir + file)
    print(patient + ' size median (px): ' + str(df['Area'].median()))
    allNS_areas += df['Area'].tolist()

P51 size median (px): 1953.0
P52 size median (px): 1762.0
P53 size median (px): 1674.0
P56 size median (px): 2165.0
P57 size median (px): 1691.0
P58 size median (px): 1753.0


In [21]:
np.median(allNS_areas)

1799.0

In [22]:
len(allNS_areas)

593279

In [23]:
df_areas = pd.Series( (v for v in allNS_areas) )

In [24]:
df_areas.std()

839.4392774667988

## Convert pixels to microns squared

In [27]:
df_areas_um = df_areas*0.18*0.18

In [29]:
# get median area of all segmented cells
df_areas_um.median()

58.2876

In [30]:
df_areas_um.std()

27.197832589924285