# Prepare input for Linear Mixed Model (LMM)

This notebook summarizes the proportion of each cell type in rings of increasing radii around a given leukemia cell. To be used as input in running the GLMM.

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import math
import seaborn as sns
import tifffile
from os import listdir
from os.path import isfile, join
from skimage import data, util, measure

In [4]:
thresholds = list(range(0,50,5)) # um distance ranges

In [5]:
# path to cell typing results
pred_location = '/data/Zhaolab/1_AMLCosMx/Final_scripts/4_CellTyping/4_CelltypesByTranscript/FromGege/celltype_v8-EP.csv'

In [6]:
# path to cell-cell ditance matrices
dist_dir = '/data/Zhaolab/1_AMLCosMx/Final_scripts/5_SpatialAnalysis/1_MeasureCellDists/cell-cell_distances/'

In [7]:
# create dictionary of patient names
name_dirs = {'p1':'P51', 'p2':'P52', 'p3':'P53', 'p4':'P56', 'p5':'P57', 'p6':'P58'}
toGegeID = {'P51':'p1', 'P52':'p2', 'P53':'p3', 'P56':'p4', 'P57':'p5', 'P58':'p6'}

In [8]:
# read in celltype predictions
df = pd.read_csv(pred_location)

# reformat columns
df3 = pd.concat([df['x'], df['Unnamed: 0'].str.split('_', expand=True)], axis=1)
df3 = df3.rename(columns={"x": "celltype_detail", 0: "PtID", 3: "cell_id"})
df3['FOV'] = df3[1].apply(lambda x: x[-2:])
df3['patients'] = df3['PtID'].apply(lambda x: toGegeID[x])
df3 = df3.drop(columns=[1,2])
morph_predicted = df3[['patients', 'celltype_detail', 'FOV', 'cell_id']]
morph_predicted

Unnamed: 0,patients,celltype_detail,FOV,cell_id
0,p1,NoCellAssigned,01,0
1,p1,CD8TeffectorGZMK,01,1
2,p1,MonocytesCD16,01,2
3,p1,Mega,01,3
4,p1,Mega,01,4
...,...,...,...,...
625140,p6,ErythroidProgenitors,23,7962
625141,p6,MatureB,23,7963
625142,p6,ErythroidProgenitors,23,7966
625143,p6,Mega,23,7971


In [9]:
# define types to remove before further analysis
types_to_remove = ['NoCellAssigned','RBC','SmallCell','Unknown']
morph_predicted = morph_predicted[~morph_predicted['celltype_detail'].isin(types_to_remove)]
morph_predicted

Unnamed: 0,patients,celltype_detail,FOV,cell_id
1,p1,CD8TeffectorGZMK,01,1
2,p1,MonocytesCD16,01,2
3,p1,Mega,01,3
4,p1,Mega,01,4
5,p1,NK,01,5
...,...,...,...,...
625140,p6,ErythroidProgenitors,23,7962
625141,p6,MatureB,23,7963
625142,p6,ErythroidProgenitors,23,7966
625143,p6,Mega,23,7971


In [10]:
# load FOV metadata to add timepoint info
FOV_metadata = pd.read_csv('/data/Zhaolab/1_AMLCosMx/Final_scripts/4_CellTyping/FOV_metadata.csv', index_col=0)
FOV_metadata.loc['P57_FOV17'] = ['C', 'CR'] # add row for P57 FOV17

morph_predicted['Timepoint'] = ''
for i in range(morph_predicted.shape[0]):
    alt_pt_name = name_dirs[morph_predicted['patients'].iloc[i]]
    fov = morph_predicted['FOV'].iloc[i]
    sample = alt_pt_name + '_FOV' + fov
    morph_predicted['Timepoint'].iloc[i] = FOV_metadata['Timepoint'].loc[sample]
    
morph_predicted

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  morph_predicted['Timepoint'] = ''
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  morph_predicted['Timepoint'].iloc[i] = FOV_metadata['Timepoint'].loc[sample]


Unnamed: 0,patients,celltype_detail,FOV,cell_id,Timepoint
1,p1,CD8TeffectorGZMK,01,1,A
2,p1,MonocytesCD16,01,2,A
3,p1,Mega,01,3,A
4,p1,Mega,01,4,A
5,p1,NK,01,5,A
...,...,...,...,...,...
625140,p6,ErythroidProgenitors,23,7962,C
625141,p6,MatureB,23,7963,C
625142,p6,ErythroidProgenitors,23,7966,C
625143,p6,Mega,23,7971,C


In [11]:
# get cell type names
class_column = 'celltype_detail'
cluster_labels = list(set(morph_predicted[class_column].tolist()))
cluster_labels.sort()

# read colors from csv
colors = pd.read_csv('/data/Zhaolab/1_AMLCosMx/Final_scripts/4_CellTyping/4_CelltypesByTranscript/color_v8.csv', index_col=0)
colors = colors.T
cluster_colors = {}
for celltype in cluster_labels:
    cluster_colors[celltype] = [colors['blue'].loc[celltype], colors['green'].loc[celltype], colors['red'].loc[celltype]]
    
cell_types = list(cluster_colors.keys())

In [12]:
cell_types

['B',
 'CD4Tmemory',
 'CD4Tnaive',
 'CD8TeffectorGZMH',
 'CD8TeffectorGZMK',
 'CD8Tnaive',
 'DC',
 'ErythroidProgenitors',
 'LeukemiaCell',
 'MatureB',
 'Mega',
 'MonocytesCD14',
 'MonocytesCD16',
 'MonocytesProgenitor',
 'NK',
 'Plasma',
 'ProgenitorB']

In [13]:
patients = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6']

In [33]:
# loop through each FOV, one patient at a time, saving count of each cell type per distance ring
for patient in patients:
    
    # subset to one patient
    one_pt = morph_predicted[morph_predicted['patients'] == patient]
    
    # get FOV names
    fovs = list(set(one_pt['FOV'].tolist()))

    for fov in fovs:

        # subset to one FOV
        one_fov = one_pt[one_pt['FOV'] == fov]

        # get timepoint
        timepoint = list(set(one_fov['Timepoint'].tolist()))[0]

        # get sample name from patient, timepoint, and FOV
        sample = patient + '_' + timepoint + '_' + fov

        # define mask dir
        mask_dir = '/data/Zhaolab/1_AMLCosMx/Final_scripts/2_Segmentation/3_NucMemMerging/' + name_dirs[patient] + '_hybrid/labels_predicted_2_15_24/'
        masks = [f for f in listdir(mask_dir) if isfile(join(mask_dir, f))]
        masks.sort()
        for mask_file in masks: # find mask file for this FOV
            if fov in mask_file:
                # Read in mask
                mask = tifffile.imread(mask_dir + mask_file)
                # Find area of each cell
                label_image = mask.astype(int)
                props = measure.regionprops_table(label_image, properties=['label', 'area'])
                data = pd.DataFrame(props)
                data['FOV'] = fov
                data['label'] = data['label'].astype(str)
                data = data.set_index('label')
                break

        # merge cell size with celltype calls
        morph_predicted_fov = one_fov.merge(data['area'], how='left', left_on='cell_id', right_index=True)
        Leukemia_cell_ids = morph_predicted_fov[morph_predicted_fov['celltype_detail'] == 'LeukemiaCell']   

        # list of Leukemia cell IDs
        LL = Leukemia_cell_ids['cell_id'].astype(int).tolist()
        fov_LL_number = len(LL)
        
        # reformat Lk cell table
        Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].astype(int)
        Leukemia_cell_ids = Leukemia_cell_ids.set_index('cell_id')

        # read in distances computed between all cells
        min_dists = pd.read_csv(dist_dir + name_dirs[patient] + '_FOV' + fov + '_cell_contacts.csv', index_col=0)

        min_dists = min_dists.filter(items=LL, axis=0)
        min_dists = min_dists * 0.18 # convert from pixel distance to um distance

        for j in range(len(thresholds)):

            # df to hold counts of this all cell types at this distance
            cell_sum = pd.DataFrame(index=LL, columns=cell_types).fillna(0)

            # iterate over all cell types
            for celltype in cell_types:

                # get cell IDs of one celltype
                subset_ids = morph_predicted_fov[morph_predicted_fov['celltype_detail'] == celltype]
                celltype_ids = subset_ids['cell_id'].tolist()

                # subset distance DF to only leukemia to x celltype distances
                subset_dists = min_dists[celltype_ids]

                if thresholds[j] == 0:
                    min_dist_boolean = subset_dists == 0.0
                else:
                    min_dist_boolean = (subset_dists > thresholds[j-1]) & (subset_dists <= thresholds[j])

                min_dist_boolean = min_dist_boolean.astype(int)

                # each row is a leukemia cell, so sum all columns in each row to get neighboring cell type sum around each LK cell
                cell_sum_celltype = min_dist_boolean.sum(axis=1)

                # add to DF for this distance
                cell_sum[celltype] = cell_sum_celltype

            # save cell sizes as col in DF
            cell_sum_size = cell_sum.merge(Leukemia_cell_ids['area'], how='left', left_index=True, right_index=True)
            
            # save csv for this FOV/distance
            cell_sum_size.to_csv('LK_neighborhood_count/' + sample + '_' + str(thresholds[j]) + 'um_counts.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Leukemia_cell_ids['cell_id'] = Leukemia_cell_ids['cell_id'].a

### Export all distances for LME

In [34]:
# list files created above, cell type counts per distance ring per FOV
count_files = 'LK_neighborhood_count/'
dir_list = os.listdir(count_files)
dir_list.sort()

In [35]:
# define which patients are responders vs non-responders
resp = ['p2', 'p4', 'p5', 'p6']
nonresp = ['p1', 'p3']

In [36]:
# convert integer distance labels to true ranges they represent
dist_to_bin = {'0': '0um', '5': '0-5um', '10': '5-10um', '15': '10-15um', '20': '15-20um', '25': '20-25um', 
               '30': '25-30um', '35': '30-35um', '40': '35-40um', '45': '40-45um'}

In [39]:
# create empty dataframe to hold output from all FOVs
all_celltype_df = pd.DataFrame(columns=['leuk_ID', 'area', 'count', 'celltype', 'dist', 'patient', 'FOV', 'timepoint', 'Response'])

# loop though counts table for each FOV at each distance from leukemia cell
for file in dir_list:
    df = pd.read_csv(count_files + file, index_col=0)
    if df.shape[0] > 0: # skip if no leukemia cells present
        pt = file[0:2] # get patient ID
        fov = file[5:7] # get FOV#
        df['leuk_ID'] = pt + '_' + fov + '_' + df.index.astype(str) # create unique identifier for each leukemia cell
        df_melted = pd.melt(df, id_vars=['leuk_ID', 'area']) # reorient DF so each row belongs to unique leukemia/neighboring celltype combo
        df_melted = df_melted.rename(columns={'value': 'count'})
        df_melted = df_melted.rename(columns={'variable': 'celltype'})
        df_melted['dist'] = dist_to_bin[file[8:].replace('um_counts.csv', '')]
        df_melted['patient'] = pt
        df_melted['FOV'] = pt + '_' + fov
        df_melted['timepoint'] = file[3]
        if file[0:2] in resp: # add response group
            df_melted['Response'] = 'R'
        elif file[0:2] in nonresp:
            df_melted['Response'] = 'NR'
        # add to growing DF    
        all_celltype_df = pd.concat([all_celltype_df, df_melted], ignore_index=True)

In [40]:
all_celltype_df

Unnamed: 0,leuk_ID,area,count,celltype,dist,patient,FOV,timepoint,Response
0,p1_01_12,1493,0.0,B,0um,p1,p1_01,A,NR
1,p1_01_34,896,0.0,B,0um,p1,p1_01,A,NR
2,p1_01_45,1893,0.0,B,0um,p1,p1_01,A,NR
3,p1_01_231,1540,0.0,B,0um,p1,p1_01,A,NR
4,p1_01_267,2361,0.0,B,0um,p1,p1_01,A,NR
...,...,...,...,...,...,...,...,...,...
6812915,p6_23_4090,992,0.0,ProgenitorB,0-5um,p6,p6_23,C,R
6812916,p6_23_5074,940,0.0,ProgenitorB,0-5um,p6,p6_23,C,R
6812917,p6_23_6205,1363,0.0,ProgenitorB,0-5um,p6,p6_23,C,R
6812918,p6_23_6390,601,0.0,ProgenitorB,0-5um,p6,p6_23,C,R


In [41]:
# save as input to linear mixed model
all_celltype_df.to_csv('All_celltypes_counts_LMM.csv')