## Cluster TMAs

**Samples:** 
- NP8
- NP9

**Method**: 

- threshold and gate cell types

- marker expression in tumor/stroma

- leiden cluster on good biomarkers for celltypes


In [None]:
#load libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns
import scipy
import scanpy as sc
import matplotlib as mpl
from matplotlib import cm
import math
mpl.rc('figure', max_open_warning = 0)
codedir = '/home/groups/graylab_share/Chin_Lab/ChinData/engje/Data/20200000/20200406_JP-TMAs'#os.getcwd()
import phenograph
os.chdir('/home/groups/graylab_share/Chin_Lab/ChinData/engje/Data')
from mplex_image import visualize as viz, process, preprocess, normalize, mics, mpimage
np.random.seed(126)

In [None]:
codedir

In [None]:
#change to correct directory
datadir = f'/home/groups/graylab_share/Chin_Lab/ChinData/engje/Data/20200000/20200406_JP-TMAs/NP_DCIS'

s_date = '20240520'
os.chdir(codedir)
if not os.path.exists(s_date):
    os.mkdir(s_date)
%matplotlib inline

# Table of contents <a name="contents"></a>
2. [load data](#cluster)
3. [select rois](#load)
5. [single cell Umap](#l7) **start here**
6. [Leiden celltypes](#l8)
[annotate leiden celltypes](#lbar)
7. [Tissue means](#tissue)


# NP Tissues  <a name="cluster"></a>


filter markers, select ROIs


[contents](#contents)

In [None]:
os.listdir(datadir)

In [None]:
#mean int
os.chdir(datadir)
ls_sample = ['NP9-DCIS2','NP8-DCIS','NP8-IDC','NP9-IDC2'] 
df_mi = pd.DataFrame()
df_xy = pd.DataFrame()
for s_sample in ls_sample:
    df_mi2 = pd.read_csv(f'{datadir}/features_{s_sample}_FilteredMeanIntensity.txt',
                         index_col=0, sep='\t')
    #df_mi2['slide_scene'] = [item.split('_cell')[0] for item in df_mi2.index]
    df_xy2 = pd.read_csv(f'{datadir}/features_{s_sample}_CentroidXY.csv',index_col=0)
    df_xy2['slide_scene'] = [item.split('_cell')[0] for item in df_xy2.index]
    df_mi =pd.concat([df_mi,df_mi2])
    df_xy =pd.concat([df_xy,df_xy2])
    #break

In [None]:
len(df_mi)

In [None]:
df_mi.columns

In [None]:
#shorten and check
df_mi.columns = [item.split('_')[0] for item in df_mi.columns]
print(len(df_mi.columns))
df_mi.columns[df_mi.columns.duplicated()]

In [None]:
df_mi = df_mi.merge(df_xy,left_index=True, right_index=True)
#df_mi.rename({'nuclei_area':'area','nuclei_eccentricity':'eccentricity'},axis=1,inplace=True)

In [None]:
df_mi.columns

In [None]:
#save
#s_out = f'{codedir}/data/20211221_JP-TMA2_FilteredMeanIntensity.csv'
s_out = f'{codedir}/data/20240520_NP-Tissues_FilteredMeanIntensity.csv'
if not os.path.exists(s_out):
    print('saving csv')
    df_mi.to_csv(s_out)

# ROI selection <a name="load"></a>

select virtual cores 1.2mm diameter

[contents](#contents)

In [None]:
s_out = f'{codedir}/data/20240520_NP-Tissues_FilteredMeanIntensity.csv'
df_mi = pd.read_csv(s_out,index_col=0)
df_mi['slide'] = [item.split('_')[0] for item in df_mi.index]
df_mi['CK19_Pos'] = df_mi.CK19>1000

In [None]:
## visualize
# a 1.2 um core
radius = 1200*3.25/2 #3.25  pixel per um

def plot_pos(df_pos,ls_color,df_xy):
    #plot
    fig, ax = plt.subplots(figsize=(5,5)) #figsize=(18,12)
    #plot negative cells
    ax.scatter(data=df_xy,x='DAPI_X',y='DAPI_Y',color='silver',s=1)
    for idx,s_color in enumerate(ls_color):
        #positive cells = positive cells based on threshold
        ls_pos_index = (df_pos[df_pos.loc[:,s_color]]).index
        df_color_pos = df_xy[df_xy.index.isin(ls_pos_index)]
        if len(df_color_pos)>=1:
            #plot positive cells
            ax.scatter(data=df_color_pos, x='DAPI_X',y='DAPI_Y',color=f'C{idx}',s=.25,alpha=0.8)
    ax.axis('equal')
    ax.set_ylim(ax.get_ylim()[::-1])
    ax.set_title(f'{s_color}')
    fig.suptitle(s_scene)
    return(fig, ax)


In [None]:
# load hand drawn ROIS
df_roi = pd.DataFrame()
for s_file in os.listdir(f'{codedir}/NP9-ROI_selection'):
    if s_file.find('exclude') >-1:
        df = pd.read_csv(f'{codedir}/NP9-ROI_selection/{s_file}',index_col=0)
        s_roi = s_file.split('_')[-1].split('.')[0]
        df['ROI'] = s_roi
        df_roi = pd.concat([df_roi,df])

In [None]:
sorted(df_mi.slide_scene.unique())
d_type = {#'NP8-DCIS_scene001':'skip', #DCIS
 #'NP8-DCIS_scene002':'DCIS',
 #'NP8-DCIS_scene003':'DCIS',
 #'NP8-DCIS_scene004':'DCIS',
 #'NP8-DCIS_scene005':'DCIS',
 #'NP8-DCIS_scene006':'skip', #DCIS
 #'NP8-DCIS_scene007':'skip', #DCIS
 #'NP8-DCIS_scene008':'DCIS',
 #'NP8-DCIS_scene009':'DCIS',
 #'NP8-DCIS_scene010':'DCIS',
 #'NP8-IDC_scene001':'IDC',
 #'NP8-IDC_scene002':'IDC',
# 'NP8-IDC_scene003':'IDC',
# 'NP8-IDC_scene004':'skip', #IDC
# 'NP8-IDC_scene005':'skip', #IDC
 #'NP9-DCIS2_scene001':'Normal',
 #'NP9-DCIS2_scene002':'DCIS',
 #'NP9-DCIS2_scene003':'DCIS',
 'NP9-IDC2_scene001':'Normal',
 #'NP9-IDC2_scene002':'DCIS',
 'NP9-IDC2_scene003':'IDC'}


In [None]:
#plot CK19
for s_scene, s_type in d_type.items():
    df_scene = df_mi[df_mi.slide_scene==s_scene]
    fig, ax = plot_pos(df_mi,['CK19_Pos'],df_scene)
    ax.set_title(f'{s_type} CK19+')
    break
# hand drawn rois for that scene
df_roi_scene = df_roi[df_roi.index.str.contains(s_scene)]

In [None]:
df_scene = df_mi[df_mi.slide_scene==s_scene].copy()
for s_roi in df_roi_scene.ROI.unique():
    ls_index = df_roi_scene[df_roi_scene.ROI==s_roi].index
    df_scene[s_roi] = df_scene.index.isin(ls_index)

In [None]:
# #select circular ROIs
# dd_center= { #'NP8-DCIS_scene001':{'DCIS1':(10000,7000)},
#     'NP8-IDC_scene001':{'Tumor1':(14000,8000),'Border1':(11000,3500),'Border2':(7000,8000)},
#      'NP8-IDC_scene002':{'Tumor1':(13500,7000),'Tumor2':(17000,4000),'Border1':(9500,8000)}, 
#     'NP8-IDC_scene003':{'Tumor1':(6000,5000),'Tumor2':(12500,7500),'Border1':(2000,6500),'Border2':(17000,6000)},
#     'NP9-DCIS2_scene001':{'Normal1':(2000,2000),'Normal2':(2500,7000),'Normal3':(11000,4000),'Normal4':(16000,5000)},
#     'NP9-IDC2_scene001':{'Normal1':(5500,8500),'Normal2':(3000,11700)},
#     'NP9-IDC2_scene003':{'Tumor1':(9500,18500),'Border1':(10500,14500)},
# }
# #slow #select circular ROIs
# dd_result = {}
# for s_scene, d_center in dd_center.items():
#     d_result={}
#     df_scene = df_mi[df_mi.slide_scene==s_scene].copy()
#     for idx, (s_roi, center) in enumerate(d_center.items()):
#         # Calculate Euclidean distance
#         for (index, Series) in df_scene.iterrows():
#             p = [Series.DAPI_X,Series.DAPI_Y]
#             dist =  math.dist(p, center)
#             df_scene.loc[index,'distance'] = dist
#         df_scene[f'ROI{idx}'] = df_scene.distance < radius
#         ls_index = df_scene[df_scene.loc[:,f'ROI{idx}']].index.tolist()
#         d_result.update({s_roi:ls_index})
#     dd_result.update({s_scene:d_result})
#     #break

In [None]:
#plot ROIS
fig, ax = plot_pos(df_scene,['CK19_Pos','Normal1','Normal2','Normal3','Normal4'],df_scene) #,'ROI2','ROI3','ROI0','ROI1'

In [None]:
# #save json
# import json
# with open('NP_ROI_data.json', 'w') as f:
#     json.dump(dd_result, f)

In [None]:
# #load json
# import json

# with open('NP_ROI_data.json') as f:
#     dd_result = json.load(f)
#     print(dd_result.keys())

In [None]:
# #filter
# df_mi['ROI'] = pd.NA
# for s_scene, d_result in dd_result.items():
#     for s_roi, ls_index in d_result.items():
#         df_mi.loc[ls_index,'ROI'] = f'{s_scene}_{s_roi}'

In [None]:
## filter hand drawn
df_mi['ROI'] = pd.NA
for s_roi in df_roi.ROI.unique():
    print(s_roi)
    ls_index = df_roi[df_roi.ROI==s_roi].index
    s_scene = ls_index[0].split('_cell')[0]
    df_mi.loc[ls_index,'ROI'] = f'{s_scene}_{s_roi}'

In [None]:
s_scene

In [None]:
#save ROIS
s_out = f'{codedir}/data/20240521_NP-Tissues-ROIs_FilteredMeanIntensity.csv'
s_out = f'{codedir}/data/20240522_NP9-IDC2-ROIs_FilteredMeanIntensity.csv'
if not os.path.exists(s_out):
    print('saving csv')
    df_mi[df_mi.ROI.notna()].to_csv(s_out)


## Umap <a name="l7"></a>

umap projection of single cells

first use all markers, all subtypes, just to evaluate markers

[contents](#contents)

In [None]:
import scanpy as sc
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
os.chdir(datadir)
#load ROIS
s_out = f'{codedir}/data/20240521_NP-Tissues-ROIs_FilteredMeanIntensity.csv'
s_out = f'{codedir}/data/20240522_NP9-IDC2-ROIs_FilteredMeanIntensity.csv'
df_norm  = pd.read_csv(f'{s_out}',index_col=0) 


In [None]:
#ER+
df_exclude = pd.DataFrame() #pd.read_csv('exclude_JP-TMA-1_ER+_34markers_leiden0.6_20.csv',index_col=0)

ls_exclude = []
#ls_exclude_all = sorted(set(df_norm[df_norm.loc[:,'PD1'] > 1365].loc[:,'PD1'].index.tolist()).union(set(ls_exclude))) #PD1
#df_exclude = df_exclude.append(pd.DataFrame(index=ls_exclude_all,columns=df_exclude.columns))
df_exclude_er = df_exclude.copy()
print(len(df_exclude_er))

In [None]:
#32 features
#[ 'NP8-IDC' 'NP9-IDC2']
s_sample = s_out.split('data/')[1].split('_Filtered')[0]
s_type = 'NP9-IDC2'#'NP8-IDC'# 'all'
d_roi = dict(zip(df_mi.index,df_mi.ROI))
#almost same features as IMC
ls_col = [ 'CD31', 'CD44', 'CD45','CD20','CD4','CD8',#'CD3',
        'CK14','CK5','EGFR', 'CK19',  'CK8','CK7','Ecad',  #
       'Ki67','CD68', 'aSMA','ColI','HER2', 'ER','PgR', #np8 is TNBC?
        'Vim'] #,'area'
print(len(ls_col))
adata = sc.AnnData(df_norm.loc[(~df_norm.index.isin(df_exclude.index)),ls_col]) 
adata.obs['slide_scene'] = [item.split('_cell')[0] for item in adata.obs.index]
adata.obs['ROI'] = adata.obs.index.map(d_roi)
adata.obs['subtype'] =[item.split('_')[-1][0:-1] for item in adata.obs.ROI]
adata.obs['TMA'] = [item.split('_')[0] for item in adata.obs.index]
# #two subtypes
if s_type == 'all':
    adata = adata[adata.obs.subtype.isin(['Normal', 'Border', 'Tumor'])]
else:
    adata = adata[adata.obs.TMA.isin([s_type])]  
print(len(adata))

In [None]:
#include two TNBC TMAs
print(adata.obs['TMA'].unique())
print(adata.obs['subtype'].unique())

In [None]:
#raw
b_scale = True
adata.raw = adata
#reduce dimensionality
sc.tl.pca(adata, svd_solver='auto')
fig,ax=plt.subplots(figsize=(3.5,5))
sc.pl.highest_expr_genes(adata, n_top=48,ax=ax,save=f'Expression_{len(ls_col)}.png')
plt.tight_layout()
sc.pl.pca_variance_ratio(adata,n_pcs=32, log=True)

if b_scale:
    sc.pp.scale(adata, zero_center=False, max_value=20)
    s_sample = s_sample + '_s'

In [None]:
len(ls_col)

In [None]:
#umap
for n_neighbors in [30,15]:
    results_file = f'{s_sample}_{n_neighbors}neighbors_{s_type}_{len(ls_col)}markers.h5ad'
    if not os.path.exists(results_file):
        print('calc umap')
        # calculate neighbors 
        sc.pp.neighbors(adata, n_neighbors=n_neighbors) #, method='rapids'
        sc.tl.umap(adata)
    else:
        print('loading umap')
        adata = sc.read_h5ad(results_file)
        adata.obs['slide_scene'] = [item.split('_cell')[0] for item in adata.obs.index]
        adata.obs['subtype'] = adata.obs.slide_scene.map(d_cyc_sub).fillna('other').replace(d_replace)
        adata.obs['celltype'] = df_man.loc[adata.obs.index,'celltype']
    #save results

    #color by markers
    #figname = f"Umap{s_sample}_markers_{n_neighbors}neighbors_{len(ls_col)}markers.png"
    #sc.pl.umap(adata, color=ls_col,vmin='p1.5',vmax='p98.5',save=figname,ncols=6)
    #color by TMA
    figname = f"Umap{s_sample}_TMA_{n_neighbors}neighbors_{s_type}_{len(ls_col)}markers.png"
    fig,ax = plt.subplots(figsize=(3,2), dpi=200)
    sc.pl.umap(adata, color='TMA',wspace=.25,save=figname,ax=ax)
    
    figname = f"Umap{s_sample}_subtype_{n_neighbors}neighbors_{s_type}_{len(ls_col)}markers.png"
    fig,ax = plt.subplots(figsize=(3,2), dpi=200)
    sc.pl.umap(adata, color='subtype',wspace=.25,save=figname,ax=ax)
    
    if not os.path.exists(results_file):
        print("writing")
        adata.write(results_file)
    break


##  Cluster Leiden <a name="l8"></a>

cluster umap projection of single cells


[contents](#contents)

In [None]:
s_type

In [None]:
#cluster
#s_sample = '20240521_NP-Tissues-ROIs_s'#'
#n_neighbors = 30 #
n_markers = len(ls_col) #19 #
#s_type = 'NP9-IDC2'#'NP8-IDC'#'all' #
resolution = 0.6
results_file = f'{s_sample}_{n_neighbors}neighbors_{n_markers}markers_{s_type}_leiden{resolution}.h5ad'
if not os.path.exists(results_file):
    print('clustering')
    sc.tl.leiden(adata,resolution=resolution)
    adata.write(results_file)
else:
    print('loading leiden')
    adata = sc.read_h5ad(results_file)


figname=f'leiden_{n_neighbors}_{n_markers}_{resolution}_{s_type}_nolegend.png'
sc.pl.umap(adata, color='leiden',ax=ax,save=figname,title=figname.split('.png')[0].replace('_',' '),legend_loc=None)

fig,ax = plt.subplots(figsize=(3,2),dpi=200)
sc.pl.umap(adata, color='leiden',ax=ax)

figname = f"Umap{s_sample}_subtype_{n_neighbors}neighbors_{s_type}_{len(ls_col)}markers.png"
fig,ax = plt.subplots(figsize=(3,2), dpi=200)
sc.pl.umap(adata, color='subtype',wspace=.25,save=figname,ax=ax)

In [None]:
# sc.pl.umap(adata, color='Vim',vmin='p1.5',vmax='p98.5')
# sc.pl.umap(adata, color='ColI',vmin='p1.5',vmax='p98.5')
# sc.pl.umap(adata, color='Ecad',vmin='p1.5',vmax='p98.5')

In [None]:
#legend
fig,ax = plt.subplots(figsize=(3,2),dpi=200)
sc.pl.umap(adata, color='leiden',ax=ax)

## Annotate Leiden <a name="lbar"></a>

annotate epitelial, immune a stroma cell types

also visualize cluster results on tissue scatter plots. Additional visualization done using 20201018_JP-TMAs_napari.py,
to visualize ome.tiff created in 20201005_JP-TMA_Pipeline.py


[contents](#contents)

In [None]:
os.chdir(f'{datadir}')

In [None]:
#load ROIS
s_out = f'{codedir}/data/20240521_NP-Tissues-ROIs_FilteredMeanIntensity.csv'
#s_out = f'{codedir}/data/20240522_NP9-IDC2-ROIs_FilteredMeanIntensity.csv'
df_norm  = pd.read_csv(f'{s_out}',index_col=0) 

In [None]:
#load leiden
s_sample = s_out.split('_Filtered')[0].split('data/')[1] + '_s' #'20240521_NP-Tissues-ROIs_s'#''20240522_NP9-IDC2-ROIs_s'#
n_neighbors = 30 #
resolution = 0.6
if s_sample == '20240521_NP-Tissues-ROIs_s':
    s_type = 'NP8-IDC'#
    n_markers =19# 21 #
else:
    s_type='NP9-IDC2'#'NP9-IDC2'#'all' #
    n_markers = 21 #
results_file = f'{s_sample}_{n_neighbors}neighbors_{n_markers}markers_{s_type}_leiden{resolution}.h5ad'
adata = sc.read_h5ad(results_file)

In [None]:
# #unmarked stroma
# #df_str = pd.read_csv(f'annotated_JP-TMA1-1_unmarked_stroma_{s_type}.csv',index_col=0)
# #df_str = pd.read_csv(f'annotated_JP-TMAs_unmarked_stroma_{s_type}.csv',index_col=0)
# fig,ax = plt.subplots(figsize=(3,2),dpi=200)
# figname=f'leiden_{n_neighbors}_{n_markers}_{resolution}_{s_type}_legend.png'
# sc.pl.umap(adata, color='leiden',ax=ax,save=figname,title=figname.split('.png')[0].replace('_',' '))

# results_file

In [None]:
# ls_col = ['CD31','Ecad','ColI','Vim','CD45']
# figname = f"Umap_markers_{n_neighbors}neighbors_{len(ls_col)}markers_{s_type}.png"
# title=figname.split('.png')[0].replace('_',' ')
# sc.pl.umap(adata, color=ls_col,vmin='p1.5',vmax='p99',ncols=5,save=figname)

In [None]:
results_file

In [None]:
# annotate
df_p = adata.to_df()
df_p['leiden'] = adata.obs['leiden'].astype('str')

if results_file =='20240521_NP-Tissues-ROIs_s_30neighbors_21markers_NP9-IDC2_leiden0.6.h5ad':
    ls_drop = ['13','11','12'] #13 missing rounds, 3, 7 is AF
    df_p.loc[(df_p.leiden=='4') & (df_p.CD20>9),'leiden'] = '4b'
    df_p.loc[(df_p.leiden=='4') & (df_p.CD4>5.5),'leiden'] = '4t'
    tum_clust = ['5','0','2']
    str_clust = ['8','3','11','12','7']
    imm_clust = ['4','4t','4b'] 
    endo_clust = ['6']
    fb_clust = ['9','10','1']
    d_named = {'3':'AF','7':'AF','5':'Normal Breast','0':'ER+ tum.','2':'ER+ tum.','6':'endothelial','4':'macrophage',
              '4t':'CD4 T cell','4b':'B cell','9':'Vim+ FB','10':'ColI+ FB','1':'FB','8':'Quies. Str.',
              }
elif results_file =='20240521_NP-Tissues-ROIs_s_30neighbors_21markers_NP8-IDC_leiden0.6.h5ad':
    ls_drop = ['16','14','7','13','12']#11 take out PGR/ER #16 missing rounds, 14 and 13 stiching error, 7 AF including necrotic, 12 necrotic
    tum_clust = ['15']
    str_clust = [] 
    imm_clust = [] 
    endo_clust = []
    fb_clust = []
    d_named = {'15':'myoep','7':'AF','12':'necrotic'}
elif results_file =='20240521_NP-Tissues-ROIs_s_30neighbors_19markers_NP8-IDC_leiden0.6.h5ad':
    print('used')
    ls_drop = ['14','12','11','8'] #14 missing rounds, '12' unfortunate CD20 BG 11  AF including necrotic, 8 necrotic
    #drop scene003 tumor 1 clust 4- weird bg
    df_p.loc[(df_p.leiden=='6') & (df_p.CD20>11.5),'leiden'] = '6b'
    df_p.loc[(df_p.leiden=='6') & (df_p.CD4>7),'leiden'] = '6t'
    #df_p.loc[(df_p.leiden=='10') & (df_p.CD31>4),'leiden'] = '10e'
    tum_clust = ['13','9','3','0','1','4']
    str_clust = ['2'] 
    imm_clust = ['7','6','6b','6t'] 
    endo_clust = ['10']
    fb_clust = ['5']
    d_named = {'13':'myoep','11':'AF','10':'endothelial','9':'Prolif. tum.','8':'necrotic','7':'macrophage',
              '6':'CD8 T cell','6b':'B cell','6t':'CD4 T cell','5':'FB','3':'Luminal tum.','0':'Luminal tum.',
              '1':'Luminal tum.','2':'Quies. Str.','4':'Luminal tum.',}
elif results_file == '20240522_NP9-IDC2-ROIs_s_30neighbors_21markers_NP9-IDC2_leiden0.6.h5ad':
    print('also used')
    df_p.loc[(df_p.leiden=='3') & (df_p.CD20>9),'leiden'] = '3b'
    ls_drop = ['7','8','15','17','16','13','14'] #7 and 8 is AF, 15 is missing rounds, 16 and 17 is AF, 14 is af, 13 floating
    tum_clust = ['0','5','10']
    str_clust = ['9'] 
    imm_clust = ['12','3','3b'] #12 mac
    endo_clust = ['6']
    fb_clust = ['1','4','2','11',]
    d_named = {'10':'myoep','5':'Normal breast luminal','0':'ER+ Tumor','6':'endothelial','3b':'B cell','3':'CD4 T cell',
              '12':'macrophage','11':'Vim+ FB','4':'ColI++ FB','2':'ColI+ FB','1':'FB','9':'Quies. Str.'}
else:
    print('new one')
    tum_clust = []
    str_clust = [] 
    imm_clust = [] 
    endo_clust = []
    fb_clust = []

    
#df_p.groupby('leiden').mean().Ecad.sort_values(ascending=False).index
#set(df_p.groupby('leiden').mean().CK19.sort_values(ascending=False).index) - set(tum_clust + imm_clust )
#set(tum_clust)

In [None]:
#clustermap
if s_type=='NP9-IDC2':
    ls_drop_col = ['CD8','HER2','Ki67']
else:
    ls_drop_col = []
b_annot = False
d_replace = {}
d_replace.update(dict(zip(tum_clust,[f'epithelial' for item in tum_clust])))
d_replace.update(dict(zip(str_clust,[f'stromal' for item in str_clust])))
d_replace.update(dict(zip(imm_clust,[f'immune' for item in imm_clust])))
d_replace.update(dict(zip(endo_clust,[f'endothelial' for item in endo_clust])))
d_replace.update(dict(zip(fb_clust,[f'fibroblast' for item in fb_clust])))
df_plot = df_p.loc[~df_p.leiden.isin(ls_drop),~df_p.columns.isin(ls_drop_col)].groupby('leiden').mean()

df_plot.index.name = f'leiden {resolution}'
d_color = dict(zip(['endothelial', 'epithelial', 'fibroblast', 'immune', 'stromal'],sns.color_palette()[0:5]))
if b_annot:
    row_colors = df_plot.index.astype('str').map(d_replace).map(d_color)
    g = sns.clustermap(df_plot,z_score=1,figsize=(9,7),cmap='viridis',row_colors=row_colors,#method='single',
                   vmin=-1.5,vmax=1.5)
    for label in d_color.keys():
        g.ax_row_dendrogram.bar(0, 0, color=d_color[label],
                                label=label, linewidth=0)
    g.ax_row_dendrogram.legend(loc="right", ncol=1)
    g.ax_heatmap.set_ylabel('')
    marker_genes = df_plot.iloc[:,g.dendrogram_col.reordered_ind].columns.tolist()
    categories_order = df_plot.iloc[g.dendrogram_row.reordered_ind,:].index.tolist()
    labels = [f"{item}: {d_named[item]}" for item in categories_order]
    g.ax_heatmap.set_yticklabels(labels,rotation=0)
else:
    g = sns.clustermap(df_plot,z_score=1,figsize=(9,7),cmap='viridis',#row_colors=row_colors,#method='single',
                   vmin=-1.5,vmax=1.5)
    marker_genes = df_plot.iloc[:,g.dendrogram_col.reordered_ind].columns.tolist()
    categories_order = df_plot.iloc[g.dendrogram_row.reordered_ind,:].index.tolist()
g.savefig(f'{codedir}/{s_date}/{s_sample}_clustermap_{n_neighbors}_{n_markers}markers_{s_type}_leiden{resolution}.png',dpi=200)


In [None]:
fig,ax=plt.subplots(figsize=(1.4,3.7),dpi=200)
df_p.groupby('leiden').count().loc[categories_order[::-1]].iloc[:,1].plot(kind='barh',title='Cell No.',ax=ax,width=0.7)
plt.tight_layout()
fig.savefig(f'{codedir}/{s_date}/barplot_{s_sample}_{n_neighbors}neighbors_{len(marker_genes)}markers_leiden{resolution}_{s_type}.png')

In [None]:

s_clust = '8'
df_p['slide_scene'] = [item.split('_cell')[0] for item in df_p.index]
df_p.groupby(['leiden','slide_scene']).count().loc[s_clust].iloc[:,1].sort_values(ascending=False)[0:30]

In [None]:
from mplex_image import analyze
df_data = df_norm
colors = mpl.cm.tab10.colors + mpl.cm.Accent.colors + mpl.cm.tab20b.colors + mpl.cm.tab20c.colors

df_pos = analyze.celltype_to_bool(df_p,'leiden')
df_pos.columns = [str(item) for item in df_pos.columns]

In [None]:
#plot all groups spatially - leiden 
ls_clust = df_pos.columns.tolist()#['8','11','12','7','5','10','1','9']#'0','2',
for s_slide in sorted(set(df_p.slide_scene)):
    #s_slide = 'JP-TMA1-1_scene029'
    fig,ax = plt.subplots(figsize=(5,4.5),dpi=200)
    #plot negative cells
    df_scene = df_data[df_data.index.str.contains(s_slide)]
    ax.scatter(data=df_scene,x='DAPI_X',y='DAPI_Y',color='silver',s=0.1,label=f'')
    #for idxs, s_color_int in enumerate(range(len(df_pos.columns))):
    for idxs, s_color in enumerate(ls_clust): #enumerate(df_pos.columns):
        #s_color = s_clust
        s_color = str(s_color)
        if len(df_p[(df_p.slide_scene==s_slide) & (df_pos.loc[:,s_color])])>=1:
            #plot positive cells
            ls_index = df_p[(df_p.slide_scene==s_slide) & (df_pos.loc[:,s_color])].index
            ax.scatter(data=df_data.loc[ls_index],x='DAPI_X',y='DAPI_Y',label=f'{s_color}',s=0.1,color=colors[idxs])
        #break
    ax.set_title(f"{s_slide}", fontsize=16) # \n {d_a[s_slide]}
    ax.axis('equal')
    ax.set_ylim(ax.get_ylim()[::-1])
    #ax.set_xticklabels('')
    #ax.set_yticklabels('')
    #break
    plt.legend(markerscale=10,framealpha=.5,bbox_to_anchor=(1,1)) 
    #fig.savefig(f'{codedir}/{s_date}/{s_slide}_leiden{resolution}_scatterplot.png')
    #break

In [None]:
ls_merge = tum_clust# fb_clust #tum_clust #['0','12'] #16
adata.obs.leiden = df_p.leiden
adata.obs['test'] = (adata.obs.leiden.isin(ls_merge)).replace({True:1,False:0})
sc.pl.umap(adata, color='test',title='tumor')
print(sum(adata.obs['test'])/len(adata))

In [None]:
adata.obs['leiden'] = df_p['leiden']
fig,ax = plt.subplots(figsize=(3,3),dpi=200)
figname=f'leiden_{n_neighbors}_{n_markers}_{resolution}_{s_type}.png'
sc.pl.umap(adata, color='leiden',ax=ax,save=figname,title=figname.split('.png')[0].replace('_',' '))

In [None]:
# for s_merge in tum_clust:
#     ls_merge = [s_merge]
#     adata.obs['test'] = (adata.obs.leiden.isin(ls_merge)).replace({True:1,False:0})
#     sc.pl.umap(adata, color='test',title=s_merge)

## add celltypes

In [None]:
df_p.loc[df_p.leiden.isin(tum_clust),'leidencelltype3'] = 'epithelial'
df_p.loc[df_p.leiden.isin(str_clust),'leidencelltype3'] = 'stromal'
df_p.loc[df_p.leiden.isin(fb_clust),'leidencelltype3'] = 'stromal'
df_p.loc[df_p.leiden.isin(endo_clust),'leidencelltype3'] = 'stromal'
df_p.loc[df_p.leiden.isin(imm_clust),'leidencelltype3'] = 'immune'
adata.obs['leidencelltype3'] = df_p.leidencelltype3
fig,ax = plt.subplots(figsize=(3,2),dpi=200)
figname=f'leiden_{n_neighbors}_{n_markers}_{resolution}_{s_type}leidencelltype3.png'
sc.pl.umap(adata, color='leidencelltype3',ax=ax,save=figname,title=figname.split('.png')[0].replace('_',' '),legend_loc='on data')

In [None]:
df_p.loc[df_p.leiden.isin(tum_clust),'leidencelltype5'] = 'epithelial'
df_p.loc[df_p.leiden.isin(str_clust),'leidencelltype5'] = 'stromal'
df_p.loc[df_p.leiden.isin(fb_clust),'leidencelltype5'] = 'fibroblast'
df_p.loc[df_p.leiden.isin(endo_clust),'leidencelltype5'] = 'endothelial'
df_p.loc[df_p.leiden.isin(imm_clust),'leidencelltype5'] = 'immune'
adata.obs['leidencelltype5'] = df_p.leidencelltype5
fig,ax = plt.subplots(figsize=(3,2),dpi=200)
figname=f'leiden_{n_neighbors}_{n_markers}_{resolution}_{s_type}_leidencelltype5.png'
sc.pl.umap(adata, color='leidencelltype5',ax=ax,save=figname,title=figname.split('.png')[0].replace('_',' '))

In [None]:
#visulaize
ls_col = ['CD31','Ecad','Vim','ColI','CD45']
figname = f"Umap_markers_{n_neighbors}neighbors_{len(ls_col)}markers_{s_type}.png"
title=figname.split('.png')[0].replace('_',' ')
sc.pl.umap(adata, color=ls_col,vmin='p1.5',vmax='p99',ncols=5,save=figname)

In [None]:
#df_p.groupby('slide').leidencelltype5.value_counts(normalize=True).unstack().sort_values(by='epithelial')[25:40]

In [None]:
df_p.loc[df_p.leiden.isin(tum_clust),'leidencelltype4'] = 'epithelial'
df_p.loc[df_p.leiden.isin(str_clust),'leidencelltype4'] = 'stromal'
df_p.loc[df_p.leiden.isin(fb_clust),'leidencelltype4'] = 'stromal'
df_p.loc[df_p.leiden.isin(endo_clust),'leidencelltype4'] = 'endothelial'
df_p.loc[df_p.leiden.isin(imm_clust),'leidencelltype4'] = 'immune'
adata.obs['leidencelltype4'] = df_p.leidencelltype4
fig,ax = plt.subplots(figsize=(3,2),dpi=200)
figname=f'leiden_{n_neighbors}_{n_markers}_{resolution}_{s_type}_leidencelltype4.png'
sc.pl.umap(adata, color='leidencelltype4',ax=ax,save=figname,title=figname.split('.png')[0].replace('_',' '),legend_loc='on data')

In [None]:
df_p.leiden.nunique()

In [None]:
d_roi = dict(zip(adata.obs.index,adata.obs.ROI))
df_p['ROI'] = df_p.index.map(d_roi)

In [None]:
# #drop scene003 tumor 1 clust 4- weird bg - include as tumor
# if s_type == 'NP8-IDC':
#     df_p = df_p[df_p.ROI!='NP8-IDC_scene003_Tumor1']

In [None]:
df_p['annotated'] = df_p.leiden.map(d_named).fillna('AF')
#plot all groups spatially - annotated 
colors = mpl.cm.tab10.colors + mpl.cm.Accent.colors #mpl.cm.tab20b.colors + mpl.cm.tab20c.colors

df_pos = analyze.celltype_to_bool(df_p,'annotated')
ls_clust = df_pos.columns.tolist()#['8','11','12','7','5','10','1','9']#'0','2',
for s_slide in sorted(set(df_p.slide_scene)):
    #s_slide = 'JP-TMA1-1_scene029'
    fig,ax = plt.subplots(figsize=(6,6),dpi=300)
    #plot negative cells
    df_scene = df_data[df_data.index.str.contains(s_slide)]
    ax.scatter(data=df_scene,x='DAPI_X',y='DAPI_Y',color='silver',s=0.1,label=f'')
    #for idxs, s_color_int in enumerate(range(len(df_pos.columns))):
    for idxs, s_color in enumerate(ls_clust): #enumerate(df_pos.columns):
        #s_color = s_clust
        s_color = str(s_color)
        if len(df_p[(df_p.slide_scene==s_slide) & (df_pos.loc[:,s_color])])>=1:
            #plot positive cells
            ls_index = df_p[(df_p.slide_scene==s_slide) & (df_pos.loc[:,s_color])].index
            ax.scatter(data=df_data.loc[ls_index],x='DAPI_X',y='DAPI_Y',label=f'{s_color}',s=0.3,color=colors[idxs])
        #break
    ax.set_title(f"{s_slide}", fontsize=16) # \n {d_a[s_slide]}
    ax.axis('equal')
    ax.set_ylim(ax.get_ylim()[::-1])
    #ax.set_xticklabels('')
    #ax.set_yticklabels('')
    #break
    plt.legend(markerscale=10,framealpha=.5,bbox_to_anchor=(1,1)) 
    #fig.savefig(f'{codedir}/{s_date}/{s_slide}_leiden{resolution}_scatterplot.png')
    #break

In [None]:
#save
s_out = f'{s_sample}_LeidenClustering_neighbors{n_neighbors}_resolution{resolution}_markers{n_markers}_{s_type}.csv'
df_out = df_p
if not os.path.exists(s_out):
    print('saving csv')
    df_out.to_csv(s_out)

## Summarize Tissue Variables <a name="tissue"></a>

per patient means


[contents](#contents)

In [None]:
os.chdir(datadir)
#s_sample

In [None]:
# done
# #leiden cell types (fraction)
# n_neighbors=30
# resolution= 0.6

# s_type='NP8-IDC'#'NP9-IDC2'  #
# s_subtype = s_type
# if s_type == 'NP8-IDC':
#     s_sample='20240521_NP-Tissues-ROIs_s'#
#     n_markers=19
# else:
#     n_markers=21# 19#
#     s_sample = '20240522_NP9-IDC2-ROIs_s'#
# df_lei = pd.read_csv(f'{s_sample}_LeidenClustering_neighbors{n_neighbors}_resolution{resolution}_markers{n_markers}_{s_type}.csv',index_col=0)
# df_lei['slide_scene'] = [item.split('_cell')[0] for item in df_lei.index]
# df_lei['leidencelltype2'] = df_lei.leidencelltype3.replace({'tumor':'epithelial','endothelial':'stromal','immune':'stromal'})
# df_lei['celltype1'] = 'all'
# df_lei['countme'] = True

# #leiden
# for s_celltype in ['leidencelltype2','celltype1']: #'celltype3','celltype',
#     for s_cell in df_lei.loc[:,s_celltype].unique():
#         df_cell = df_lei.loc[df_lei.loc[:,s_celltype]==s_cell]
#         df_prop = viz.prop_positive(df_cell,s_cell='annotated',s_grouper='ROI')
#         s_out = f'results_{s_sample}_LeidenClustering_{n_neighbors}_{n_markers}_{resolution}_byPatient_by{s_celltype}_in{s_cell}_{s_type}.csv'
#         df_prop.fillna(0).to_csv(s_out)
#         print(s_out)

## compare tumor, border and normal

In [None]:
from statannotations.Annotator import Annotator
import statsmodels
def annotated_stripplot_hue(df,x,y,hue,figsize,hue_order=None,b_correct=True,s=2):
    '''
    with hue, show pvalues, default FDR corrected 
    '''
    order = df.loc[:,x].unique()
    if hue_order ==  None:
        hue_order = df.loc[:,hue].unique()
    plotting = {"data":df,"x":x,"y":y,"order":order,
               "hue":hue,"hue_order":hue_order}
    pairs = [((item,hue_order[0]),(item,hue_order[1])) for item in order]

    fig,ax = plt.subplots(dpi=200,figsize=figsize)
    f,a = plt.subplots()
    sns.stripplot(**plotting,dodge=True,ax=ax,s=s,alpha=0.7)
    sns.boxplot(**plotting,ax=ax,showmeans=True,medianprops={'visible': False},
                                   whiskerprops={'visible': False},meanline=True,showcaps=False,
                           meanprops={'color': 'k', 'ls': '-', 'lw': 2},showfliers=False,showbox=False) 
    annot = Annotator(a,pairs,**plotting,verbose=False)
    h, l = ax.get_legend_handles_labels()
    ax.legend(h[0:len(hue_order)],l[0:len(hue_order)],bbox_to_anchor=(1,1),title=hue)
    annot.configure(test='t-test_ind',text_format="star")
    annot.apply_test()
    a, test_results = annot.apply_test().annotate()
    plt.close(f)
    annot = Annotator(ax,pairs,**plotting,verbose=False)
    d_pval = {}
    for res in test_results:
        d_pval.update({res.data.group1[0]:res.data.pvalue})
    pvalues = [d_pval[item] for item in order]
    reject, corrected, __, __ = statsmodels.stats.multitest.multipletests(pvalues,method='fdr_bh')
    formatted_pvalues = [f'p={pvalue:.2}' for pvalue in list(pvalues)]
    if b_correct:
        formatted_pvalues = [f'FDR={pvalue:.2}' for pvalue in list(corrected)]
    annot.set_custom_annotations(formatted_pvalues)
    annot.annotate()
    return(fig, ax)

def name_celltype(str):
    str = str.replace('Normal breast luminal','Norm. breast').replace('myoep',
        'myoep.').title().replace('Cd','CD').replace('Coli','ColI').replace('Er+','ER+').replace('Fb','Fb.')
    return(str)

In [None]:
%matplotlib inline
for s_type in ['NP9-IDC2','NP8-IDC',]:
    df_file = pd.DataFrame(index=os.listdir())
    df_file = df_file[df_file.index.str.contains('results_')]
    for hue_order in [('Tumor','Border'),('Tumor','Normal')]:
        sns.set_palette('muted')
        if hue_order==('Tumor','Normal'):
            pal = ('#4878d0','#6acc64')
            sns.set_palette(pal)
        for s_file in df_file[df_file.index.str.contains(s_type)].index:
            df=pd.read_csv(s_file,index_col=0)
            s_compartment = s_file.split('_in')[1].split('_')[0]
            df_long = df.unstack().reset_index().rename({'level_0':'celltype',0:'Fraction'},axis=1)
            df_long['Type'] = [item.split('_')[-1][0:-1] for item in df_long.ROI]
            if s_compartment == 'epithelial':
                figsize=(5,3)
            elif s_compartment == 'stromal':
                figsize=(8,3)
                if s_type=='NP9-IDC2':
                    figsize=(9,3)
            else:
                 #continue
                 figsize=(12,3)
            if not s_compartment=='nan':
                fig,ax = annotated_stripplot_hue(df_long,'celltype','Fraction','Type',figsize=figsize,
                                                 hue_order=hue_order,b_correct=True,s=8)
                ax.set_ylabel(f'Fraction in {s_compartment}')
                labels = ax.get_xticklabels()
                ax.set_xticklabels([name_celltype(item.get_text()) for item in labels])
                ax.set_title(s_type)
                ax.set_xlabel('')
                plt.tight_layout()
                fig.savefig(f'figures/stripplot_{s_type}_{s_compartment}.png')
        if s_type == 'NP8-IDC':
            break
    #     break
    #break

In [None]:
# # run this on re-registered images to create crops
# '/home/groups/graylab_share/Chin_Lab/ChinData/Cyclic_Workflow/cmIF_2019-02-01_NPDCIS2/RegisteredImages/ReRegistered-NP9-IDC2'
# from skimage import io, transform
# import skimage
# import numpy as np
# s_file = 'ReRegistered-R1_PCNA.CD8.PD1.CK19_NP9-IDC2-Scene-003_c5_ORG.tif'
# img = io.imread(s_file)
# a_rescale = skimage.exposure.rescale_intensity(img,in_range=(0,1.5*np.quantile(img,0.9999)))
# a_crop = (a_rescale/256).astype(np.uint8)
# a_crop = transform.rescale(a_crop, scale=0.5,preserve_range=True,anti_aliasing=True)
# io.imsave(f'Cropped/{s_file}',a_crop)