## Cluster TMAs

**Samples:** 
- Basel

**Method**:

- use mesmer segmentation data

- leiden cluster on good biomarkers for celltypes

In [None]:
#load libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns
import scipy
import scanpy as sc
import matplotlib as mpl
from matplotlib import cm
mpl.rc('figure', max_open_warning = 0)
codedir = '/home/groups/graylab_share/OMERO.rdsStore/engje/Data/IMC_Data_publication'#os.getcwd()
import phenograph
os.chdir('/home/groups/graylab_share/OMERO.rdsStore/engje/Data')
from mplex_image import visualize as viz, process, preprocess, normalize, mpimage
np.random.seed(1221)

In [None]:
s_date = '20220114'
os.chdir(codedir)
if not os.path.exists(s_date):
    os.mkdir(s_date)
%matplotlib inline

# Table of contents <a name="contents"></a>
1. [Load annotation](#loadold)
2. [load data](#cluster)
3. [load thresholds](#load)
  [gating](#gating)  [colocalization](#coloc)
4. [bg subtract](#leiden)
5. [single cell Umap](#l7)
6. [Leiden celltypes](#l8)
[annotate leiden celltypes](#lbar)
7. [Tissue means](#tissue)
8. [single variable survival](#surv)
9. [subtyping survival](#subt)

# Load annotation <a name="loadold"></a>

[contents](#contents)

In [None]:
df_a = pd.read_csv(f'{codedir}/BaselTMA/Basel_PatientMetadata.csv')
#basel has 285 patients with overall survival
print(len(df_a.PID.unique()))
print(df_a[~df_a.PID.duplicated()].groupby('clinical_type').PID.count())
print(df_a[~df_a.PID.duplicated()].groupby('clinical_type').PID.count().sum())
#df_a.columns
#s_col = 'diseasestatus' # == tumor 'Count_Cells' #
#s_id = 238
#df_a.loc[df_a.PID==s_id,[s_col,'TMALocation']]
#df_a.loc[:,s_col].unique()
#df_a[~df_a.PID.duplicated()]
#df_a.OSmonth.dropna()

In [None]:
#no survival data for zurich
'''
df_z = pd.read_csv(f'{codedir}/ZurichTMA/Zuri_PatientMetadata.csv',index_col=0)
df_z.OSmonth.isna().sum()
'''
df_s = pd.read_csv(f'{codedir}/Basel_Zuri_StainingPanel.csv')
#df_s.loc[~df_s.loc[:,'Antibody Clone'].isna()].Target
df_s.head()

# Basel TMA  <a name="cluster"></a>

Basel TMA, 279 patients with overal survival, subtype



| clinical_type | no. |
| --- | ----------- |
| HR+HER2+   |   30 |
| HR+HER2-   |  176 |
| HR-HER2+   |   24 |
| TripleNeg  |   49 |


[contents](#contents)

In [None]:
segdir = f'{codedir}/Segmentation'
df_mi = pd.read_csv((f'{segdir}/features_IMC_MeanIntensity_Centroid_Shape.csv'),index_col=0)
df_mi = df_mi.dropna()

In [None]:
sorted(df_mi.columns)

In [None]:
#filter sbcellular location

#150 cells are na in the markers: probaly too small for the expansion (get borders erased them)
#os.listdir(f'{codedir}/BaselTMA')
sorted(df_mi.columns)
ls_marker = ['CD20_cytoplasm','CD31_cytoplasm','CD3_cytoplasm','CD44_cytoplasm','CD45_cytoplasm',
             'CD68_cytoplasm','CarbonicAnhydraseIX_cytoplasm', 'Cytokeratin19_cytoplasm','Cytokeratin5_cytoplasm',
              'Cytokeratin7_cytoplasm','Cytokeratin818_cytoplasm','DAPI1_nuclei','EGFR_cytoplasm','ER_nuclei','Ecad_cytoplasm',
             'EpCAM_cytoplasm','Erk12_nuclei','Fibronectin_cytoplasm','GATA3_nuclei','HistoneH3_nuclei','Keratin14_cytoplasm',
             'Ki67_nuclei','ProgesteroneReceptorAB_nuclei','S6_cytoplasm','SMA_cytoplasm', ##'RabbitIgGHL_cell' now ER_nuclei,
             'Slug_nuclei','Sox9_nuclei','Twist_nuclei','Vimentin_cytoplasm','area_nuclei','bCatenin_nuclei',
              'cMyc_nuclei', 'centroid-0_nuclei', 'centroid-1_nuclei','cerbB2Her2_cytoplasm','cleavedPARP_nuclei',
            'eccentricity_nuclei','mTOR_cytoplasm','p53_nuclei','panCytokeratin_cytoplasm',
            ]
sorted(df_mi.columns)
len(ls_marker)

In [None]:
'''exclude_channels = ["ImageId" ,"CellId" ,"In115 115InIn115Di","Xe134 134XeXe134Di","Hg202 202HgHg202Di","Pb204 204PbPb204Di",
                     "Pb206 206PbPb206Di","ArAr80 80ArArArAr80Di", "10311239Ru96Di Rutheni","10311240Ru98Di Rutheni",
                     "10311241Ru99Di Rutheni", "10311242Ru100Di Rutheni","10311243Ru101Di Rutheni", "10311244Ru102Di Rutheni",
                     "10311245Ru104Di Rutheni","Xe126 126XeXe126Di","I127 127II127Di","Xe131 131XeXe131Di","Pb207 207PbPb207Di",
                     "Pb208 208PbPb208Di","EulerNumber","MajorAxisLength","MinorAxisLength", "Orientation",
                     "10331253Ir191Di Iridium","Perimeter","Solidity", # excluded in Jackson et al
                   'Extent', 'Percent_Touching', 'Number_Neighbors', #I excluded these three
                   ]
#29 markers used in jackson et al
ls_jackson = ['iridium', 'histone', 'phosphorylated histone', 'CK14', 'CK5', 'CK8/18', 'CK19', 'CK7', 'pan-CK',
'E/P-cadherin', 'ER', 'PR', 'HER2', 'GATA3', 'SMA', 'vimentin', 'fibronectin', 'vWF/ CD31',
'CD44', 'CD45', 'CD68', 'CD3', 'CD20', 'cleaved caspase 3/cleaved PARP',
'carbonic anhydrase', 'phosphorylated S6', 'Ki67', 'p53', 'EGFR', 'area', 'eccentricity']
'''

In [None]:
#len(df_data.Marker.unique()) #34 in watershed

In [None]:
#df_test = df_data.loc[df_data.id.str.contains('BaselTMA_SP41_257'),['id','mc_counts','Marker']]
#df_test.pivot_table(index="id", columns='Marker',  values='mc_counts')
#df_wide = df_data.loc[:,['id','mc_counts','Marker']].pivot_table(index="id",columns='Marker',values='mc_counts')

In [None]:
#watershed df_wide.shape (855668, 34)
#mesmer df_mi.loc[:,ls_marker].shape (618811, 40)

In [None]:
## add slide scene
df_mi['slide_scene'] = [item.split('_cell')[0].replace("scene","") for item in df_mi.index]
df_mi.loc[:,ls_marker+['slide_scene']].head()

In [None]:
#add core
for s_core in df_a.core.unique():
    s_scene = s_core.split('_')[-2] + '_' + s_core.split('_')[-1]
    df_mi.loc[df_mi.slide_scene==s_scene,'core'] = s_core
    #break
#old
#for s_core in df_a.core.unique():
#    df_wide.loc[df_wide.index.str.contains(f'{s_core}_'),'core'] = s_core
df_wide = df_mi.loc[:,ls_marker+['slide_scene','core']]

In [None]:
#rename columns
d_replace = {'centroid-0_nuclei':'DAPI_Y',
       'centroid-1_nuclei':'DAPI_X',
            'cerbB2Her2_cytoplasm':'HER2_cytoplasm',
            'ProgesteroneReceptorAB_nuclei':'PgR_nuclei',
             'panCytokeratin_cytoplasm':'panCK_cytoplasm',
            'Cytokeratin19_cytoplasm':'CK19_cytoplasm',
             'Cytokeratin5_cytoplasm':'CK5_cytoplasm',
       'Cytokeratin7_cytoplasm':'CK7_cytoplasm',
             'Cytokeratin818_cytoplasm':'CK8_cytoplasm',
            'Keratin14_cytoplasm':'CK14_cytoplasm',
            'CarbonicAnhydraseIX_cytoplasm':'CAIX_cytoplasm',
            'Fibronectin_cytoplasm':'FN_cytoplasm',
            'HistoneH3_nuclei':'HH3_nuclei',
            'cleavedPARP_nuclei':'CC3cPARP_nuclei',
            'Vimentin_cytoplasm':'Vim_cytoplasm',
            'CD31_cytoplasm':'CD31vWF_cytoplasm'}
df_wide.rename(d_replace,axis=1,inplace=True)

In [None]:
df_wide.head()

In [None]:
df_wide.columns

In [None]:
#save
#s_out = f'{codedir}/data/20220110_BaselTMA_MeanIntensity.csv'
#s_out = f'{codedir}/data/20220112_BaselTMA_MeanIntensity.csv'
s_out = f'{codedir}/data/20220114_BaselTMA_MeanIntensity.csv'
if not os.path.exists(s_out):
    print('saving csv')
    df_wide.to_csv(s_out)

## Umap <a name="l7"></a>

umap projection of single cells

[contents](#contents)

In [None]:
os.chdir(f'{codedir}/data')
s_sample = '20220114_BaselTMA'#'20220112_BaselTMA'#'20220110_BaselTMA'#'20211221_BaselTMA'
df_norm = pd.read_csv(f'{s_sample}_MeanIntensity.csv',index_col=0)
#rename columns
dict(zip(df_norm.columns,[item.split('_')[0] for item in df_norm.columns]))
d_rename = {'CD20_cytoplasm': 'CD20', 'CD31vWF_cytoplasm': 'CD31', 'CD3_cytoplasm': 'CD3', 'CD44_cytoplasm': 'CD44',
 'CD45_cytoplasm': 'CD45', 'CD68_cytoplasm': 'CD68', 'CAIX_cytoplasm': 'CAIX', 'CK19_cytoplasm': 'CK19',
 'CK5_cytoplasm': 'CK5', 'CK7_cytoplasm': 'CK7', 'CK8_cytoplasm': 'CK8', 'DAPI1_nuclei': 'DAPI1',
 'EGFR_cytoplasm': 'EGFR', 'Ecad_cytoplasm': 'Ecad', 'EpCAM_cytoplasm': 'EpCAM', 'Erk12_nuclei': 'Erk12',
 'FN_cytoplasm': 'FN', 'GATA3_nuclei': 'GATA3', 'HH3_nuclei': 'HH3', 'CK14_cytoplasm': 'CK14',
 'Ki67_nuclei': 'Ki67', 'PgR_nuclei': 'PgR', #'RabbitIgGHL_cell': 'RbtIgG',
            'S6_cytoplasm': 'S6', 'ER_nuclei': 'ER',
 'SMA_cytoplasm': 'SMA', 'Slug_nuclei': 'Slug', 'Sox9_nuclei': 'Sox9', 'Twist_nuclei': 'Twist',
 'Vim_cytoplasm': 'Vim', 'area_nuclei': 'area', 'bCatenin_nuclei': 'bCatenin', 'cMyc_nuclei': 'cMyc',
 'HER2_cytoplasm': 'HER2', 'CC3cPARP_nuclei': 'CC3_cPARP', 'eccentricity_nuclei': 'eccentricity', 'mTOR_cytoplasm': 'mTOR',
 'p53_nuclei': 'p53', 'panCK_cytoplasm': 'panCK', #'slide_scene': 'slide', #'DAPI_Y': 'DAPI', #'DAPI_X': 'DAPI',
 'core': 'core'}
df_norm.rename(d_rename,axis=1,inplace=True)
#add annotation
df_norm['subtype'] = df_norm.core.map(dict(zip(df_a.core,df_a.clinical_type)))
df_norm['Patient'] = df_norm.core.map(dict(zip(df_a.core,df_a.PID)))
df_norm =df_norm.dropna()

In [None]:
df_norm.columns

## cluster all cells

TNBC and ER+

In [None]:
#33 features
ls_col = ['Area', 'CAIX', 'CC3_cPARP', 'CD20', 'CD3', 'CD31', 'CD44', 'CD45',
       'CD68', 'CK14', 'CK19', 'CK5', 'CK7', 'CK8', 'EGFR', 'Eccentricity',
       'FN', 'GATA3', 'HER2', 'HH3', 'Ki67', 'PgR', 'RbtIgG', 'SMA', 'Slug',
       'Twist', 'Vim', 'cMyc', 'p53', 'pHH3', 'pS6', 'panCK', 'pmTOR'#,'Ecad'
         ]#old
ls_col = ['CD20', 'CD31', 'CD3', 'CD44', 'CD45', 'CD68', 'CAIX', 'CK19', 'CK5',
       'CK7', 'CK8', 'DAPI1', 'EGFR', 'EpCAM', 'Erk12', 'FN', 'GATA3',
       'HH3', 'CK14', 'Ki67', 'PgR', 'RbtIgG', 'S6', 'SMA', 'Slug', 'Sox9',
       'Twist', 'Vim', 'Area', 'bCatenin', 'cMyc', 'HER2',
       'CC3_cPARP', 'Eccentricity', 'mTOR', 'p53', 'panCK',  ] 
ls_col = ['CD20', 'CD31', 'CD3', 'CD44', 'CD45', 'CD68', 'CAIX', 'CK19', 'CK5',
       'CK7', 'CK8', 'DAPI1', 'EGFR', 'ER', 'Ecad', 'EpCAM', 'Erk12', 'FN',
       'GATA3', 'HH3', 'CK14', 'Ki67', 'PgR', 'S6', 'SMA', 'Slug', 'Sox9',
       'Twist', 'Vim', 'Area', 'bCatenin', 'cMyc',  'HER2', #'DAPI_Y', 'DAPI_X',
       'CC3_cPARP', 'Eccentricity', 'mTOR', 'p53', 'panCK',   ]
ls_col = [ 'CD31', 'CD44', 'CD45','CD20','CD3',#'Area',  'nCD3' noisy,#'CAIX' weird,
        'CK14','CK5','EGFR', 'CK19',  'CK8','CK7','panCK','Ecad',  #'Eccentricity','?CD68','weakCK7',
       'Ki67','ER','PgR','CD68','FN', 'SMA', #'Slug',#'GATA3',weak and bright noise 
        'Vim','area'] #'cMyc','p53','pmTOR',  'pHH3', 'pS6' 'Twist',
df_exclude = pd.read_csv('exclude_necrotic_BaselTMA_TripleNeg.csv',index_col=0)
print(len(ls_col))
adata = sc.AnnData(df_norm.loc[~df_norm.index.isin(df_exclude.index),ls_col])
adata.obs['TMA'] = 'Basel'
adata.obs['slide_scene'] = df_norm.core
adata.obs['Patient'] = df_norm.Patient
adata.obs['subtype'] = df_norm.subtype.replace({'HR+HER2-':'ER+', 'TripleNeg':'TNBC'})
#two subtypes
adata = adata[adata.obs.subtype.isin(['TNBC', 'ER+'])]


In [None]:
adata.obs['subtype'].unique()

In [None]:
#preprocess
b_scale = True
adata.raw = adata
#reduce dimensionality
sc.tl.pca(adata, svd_solver='auto')
fig,ax=plt.subplots(figsize=(3.5,5))
sc.pl.highest_expr_genes(adata, n_top=48,ax=ax,save=f'{s_sample}_Expression_{len(ls_col)}.png')
plt.tight_layout()
sc.pl.pca_variance_ratio(adata,n_pcs=37, log=True)
if b_scale:
    sc.pp.scale(adata, zero_center=False, max_value=20)
    s_sample = s_sample + '_s'

In [None]:
n_neighbors = 30
results_file = f'{s_sample}_{n_neighbors}neighbors_{len(ls_col)}markers.h5ad'
results_file #= f'{s_sample}_{n_neighbors}neighbors_{len(ls_col)}markers.h5ad'

In [None]:
#umap
#s_sample = '20220112_BaselTMA'#'20220110_BaselTMA'#'20211221_BaselTMA'
for n_neighbors in [30,15]:
    results_file = f'{s_sample}_{n_neighbors}neighbors_{len(ls_col)}markers.h5ad'
    if not os.path.exists(results_file):
        print('calc umap')
        # calculate neighbors 
        sc.pp.neighbors(adata, n_neighbors=n_neighbors) #, method='rapids'
        sc.tl.umap(adata)
    else:
        print('loading umap')
        adata = sc.read_h5ad(results_file)
        adata.obs['TMA'] = 'Basel'
        adata.obs['slide_scene'] = df_norm.core
        adata.obs['subtype'] = df_norm.subtype
        adata.obs['Patient'] = df_norm.Patient#.astype('int').astype('str')
    #save results  
    if not os.path.exists(results_file):
        adata.write(results_file)
    #color by markers
    figname = f"Umap_{s_sample}_markers_{n_neighbors}neighbors_{len(ls_col)}markers.png"
    sc.pl.umap(adata, color=ls_col,vmin='p1.5',vmax='p98.5',save=figname,ncols=6)
    #color by TMA
    figname = f"Umap_{s_sample}_TMA_{n_neighbors}neighbors_{len(ls_col)}markers.png"
    fig,ax = plt.subplots(figsize=(3,2), dpi=200)
    sc.pl.umap(adata, color='TMA',wspace=.25,save=figname,ax=ax)
    
    figname = f"Umap_{s_sample}_subtype_{n_neighbors}neighbors_{len(ls_col)}markers.png"
    fig,ax = plt.subplots(figsize=(3,2), dpi=200)
    sc.pl.umap(adata, color='subtype',wspace=.25,save=figname,ax=ax)
    
    figname = f"Umap_{s_sample}_Patient_{n_neighbors}neighbors_{len(ls_col)}markers.png"
    fig,ax = plt.subplots(figsize=(3,2), dpi=200)
    sc.pl.umap(adata, color='Patient',wspace=.25,save=figname,ax=ax)

    break


In [None]:
s_type = '2sub'
resolution = 0.6
n_markers = 21#len(ls_col) #20
results_file = f'{s_sample}_{n_neighbors}neighbors_{n_markers}markers_{s_type}_leiden{resolution}.h5ad'
if not os.path.exists(results_file):
    print('clustering')
    sc.tl.leiden(adata,resolution=resolution)
    adata.write(results_file)
else:
    print('loading leiden')
    adata = sc.read_h5ad(results_file)

#fig,ax = plt.subplots(figsize=(3,2),dpi=200)
#figname=f'leiden_{n_neighbors}_{n_markers}_{resolution}_{s_type}_legend.png'
#sc.pl.umap(adata, color='leiden',ax=ax,save=figname,title=figname.split('.png')[0].replace('_',' '))

fig,ax = plt.subplots(figsize=(3,2),dpi=200)
figname=f'leiden_{n_neighbors}_{n_markers}_{resolution}_{s_type}.png'
sc.pl.umap(adata, color='leiden',ax=ax,save=figname,title=figname.split('.png')[0].replace('_',' '),legend_loc=None)
#

In [None]:
#small cells
#adata.obs['test'] = (adata.to_df().Area > 10).replace({True:1,False:0})
#sc.pl.umap(adata, color='test',title='merged')

## cluster within subtype

skip (cluster eR and TNBC together)

## Annotate Leiden <a name="lbar"></a>

annotate epitelial, immune a stroma cell types

also visualize cluster results on tissue scatter plots.


[contents](#contents)

In [None]:
#load
os.chdir(f'{codedir}/data')
s_sample = '20220114_BaselTMA_s' #'20220112_BaselTMA' #'20220110_BaselTMA' #'20211221_BaselTMA'
n_neighbors = 30 #
n_markers = 21 # 29 #
s_type = '2sub'#'TripleNeg'#'HR+HER2-'#
resolution = 0.6 #23c # 0.4 #18c #  0.5 # 20c
results_file = f'{s_sample}_{n_neighbors}neighbors_{n_markers}markers_{s_type}_leiden{resolution}.h5ad'
adata = sc.read_h5ad(results_file)

In [None]:
fig,ax = plt.subplots(figsize=(3,2),dpi=200)
figname=f'leiden_{n_neighbors}_{n_markers}_{resolution}_{s_type}_legend.png'
sc.pl.umap(adata, color='leiden',ax=ax,save=figname,title=figname.split('.png')[0].replace('_',' '))

results_file

In [None]:
#annoitate
ls_drop = []
df_p = adata.to_df()
df_p['leiden'] = adata.obs['leiden']
if results_file =='20220110_BaselTMA_30neighbors_19markers_TripleNeg_leiden0.4.h5ad':
    tum_clust = ['18', '9', '14', '3', '16', '5', '10', '4', '12', '13', '8','11', '6', ] #
    str_clust = ['17','1', '2', '7'] 
    imm_clust = ['15','0'] 
    endo_clust = ['7']
if results_file =='20220110_BaselTMA_30neighbors_18markers_TripleNeg_leiden0.4.h5ad':
    tum_clust = ['17', '9', '12', '3', '14', '8', '7', '4', '13', '10',] 
    str_clust = ['0', '15', '2', '6','11'] 
    imm_clust = ['1','18','16'] 
    endo_clust = ['5'] 
if results_file =='20220110_BaselTMA_30neighbors_18markers_TripleNeg_leiden0.5.h5ad':
    tum_clust = ['23', '11', '17', '3', '21', '12', '22', '15', '18', '10',
                  '4', '19', '16',  '24', '14','7',] 
    str_clust = ['0',  '13', '2', '20',  '6',   '9'] #7 might be epithelial
    imm_clust = ['1','5',] 
    endo_clust = ['8',]
    ls_drop = ['24']
if results_file == '20220112_BaselTMA_30neighbors_19markers_TripleNeg_leiden0.5.h5ad':
    tum_clust = ['23', '24', '7', '13', '22', '21', '16', '20', '18', '10',
                  '11', '17', '3', '5', '14', '15', '19',  '8']  #'9',?
    str_clust = ['0',  '12', '2', '6', '9'] 
    imm_clust = ['4'] 
    endo_clust = ['1']
    ls_drop = [] 
if results_file == '20220112_BaselTMA_30neighbors_20markers_TripleNeg_leiden0.5.h5ad':
    tum_clust = ['21', '23', '7', '12', '19', '18', '16', '8', '15', '10',
                  '22', '3', '14','5', '17', '4', '13']  #5 is proliferating
    str_clust = ['0', '11',  '2', '1', '20'] 
    imm_clust = ['6'] 
    endo_clust = ['9']
    ls_drop = [] 
if results_file =='20220114_BaselTMA_s_30neighbors_20markers_2sub_leiden0.5.h5ad':
    tum_clust = ['19', '10', '9',  '18', '13', '5', '22', '6', '8', '21',
                  '16', '12', '0', '11', '17','20','23',] # #'14', FN+ also some epithelial :/ #  '15' endo
    str_clust = ['1', '14', '3', '4', '7'] 
    imm_clust = ['24','2'] #24 is b cells, 2 is t cells
    endo_clust = ['15']
    ls_drop = [] 
if results_file =='20220114_BaselTMA_s_30neighbors_21markers_2sub_leiden0.6.h5ad':
    print('good one')
    tum_clust = ['19', '13', '7', '6', '15', '21', '11', '9', '16', '23', '5',
                  '1', '10', '20','22','9b']
    str_clust = ['0', '12', '18',  '3', '4', '8']
    imm_clust = ['17','2']
    endo_clust = ['14']        
    ls_drop = [] 
    #CD3 bg in EGFR high
    print('subCD3')
    df_p.loc[df_p.leiden=='23','CD3'] = np.clip(a=df_p.loc[df_p.leiden=='23','CD3'],a_min=0,a_max=3)
    #merge ER+
    ls_merge = ['22','7']
    df_p.loc[df_p.leiden.isin(ls_merge),'leiden'] = '7'
    #notes: Spilt basal and myoep, Split CD20 and Tcell, Combine luminal tumor
    #split cd3 out of 17
    df_p.loc[((df_p.leiden=='17') & (df_p.CD20<2)),'leiden'] = '2'
    #merge luminal t.
    ls_merge = ['13','6']
    df_p.loc[df_p.leiden.isin(ls_merge),'leiden'] = '6'
    #basal and myoep split
    df_test = df_p.loc[(df_p.leiden=='9')][(df_p.loc[(df_p.leiden=='9'),'CK14']/df_p.loc[(df_p.leiden=='9'),'SMA']) > 2]
    df_p['leiden'] = df_p.leiden.astype('str')
    df_p.loc[df_test.index,'leiden'] = '9b'
    
df_p.groupby('leiden').mean().panCK.sort_values(ascending=False).index
set(df_p.groupby('leiden').mean().CK19.sort_values(ascending=False).index) - set(tum_clust + imm_clust )
#set(tum_clust)

In [None]:
#clustermap
b_annot = False #True

d_replace = {}
if b_annot:
    d_replace.update(dict(zip(tum_clust,[f'{item}\nep' for item in tum_clust])))
    d_replace.update(dict(zip(imm_clust,[f'{item}\nim' for item in imm_clust])))
    d_replace.update(dict(zip(str_clust,[f'{item}\ns' for item in str_clust])))
    d_replace.update(dict(zip(endo_clust,[f'{item}\nen' for item in endo_clust])))
    df_plot = df_p.loc[:,(df_p.dtypes=='float32') | (df_p.columns=='leiden')].groupby('leiden').mean().dropna().drop(ls_drop).rename(d_replace)
else:
    df_plot = df_p.loc[:,(df_p.dtypes=='float32') | (df_p.columns=='leiden')].groupby('leiden').mean().drop(ls_drop).dropna()
df_plot.index.name = f'leiden {resolution}'
df_plot = df_plot.loc[:,df_plot.sum() > 0]
g = sns.clustermap(df_plot,z_score=1,figsize=(6,8),cmap='viridis',dendrogram_ratio=0.1,
                   vmin=-2,vmax=2,cbar_pos=(.01, .9, .05, .1),method='ward')
g.savefig(f'{codedir}/{s_date}/{s_sample}_clustermap_{n_neighbors}_{n_markers}markers_{s_type}_leiden{resolution}.png',dpi=200)
marker_genes = df_plot.iloc[:,g.dendrogram_col.reordered_ind].columns.tolist()
categories_order = df_plot.iloc[g.dendrogram_row.reordered_ind,:].index.tolist()
#scatterplots
#sns.scatterplot(data=df_p.loc[df_p.leiden=='17'], x='CD3', y='CD20',s=1)
#df_test = (df_p.loc[(df_p.leiden=='9'),'CK14']/df_p.loc[(df_p.leiden=='9'),'SMA']) > 2
#print(df_test.sum())
#sns.scatterplot(data=df_p.loc[df_p.leiden=='9'], x='SMA', y='CK14',s=1,hue=df_test)

In [None]:
fig,ax=plt.subplots(figsize=(1.4,3.7),dpi=200)
df_p.groupby('leiden').count().loc[categories_order[::-1]].iloc[:,1].plot(kind='barh',title='Cell No.',ax=ax,width=0.7)
plt.tight_layout()
fig.savefig(f'{codedir}/{s_date}/barplot_{s_sample}_{n_neighbors}neighbors_{len(marker_genes)}markers_leiden{resolution}_{s_type}.png')

In [None]:
#TNBC 24 is CD68 hot pixels. 23 is real bright ck staining, based on 11: only 8 sample have appreciable ER...why??? because these are TN!, looks like rbt IgG is ER. use nuclear and rename/cluster
#21 is EGFR hi, real #20 is real, Vim hi, 19 real ck9 hi, 18 real ck14 hi #16 had high ER background, 14 has high ER bg but its not nuclear
#13 168_X10Y5 and 137_X9Y3 and 64_X13Y4 is necrotic
#nice cd45 111_X8Y6, ER+ nirm duct
## TNBC no hot 20? #18 24? 16? #20 is ER+ tumor, maybe also non-specific ER
##TNBC no necrotic 15-23 only 1 tissue
df_p['core'] = [item.split('_cell')[0].replace('scene','') for item in df_p.index]
s_clust = '9'
#df_p['core'] = df_p.merge(df_norm.loc[:,['core']],left_index=True,right_index=True).core
df_p.groupby(['leiden','core']).count().loc[s_clust].iloc[:,1].sort_values(ascending=False)[0:10]

In [None]:
#df_exclude
'''
if results_file == '20220110_BaselTMA_30neighbors_18markers_TripleNeg_leiden0.5.h5ad':
    print('excluding')
    df_exclude = df_p[((df_p.leiden=='13') & (df_p.core.isin(['168_X10Y5','137_X9Y3','64_X13Y4'])))] # necrotic | (df_p.leiden=='24')
    df_exclude.to_csv('exclude_necrotic_BaselTMA_TripleNeg.csv')'''

In [None]:
#s_scene = '107_X4Y7'
#adata.obs['test'] = (adata.obs.slide_scene.str.contains(s_scene)).replace({True:1,False:0})
#sc.pl.umap(adata, color='test',title=s_scene)
#  
ls_merge= tum_clust # str_clust + endo_clust + imm_clust #
ls_merge=['0']
adata.obs['test'] = (adata.obs.leiden.isin(ls_merge)).replace({True:1,False:0})
sc.pl.umap(adata, color='test',title='merged')

In [None]:
%matplotlib inline
#plot all groups spatially - leiden 
from mplex_image import analyze
colors = mpl.cm.tab20b.colors + mpl.cm.tab20c.colors
df_pos = analyze.celltype_to_bool(df_p,'leiden')
df_pos.columns = [str(item) for item in df_pos.columns]

In [None]:

for s_slide in sorted(set(df_p.core)):
    s_slide = '81_X11Y4'#'79_X8Y4'#'111_X8Y6'#'103_X5Y4'#'118_X13Y5'#'241_X7Y3'#'92_X12Y7'#'77_X3Y8'#'23_X15Y7'#'129_X7Y8'#'116_X4Y4'#'111_X8Y6'#'176_X3Y7'#'111_X8Y6'#'129_X7Y8'#'189_X5Y5'#'214_X15Y6'#'229_X1Y7'#'236_X11Y7'#'64_X13Y4'#'137_X9Y3'#'77_X3Y8'#'168_X10Y5'#'114_X13Y4'#'103_X5Y4'#'81_X11Y4' #'270_X6Y8'#'103_X5Y4'#'241_X7Y3'#'179_X13Y8'#'135_X8Y5' #'110_X14Y8'#'254_X11Y6'
    fig,ax = plt.subplots(figsize=(5,4.5),dpi=200)
    #plot negative cells
    df_scene = df_norm[df_norm.slide_scene==s_slide]
    ax.scatter(data=df_scene,x='DAPI_X',y='DAPI_Y',color='silver',s=0.2,label=f'')
    #for idxs, s_color_int in enumerate(range(len(df_pos.columns))):
    for idxs, s_color in enumerate(df_pos.columns):
        s_color = s_clust
        s_color = str(s_color)
        if len(df_p[(df_p.core==s_slide) & (df_pos.loc[:,s_color])])>=1:
            #plot positive cells
            ls_index = df_p[(df_p.core==s_slide) & (df_pos.loc[:,s_color])].index
            ax.scatter(data=df_norm.loc[ls_index],x='DAPI_X',y='DAPI_Y',label=f'{s_color}',s=0.3,color=colors[idxs])
        break
    ax.set_title(f"{s_slide}", fontsize=16) # \n {d_a[s_slide]}
    ax.axis('equal')
    ax.set_ylim(ax.get_ylim()[::-1])
    #ax.set_xticklabels('')
    #ax.set_yticklabels('')
    #break
    plt.legend(markerscale=10,framealpha=.5) 
    #fig.savefig(f'{codedir}/{s_date}/{s_slide}_leiden{resolution}_scatterplot.png')
    break

In [None]:
#one by one
for s_clust in str_clust:
    ls_merge = [s_clust]
    adata.obs['test'] = (adata.obs.leiden.isin(ls_merge)).replace({True:1,False:0})
    sc.pl.umap(adata, color='test',title=s_clust)#'merged')

In [None]:
#threshold on umap
#df_p.loc[((df_p.Vim>1) | (df_p.FN>1)),'celltype'] = 'str.'
#df_p.loc[df_p.CD31>2,'celltype'] = 'str.'
#df_p.loc[df_p.Ecad > .4,'celltype'] = 'epithelial'
#df_p.loc[df_p.CD45>1.2,'celltype'] = 'imm.'
adata.obs['test'] = pd.Series(adata.obs.index.isin(df_p[df_p.CD31>2].index)).replace({True:1,False:0}).values
sc.pl.umap(adata, color='test',title='CD31')
adata.obs['test'] = pd.Series(adata.obs.index.isin(df_p[df_p.CD45>1.2].index)).replace({True:1,False:0}).values
sc.pl.umap(adata, color='test',title='CD45')
adata.obs['test'] = pd.Series(adata.obs.index.isin(df_p[df_p.Ecad>.4].index)).replace({True:1,False:0}).values
sc.pl.umap(adata, color='test',title='Ecad')
#adata.obs['test'] = pd.Series(adata.obs.index.isin(df_p[df_p.panCK>1].index)).replace({True:1,False:0}).values
#sc.pl.umap(adata, color='test',title='merged')
adata.obs['test'] = pd.Series(adata.obs.index.isin(df_p[(df_p.Vim>1) | (df_p.FN>1)].index)).replace({True:1,False:0}).values
sc.pl.umap(adata, color='test',title='Vim FN')

In [None]:
#artifacts
results_file
#df_p[df_p.leiden.isin(['19'])].to_csv('exclude_20211221_BaselTMA_31neighbors_29markers_TripleNeg_leiden0.4.csv') #'18',
#df_p[df_p.leiden.isin(['18','19','20','21'])].to_csv('exclude_20211221_BaselTMA_31neighbors_29markers_HR+HER2-_leiden0.4.csv') #16, 17 not

In [None]:
%matplotlib inline
df_p.loc[df_p.leiden.isin(tum_clust),'leidencelltype3'] = 'epithelial'
df_p.loc[df_p.leiden.isin(str_clust),'leidencelltype3'] = 'str.'
df_p.loc[df_p.leiden.isin(endo_clust),'leidencelltype3'] = 'str.'
df_p.loc[df_p.leiden.isin(imm_clust),'leidencelltype3'] = 'imm.'
adata.obs['leidencelltype3'] = df_p.leidencelltype3
fig,ax = plt.subplots(figsize=(3,2),dpi=200)
figname=f'leiden_{n_neighbors}_{n_markers}_{resolution}_{s_type}_leidencelltype3.png'
sc.pl.umap(adata, color='leidencelltype3',ax=ax,save=figname,title=figname.split('.png')[0].replace('_',' '),legend_loc='on data')

In [None]:
df_p.loc[df_p.leiden.isin(tum_clust),'leidencelltype4'] = 'epithelial'
df_p.loc[df_p.leiden.isin(str_clust),'leidencelltype4'] = 'stromal'
df_p.loc[df_p.leiden.isin(endo_clust),'leidencelltype4'] = 'endothelial'
df_p.loc[df_p.leiden.isin(imm_clust),'leidencelltype4'] = 'immune'
adata.obs['leidencelltype4'] = df_p.leidencelltype4
fig,ax = plt.subplots(figsize=(3,2),dpi=200)
figname=f'leiden_{n_neighbors}_{n_markers}_{resolution}_{s_type}_leidencelltype4.png'
sc.pl.umap(adata, color='leidencelltype4',ax=ax,save=figname,title=figname.split('.png')[0].replace('_',' '),legend_loc='on data')

In [None]:
#manual gating
df_p['gatedcelltype3'] = np.nan
df_p.loc[((df_p.Vim>1) | (df_p.FN>1)),'gatedcelltype3'] = 'str.'
df_p.loc[df_p.CD31>2,'gatedcelltype3'] = 'str.'
df_p.loc[df_p.Ecad > .4,'gatedcelltype3'] = 'epithelial'
df_p.loc[df_p.CD45>1.2,'gatedcelltype3'] = 'imm.'
df_p.gatedcelltype3 = df_p.gatedcelltype3.fillna('str.')
adata.obs['gatedcelltype3'] = df_p.gatedcelltype3
fig,ax = plt.subplots(figsize=(3,2),dpi=200)
figname=f'leiden_{n_neighbors}_{n_markers}_{resolution}_{s_type}_gatingcelltype3.png'
sc.pl.umap(adata, color='gatedcelltype3',ax=ax,save=figname,title=figname.split('.png')[0].replace('_',' '),legend_loc='on data')

In [None]:
#manual gating
df_p['gatedcelltype5'] = np.nan
df_p.loc[((df_p.Vim>1) | (df_p.FN>1)),'gatedcelltype5'] = 'fibroblast'
df_p.loc[df_p.CD31>2,'gatedcelltype5'] = 'endothelial'
df_p.loc[df_p.Ecad > .4,'gatedcelltype5'] = 'epithelial'
df_p.loc[df_p.CD45>1.2,'gatedcelltype5'] = 'immune'
df_p.gatedcelltype5 = df_p.gatedcelltype5.fillna('stromal')
adata.obs['gatedcelltype5'] = df_p.gatedcelltype5
fig,ax = plt.subplots(figsize=(3,2),dpi=200)
figname=f'leiden_{n_neighbors}_{n_markers}_{resolution}_{s_type}_gatingcelltype5.png'
sc.pl.umap(adata, color='gatedcelltype5',ax=ax,save=figname,title=figname.split('.png')[0].replace('_',' '),legend_loc='on data')

In [None]:
#manual gating (4 cell types)
df_p['celltype'] = np.nan
df_p.loc[((df_p.Vim>1) | (df_p.FN>1)),'celltype'] = 'stromal'
df_p.loc[df_p.CD31>2,'celltype'] = 'endothelial'
df_p.loc[df_p.Ecad > .4,'celltype'] = 'epithelial'
df_p.loc[df_p.CD45>1.2,'celltype'] = 'immune'
df_p.celltype = df_p.celltype.fillna('stromal')
adata.obs['celltype'] = df_p.celltype
fig,ax = plt.subplots(figsize=(3,2),dpi=200)
figname=f'leiden_{n_neighbors}_{n_markers}_{resolution}_{s_type}_gatingcelltype.png'
sc.pl.umap(adata, color='celltype',ax=ax,save=figname,title=figname.split('.png')[0].replace('_',' '),legend_loc='on data')

In [None]:
#save
s_out = f'{s_sample}_LeidenClusteringGating_neighbors{n_neighbors}_resolution{resolution}_markers{n_markers}_{s_type}.csv'

if not os.path.exists(s_out):
    print('saving csv')
    for s_core in df_a.core.unique():
        print(s_core)
        s_find=f"{s_core.split('_')[2]}_scene{s_core.split('_')[3]}"
        df_p.loc[df_p.index.str.contains(s_find),'core'] = s_core
    df_p.to_csv(s_out)

In [None]:
df_p.leiden.unique()
s_out

## Summarize Tissue Variables <a name="tissue"></a>

per patient means

make sure to only use cells included in leiden clustering


look at colocalization in leiden clusters

[contents](#contents)

In [None]:
pwd

In [None]:
#20220114_BaselTMAan_LeidenClusteringGating_neighbors30_resolution0.6_markers21_GatedCellTypes.csv

In [None]:
#leiden cell types (fraction)
n_neighbors= 30
resolution= 0.6 #0.5 
n_markers= 21#18 #29
s_type='2sub'# 'TripleNeg'
s_sample = '20220114_BaselTMA_s' #'20220110_BaselTMA' #'20211221_BaselTMA'
s_subtype = s_type

df_lei = pd.read_csv(f'{codedir}/data/{s_sample}_LeidenClusteringGating_neighbors{n_neighbors}_resolution{resolution}_markers{n_markers}_{s_type}.csv',index_col=0)
df_lei.rename({'celltype3':'leidencelltype3'},axis=1,inplace=True)
df_lei['leidencelltype2'] = df_lei.leidencelltype3.replace({'tumor':'epithelial','endothelial':'stromal','immune':'stromal','str.':'stromal','imm.':'stromal'})
#df_lei['gated_celltype3'] = df_lei.celltype3.replace({'tumor':'epithelial','endothelial':'stromal','fibroblast':'stromal','str.':'stromal','imm.':'immune'})
df_lei['celltype1'] = 'all'
df_lei['countme'] = True
df_lei.leiden = df_lei.leiden.astype('str')

In [None]:
df_lei.head()

In [None]:
for s_core in df_lei.core.unique():
    df_a[df_a.core.str.contains(s_core)]
    s_core_name = df_a[df_a.core.str.contains(s_core)].core.values[0]
    df_lei.loc[df_lei.core==s_core,'core'] = s_core_name
#drop 
df_lei['type'] = df_lei.core.map(dict(zip(df_a.core,df_a.diseasestatus)))
df_lei = df_lei[df_lei.type=='tumor']
df_lei['Patient'] = df_lei.core.map(dict(zip(df_a.core,df_a.PID)))
an_array = df_lei.Patient.unique()
'''
number_of_rows = an_array.shape[0]
random_indices = np.random.choice(number_of_rows, size=int(number_of_rows/2), replace=False)
random_rows = an_array[random_indices]
'''
if s_type == 'TripleNeg':
    random_rows = [191,  23, 179,  64,  52,  81, 229,  33, 110,  61, 115,  59, 168,
             8, 116,  36, 189, 172, 241, 214,  15, 187, 114, 145]
    random_rows = df_lei.Patient.unique().tolist()
else:
    random_rows = df_lei.Patient.unique().tolist()
random_validation = set(an_array) - set(random_rows)

In [None]:
s_sample = '20220114_BaselTMA'

In [None]:
#gated celltypes (fraction)

#s_sample = s_sample.replace('_s','')# = '20220114_BaselTMA'
s_grouper='Patient'
ls_cell = ['leidencelltype3','leidencelltype4','gatedcelltype3','gatedcelltype5','celltype']#,'celltype'
df_test = df_lei.loc[df_lei.Patient.isin(random_rows)]
for s_cell in ls_cell:
    df_prop = viz.prop_positive(df_test,s_cell=s_cell,s_grouper=s_grouper)
    s_out = f'results_{s_sample}_GatedCellTypes_by{s_grouper}_by{s_cell}_{s_subtype}.csv'
    print(s_out)
    df_prop.to_csv(f'{codedir}/{s_out}')
    #break

In [None]:
#leiden

df_test = df_lei.loc[df_lei.Patient.isin(random_rows)]
for s_celltype in ['leidencelltype2','celltype1']: #'celltype3','celltype',
    for s_cell in df_test.loc[:,s_celltype].unique():
        df_cell = df_test.loc[df_test.loc[:,s_celltype]==s_cell]
        df_prop = viz.prop_positive(df_cell,s_cell='leiden',s_grouper='Patient')
        s_out = f'results_{s_sample}_LeidenClustering_{n_neighbors}_{n_markers}_{resolution}_byPatient_by{s_celltype}_in{s_cell}_{s_type}.csv'
        df_prop.fillna(0).to_csv(f'{codedir}/{s_out}')
        print(s_out)


## survival analysis <a name="surv"></a>


- single variable

- subtypes

[contents](#contents)


In [None]:
import lifelines
from lifelines import KaplanMeierFitter
from lifelines.statistics import multivariate_logrank_test
from lifelines import CoxPHFitter
import warnings

df_a = pd.read_csv(f'{codedir}/BaselTMA/Basel_PatientMetadata.csv')
#os.mkdir('Survival_Plots')

In [None]:
#example file names 'results_20211207_JP-TMA1_BGSubtractedMeanIntensity_byPatient_bycelltype3_inimmune_TNBC.csv',
#'results_20211207_JP-TMA1_GatedCellTypes_M1M2_byPatient_bycelltype_TNBC.csv',
#'results_20211207_JP-TMA1_LeidenClustering_30_33_0.4_byPatient_bycelltype1_inall_ER+.csv']

#preprocess.dchange_fname({'_byTumorDiffPlus_M2':'_byTumorDiffPlusM2'},b_test=False)
#preprocess.dchange_fname({'_byTumorDiffPlus_M1':'_byTumorDiffPlusM1'},b_test=False)
   
s_sample = '20220114_BaselTMA'#'20220110_BaselTMA' #'20211221_BaselTMA'

df_file = pd.DataFrame()
for s_file in os.listdir(f'{codedir}'):
    if s_file.find(f'results_{s_sample}_BGSubtractedMeanIntensity_byPatient') > -1:
        #df = pd.read_csv(s_file)
        s_type = 'BGSubtractedMeanIntensity'
        s_subtype = s_file.split('.csv')[0].split('_')[-1]
        s_partition = s_file.split('.csv')[0].split('_')[-3].split('by')[1]
        s_cell = s_file.split('.csv')[0].split('_')[-2].split('in')[1]       
    elif s_file.find(f'results_{s_sample}_GatedCellTypes_Leiden_byPatient') > -1:
        #df = pd.read_csv(s_file)
        s_type = 'GatedCellTypes_M1M2'
        s_subtype = s_file.split('.csv')[0].split('_')[-1]
        s_partition = 'gating'
        s_cell = s_file.split('.csv')[0].split('_')[-2].split('by')[1]
    elif s_file.find(f'results_{s_sample}_GatedCellTypes_byPatient') > -1:
        s_type = 'GatedCellTypes'
        s_subtype = s_file.split('.csv')[0].split('_')[-1]
        s_partition = 'gating'
        s_cell = s_file.split('.csv')[0].split('_')[-2].split('by')[1]
    elif s_file.find(f'results_{s_sample}_LeidenClustering_') > -1:
        #df = pd.read_csv(s_file)
        s_type = 'LeidenClustering'
        s_subtype = s_file.split('.csv')[0].split('_')[-1]
        s_partition = s_file.split('.csv')[0].split('_')[-3].split('by')[1]
        s_cell = s_file.split('.csv')[0].split('_')[-2].split('in')[1]   
    else:
        continue
    df_file.loc[s_file,'subtype'] = s_subtype
    df_file.loc[s_file,'type'] = s_type
    df_file.loc[s_file,'partition'] = s_partition
    df_file.loc[s_file,'cell'] = s_cell
    #break

In [None]:
os.chdir(codedir)
df_file

In [None]:
#leiden clusters vs survival 30 ER+, 18 TNBC
#for s_index in df_file[df_file.subtype=='GatedCellTypes'].index: #'2sub'
for s_index in df_file.index:
    #s_index = 'results_20220114_BaselTMA_GatedCellTypes_byPatient_bycelltype_2sub.csv'
    print(s_index)
    df_all=pd.read_csv(s_index,index_col=0)
    df_all = df_all.loc[:,df_all.dtypes=='float64'].fillna(0)
    df_all['subtype'] = df_all.index.map(dict(zip(df_a.PID,df_a.clinical_type)))
    df_all['subtype'] =df_all.subtype.replace({'HR+HER2-':'ER+', 'TripleNeg':'TNBC'})
    s_type = df_file.loc[s_index,'type'] 
    s_partition = df_file.loc[s_index,'partition'] 
    s_cell =df_file.loc[s_index,'cell'] 
    df_all.loc[:,'durations'] = df_all.index.map(dict(zip(df_a.PID,df_a.OSmonth)))
    df_all.loc[:,'event_observed'] = df_all.index.map(dict(zip(df_a.PID,df_a.Patientstatus.str.contains('death'))))
    df_all.loc[:,'event_observed'] = df_all.event_observed.replace({True:1,False:0})
    for s_subtype in ['TNBC','ER+']:
        df = df_all[df_all.subtype==s_subtype]
        df = df.dropna()
        for s_col in df.columns.drop(['durations','event_observed','subtype']):
            b_low = df.loc[:,s_col] <= df.loc[:,s_col].median()
            if df.loc[:,s_col].median() == 0:
                continue
            df.loc[b_low,'abundance'] = 'low'
            df.loc[~b_low,'abundance'] = 'high'
            kmf = KaplanMeierFitter()
            results = multivariate_logrank_test(event_durations=df.durations, groups=df.abundance, event_observed=df.event_observed)
            if results.summary.p[0] < 0.05:
                print(s_col)
                fig, ax = plt.subplots(figsize=(3,3),dpi=300)
                for s_group in ['high','low']:
                    df_abun = df[df.abundance==s_group]
                    durations = df_abun.durations
                    event_observed = df_abun.event_observed
                    kmf.fit(durations, event_observed,label=s_group)
                    kmf.plot(ax=ax,ci_show=False)
                s_title1 = f'{s_subtype} {s_type} {s_partition}'
                s_title2 = f'{s_cell} {s_col}'
                ax.set_title(f'{s_title1}\n{s_title2}\np={results.summary.p[0]:.2}',fontsize=10)
                ax.legend(loc='upper right')
                plt.tight_layout()
                fig.savefig(f"./data/Survival_Plots/KM_{s_title1.replace(' ','_')}_{s_title2.replace(' ','_')}.png",dpi=300)
            cph = CoxPHFitter(penalizer=0.1)
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                try:
                    cph.fit(df.loc[:,[s_col,'durations','event_observed']], duration_col='durations', event_col='event_observed')
                    if cph.summary.p[0] < 0.05:
                        fig, ax = plt.subplots(figsize=(2.5,2),dpi=300)
                        cph.plot(ax=ax)
                        s_title1 = f'{s_subtype} {s_type} {s_partition}'
                        s_title2 = f'{s_cell} {s_col}'
                        ax.set_title(f'{s_title1}\n{s_title2}\np={cph.summary.p[0]:.2}',fontsize=10)
                        plt.tight_layout()
                        fig.savefig(f"./data/Survival_Plots/CPH_{s_title1.replace(' ','_')}_{s_title2.replace(' ','_')}.png",dpi=300)
                except:
                    print(f'skipped {s_col}')        

    break

In [None]:
sns.scatterplot(data=df_all[df_all.subtype=='TNBC'],x='9',y='6',hue='durations',s=20)

In [None]:
sns.scatterplot(data=df_all[df_all.subtype=='ER+'],x='2',y='5',hue='durations',s=20)

In [None]:
#old
for s_index in df_file.index:
    s_index = 'results_20220110_BaselTMA_GatedCellTypes_Leiden_byPatient_bycelltype3_TripleNeg.csv'
    s_index = 'results_20220110_BaselTMA_GatedCellTypes_Leiden_byPatient_bycelltype_TripleNeg.csv'
    #s_index = 'results_20220110_BaselTMA_GatedCellTypes_Leiden_byPatient_bygatedcelltype3_TripleNeg.csv'
    df=pd.read_csv(f'{codedir}/{s_index}',index_col=0)
    df = df.loc[:,df.dtypes=='float64'].fillna(0)
    #s_subtype = df_file.loc[s_index,'subtype'] 
    #s_type = df_file.loc[s_index,'type'] 
    #s_partition = df_file.loc[s_index,'partition'] 
    #s_cell =df_file.loc[s_index,'cell'] 
    s_type = s_index.split('.csv')[0].split('_')[-5]
    s_subtype = s_index.split('.csv')[0].split('_')[-1]
    s_partition = 'gating'
    s_cell = s_index.split('.csv')[0].split('_')[-2].split('by')[1]
    df.loc[:,'durations'] = df.index.map(dict(zip(df_a.PID,df_a.OSmonth)))
    df.loc[:,'event_observed'] = df.index.map(dict(zip(df_a.PID,df_a.Patientstatus.str.contains('death'))))
    df.loc[:,'event_observed'] = df.event_observed.replace({True:1,False:0})
    for s_col in df.columns.drop(['durations','event_observed']):
        #print(s_col)
        b_low = df.loc[:,s_col] <= df.loc[:,s_col].median()
        if df.loc[:,s_col].median() == 0:
            continue
        df.loc[b_low,'abundance'] = 'low'
        df.loc[~b_low,'abundance'] = 'high'
        kmf = KaplanMeierFitter()
        results = multivariate_logrank_test(event_durations=df.durations, groups=df.abundance, event_observed=df.event_observed)
        #break
        if results.summary.p[0] < 0.05:
            print(s_col)
            fig, ax = plt.subplots(figsize=(3,3),dpi=300)
            for s_group in ['high','low']:
                df_abun = df[df.abundance==s_group]
                durations = df_abun.durations
                event_observed = df_abun.event_observed
                kmf.fit(durations, event_observed,label=s_group)
                kmf.plot(ax=ax,ci_show=False)
            s_title1 = f'{s_subtype} {s_type} {s_partition}'
            s_title2 = f'{s_cell} {s_col}'
            ax.set_title(f'{s_title1}\n{s_title2}\np={results.summary.p[0]:.2}',fontsize=10)
            ax.legend(loc='upper right')
            plt.tight_layout()
            fig.savefig(f"{codedir}/data/Survival_Plots/KM_{s_title1.replace(' ','_')}_{s_title2.replace(' ','_')}.png",dpi=300)
        cph = CoxPHFitter(penalizer=0.1)
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            try:
                cph.fit(df.loc[:,[s_col,'durations','event_observed']], duration_col='durations', event_col='event_observed')
                if cph.summary.p[0] < 0.1:
                    fig, ax = plt.subplots(figsize=(2.5,2),dpi=300)
                    cph.plot(ax=ax)
                    s_title1 = f'{s_subtype} {s_type} {s_partition}'
                    s_title2 = f'{s_cell} {s_col}'
                    ax.set_title(f'{s_title1}\n{s_title2}\np={cph.summary.p[0]:.2}',fontsize=10)
                    plt.tight_layout()
                    fig.savefig(f"{codedir}/data/Survival_Plots/CPH_{s_title1.replace(' ','_')}_{s_title2.replace(' ','_')}.png",dpi=300)
            except:
                print(f'skipped {s_col}')
    #break


In [None]:
df_file.index

## survival analysis <a name="subt"></a>

- subtypes all


[contents](#contents)

In [None]:
df_file.to_csv(f'{codedir}/IMC_results_files.csv')

In [None]:
ls_subt = [#'results_20211221_BaselTMA_GatedCellTypes_Leiden_byPatient_bycelltype_TripleNeg.csv',
    #'results_20211221_BaselTMA_GatedCellTypes_Leiden_byPatient_bycelltype3_TripleNeg.csv',
    #'results_20211221_BaselTMA_LeidenClustering_30_29_0.4_byPatient_bycelltype1_inall_TripleNeg.csv',
    #   'results_20211221_BaselTMA_LeidenClustering_30_29_0.4_byPatient_bycelltype2_instromal_TripleNeg.csv', #this is significant
    #   'results_20211221_BaselTMA_LeidenClustering_30_29_0.4_byPatient_bycelltype2_inepithelial_TripleNeg.csv'
 #'results_20220110_BaselTMA_GatedCellTypes_Leiden_byPatient_bycelltype3_TripleNeg.csv',
    #'results_20220110_BaselTMA_GatedCellTypes_Leiden_byPatient_bycelltype_TripleNeg.csv' #not good
    'results_20220110_BaselTMA_GatedCellTypes_Leiden_byPatient_bygatedcelltype3_TripleNeg.csv' #immune 6 neigh, 0.2
]

ls_subt = [#'results_20220114_BaselTMA_LeidenClustering_30_21_0.6_byPatient_byleidencelltype2_instromal_2sub.csv',
       'results_20220114_BaselTMA_GatedCellTypes_byPatient_byleidencelltype3_2sub.csv',
       #'results_20220114_BaselTMA_GatedCellTypes_byPatient_bygatedcelltype3_2sub.csv',
       #'results_20220114_BaselTMA_LeidenClustering_30_21_0.6_byPatient_bycelltype1_inall_2sub.csv',
       #'results_20220114_BaselTMA_GatedCellTypes_byPatient_bycelltype_2sub.csv',
       #'results_20220114_BaselTMA_GatedCellTypes_byPatient_bygatedcelltype5_2sub.csv',
       #'results_20220114_BaselTMA_LeidenClustering_30_21_0.6_byPatient_byleidencelltype2_inepithelial_2sub.csv'
]

In [None]:
for s_index in ls_subt:
    df = pd.read_csv(s_index,index_col=0).fillna(0)
    s_subtype = df_file.loc[s_index,'subtype'] 
    s_type = df_file.loc[s_index,'type'] 
    s_partition = df_file.loc[s_index,'partition'] 
    s_cell =df_file.loc[s_index,'cell'] 
    break

In [None]:
if s_type == 'LeidenClustering':
    d_celltypes = {'19':'Luminal hi t.',
'9': 'Myoepithelial',
'9b': 'Basal t.',
'15': 'PgR+ t.',
'7': 'Luminal ER+ t.',
'11': 'Ecad hi t.',
'6': 'Luminal t.',
'23': 'EGFR hi t.',
'10': 'Proliferating t.',
'21': 'Vim+ t.',
'16': 'CD44+ t.',
'2': 'CD3 T cell',
'17': 'CD20 B cell',
'3': 'FN++ str.',
'5': 'Large t.',
'0': 'NOS str.',
'20': 'Ecad- t.',
'8': 'Vim++ str.',
'1': 'poorly diff t.',
'12': 'CD44+ str.',
'4': 'Sm. NOS str.',
'14': 'Endothelial',
'18': 'Macrophage'}
    df = df.rename(d_celltypes,axis=1)

In [None]:
ls_col = df.columns[(~df.columns.str.contains('DAPI')) & (~df.columns.str.contains('R5Q')) & (df.dtypes=='float64')].tolist()
ls_col = df.loc[:,ls_col].sum().sort_values(ascending=False)[0:11].index.tolist()
ls_col = df.columns
#umap
for n_neighbors in [6]: 
    results_file = f'{s_date}/results_{s_subtype}_{s_type}_{s_partition}_{s_cell}_{n_neighbors}neighbors_{len(ls_col)}markers.h5ad'
    if not os.path.exists(f'{codedir}/{results_file}'):
        print('making adata')
        adata = sc.AnnData(df.loc[:,ls_col].fillna(0))
        adata.raw = adata
        #reduce dimensionality
        sc.tl.pca(adata, svd_solver='auto')
        print('scaling')
        sc.pp.scale(adata, zero_center=False, max_value=20)
        print('calc umap')
        # calculate neighbors 
        sc.pp.neighbors(adata, n_neighbors=n_neighbors) 
        sc.tl.umap(adata)
        adata.write(f'{codedir}/{results_file}')
    else:
        print('loading umap')
        adata = sc.read_h5ad(f'{codedir}/{results_file}')
    #color by markers   
    figname = f"Umap_markers_{s_subtype}_{s_type}_{s_partition}_{s_cell}_{n_neighbors}neigh.png"
    title=figname.split('.png')[0].replace('_',' ')
    sc.pl.umap(adata, color=ls_col,vmin='p1.5',vmax='p99.5',ncols=4,save=figname,size=250)
    #subtype
    adata.obs['Subtype'] = adata.obs.index.map(dict(zip(df_a.PID.astype('str'),df_a.clinical_type)))
    figname = f"Umap_Subtype_{s_subtype}_{s_type}_{s_partition}_{s_cell}_{n_neighbors}neigh.png"
    title=figname.split('.png')[0].replace('_',' ')
    sc.pl.umap(adata, color='Subtype',vmin='p1.5',vmax='p99.5',ncols=4,save=figname,size=250)
    break

In [None]:
#leiden
for resolution in [0.05]: #0.3,
    results_file = f'{s_date}/results_{s_subtype}_{s_type}_{s_partition}_{s_cell}_{n_neighbors}neighbors_{len(ls_col)}markers_leiden{resolution}.h5ad'
    sc.tl.leiden(adata,resolution=resolution)
    fig,ax = plt.subplots(figsize=(2.5,2),dpi=200)
    figname=f'leiden_{s_subtype}_{s_type}_{s_partition}_{s_cell}_{n_neighbors}_{resolution}.png'
    sc.pl.umap(adata, color='leiden',ax=ax,title=figname.split('.png')[0].replace('_',' '),wspace=.25,save=figname,size=40)
    break

In [None]:
for s_subtype in ['TNBC','ER+']:
    df_p = adata.to_df()
    df_p['Subtype'] = df_p.index.map(dict(zip(df_a.PID.astype('str'),df_a.clinical_type)))
    df_p['Subtype'] =df_p.Subtype.replace({'HR+HER2-':'ER+','TripleNeg':'TNBC'})
    df_p['leiden'] = adata.obs['leiden']
    df_p = df_p.loc[df_p.Subtype==s_subtype]
    df_p.loc[:,'Survival_time'] = df_p.index.map(dict(zip(df_a.PID.astype('str'),df_a.OSmonth)))
    df_p.loc[:,'Survival'] = df_p.index.map(dict(zip(df_a.PID.astype('str'),df_a.Patientstatus.str.contains('death'))))
    df_p.loc[:,'Survival'] = df_p.Survival.replace({True:1,False:0})
    df_st = df_p.drop('Subtype',axis=1)
    T = df_st['Survival_time']     ## time to event
    E = df_st['Survival']      ## event occurred or censored
    groups = df_st.loc[:,'leiden']  
    kmf1 = KaplanMeierFitter() ## instantiate the class to create an object
    fig, ax = plt.subplots(figsize=(3,3),dpi=200)
    for s_group in sorted(groups.unique()):
        i1 = (groups == s_group)
        kmf1.fit(T[i1], E[i1], label=s_group)    ## fit thedata
        kmf1.plot(ax=ax,ci_show=False)
        #print(kmf1.median_survival_time_)
    results = multivariate_logrank_test(event_durations=T, groups=groups, event_observed=E)
    ax.set_title(f'{s_subtype} {s_cell} res={resolution} \n neigh={n_neighbors} p={results.summary.p[0]:.2}')
    ax.legend(loc='upper right')
    plt.tight_layout()
    if results.summary.p[0] < 0.05:
        fig.savefig(f'{codedir}/data/Survival_Plots/KM_{s_subtype}_{s_type}_{s_partition}_{s_cell}_{n_neighbors}_{resolution}.png',dpi=200)
        #more plots
        d_a = dict(zip(adata.obs.index,adata.obs.leiden))
        d_color = dict(zip(sorted(adata.obs.leiden.unique()),sns.color_palette()[0:len(adata.obs.leiden.unique())]))
        row_colors = df_p.index.map(d_a).map(d_color)
        g = sns.clustermap(df_p.loc[:,ls_col].dropna(),figsize=(5,5),cmap='viridis',
                row_colors=row_colors,method='ward')
        for label in d_color.keys():
            g.ax_row_dendrogram.bar(0, 0, color=d_color[label],
                                label=label, linewidth=0)
        g.ax_row_dendrogram.legend(loc="right", ncol=1)
        g.savefig(f'{codedir}/{s_date}/clustermap_patients_{s_sample}_{s_type}_{s_partition}_{s_cell}_{s_type}_{n_neighbors}_{resolution}.png',dpi=200)
        #subtypes
        d_replace = {}
        df_plot = df_p.loc[:,ls_col.tolist()+['leiden']].dropna().groupby('leiden').mean()
        df_plot.index.name = f'leiden {resolution}'
        g = sns.clustermap(df_plot.dropna().T,z_score=1,figsize=(3,5),cmap='viridis',
                           vmin=-2,vmax=2,method='ward')
        g.fig.suptitle(f'leiden {resolution}',x=.9) 
        g.savefig(f'{codedir}/{s_date}/clustermap_subtypes_{s_sample}_{s_type}_{s_partition}_{s_cell}_{s_type}_{n_neighbors}_{resolution}.png',dpi=200)
        marker_genes = df_plot.dropna().T.iloc[:,g.dendrogram_col.reordered_ind].columns.tolist()
        categories_order = df_plot.dropna().T.iloc[g.dendrogram_row.reordered_ind,:].index.tolist()
        #barplot
        fig,ax=plt.subplots(figsize=(2.5,3),dpi=200)
        df_p.groupby('leiden').count().loc[marker_genes[::-1]].iloc[:,1].plot(kind='barh',title='Patient Count',ax=ax)
        plt.tight_layout()
        fig.savefig(f'./{s_date}/barplot_subtyping_{s_sample}_{s_sample}_{s_type}_{s_partition}_{s_cell}_{s_type}_{n_neighbors}_{resolution}.png')
    break
    #CPH    
    df_dummy = pd.get_dummies(df_st.loc[:,['Survival_time','Survival','leiden']])
    df_dummy = df_dummy.loc[:,df_dummy.sum() != 0]
    cph = CoxPHFitter(penalizer=0.1)  ## Instantiate the class to create a cph object
    cph.fit(df_dummy, 'Survival_time', event_col='Survival')
    fig, ax = plt.subplots(figsize=(3,3),dpi=300)
    cph.plot(ax=ax)
    ax.set_title(f'CPH: {s_subtype} {s_type} {s_cell}')
    plt.tight_layout()
    fig.savefig(f'{codedir}/data/Survival_Plots/CoxPH_{s_subtype}_{s_type}_{s_partition}_{s_cell}_{n_neighbors}_{resolution}.png',dpi=200)
    break

In [None]:
        fig.savefig(f'{codedir}/data/Survival_Plots/KM_{s_subtype}_{s_type}_{s_partition}_{s_cell}_{n_neighbors}_{resolution}.png',dpi=200)
        #more plots
        d_a = dict(zip(adata.obs.index,adata.obs.leiden))
        d_color = dict(zip(sorted(adata.obs.leiden.unique()),sns.color_palette()[0:len(adata.obs.leiden.unique())]))
        row_colors = df_p.index.map(d_a).map(d_color)
        g = sns.clustermap(df_p.loc[:,ls_col].dropna(),figsize=(5,5),cmap='viridis',
                row_colors=row_colors,method='ward')
        for label in d_color.keys():
            g.ax_row_dendrogram.bar(0, 0, color=d_color[label],
                                label=label, linewidth=0)
        g.ax_row_dendrogram.legend(loc="right", ncol=1)
        g.savefig(f'{codedir}/{s_date}/clustermap_patients_{s_sample}_{s_type}_{s_partition}_{s_cell}_{s_type}_{n_neighbors}_{resolution}.png',dpi=200)
        #subtypes
        d_replace = {}
        df_plot = df_p.loc[:,ls_col.tolist()+['leiden']].dropna().groupby('leiden').mean()
        df_plot.index.name = f'leiden {resolution}'
        g = sns.clustermap(df_plot.dropna().T,z_score=1,figsize=(3,5),cmap='viridis',
                           vmin=-2,vmax=2,method='ward')
        g.fig.suptitle(f'leiden {resolution}',x=.9) 
        g.savefig(f'{codedir}/{s_date}/clustermap_subtypes_{s_sample}_{s_type}_{s_partition}_{s_cell}_{s_type}_{n_neighbors}_{resolution}.png',dpi=200)
        marker_genes = df_plot.dropna().T.iloc[:,g.dendrogram_col.reordered_ind].columns.tolist()
        categories_order = df_plot.dropna().T.iloc[g.dendrogram_row.reordered_ind,:].index.tolist()
        #barplot
        fig,ax=plt.subplots(figsize=(2.5,3),dpi=200)
        df_p.groupby('leiden').count().loc[marker_genes[::-1]].iloc[:,1].plot(kind='barh',title='Patient Count',ax=ax)
        plt.tight_layout()
        fig.savefig(f'./{s_date}/barplot_subtyping_{s_sample}_{s_sample}_{s_type}_{s_partition}_{s_cell}_{s_type}_{n_neighbors}_{resolution}.png')
   

## compare patients

epithelial rich, immune rich, stromal rich

In [None]:
from scipy import stats

In [None]:
df_celltype = pd.read_csv('results_Patient_celltype_cluster_new.csv',index_col=0)

In [None]:
df_lei = pd.read_csv('./data/20211221_BaselTMA_LeidenClustering_neighbors30_resolution0.4_markers19_TripleNeg.csv',index_col=0)

In [None]:
df_lei

In [None]:
#drop 
df_lei['type'] = df_lei.core.map(dict(zip(df_a.core,df_a.diseasestatus)))
df_lei = df_lei[df_lei.type=='tumor']

In [None]:
df_lei['Patient'] = df_lei.core.map(dict(zip(df_a.core,df_a.PID)))
df_lei['Group'] = df_lei.Patient.map(dict(zip(df_celltype.index,df_celltype.leiden)))
df_lei['Group'] = df_lei.Group.astype('int')
df_lei['celltype2'] =df_lei.celltype3.replace({'immune':'stromal'})

In [None]:
ls_marker = df_lei.columns[df_lei.dtypes == 'float64'].tolist()

In [None]:
#boxplots
s_cell = 'celltype2'
for s_group in ['Group']:
    for s_celltype in sorted(set(df_lei.loc[:,s_cell])):
        df_cell = df_lei[df_lei.loc[:,s_cell] == s_celltype]
        df_mean = df_cell.loc[:,ls_marker + [s_group,'Patient']].groupby([s_group,'Patient']).mean()
        for s_marker in ls_marker:
            df_marker = df_mean.loc[:,s_marker].reset_index()
            #test
            lls_result = []
            for s_test in df_marker.loc[:,s_group].unique():
                ls_result = df_marker.loc[df_marker.loc[:,s_group] == s_test,s_marker].values
                lls_result.append(ls_result)
            if len(lls_result)==2:
                try:
                    statistic, pvalue = stats.mannwhitneyu(lls_result[0],lls_result[1])
                except:
                    pvalue = 1
                    continue
            elif len(lls_result)==3:
                try:
                    statistic, pvalue = stats.kruskal(lls_result[0],lls_result[1],lls_result[2])
                except:
                    pvalue = 1
                    continue
            elif len(lls_result)==4:
                try:
                    statistic, pvalue = stats.kruskal(lls_result[0],lls_result[1],lls_result[2],lls_result[3])
                except:
                    pvalue = 1
                    continue
            else:
                pvalue = 1
            #plot    
            fig,ax=plt.subplots(figsize=(2.8,2.2),dpi=200)
            sns.boxplot(data=df_marker,x=s_group,y=s_marker,ax=ax,fliersize=0)
            sns.stripplot(data=df_marker,x=s_group,y=s_marker,ax=ax,palette='dark',size=3)
            ax.set_ylabel(f'{s_marker} in {s_celltype}')
            ax.set_title(f'{s_marker}\n (p = {pvalue:.3f})',fontsize=12,horizontalalignment='center')
            labels = [item.get_text() for item in ax.get_xticklabels()]
            if len(lls_result)==3:
                ax.set_xticklabels([item.replace('_rich',f'') for item in labels])
                ax.set_xlabel('(-rich)')
            #ax.set_ylim(0,3)
            plt.tight_layout()
            fig.savefig(f'{codedir}/{s_date}/boxplot_IMC_{s_cell}_{s_celltype}_{s_group}_{s_marker}.png')
            #break
            if pvalue > 0.05:
                plt.close(fig)
            else:

                fig.savefig(f'{codedir}/{s_date}/boxplot_IMC_{s_cell}_{s_celltype}_{s_group}_{s_marker}.png')
                print(f'{s_marker} in {s_celltype} by {s_group}') 
                #plt.close(fig)