In [None]:
#load libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import copy
import seaborn as sns
import importlib
import scipy

import scanpy as sc
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale, minmax_scale
from sklearn.metrics import silhouette_score
import matplotlib as mpl
mpl.rc('figure', max_open_warning = 0)
#mpl.font_manager._rebuild()
mpl.rcParams['mathtext.fontset'] = 'custom'
mpl.rcParams['mathtext.it'] = 'Arial:italic'
mpl.rcParams['mathtext.rm'] = 'Arial'
mpl.rcParams['font.sans-serif'] = "Arial"
mpl.rcParams['font.family'] = "sans-serif"
mpl.rc('font', serif='Arial') 
codedir = os.getcwd()
#load cmif libraries
#os.chdir('/home/groups/graylab_share/OMERO.rdsStore/engje/Data/cmIF')
from mplex_image import visualize as viz, process, preprocess, normalize

In [None]:
os.chdir(codedir)

In [None]:
np.random.seed(222)

# Table of contents <a name="contents"></a>
1. [Load Data](#load)
2. [Normalize](#norm)
6. [Visualize Normalization](#normviz)
[leiden for cell typing](#clusterlei)
7. [Cluster K means](#cluster)
8. [Leiden cluster](#clust1)


In [None]:
#load data
os.chdir(f'{codedir}/paper_data')

In [None]:
s_date = '20210402'
if not os.path.exists(s_date):
    os.mkdir(s_date)

# Load Data <a name="load"></a>

2.	As Ki67 is not continuous antigen, can you count positive cells (Proliferative cluster) by distance (<25, 25-50, 50-75, >75) from collagen I in each Bx?

3.	Could you map cells by distance (<25, 25-50, 50-75, >75) from collagen I in each Bx? If you can add a distance column (1-4) in the cluster csv, I can make it in Qi.

4.	Could you try to see the correlation between ER/PCNA and (VIM+aSMA+CD31)? – not necessary to show significance. (see attached image from Bx1 Scene-003)

[contents](#contents)

### not normalized

In [None]:
df_mi = pd.read_csv('20210324_SMTBx1-4_JE-TMA-43_60_62_FilteredMeanIntensity.csv',index_col=0) 
df_mi['slide'] = [item.split('_')[0] for item in df_mi.index]
df_mi['slide_scene'] = [item.split('_cell')[0] for item in df_mi.index]

In [None]:
for s_file in os.listdir():
    if s_file.find('MaskDistances') > -1:
        print(s_file)
df_mask = pd.DataFrame()
for s_sample in ['SMT101Bx1-16','SMTBx2-5','SMTBx3','SMTBx4-3','HTA-33']: #'SMT101Bx4-3',
    df_mask = df_mask.append(pd.read_csv(f'features_{s_sample}_MaskDistances.csv',index_col=0))

In [None]:
df_mask.columns
ls_target = ['Vim_dist','CD31_dist', 'PDPN_dist',  'aSMA_dist', 'CD68_dist','ColI_dist', 'ColIV_dist']
ls_marker =  ['ER_nuclei','Ki67_nuclei','PCNA_nuclei']
ls_drop = ['HTA-33_scene001','SMTBx1-16_scene001'#,'SMT101Bx4-3_scene001','SMT101Bx4-3_scene002'
          ]

In [None]:
df = df_mi.merge(df_mask.loc[:,ls_target],left_index=True,right_index=True)
df = df[(~df.Vim_dist.isna()) & (~df.slide_scene.isin(ls_drop))]
df.loc[:,ls_target] = df.loc[:,ls_target]*.325

In [None]:
#fit
data = df.loc[:,ls_marker].T
batch = df.slide
bayesdata = normalize.combat(data, batch)
df_norm = bayesdata.T

In [None]:
df_norm['slide'] = df.slide
df_norm.groupby('slide').mean()
df_norm.groupby('slide').std()

In [None]:
df['Vim-CD31-aSMA_dist'] = df.loc[:,['Vim_dist','CD31_dist','aSMA_dist']].min(axis=1)
ls_target = ls_target + ['Vim-CD31-aSMA_dist']

In [None]:
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
%matplotlib inline
#by tissue no Bx1
sns.set(style='white')
import matplotlib.ticker as tic
import warnings
warnings.filterwarnings('ignore')
tot = 0
ls_dist = [25, 50, 75]
i_diff = 25
ls_slide = ['SMTBx2-5', 'SMTBx3','SMT1Bx4-3'] #'
d_slide = {'SMTBx1-16':'Bx1', 'SMTBx2-5':'Bx2', 'SMTBx3':'Bx3','HTA-33':'Bx4-HTAN','SMTBx4-3':'Bx4'}
for s_target in ['ColI_dist', 'ColIV_dist','Vim-CD31-aSMA_dist']:
    print(s_target)
    fig, ax = plt.subplots(3,2, figsize=(4.5,4),sharex=True,dpi=300)
    for idxc, s_slide in enumerate(ls_slide):
        print(s_slide)
        df_slide = df[df.slide==s_slide]
        for idx, s_marker in enumerate(['ER_nuclei', 'PCNA_nuclei']): #,'Ki67_nuclei']):
            print(s_marker)
            df_result = pd.DataFrame(index=df_slide.index)
            for s_dist in ls_dist:
                b_bool = (df_slide.loc[:,s_target] < s_dist) & (df_slide.loc[:,s_target] >= s_dist - i_diff)
                df_result.loc[b_bool,f'{s_marker}_{s_dist}'] = df_slide.loc[b_bool,s_marker]
            for  s_col in df_result.columns:
                sns.kdeplot(df_result.loc[:,s_col].dropna(), ax=ax[idxc,idx],
                            label=f"< {s_col.split('_')[2]}"#,fill=True, alpha=0.3
                           )
            if df_result.mean().fillna(0)[2] == 0:
                statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna())
                print(len(df_result.iloc[:,0].dropna()))
                print(len(df_result.iloc[:,1].dropna()))
            else:
                statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna(),df_result.iloc[:,2].dropna())
                print(len(df_result.iloc[:,0].dropna()))
                print(len(df_result.iloc[:,1].dropna()))
                print('over75')
                print(len(df_result.iloc[:,2].dropna()))
            ax[idxc,idx].set_xlabel(f"{s_col.split('_')[0]} Intensity",fontname="Arial",fontsize=18)
            ax[idxc,idx].set_ylabel(f"")
            ax[idxc,idx].set_title(f"")
            temp = tic.MaxNLocator(3)
            ax[idxc,idx].set_yticklabels(())
            ax[idxc,idx].xaxis.set_major_locator(temp)
            tot+=1
            if pvalue < 0.001: # 0.05/30: #bonferoni correction
                ax[idxc,idx].text(0.42, 0.87, '*',
                     horizontalalignment='center',
                     verticalalignment='center',
                     transform=ax[idxc,idx].transAxes)
            ax[idxc,idx].set_xlim(-1000,5500)
            ax[idxc,idx].spines['right'].set_visible(False)
            ax[idxc,idx].spines['left'].set_visible(False)
            ax[idxc,idx].spines['top'].set_visible(False)
            #print(ax[idxc,idx].get_xticklabels())
            #ax[idxc,idx].set_xticklabels(ax[idxc,idx].get_xticklabels(),{'fontsize':16})
        ax[idxc,0].set_ylabel(f"{d_slide[s_slide]}",fontname="Arial",fontsize=18)
    ax[2,1].legend(title='$\mu$m',borderpad=.3,labelspacing=.3,loc=4,fontsize=14)
    plt.subplots_adjust(wspace=.001,hspace=.001)
    plt.suptitle(f"Distance to {s_target.split('_')[0]}",y=.93,fontname="Arial",fontsize=24)
    plt.tight_layout()
    fig.savefig(f'./{s_date}/IntensityvsDistance_{i_diff}s_{s_target}_by_slide_noBx1.png',dpi=300)
    #fig.savefig(f'./{s_date}/IntensityvsDistance_{i_diff}s_{s_target}_by_slide_noBx1.pdf',dpi=200)
    #break

In [None]:
 0.05/30

In [None]:
tot

In [None]:
from matplotlib import gridspec
ax_objs = []
ls_slide = ['SMTBx2-5', 'SMTBx3','SMT1Bx4-3'] #'
d_slide = {'SMTBx1-16':'Bx1', 'SMTBx2-5':'Bx2', 'SMTBx3':'Bx3','HTA-33':'Bx4-HTAN','SMTBx4-3':'Bx4'}
for s_target in ['ColI_dist', 'ColIV_dist','Vim-CD31-aSMA_dist']:
    fig = plt.figure(figsize=(5.5,3.5),dpi=300)
    gs = gridspec.GridSpec(nrows=3,  ncols=2,figure=fig, 
                       wspace=0.1, hspace=0.05,left=0.1, right=.75
                      )
    for idxc, s_slide in enumerate(ls_slide):
        df_slide = df[df.slide==s_slide]
        for idx, s_marker in enumerate(['ER_nuclei', 'PCNA_nuclei']):
            ax_objs.append(fig.add_subplot(gs[idxc,idx]))
            df_result = pd.DataFrame(index=df_slide.index)
            for s_dist in ls_dist:
                b_bool = (df_slide.loc[:,s_target] < s_dist) & (df_slide.loc[:,s_target] >= s_dist - i_diff)
                df_result.loc[b_bool,f'{s_marker}_{s_dist}'] = df_slide.loc[b_bool,s_marker]
            for  s_col in df_result.columns:
                g =sns.kdeplot(df_result.loc[:,s_col].dropna(), ax=ax_objs[-1],
                               label=f"< {s_col.split('_')[2]}"#,fill=True,alpha=0.5
                              )
            if df_result.mean().fillna(0)[2] == 0:
                statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna())
                #print(pvalue)
            else:
                statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna(),df_result.iloc[:,2].dropna())
            ax_objs[-1].set_ylabel(f"")
            ax_objs[-1].set_title(f"")
            temp = tic.MaxNLocator(3)
            ax_objs[-1].set_yticklabels(())
            ax_objs[-1].xaxis.set_major_locator(temp)
            tot+=1
            if pvalue < 0.001: # 0.05/30: #bonferoni correction
                ax_objs[-1].text(0.55, 0.65, '*',
                     horizontalalignment='center',
                     verticalalignment='center',
                     transform=ax_objs[-1].transAxes)
            ax_objs[-1].set_xlim(-1000,5500)
            ax_objs[-1].spines['right'].set_visible(False)
            ax_objs[-1].spines['left'].set_visible(False)
            ax_objs[-1].spines['top'].set_visible(False)
            #ax_objs[-1].spines['bottom'].set_visible(False)
            ax_objs[-1].set_xlabel('')
            rect = ax_objs[-1].patch
            rect.set_alpha(0)
            if idx == 0:
                ax_objs[-1].set_ylabel(f"{d_slide[s_slide]}",fontsize=18)
            if idx==1:
                if idxc == 2:
                    ax_objs[-1].legend(title='$\mu$m',borderpad=.3,labelspacing=.3,fontsize=12,loc='upper left', bbox_to_anchor=(1.05, 1.5))
            if idxc ==2:
                ax_objs[-1].set_xlabel(f"{s_col.split('_')[0]} Intensity",fontsize=18)
            else:
                ax_objs[-1].set_xticklabels([])           
    plt.suptitle(f"Distance to {s_target.split('_')[0]}",x=.45,y=.95,fontsize=20)
    gs.update(bottom = 0.2)
    fig.savefig(f'./{s_date}/IntensityvsDistance_{i_diff}s_{s_target}_by_slide_noBx1_bigger.png',dpi=200)
    #break

In [None]:
#by tissue w bx1
%matplotlib inline
sns.set(style='white')
import matplotlib.ticker as tic
import warnings
warnings.filterwarnings('ignore')
tot = 0
ls_dist = [25, 50, 75]
i_diff = 25
ls_slide = ['SMTBx1-16','SMTBx2-5', 'SMTBx3','SMT1Bx4-3'] #'
d_slide = {'SMTBx1-16':'Bx1', 'SMTBx2-5':'Bx2', 'SMTBx3':'Bx3','HTA-33':'Bx4-HTAN','SMTBx4-3':'Bx4'}
for s_target in  ls_target + ['Vim-CD31-aSMA_dist']: #['CD68_dist','ColI_dist', 'ColIV_dist']:
    fig, ax = plt.subplots(4,3, figsize=(7,5),sharex=True,dpi=300)
    for idxc, s_slide in enumerate(ls_slide):
        df_slide = df[df.slide==s_slide]
        for idx, s_marker in enumerate(ls_marker):
            df_result = pd.DataFrame(index=df_slide.index)
            for s_dist in ls_dist:
                b_bool = (df_slide.loc[:,s_target] < s_dist) & (df_slide.loc[:,s_target] >= s_dist - i_diff)
                df_result.loc[b_bool,f'{s_marker}_{s_dist}'] = df_slide.loc[b_bool,s_marker]
            for  s_col in df_result.columns:
                sns.kdeplot(df_result.loc[:,s_col].dropna(), ax=ax[idxc,idx], label=f"< {s_col.split('_')[2]}")
            if df_result.mean().fillna(0)[2] == 0:
                statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna())
                #print(pvalue)
            else:
                statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna(),df_result.iloc[:,2].dropna())
            ax[idxc,idx].set_xlabel(f"{s_col.split('_')[0]} Intensity",fontsize=18)
            ax[idxc,idx].set_ylabel(f"")
            ax[idxc,idx].set_title(f"")
            temp = tic.MaxNLocator(3)
            ax[idxc,idx].set_yticklabels(())
            ax[idxc,idx].xaxis.set_major_locator(temp)
            tot+=1
            if pvalue < 0.001: # 0.05/30: #bonferoni correction
                ax[idxc,idx].text(0.5, 0.8, '*',
                     horizontalalignment='center',
                     verticalalignment='center',
                     transform=ax[idxc,idx].transAxes)
            ax[idxc,idx].set_xlim(-1500,7000)
            ax[idxc,idx].spines['right'].set_visible(False)
            ax[idxc,idx].spines['left'].set_visible(False)
            ax[idxc,idx].spines['top'].set_visible(False)
        ax[idxc,0].set_ylabel(f"{d_slide[s_slide]}",fontsize=18)
    ax[0,2].legend(title='$\mu$m')
    plt.subplots_adjust(wspace=.001,hspace=.001)
    plt.suptitle(f"Distance to {s_target.split('_')[0]}",fontsize=20)
    plt.tight_layout()
    fig.savefig(f'./{s_date}/IntensityvsDistance_25s_{s_target}_by_slide.png',dpi=300)
    #break

In [None]:
#by tissue w bx1
%matplotlib inline
sns.set(style='white')
import matplotlib.ticker as tic
import warnings
warnings.filterwarnings('ignore')
tot = 0
ls_dist = [25, 50, 75]
i_diff = 25
ls_slide = ['SMTBx2-5', 'SMTBx3','SMT1Bx4-3'] #'SMTBx1-16',
d_slide = {'SMTBx1-16':'Bx1', 'SMTBx2-5':'Bx2', 'SMTBx3':'Bx3','HTA-33':'Bx4-HTAN','SMTBx4-3':'Bx4'}
for s_target in ['ColI_dist', 'ColIV_dist']:
    fig, ax = plt.subplots(3,3, figsize=(7,4),sharex=True)
    for idxc, s_slide in enumerate(ls_slide):
        df_slide = df[df.slide==s_slide]
        for idx, s_marker in enumerate(ls_marker):
            df_result = pd.DataFrame(index=df_slide.index)
            for s_dist in ls_dist:
                b_bool = (df_slide.loc[:,s_target] < s_dist) & (df_slide.loc[:,s_target] >= s_dist - i_diff)
                df_result.loc[b_bool,f'{s_marker}_{s_dist}'] = df_slide.loc[b_bool,s_marker]
            for  s_col in df_result.columns:
                sns.kdeplot(df_result.loc[:,s_col].dropna(), ax=ax[idxc,idx], label=f"< {s_col.split('_')[2]}")
            if df_result.mean().fillna(0)[2] == 0:
                statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna())
                #print(pvalue)
            else:
                statistic, pvalue = scipy.stats.f_oneway(df_result.iloc[:,0].dropna(),df_result.iloc[:,1].dropna(),df_result.iloc[:,2].dropna())
            ax[idxc,idx].set_xlabel(f"{s_col.split('_')[0]} Intensity")
            ax[idxc,idx].set_ylabel(f"")
            ax[idxc,idx].set_title(f"")
            temp = tic.MaxNLocator(3)
            ax[idxc,idx].set_yticklabels(())
            ax[idxc,idx].xaxis.set_major_locator(temp)
            tot+=1
            if pvalue < 0.001: # 0.05/30: #bonferoni correction
                ax[idxc,idx].text(0.5, 0.8, '*',
                     horizontalalignment='center',
                     verticalalignment='center',
                     transform=ax[idxc,idx].transAxes)
            ax[idxc,idx].set_xlim(-1500,7000)
            ax[idxc,idx].spines['right'].set_visible(False)
            ax[idxc,idx].spines['left'].set_visible(False)
            ax[idxc,idx].spines['top'].set_visible(False)
        ax[idxc,0].set_ylabel(f"{d_slide[s_slide]}")
    ax[0,2].legend(title='$\mu$m')
    plt.subplots_adjust(wspace=.001,hspace=.001)
    plt.suptitle(f"Distance to {s_target.split('_')[0]}")
    plt.tight_layout()
    fig.savefig(f'./{s_date}/IntensityvsDistance_25s_{s_target}_by_slide.png',dpi=200)
    #break