# Tumor Landscape Analysis

Post-processing Module
---

## Barrett's Esophagus Project

Pre-processing of data 

Collaboration with **Justin Law** (<justin.law@icr.ac.uk>) and **Joe Brown** (<Joel.Brown@moffitt.org>)

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os, sys
from IPython.display import clear_output

def update_progress(progress, msg):
    bar_length = 25
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress : [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text + "  <<< " + msg)

# get parent directory
main_pth = os.path.dirname(os.getcwd())
dat_pth = main_pth + '/data/BE/'

units = '[um]'
# scale (units/pixel)
scale = 0.45


print('working directory is: ' + main_pth)

working directory is: /Volumes/GoogleDrive/My Drive/EcoEvo of Cancer/studies/BarrettsEsophagus


In [2]:
# reads classes info to dataframe
classes = pd.read_csv(main_pth + '/data/classes.csv')
classes = classes.loc[classes['drop'] == False].copy().reset_index()
classes

Unnamed: 0,index,class_code,class_word,class_name,class_val,class_color,drop
0,0,s,stromal,Stromal cells,42,#2b83ba,False
1,1,l,lymphocyte,Lymphocytes,84,#1a9641,False
2,2,t,tumor,BE cells,126,#d7191c,False
3,3,e,epithelial,Epithelial cells,168,#ff8400,False


In [3]:
#setname = 'biopsies'
#setname = 'biopsies_21'
#setname = 'biopsies_all'

# reads samples result data 
sample_info = pd.read_csv(dat_pth + setname + '.csv')

outdir = dat_pth + setname + '_results/'
if not os.path.exists(outdir):
    os.makedirs(outdir)

vildir = outdir + '/violins/all_biopsies/'
if not os.path.exists(vildir):
    os.makedirs(vildir)
    
vildirpid = outdir + '/violins/pid_means/'
if not os.path.exists(vildirpid):
    os.makedirs(vildirpid)

sample_info

Unnamed: 0,sample_ID,DCIS,patient_ID,set,max_diag,code,ndpi,biopsy,atyp,image_file,coord_file,tiff_file,results_dir,row_min,row_max,col_min,col_max,row_ref,col_ref,num_points
0,D0017_0,D0017,983,Case,High-Grade,D0017_983_47X_A,3,0,3.0,images/D0017_0.tiff,biopsies/D0017_0.csv,rasters/D0017_0.tiff,results/D0017_0,1799,19076,3104,6474,1579,2884,24587
1,D0017_1,D0017,983,Case,High-Grade,D0017_983_47X_A,3,1,3.0,images/D0017_1.tiff,biopsies/D0017_1.csv,rasters/D0017_1.tiff,results/D0017_1,9303,16528,7902,9997,9083,7682,9549
2,D0017_2,D0017,983,Case,High-Grade,D0017_983_47X_A,3,2,3.0,images/D0017_2.tiff,biopsies/D0017_2.csv,rasters/D0017_2.tiff,results/D0017_2,809,8236,11186,13075,589,10966,13330
3,D0017_3,D0017,983,Case,High-Grade,D0017_983_47X_A,3,3,3.0,images/D0017_3.tiff,biopsies/D0017_3.csv,rasters/D0017_3.tiff,results/D0017_3,1393,8711,7248,9450,1173,7028,9552
4,D0017_4,D0017,983,Case,High-Grade,D0017_983_47X_A,3,4,3.0,images/D0017_4.tiff,biopsies/D0017_4.csv,rasters/D0017_4.tiff,results/D0017_4,4261,17144,36,2860,4041,0,16191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,D0012_4,D0012,512,Case,High-Grade,D0012_512_19R_L,3,4,,,biopsies/D0012_4.csv,rasters/D0012_4.tiff,results/D0012_4,6670,17988,54001,57978,6450,53781,13872
130,D0014_1,D0014,664,Case,High-Grade,D0014_664_10S_A1,3,1,,,biopsies/D0014_1.csv,rasters/D0014_1.tiff,results/D0014_1,6074,17068,54001,55998,5854,53781,10714
131,D0014_2,D0014,664,Case,High-Grade,D0014_664_10S_A1,3,2,,,biopsies/D0014_2.csv,rasters/D0014_2.tiff,results/D0014_2,7001,17222,56001,59994,6781,55781,13413
132,D0014_3,D0014,664,Case,High-Grade,D0014_664_10S_A1,3,3,,,biopsies/D0014_3.csv,rasters/D0014_3.tiff,results/D0014_3,16003,19986,58001,61997,15783,57781,3928


In [None]:
stats = pd.DataFrame()
MH_metrics_tbl = pd.DataFrame()
LME_fracs = pd.DataFrame()

SSH_coloc_lt = pd.DataFrame()
SSH_coloc_st = pd.DataFrame()
SSH_coloc_sl = pd.DataFrame()

SSH_nni_lt = pd.DataFrame()
SSH_nni_tl = pd.DataFrame()
SSH_nni_ts = pd.DataFrame()
SSH_nni_st = pd.DataFrame()
SSH_nni_ls = pd.DataFrame()
SSH_nni_sl = pd.DataFrame()

SSH_rhi_ll = pd.DataFrame()
SSH_rhi_tt = pd.DataFrame()
SSH_rhi_ss = pd.DataFrame()
SSH_rhi_lt = pd.DataFrame()
SSH_rhi_tl = pd.DataFrame()
SSH_rhi_ts = pd.DataFrame()
SSH_rhi_st = pd.DataFrame()
SSH_rhi_ls = pd.DataFrame()
SSH_rhi_sl = pd.DataFrame()

for index, sample in sample_info.iterrows():
    
    sid = sample.sample_ID
    cid = sample.case_ID
    diag = sample.max_diag
    pid = sample.patient_ID
    st = sample.set
    msg = sid + " : [{0}/{1}]".format( index + 1, len(sample_info.index))
    resdir = dat_pth + 'results/' + sid
    
    update_progress(index/len(sample_info), msg)
    
    result_dir = sample.results_dir
    
    fil = resdir + '/' + sid +'_'+ setname + '_result_stats.tsv'
    if os.path.exists(fil):
        aux = pd.read_csv(fil, sep='\t')
        
        aux.insert(loc=1, column='pid',      value=pid)
        aux.insert(loc=2, column='set',      value=st)        
        aux.insert(loc=3, column='max_diag', value=diag)
        stats = stats.append(aux, ignore_index=True)
    
    fil = resdir + '/MH_sample_metrics_'+ setname + '.tsv'
    if os.path.exists(fil):
        aux = pd.read_csv(fil, sep='\t')
        
        aux.insert(loc=0, column='sample_ID', value=sid)
        aux.insert(loc=1, column='pid',       value=pid)
        aux.insert(loc=2, column='set',       value=st)
        aux.insert(loc=3, column='max_diag',  value=diag)
        MH_metrics_tbl = MH_metrics_tbl.append(aux, ignore_index=True)

        
    fil = resdir + '/patch_level/MH_patch_metrics_'+ setname + '.tsv'
    if os.path.exists(fil):
        aux = pd.read_csv(fil, sep='\t')
        
        auy = aux[['LME', 'area_fraction']].groupby(['LME']).sum().reset_index()
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='set',       value=st)
        auy.insert(loc=3, column='max_diag',  value=diag)
        LME_fracs = LME_fracs.append(auy, ignore_index=True) 
        
    
    fil = resdir + '/patch_level/SSH_factor_coloc_'+ setname + '.tsv'
    if os.path.exists(fil):
        aux = pd.read_csv(fil, sep='\t')
        
        auy = aux.loc[(aux['factor'] == 'l:t') & (aux['stratum'] == 'LME_MH')][['q_statistic', 'p_value']]
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='set',       value=st)
        auy.insert(loc=3, column='max_diag',  value=diag)
        SSH_coloc_lt = SSH_coloc_lt.append(auy, ignore_index=True) 
        
        auy = aux.loc[(aux['factor'] == 's:t') & (aux['stratum'] == 'LME_MH')][['q_statistic', 'p_value']]
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='set',       value=st)
        auy.insert(loc=3, column='max_diag',  value=diag)
        SSH_coloc_st = SSH_coloc_st.append(auy, ignore_index=True) 
        
        auy = aux.loc[(aux['factor'] == 's:l') & (aux['stratum'] == 'LME_MH')][['q_statistic', 'p_value']]
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='set',       value=st)
        auy.insert(loc=3, column='max_diag',  value=diag)
        SSH_coloc_sl = SSH_coloc_sl.append(auy, ignore_index=True) 
    
    fil = resdir + '/patch_level/SSH_factor_NNindex_'+ setname + '.tsv'
    if os.path.exists(fil):
        aux = pd.read_csv(fil, sep='\t')
        
        auy = aux.loc[(aux['factor'] == 'l:t') & (aux['stratum'] == 'LME_MH')][['q_statistic', 'p_value']]
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='max_diag',  value=diag)
        auy.insert(loc=3, column='set',       value=st)
        SSH_nni_lt = SSH_nni_lt.append(auy, ignore_index=True) 
                   
        auy = aux.loc[(aux['factor'] == 't:l') & (aux['stratum'] == 'LME_MH')][['q_statistic', 'p_value']]
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='max_diag',  value=diag)
        auy.insert(loc=3, column='set',       value=st)
        SSH_nni_tl = SSH_nni_tl.append(auy, ignore_index=True)  
        
        auy = aux.loc[(aux['factor'] == 's:t') & (aux['stratum'] == 'LME_MH')][['q_statistic', 'p_value']]
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='max_diag',  value=diag)
        auy.insert(loc=3, column='set',       value=st)
        SSH_nni_st = SSH_nni_st.append(auy, ignore_index=True) 
                   
        auy = aux.loc[(aux['factor'] == 't:s') & (aux['stratum'] == 'LME_MH')][['q_statistic', 'p_value']]
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='max_diag',  value=diag)
        auy.insert(loc=3, column='set',       value=st)
        SSH_nni_ts = SSH_nni_ts.append(auy, ignore_index=True)  
        
        auy = aux.loc[(aux['factor'] == 's:l') & (aux['stratum'] == 'LME_MH')][['q_statistic', 'p_value']]
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='max_diag',  value=diag)
        auy.insert(loc=3, column='set',       value=st)
        SSH_nni_sl = SSH_nni_sl.append(auy, ignore_index=True) 
                   
        auy = aux.loc[(aux['factor'] == 'l:s') & (aux['stratum'] == 'LME_MH')][['q_statistic', 'p_value']]
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='max_diag',  value=diag)
        auy.insert(loc=3, column='set',       value=st)
        SSH_nni_ls = SSH_nni_ls.append(auy, ignore_index=True) 
    
    fil = resdir + '/patch_level/SSH_factor_RHindex_'+ setname + '.tsv'
    if os.path.exists(fil):
        aux = pd.read_csv(fil, sep='\t')
        
        auy = aux.loc[(aux['factor'] == 'l:l') & (aux['stratum'] == 'LME_MH')][['q_statistic', 'p_value']]
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='max_diag',  value=diag)
        auy.insert(loc=3, column='set',       value=st)
        SSH_rhi_ll = SSH_rhi_ll.append(auy, ignore_index=True) 
        
        auy = aux.loc[(aux['factor'] == 't:t') & (aux['stratum'] == 'LME_MH')][['q_statistic', 'p_value']]
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='max_diag',  value=diag)
        auy.insert(loc=3, column='set',       value=st)
        SSH_rhi_tt = SSH_rhi_tt.append(auy, ignore_index=True) 
        
        auy = aux.loc[(aux['factor'] == 's:s') & (aux['stratum'] == 'LME_MH')][['q_statistic', 'p_value']]
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='max_diag',  value=diag)
        auy.insert(loc=3, column='set',       value=st)
        SSH_rhi_ss = SSH_rhi_ss.append(auy, ignore_index=True) 
        
        auy = aux.loc[(aux['factor'] == 'l:t') & (aux['stratum'] == 'LME_MH')][['q_statistic', 'p_value']]
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='max_diag',  value=diag)
        auy.insert(loc=3, column='set',       value=st)
        SSH_rhi_lt = SSH_rhi_lt.append(auy, ignore_index=True) 
        
        auy = aux.loc[(aux['factor'] == 't:l') & (aux['stratum'] == 'LME_MH')][['q_statistic', 'p_value']]
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='max_diag',  value=diag)
        auy.insert(loc=3, column='set',       value=st)
        SSH_rhi_tl = SSH_rhi_tl.append(auy, ignore_index=True) 
    
        auy = aux.loc[(aux['factor'] == 's:t') & (aux['stratum'] == 'LME_MH')][['q_statistic', 'p_value']]
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='max_diag',  value=diag)
        auy.insert(loc=3, column='set',       value=st)
        SSH_rhi_st = SSH_rhi_st.append(auy, ignore_index=True) 
    
        auy = aux.loc[(aux['factor'] == 't:s') & (aux['stratum'] == 'LME_MH')][['q_statistic', 'p_value']]
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='max_diag',  value=diag)
        auy.insert(loc=3, column='set',       value=st)
        SSH_rhi_ts = SSH_rhi_ts.append(auy, ignore_index=True) 
        
        auy = aux.loc[(aux['factor'] == 's:l') & (aux['stratum'] == 'LME_MH')][['q_statistic', 'p_value']]
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='max_diag',  value=diag)
        auy.insert(loc=3, column='set',       value=st)
        SSH_rhi_sl = SSH_rhi_sl.append(auy, ignore_index=True) 
    
        auy = aux.loc[(aux['factor'] == 'l:s') & (aux['stratum'] == 'LME_MH')][['q_statistic', 'p_value']]
        auy.insert(loc=0, column='sample_ID', value=sid)
        auy.insert(loc=1, column='pid',       value=pid)
        auy.insert(loc=2, column='max_diag',  value=diag)
        auy.insert(loc=3, column='set',       value=st)
        SSH_rhi_ls = SSH_rhi_ls.append(auy, ignore_index=True) 
        
    
update_progress(1, msg)

In [None]:
LME_fracs.to_csv(outdir + setname + '_result_lme_fractions.tsv', sep='\t', index=False)

stats['t2s'] = np.nan
stats.loc[stats['s_density'] > 0, 't2s'] = stats['t_density']/stats['s_density']

stats.to_csv(outdir + setname + '_result_stats.tsv', sep='\t', index=False)
stats

In [None]:
import seaborn as sns
from statannot import add_stat_annotation
from itertools import combinations
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=UserWarning)

def violin_strat(dat, classes, signal, ttl, fname):
    
    orig_stdout = sys.stdout
    f = open(fname +'.txt', 'w')
    sys.stdout = f
    
    aux = dat[classes].to_list()
    auy = dat[signal].to_list()
    cls = sorted(set(aux), reverse=True)
    
    fig, ax = plt.subplots(1, 1, figsize=(12, 12), facecolor='w', edgecolor='k')
    
    sns.axes_style("whitegrid")
    sns.violinplot(x  = aux, 
                   y  = auy, 
                   ax = ax,
                   palette ="Set3", 
                   scale = 'count', 
                   inner = 'box',
                   order = cls)
    sns.swarmplot(x  = aux, 
                  y  = auy, 
                  ax = ax,
                  color = "black",
                  order = cls)
    
    add_stat_annotation(ax, x=aux, y=auy, order=cls,
                        box_pairs=combinations(cls, 2),
                        test='t-test_ind', 
                        #text_format='full',
                        text_format='star', 
                        loc='inside', verbose=2);    
    ax.set_title(ttl)
    
    plt.savefig(fname +'.png', bbox_inches='tight', dpi=90)
    plt.close()
    sys.stdout = orig_stdout
    f.close()
         
    return([fig, ax])


def violin_groups(dat, classes, signal, groups, ttl, fname):
    
    orig_stdout = sys.stdout
    f = open(fname +'.txt', 'w')
    sys.stdout = f
    
    aux = dat[groups].to_list()
    grps = sorted(set(aux), reverse=False)
    comb = list(combinations(grps, 2))
    
    aux = dat[classes].to_list()
    cls = sorted(set(aux), reverse=False)
    
    box_pairs=[]
    for cas in comb:
        box_pairs += [((x, cas[0]),(x, cas[1])) for x in cls]
    

    fig, ax = plt.subplots(1, 1, figsize=(12, 12), facecolor='w', edgecolor='k')
    
    sns.axes_style("whitegrid")
    sns.violinplot(data= dat,
                   x  = classes, 
                   y  = signal, 
                   hue = groups,
                   ax = ax,
                   palette ="Set3", 
                   #scale = 'count', 
                   scale = 'width', 
                   inner = 'box',
                   order = cls)
    handles = ax.legend_.legendHandles
    labels = [text.get_text() for text in ax.legend_.texts]
    
    sns.swarmplot(data= dat,
                  x  = classes, 
                  y  = signal,
                  hue = groups,
                  dodge=True,
                  ax = ax,
                  color = "black",
                  order = cls,
                  size = 3)
     
    add_stat_annotation(ax, data= dat, 
                        x=classes, y=signal, hue = groups, 
                        order=cls,
                        box_pairs=box_pairs,
                        test='t-test_ind', 
                        #text_format='full',
                        text_format='star', 
                        loc='inside', verbose=2);    
    ax.set_title(ttl)
    ax.set_xlabel('LME')
    ax.set_ylabel('Area fraction')
    ax.legend(handles, labels, title='', bbox_to_anchor=(1.01, 1), loc='upper left')
     
    plt.savefig(fname +'.png', bbox_inches='tight', dpi=90)
    plt.close()
    sys.stdout = orig_stdout
    f.close()
         
    return([fig, ax])

noncols = ['sample_ID', 'pid', 'set', 'max_diag', 'code', 
           'ndpi', 'biopsy', 'atyp', 'p_value',
           'num_points', 'total_area', 'ROI_area', 'num_cells']


In [None]:
# Plot violin distributions for LME fractions in all biopsies and patients
# grouped by LME and set (control vs case) 
fig, ax = violin_groups(LME_fracs, 'LME', 'area_fraction', 'set', 'LME fractions',
                        vildir + 'lme_fracs')

In [None]:
# Plot violin distributions for simple statistics, for all biopsies and patients
# grouped by set (control vs case) 
cols = [x for x in stats.columns if x not in noncols]
for col in cols:
    fig, ax = violin_strat(stats, 'set', col, col, vildir + 'stat_' + col)
    
aux = stats[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
for col in cols:
    fig, ax = violin_strat(aux, 'set', col, col, vildirpid + 'stat_pid_' + col)
    

In [None]:
# Plot violin distributions for MH statistics, for all biopsies and patients
# grouped by set (control vs case)
cols = [x for x in MH_metrics_tbl.columns if x not in (noncols + 
                                             ['number_of_patches', 'total_edge', 'effective_mesh_size',
                                              'fractal_dimension_mn', 'fractal_dimension_am', 
                                              'fractal_dimension_md', 'fractal_dimension_ra', 
                                              'fractal_dimension_sd', 'fractal_dimension_cv'])]
for col in cols:
    fig, ax = violin_strat(MH_metrics_tbl, 'set', col, col, vildir + 'MH_metrics_' + col)
    
cols = [x for x in MH_metrics_tbl.columns if x not in (noncols + 
                                             ['number_of_patches', 'total_edge', 'effective_mesh_size',
                                              'area_am', 'area_md', 'area_ra', 
                                              'area_sd', 'area_cv', 
                                              'perimeter_am', 'perimeter_md', 'perimeter_ra', 
                                              'perimeter_sd', 'perimeter_cv',
                                              'perimeter_area_ratio_am', 'perimeter_area_ratio_md', 
                                              'perimeter_area_ratio_ra', 'perimeter_area_ratio_sd', 
                                              'perimeter_area_ratio_cv',
                                              'shape_index_am', 'shape_index_md', 'shape_index_ra', 
                                              'shape_index_sd', 'shape_index_cv',
                                              'fractal_dimension_mn', 'fractal_dimension_am', 
                                              'fractal_dimension_md', 'fractal_dimension_ra', 
                                              'fractal_dimension_sd', 'fractal_dimension_cv',
                                              'euclidean_nearest_neighbor_am', 'euclidean_nearest_neighbor_md',
                                              'euclidean_nearest_neighbor_ra', 'euclidean_nearest_neighbor_sd',
                                              'euclidean_nearest_neighbor_cv'])]
aux = MH_metrics_tbl[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
for col in cols:
    fig, ax = violin_strat(aux, 'set', col, col, vildirpid + 'MH_metrics_pid_' + col)    

In [None]:
# Plot violin distributions for SSH_coloc statistics, for all biopsies and patients
# grouped by set (control vs case)
cols = [x for x in SSH_coloc_lt.columns if x not in (noncols)]

a_lt = SSH_coloc_lt[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
a_st = SSH_coloc_st[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
a_sl = SSH_coloc_sl[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
for col in cols:
    
    fig, ax = violin_strat(SSH_coloc_lt, 'set', col, col, vildir + 'SSH_coloc_lt_' + col)
    fig, ax = violin_strat(a_lt, 'set', col, col, vildirpid + 'SSH_coloc_lt_pid_' + col)  

    fig, ax = violin_strat(SSH_coloc_st, 'set', col, col, vildir + 'SSH_coloc_st_' + col)
    fig, ax = violin_strat(a_st, 'set', col, col, vildirpid + 'SSH_coloc_st_pid_' + col) 

    fig, ax = violin_strat(SSH_coloc_sl, 'set', col, col, vildir + 'SSH_coloc_st_' + col)
    fig, ax = violin_strat(a_sl, 'set', col, col, vildirpid + 'SSH_coloc_sl_pid_' + col) 
    
# Plot violin distributions for SSH_nni statistics, for all biopsies and patients
# grouped by set (control vs case)
cols = [x for x in SSH_nni_lt.columns if x not in (noncols)]

a_lt = SSH_nni_lt[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
a_tl = SSH_nni_tl[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
a_ts = SSH_nni_ts[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
a_st = SSH_nni_st[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
a_ls = SSH_nni_ls[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
a_sl = SSH_nni_sl[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
for col in cols:
    
    fig, ax = violin_strat(SSH_nni_lt, 'set', col, col, vildir + 'SSH_nni_lt_' + col)
    fig, ax = violin_strat(a_lt, 'set', col, col, vildirpid + 'SSH_nni_lt_pid_' + col)    
    
    fig, ax = violin_strat(SSH_nni_tl, 'set', col, col, vildir + 'SSH_nni_tl_' + col)
    fig, ax = violin_strat(a_tl, 'set', col, col, vildirpid + 'SSH_nni_tl_pid_' + col)    
    
    fig, ax = violin_strat(SSH_nni_ts, 'set', col, col, vildir + 'SSH_nni_ts_' + col)
    fig, ax = violin_strat(a_ts, 'set', col, col, vildirpid + 'SSH_nni_ts_pid_' + col)    
    
    fig, ax = violin_strat(SSH_nni_st, 'set', col, col, vildir + 'SSH_nni_st_' + col)
    fig, ax = violin_strat(a_st, 'set', col, col, vildirpid + 'SSH_nni_st_pid_' + col)    
    
    fig, ax = violin_strat(SSH_nni_ls, 'set', col, col, vildir + 'SSH_nni_ls_' + col)
    fig, ax = violin_strat(a_ls, 'set', col, col, vildirpid + 'SSH_nni_ls_pid_' + col)    
    
    fig, ax = violin_strat(SSH_nni_sl, 'set', col, col, vildir + 'SSH_nni_sl_' + col)
    fig, ax = violin_strat(a_sl, 'set', col, col, vildirpid + 'SSH_nni_sl_pid_' + col)    
    
    
# Plot violin distributions for SSH_rhi statistics, for all biopsies and patients
# grouped by set (control vs case)
cols = [x for x in SSH_rhi_lt.columns if x not in (noncols)]

a_ll = SSH_rhi_ll[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
a_tt = SSH_rhi_tt[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
a_ss = SSH_rhi_ss[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
a_lt = SSH_rhi_lt[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
a_tl = SSH_rhi_tl[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
a_ts = SSH_rhi_ts[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
a_st = SSH_rhi_st[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
a_ls = SSH_rhi_ls[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)
a_sl = SSH_rhi_sl[cols + ['set', 'pid']].groupby(['pid', 'set']).mean().reset_index().drop(['pid'], axis=1)

for col in cols:
    
    fig, ax = violin_strat(SSH_rhi_ll, 'set', col, col, vildir + 'SSH_rhi_ll_' + col)
    fig, ax = violin_strat(a_ll, 'set', col, col, vildirpid + 'SSH_rhi_ll_pid_' + col)
    
    fig, ax = violin_strat(SSH_rhi_tt, 'set', col, col, vildir + 'SSH_rhi_tt_' + col)
    fig, ax = violin_strat(a_tt, 'set', col, col, vildirpid + 'SSH_rhi_tt_pid_' + col)
    
    fig, ax = violin_strat(SSH_rhi_ss, 'set', col, col, vildir + 'SSH_rhi_ss_' + col)
    fig, ax = violin_strat(a_ss, 'set', col, col, vildirpid + 'SSH_rhi_ss_pid_' + col)
    
    fig, ax = violin_strat(SSH_rhi_lt, 'set', col, col, vildir + 'SSH_rhi_lt_' + col)
    fig, ax = violin_strat(a_lt, 'set', col, col, vildirpid + 'SSH_rhi_lt_pid_' + col)
    
    fig, ax = violin_strat(SSH_rhi_tl, 'set', col, col, vildir + 'SSH_rhi_tl_' + col)
    fig, ax = violin_strat(a_tl, 'set', col, col, vildirpid + 'SSH_rhi_tl_pid_' + col)
    
    fig, ax = violin_strat(SSH_rhi_ts, 'set', col, col, vildir + 'SSH_rhi_ts_' + col)
    fig, ax = violin_strat(a_ts, 'set', col, col, vildirpid + 'SSH_rhi_ts_pid_' + col)
    
    fig, ax = violin_strat(SSH_rhi_st, 'set', col, col, vildir + 'SSH_rhi_st_' + col)
    fig, ax = violin_strat(a_st, 'set', col, col, vildirpid + 'SSH_rhi_st_pid_' + col)
    
    fig, ax = violin_strat(SSH_rhi_ls, 'set', col, col, vildir + 'SSH_rhi_ls_' + col)
    fig, ax = violin_strat(a_ls, 'set', col, col, vildirpid + 'SSH_rhi_ls_pid_' + col)
    
    fig, ax = violin_strat(SSH_rhi_sl, 'set', col, col, vildir + 'SSH_rhi_sl_' + col)
    fig, ax = violin_strat(a_sl, 'set', col, col, vildirpid + 'SSH_rhi_sl_pid_' + col)
    


# SPECIFIC PLOTS

In [None]:
import seaborn as sns
from statannot import add_stat_annotation
from itertools import combinations
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=UserWarning)


def violins(dat, classes, signal, lab):
    
    aux = dat[classes].to_list()
    auy = dat[signal].to_list()
    cls = sorted(set(aux), reverse=True)
    
    fig, ax = plt.subplots(1, 1, figsize=(12, 12), facecolor='w', edgecolor='k')
    
    sns.axes_style("whitegrid")
    sns.violinplot(x  = aux, 
                   y  = auy, 
                   ax = ax,
                   palette ="Set3", 
                   scale = 'count', 
                   inner = 'box',
                   order = cls)
    sns.swarmplot(x  = aux, 
                  y  = auy, 
                  ax = ax,
                  color = "black",
                  order = cls)
    
    add_stat_annotation(ax, x=aux, y=auy, order=cls,
                        box_pairs=combinations(cls, 2),
                        test='t-test_ind', 
                        line_offset_to_box=0.2,
                        #text_format='full',
                        text_format='star', 
                        loc='inside', verbose=2);    
    ax.set_ylabel(lab)
    sns.set(font_scale = 4)
         
    return([fig, ax])

In [None]:
cols = ['cell_density', 't_density', 't_fraction', 'l_fraction', 'MH_patch_density']
labs = [r'Cell Density $[\mu m^{-2}]$', 
        r'BE-cell Density $[\mu m^{-2}]$',
        'BE-cell Fraction',
        'Lymphocyte Fraction',  
        'Patch Density']

aux = stats[['sample_ID', 'set'] + cols].copy()
aux['cell_density'] = aux['cell_density']/(scale*scale)
aux['t_density'] = aux['t_density']/(100*scale*scale)

In [None]:
for i, col in enumerate(cols):
    
    #i = 4
    col = cols[i]
    lab = labs[i]
    
    fig, ax = violins(aux, 'set', col, lab)
    plt.savefig(vildir + 'aaa_stat_' + col +'.png', bbox_inches='tight', dpi=90)
    plt.close()
    


In [None]:
cols = ['shape_index_mn', 'area_mn', 'perimeter_mn']
labs = ['Mean Shape Index', 
        r'Mean Area $[\mu m^{2}]$',
        r'Mean Perimeter $[\mu m]$']

aux = MH_metrics_tbl[['sample_ID', 'set'] + cols].copy()
aux['shape_index_mn'] = 1/aux['shape_index_mn']
aux['area_mn'] = aux['area_mn']*(scale*scale)
aux['perimeter_mn'] = aux['perimeter_mn']*(scale)

In [None]:
for i, col in enumerate(cols):
    
    #i = 4
    col = cols[i]
    lab = labs[i]
    
    fig, ax = violins(aux, 'set', col, lab)
    plt.savefig(vildir + 'aaa_stat_' + col +'.png', bbox_inches='tight', dpi=90)
    plt.close()
  