In [None]:
# Import libraries
import os
import sys
import numpy as np
import pandas as pd
import shutil
import matplotlib.pyplot as plt
import re
from skimage import io
import tifffile
from scipy.ndimage import median_filter
from skimage.util import img_as_ubyte,  img_as_float
import skimage
from skimage.feature import blob_dog, blob_log, blob_doh
from math import sqrt
import scipy
import seaborn as sns
from scipy import stats

import lifelines
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.statistics import multivariate_logrank_test

from statannotations.Annotator import Annotator
from itertools import combinations
import statsmodels
import warnings
warnings.filterwarnings("ignore", category=UserWarning) 

# Set Paths
codedir = os.getcwd()

%matplotlib inline

In [None]:
#images: dowload from synapse.org syn51068458
# (free account required)
rootdir = '/home/groups/BCC_Chin_Lab/ChinData/Cyclic_Workflow/cmIF_2021-05-03_PDAC'
regdir = f'{rootdir}/RegisteredImages'
segdir = f'{codedir}/Segmentation'
segdiro = f'{rootdir}/Segmentation'

# clone mplex_image at https://gitlab.com/engje/mplex_image
os.chdir('/home/groups/graylab_share/data/engje/Data/')
from mplex_image import preprocess, mpimage, cmif
os.chdir(codedir)

# Table of contents <a name="contents"></a>
0. [functions](#func)
1. [skimage blobs](#sk)
2. [Foci analysis](#focifoci)
3. [mIHC analysis](#mihc)
4. [CPH modeling](#clin)
5. [NEW CPH modeling](#newsurv)
5. [gene expression analysis](#geneexp)
6. [TCR analysis](#tcell)
7. [High v. low pORG](#split)

In [None]:
import scipy
from skimage.filters import unsharp_mask
import skimage
from skimage.restoration import (denoise_tv_chambolle, denoise_bilateral,
                                 denoise_wavelet, estimate_sigma)
from skimage import color, morphology
from skimage.transform import rescale

## functions <a name="func"></a> 

[contents](#contents)

In [None]:
def get_blobs2(image_gray,min_sigma,max_sigma,threshold,exclude_border):

    blobs_dog = blob_dog(image_gray,  min_sigma=min_sigma, max_sigma=max_sigma, threshold=threshold,exclude_border=exclude_border)
    blobs_dog[:, 2] = blobs_dog[:, 2] * sqrt(2)

    blobs_list = [image_gray,  blobs_dog] #blobs_doh ,
    colors = ['red','red', ]
    titles = ['Original','Difference of Gaussian',#
              ]
    sequence = zip(blobs_list, colors, titles)

    fig, axes = plt.subplots(1, 2, figsize=(6, 3), sharex=True, sharey=True)
    ax = axes.ravel()

    for idx, (blobs, color, title) in enumerate(sequence):
        if idx == 1:
            ax[idx].set_title(f'{title}\nmin={min_sigma} max={max_sigma} thresh={threshold}')
        else:
            ax[idx].set_title(f'{title}')
        ax[idx].imshow(image)
        if not title == 'Original':
            for blob in blobs:
                y, x, r = blob
                c = plt.Circle((x, y), r, color=color, linewidth=2, fill=False)
                ax[idx].add_patch(c)
        #ax[idx].set_axis_off()

    plt.tight_layout()
    plt.close(fig)
    return(blobs_dog,fig)

def km_plot(df,s_col,s_time,s_censor):
    results = multivariate_logrank_test(event_durations=df.loc[:,s_time],
                                    groups=df.loc[:,s_col], event_observed=df.loc[:,s_censor])        
    kmf = KaplanMeierFitter()
    fig, ax = plt.subplots(figsize=(4,4),dpi=300)
    ls_order = sorted(df.loc[:,s_col].dropna().unique())
    for s_group in ls_order:
        print(s_group)
        df_abun = df[df.loc[:,s_col]==s_group]
        durations = df_abun.loc[:,s_time]
        event_observed = df_abun.loc[:,s_censor]
        kmf.fit(durations,event_observed,label=s_group)
        kmf.plot(ax=ax,ci_show=True,show_censors=True)
    ax.set_title(f'{s_col}\np={results.summary.p[0]:.2} n={[df.loc[:,s_col].value_counts()[item] for item in ls_order]}')
    ax.set_ylim(-0.05,1.05)
    return(fig,ls_order)

def cph_plot(df,s_multi,s_time,s_censor,figsize=(3,3)):
    cph = CoxPHFitter()  #penalizer=0.1
    if df.columns.isin(['Stage']).any():
        df.Stage = df.Stage.replace({'I':1,'II':2,'III':3,'IV':4})
    cph.fit(df.dropna(), s_time, event_col=s_censor) 
    fig, ax = plt.subplots(figsize=figsize,dpi=300)
    cph.plot(ax=ax)
    pvalue = cph.summary.p[s_multi]
    ax.set_title(f'{s_multi}\np={pvalue:.2} n={(len(df.dropna()))}')
    plt.tight_layout()
    return(fig,cph)
        

##  Section 1: skimage blob detection

Foci are detected here from images plus segmentation masks

**You can skip and use pre-computed foci counts**

 <a name="sk"></a> 

[contents](#contents)

In [None]:
#intensity data: dowload from synapse.org syn51068458
df_mi = pd.read_csv(f'{codedir}/data/20220720_U54-TMA_FilteredMeanIntensity_Link.csv',index_col=0)

In [None]:
#skimage blob detection
#requires images: dowload from synapse.org syn51068458
threshold=0.002
d_thresh = {'pRPA':1100, 'gH2AX':1100, 'RAD51':1300}
os.chdir(regdir)
ls_slide = sorted(set(df_mi.slide_scene)) 
ls_marker = ['pRPA','gH2AX','RAD51',] 
df_result_all = pd.DataFrame()
for s_slide in ls_slide:
    print(s_slide)
    os.chdir(f'{segdiro}/U54-TMA-9_CellposeSegmentation') #change this to path for your downloaded segmentation masks
    s_seg = pd.Series(sorted(os.listdir()),dtype='object')[pd.Series(index=sorted(os.listdir()),dtype='object').index.str.contains(f'{s_slide}_nuc30_NucleiSegmentationBasins')].iloc[0]
    label_image = io.imread(s_seg)
    os.chdir(f'{regdir}/{s_slide}') # change this to path to your downloaded registered tifs
    df_img=mpimage.parse_org()
    for idxs, s_marker in enumerate(ls_marker):
        intensity_image = io.imread(df_img[df_img.marker==s_marker].index[0])
        props = skimage.measure.regionprops_table(label_image, intensity_image=intensity_image, properties=('label','bbox','mean_intensity')) # 'image','intensity_image',
        df_props = pd.DataFrame(props,dtype='float').set_index('label')
        df_props.columns = [item.replace('-','') for item in df_props.columns]
        ls_index = df_props[df_props.mean_intensity>d_thresh[s_marker]].index
        print(f'{s_marker} {len(ls_index)}')
        for i_cell in ls_index:
            se_cell = df_props.loc[i_cell].dropna().astype('int')
            image = intensity_image[se_cell.bbox0:se_cell.bbox2,se_cell.bbox1:se_cell.bbox3]
            if s_marker == 'pRPA':
                blobs, fig = get_blobs2(image,min_sigma=0.1,max_sigma=2,threshold=threshold,exclude_border=1)
            elif s_marker == 'gH2AX':
                blobs, fig = get_blobs2(image,min_sigma=1,max_sigma=2,threshold=threshold,exclude_border=1)
            else:
                blobs, fig = get_blobs2(image,min_sigma=0.1,max_sigma=2,threshold=threshold,exclude_border=1)
            df_props.loc[i_cell,'blobs'] = len(blobs)
            if len(blobs) > 1:
                fig.suptitle(s_marker)
                plt.tight_layout()
                fig.savefig(f'{codedir}/blobs/{s_marker}/{s_slide}_{s_marker}_{i_cell}.png',dpi=200)
                plt.close(fig)
        df_props.index = [s_slide + '_cell' + str(item) for item in df_props.index]
        if len(ls_index)==0:
                df_props['blobs'] = np.NaN
        if idxs == 0:
            df_result = df_props.rename({'blobs':f'{s_marker}_foci'},axis=1)
        else:
            df_result.loc[df_props.index,f'{s_marker}_foci'] = df_props.blobs
    df_result_all = df_result_all.append(df_result)

#uncomment to save (this data has been pre-computed and saved in the repo)
#df_result_all.loc[:,df_result_all.columns.str.contains('foci')].dropna(how='all').to_csv(f'{codedir}/foci_U54-TMA-9_{".".join(ls_marker)}_{threshold}.csv')


### Section 2: FOCI Analysis <a name="focifoci"></a>

Load saved foci for plotting and downstream analysis

**You don't need to run section 1 to run this**


[contents](#contents)

In [None]:
df_surv = pd.read_csv(f'{codedir}/data/u54_tma_sampleannot_Link.csv')
len(df_surv)


In [None]:
df_pORG = pd.read_csv(f'{codedir}/data/GSVA_Scores_Link.csv')#=,index_col='Public_Specimen_ID'

In [None]:
df_primary = df_surv[((df_surv.Tissue=='PDAC') | (df_surv.Tissue=='Intestinal'))].copy()
print(len(df_primary))
df_primary = df_primary.merge(df_pORG,on='Public_Specimen_ID',how='left')
df_primary.set_index("Public_Specimen_ID", inplace = True)
len(df_primary)

In [None]:
df_mapper = df_primary[~df_primary.index.duplicated()]
ls_add = ['txi_pORG_Up_42_Genes','Original_pORG_Up_78_Genes', 
       'Original_pSUB_Up_100_Genes', 'txi_pSUB_Up_100_Genes']
for s_add in ls_add:
    d_map = dict(zip(df_mapper.index,df_mapper.loc[:,s_add]))
    print(len(d_map))
    df_surv[s_add] = df_surv.Public_Specimen_ID.map(d_map)

In [None]:
os.chdir('..')
import util
os.chdir(codedir)

### TMA survival

In [None]:
%matplotlib inline

In [None]:
##

alpha = 0.05
s_propo = ''
savedir = f'{codedir}'
s_time=  'Survival_time'#
s_censor='Survival'#
s_subtype = ''
s_cell = ''
s_type_title = ''
 
for s_col in ls_add[1:4]:
    print(s_col)
    for cutp in [0.33,0.5,0.66]: #np.round(np.arange(0.25,1,0.25),3):#
            print(cutp)
            df_km, pvalue = util.single_km(df_surv[~df_surv.Public_Patient_ID.duplicated()],s_cell,s_subtype,s_type_title,s_col,savedir,alpha,cutp, #
                                       s_time,s_censor,s_propo)
            print(pvalue)
    break

### define high/low pORG

based on survival differences

In [None]:
df_pri = df_surv[~(df_surv.Public_Patient_ID.duplicated(keep='first')) & ((df_surv.Tissue=='PDAC'))]
i_pORG_ori = np.quantile(df_pri.Original_pORG_Up_78_Genes.dropna(),0.66)
print(i_pORG_ori)
i_pORG_txi = np.quantile(df_pri.txi_pORG_Up_42_Genes.dropna(),0.66)
print(i_pORG_txi)
df_surv.loc[df_surv['Original_pORG_Up_78_Genes'] >= i_pORG_ori,'pORG_binary_orig'] = 'high'
df_surv.loc[df_surv['Original_pORG_Up_78_Genes'] < i_pORG_ori,'pORG_binary_orig'] = 'low'

df_surv.loc[df_surv['txi_pORG_Up_42_Genes'] >= i_pORG_txi,'pORG_binary_txi'] = 'high'
df_surv.loc[df_surv['txi_pORG_Up_42_Genes'] < i_pORG_txi,'pORG_binary_txi'] = 'low'

In [None]:
#df_surv.to_csv(f'{codedir}/data/u54_tma_sampleannot_Link_new.csv')

In [None]:
print(df_surv.pORG_binary_orig.value_counts())
print(df_surv.pORG_binary_txi.value_counts())

In [None]:
#load combined
s_sample = '20220721_U54-TMA'#'20220711_U54-TMA' #'20220409_JP-TMAs_IMC-TMAs'
s_names = 'Combined' #'unnamed' #
s_type = 'PDAC'
df_lei = pd.read_csv(f'{codedir}/data/{s_sample}_{s_names}Celltypes_{s_type}_Link.csv',index_col=0)

d_patient = dict(zip(df_surv.coor_mplexable,df_surv.Public_Patient_ID))

df_lei['Tissue'] = df_lei.Patient.map(dict(zip(df_surv.Public_Patient_ID,df_surv.Tissue)))
df_epi = pd.read_csv(f'{codedir}/data/results_20220721_U54-TMA_CellTypeCounts_byPatient_byleidencelltype5_PDAC_Link.csv',index_col=0)


In [None]:

ls_marker = ['gH2AX','pRPA','RAD51']
threshold=0.002
df_foci2 = pd.read_csv(f'{codedir}/data/foci_U54-TMA-9_{".".join(ls_marker)}_{threshold}.csv',index_col=0)
df_foci2.index = [item.replace('.0','') for item in df_foci2.index]
df_foci2 = df_foci2.fillna(0)
df_foci2['scene'] = [item.split('_cell')[0] for item in df_foci2.index]
df_foci2['Patient'] = df_foci2.scene.map(d_patient)

In [None]:
# plot Ki67 versus pRPA foci
s_define = 'pORG_binary_txi'#'pORG_binary_orig'#'pORG_binary'
df_lei['Cohort'] = df_lei.Patient.map(dict(zip(df_surv.Public_Patient_ID,df_surv.Cohort)))
df_lei['pORG_binary'] = df_lei.Patient.map(dict(zip(df_surv.Public_Patient_ID,df_surv.loc[:,s_define])))
df_lei['Ki67pos'] = np.nan
df_lei.loc[df_lei.Ki67>3*256,'Ki67pos'] = 'Ki67+'
df_lei.Ki67pos.fillna('Ki67-',inplace=True)
ls_foci =['pRPA_foci','gH2AX_foci','RAD51_foci']
df_lei_foci = df_lei.merge(df_foci2.loc[:,ls_foci],left_index=True,right_index=True,how='left')
df_lei_foci.loc[:,ls_foci] = df_lei_foci.loc[:,ls_foci].fillna(0)

In [None]:
#use df foci sum 2 (lower threshold)
ls_index = df_lei[df_lei.leidencelltype5=='epithelial'].index
df_foci_sum2 = df_foci2.loc[df_foci2.index.isin(ls_index)].groupby('Patient').sum()
for s_marker in ls_marker:
    df_foci_sum2[f'log_{s_marker}_foci'] = np.log(df_foci_sum2.loc[:,f'{s_marker}_foci'] + 1)
    df_foci_sum2[f'mean_{s_marker}_foci'] = (df_foci_sum2.loc[df_foci_sum2.index.isin(df_epi.index),f'{s_marker}_foci']/df_epi.epithelial).fillna(0)

    
df_foci_sum2['Public_Patient_ID'] = df_foci_sum2.index
df_surv = df_surv.merge(df_foci_sum2,on='Public_Patient_ID',how='left',suffixes=('_1',''))

In [None]:
#compare Ki67+ versus negative, double violinplots
#compare all the combinations, double violinplots
%matplotlib inline
import seaborn as sns
from scipy import stats
from statannotations.Annotator import Annotator
from itertools import combinations
import statsmodels
from statsmodels.stats.multicomp import pairwise_tukeyhsd

s_compare = 'Ki67'#'all' #
import warnings
warnings.filterwarnings("ignore", category=UserWarning) 
for s_cat in ['pORG_binary','Cohort']:
    ls_order = ['Ki67-','Ki67+']
    for s_foci in ls_foci:
        if s_compare == 'all':
            figsize=(3.5,3)
        else:
            figsize=(2.8,2.8)
        fig,ax=plt.subplots(dpi=300,figsize=figsize)
        df_both = pd.DataFrame()
        order = []
        d_pval = {}
        for idx, s_cohort in enumerate(df_lei_foci.loc[:,s_cat].dropna().unique()):
            print(s_cohort)
            df_plot_foci =df_lei_foci[(df_lei_foci.leidencelltype5=='epithelial')
                                      & (df_lei_foci.Tissue=='PDAC') & (df_lei_foci.loc[:,s_cat]==s_cohort)]
            statistic, pvalue = stats.ttest_ind(df_plot_foci.loc[df_plot_foci.Ki67pos=='Ki67+',s_foci],
                                                df_plot_foci.loc[df_plot_foci.Ki67pos=='Ki67-',s_foci],
                                               alternative='greater')
            for s_order in ls_order:
                order.append((s_cohort,s_order))
            df_both = pd.concat([df_both,df_plot_foci])
            d_pval.update({s_cohort:pvalue})
        sns.violinplot(data=df_both,hue='Ki67pos',y=s_foci,x=s_cat,ax=ax,alpha=0.5,linewidth=0.5)
        sns.stripplot(data=df_both,hue='Ki67pos',y=s_foci,x=s_cat,s=1,dodge=True,ax=ax,palette='dark',jitter=0.2)
        #annotate
        if s_compare == 'all':
            pairs = list(combinations(order, r=2))
            annotator = Annotator(ax, pairs=pairs, data=df_both, y=s_foci,x=s_cat,hue='Ki67pos')
            annotator.configure(test="t-test_ind",line_width=1)#,alternative='greater
            pvalues = annotator.apply_test().annotations #annotator.apply_and_annotate() # 
            pvalues = [item.data.pvalue for item in pvalues]     
            reject, corrected, __, __ = statsmodels.stats.multitest.multipletests(pvalues,method='bonferroni')
            formatted_pvalues = [f'p={pvalue:.2}' for pvalue in list(corrected)]
            annotator.set_custom_annotations(formatted_pvalues)
            annotator.annotate()
        else:
            pairs = [(order[0],order[1]),(order[2],order[3])]
            pvalues = [d_pval[pairs[0][0][0]],d_pval[pairs[1][0][0]]]
            reject, corrected, __, __ = statsmodels.stats.multitest.multipletests(pvalues,method='bonferroni')
            formatted_pvalues = [f'p={pvalue:.2}' for pvalue in list(corrected)]
            annotator = Annotator(ax, pairs=pairs, data=df_both, y=s_foci,x=s_cat,hue='Ki67pos')
            annotator.set_custom_annotations(formatted_pvalues)
            annotator.annotate()
        
        ax.set_title(f"{s_foci.replace('_',' ')} vs. {s_cat.split('_')[0]}", fontsize='x-large')
        ax.set_xlabel(s_cat)
        ax.set_ylabel(f"No. {s_foci.replace('_',' ')}")
        h, l = ax.get_legend_handles_labels()
        labels =  [f'_{item}' if ind < 2 else item for ind,item in enumerate(l)]
        ax.legend(h,labels,title='',fontsize='small',markerscale=.5,bbox_to_anchor=(1.01,0.9))
        plt.tight_layout()
        fig.savefig(f'{codedir}/violinplot_both_{s_foci}_{s_cohort}_{s_compare}.png')
        #df_plot_foci.loc[:,ls_col].to_csv(f'results_foci_Ki67_{s_cohort}.csv')
        #break
    #break

### Section 3: mIHC Analysis <a name="mihc"></a>

Stacked barplot


[contents](#contents)

In [None]:
#survival

df_surv = pd.read_csv(f'{codedir}/data/u54_tma_sampleannot_Link_new.csv',index_col=0)
df_surv['Sample_ID'] = [item.split('-T')[0][-6::] for item in df_surv.loc[:,'Public_Patient_ID'].fillna('none')]

In [None]:
df_surv

In [None]:
# dowload mIHC data from https://www.synapse.org/#!Synapse:syn51078766
# combine ROIs into large dataframe and save
if not os.path.exists(f'data/20221123_mIHC_LiverLung_Celltypes.csv'):
    ls_col = ['Sample_ID','class','Location_Center_X', 'Location_Center_Y', 
           'GRZB_func', 'KI67_func', 'PD1_func', 'PDL1_func', 'CD163_func',
           'CCR2_func', 'HLAII_func', 'EOMES_func','Area']
    df_ll =pd.DataFrame()
    for s_file in sorted(os.listdir('mIHC_Data')):
        df = pd.read_csv(f'mIHC_Data/{s_file}',index_col=0)
        s_sample = s_file.split('LiverLungBCC')[1].split('.')[0]
        df['Sample_ID'] = s_sample
        df.index = [f'{s_sample.split("Nuclei_")[1].replace("ROI","_scene")}_cell{item}' for item in df.index]
        df_ll = pd.concat([df_ll,df.loc[:,ls_col]])
        break
    df_ll['Organ'] = df_ll.Sample_ID.map(dict(zip(df_ll_annot.index,df_ll_annot.loc[:,'Met Site'])))
    df_ll['Location'] = df_ll.Sample_ID.map(dict(zip(df_ll_annot.index,df_ll_annot.loc[:,'Location'])))
    df_ll['Desc'] = df_ll.Sample_ID.map(dict(zip(df_ll_annot.index,df_ll_annot.loc[:,'Desc'])))
    df_ll.to_csv(f'data/20221123_mIHC_LiverLung_Celltypes.csv')

In [None]:
#load data
df_ll = pd.read_csv(f'data/20221123_mIHC_LiverLung_Celltypes.csv',index_col=0,low_memory=False)
print(len(df_ll))

df_ll_ann = pd.read_csv('data/LiverLung_annotations.csv',index_col=0)
df_ll_roi = pd.read_csv('data/annotated_LiverLung_perROI.csv',index_col=0)

In [None]:
#add annotation
df_ll['Sample_ID_short'] = [item.split('Nuclei_')[1].split('ROI')[0] for item in df_ll.Sample_ID]
df_ll['Sample_ID_int'] = [int(item) for item in df_ll.Sample_ID_short]
df_ll['ROI'] = [int(item.split('ROI')[1]) for item in df_ll.Sample_ID]
df_ll['Cohort'] = df_ll.Sample_ID_int.map(dict(zip(df_ll_ann.index, df_ll_ann.Cohort)))
df_ll['Patient'] = df_ll.Sample_ID_int.map(dict(zip(df_ll_ann.index, df_ll_ann.Patient)))
df_ll['Sample_ROI'] = df_ll.Sample_ID_int.astype('str') + '_' + df_ll.ROI.astype('str')
df_ll_roi['Sample_ROI'] = df_ll_roi.index.astype('str') + '_' + df_ll_roi.ROI.astype('str')
df_ll['Location'] = df_ll.Sample_ROI.map(dict(zip(df_ll_roi.Sample_ROI,df_ll_roi.loc[:,'Location'])))

In [None]:
df_ll['classII'] = df_ll.loc[:,'class'].replace({'T-regulatory CD4 cells':'T cells',
                                                 'CD4 T helper cells':'T cells','CD8 T cells':'T cells'}) #
df_ll.rename({'class':'classI'},axis=1,inplace=True)

In [None]:
d_color = dict(zip(['ST-00016289', 'ST-00017078', 'ST-00017310', 'ST-00017381',
       'ST-00018269', 'ST-00018955', 'ST-00019367', 'ST-00019368',
       'ST-00020181'],sns.color_palette('Purples',9)))
d_color.update(dict(zip(['ST-00015839', 'ST-00017440', 'ST-00017804'],sns.color_palette('Blues',3))))

ls_mihc = [ 'PanCK+','aSMA+','CD4 T helper cells','CD8 T cells', 'T-regulatory CD4 cells','B cells', 'Granulocytes', 'Monocyte',
 'Macrophage', 'Mature DC','Immature DC',]


In [None]:
#by location
s_define = 'pORG_Score' #'txi_pORG_Up_42_Genes'#''#
s_group = 'Cohort'
alpha = 0.05
s_column ='classI'
s_patient = 'Public_Patient_ID'

for s_loc in ['all','T','B','D']: 
    if s_loc == 'all':
        df_loc = df_ll
    else:
        df_loc = df_ll[df_ll.Location==s_loc]
    print(len(df_loc.Sample_ID_short.unique()))
    df_group = (df_loc.groupby(['Patient',s_column]).count().Sample_ID/(df_loc.groupby(['Patient']).count().Sample_ID)).unstack()
    df_group[s_patient] = df_group.index
    df_group['Cohort'] = df_group.index.map(dict(zip(df_ll_roi.Patient,df_ll_roi.loc[:,'Cohort'])))
    df_group = df_group.merge(df_surv.loc[:,['Survival','Survival_time',s_define,s_patient]],on=s_patient)
    df_group = df_group[~df_group.loc[:,s_patient].duplicated()]
    df_group = df_group[df_group.Cohort!='Liver met'].fillna(0)
    ls_order = sorted(df_group.loc[:,s_group].unique())
    for s_marker in ls_mihc:
        try:
            s_high = df_group.loc[:,s_group].unique()[0]
            s_low = df_group.loc[:,s_group].unique()[1]
        except:
            continue
        n_high = sum(df_group.loc[:,s_group]==s_high)
        n_low = sum(df_group.loc[:,s_group]==s_low)
        statistic,pvalue = stats.ttest_ind(df_group.loc[df_group.loc[:,s_group]==s_high,s_marker],
                                           df_group.loc[df_group.loc[:,s_group]==s_low,s_marker])
        if pvalue <= alpha:
            df_group_roi = (df_loc.groupby(['Sample_ROI',s_column]).count().Sample_ID/(df_loc.groupby(['Sample_ROI']).count().Sample_ID)).unstack()
            df_group_roi['Cohort'] = df_group_roi.index.map(dict(zip(df_ll_roi.Sample_ROI,df_ll_roi.loc[:,'Cohort'])))
            df_group_roi['Patient'] = df_group_roi.index.map(dict(zip(df_ll_roi.Sample_ROI,df_ll_roi.loc[:,'Patient'])))
            df_group_roi = df_group_roi[df_group_roi.Cohort.isin(['liver_cohort','lung_cohort'])]
            fig, ax = plt.subplots(figsize=(3,3),dpi=300)
            sns.boxplot(data=df_group_roi,x=s_group,y=s_marker,showfliers=False,ax=ax,order=[str(item) for item in ls_order],palette=['mediumpurple','deepskyblue'])
            sns.stripplot(data=df_group_roi,x=s_group,y=s_marker,ax=ax,hue='Patient',s=3,palette=d_color)
            ax.set_ylim(ax.get_ylim()[0],ax.get_ylim()[1])
            ax.set_title(f'{s_group} versus\n {s_marker}\n p={pvalue:.4f} (n={n_low}, {n_high})')
            ax.set_ylabel(f'Fraction {s_marker} in {s_loc}')
            ax.set_ylabel(f'{s_marker}')
            ax.get_legend().remove()
            plt.tight_layout()
            #fig.savefig(f'{s_date}/boxplot_mIHC_{s_marker}_versus_{s_group}_in_{s_loc}.png')
        #break
    break

In [None]:

print(df_group.groupby('Cohort').mean().loc[:,s_define])
#print(df_group.groupby('Cohort').sem().loc[:,s_define])

In [None]:
#df_ll.classI.unique()

In [None]:
fig,ax=plt.subplots(dpi=300)
df_group.groupby('Cohort').mean().loc[:,ls_mihc].plot(kind='bar',width=.9,stacked=True,ax=ax,colormap='Paired')
ax.legend(bbox_to_anchor=(1,.9))
ax.set_ylabel('Fraction in Tissue')
plt.tight_layout()

# Section 4 <a name="clin"></a>

pORG mutltivarable survival analysis with clinicopathological variables


[contents](#contents)

### clinicopathological variables

In [None]:
# load patient vital status
#also had stage/ grade/ lymph nodes etc.
df_vital = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/FMP_Patients_Nov17_2021.xlsx',sheet_name='Sheet1')

#Stage 1 pancreatic cancer means the cancer is not more than 4cm in size and it hasn't spread outside the pancreas. It is split into 1A and 1B.
# 1A In TNM staging, this is the same as T1, N0, M0 1B In TNM staging, this is the same as T2, N0, M0

# Stage 2A means the cancer is bigger than 4cm but is still within the pancreas. It has not spread to the lymph nodes Open a glossary item or other areas of the body.
#2A in TNM staging, this is the same as T3, N0, M0. 2B In TNM staging, this is the same as T1, 2 or 3, N1, M0.

#3 Stage 3 can mean that the cancer is any size within the pancreas and has spread to 4 or more nearby lymph nodes Open a glossary item.
#In TNM staging, this is the same as T1, 2 or 3, N2, M0.
#Or stage 3 can mean the cancer has started to grow outside the pancreas into the major blood vessels nearby. It may or may not have spread into the lymph nodes. It hasn't spread to any other areas of the body.
#In TNM staging, this is the same as T4, Any N, M0.

#4 Your doctor might call this advanced (metastatic) cancer.
#In TNM staging, this is the same as Any T, Any N, M1.

#collapse stage
d_stage = {'2B - IIB':'II', 'p2A':'II', 'p2B':'II', '4 - IV':'IV', '2A - IIA':'II', 'p3':'III', '1B - IB':'I',
    'c1B':'I', 'c2B':'I', 'nan':np.NaN, 'p4':'IV','2B - T1-3, N1, M0':'II','p0':'0','p1B':'I','c1':'I',
    'c4':'IV','c3':'III','p1A':'I','pNA':np.NaN,'3 - T4, Any N, M0':'III','c2A':'II','p4B':'IV','c4B':'IV',
     '2B - T1, N1, M0 / T2, N1, M0 / T3, N1, M0':'II','pUNK':np.NaN,'p2':'II','p3B':'III','c2':'II','p3A':'III',
    '3 - III':'III','99 - Unknown':np.NaN,'1A - IA':'I','c3A':'III','c4A':'IV','p1':'I','c1A':'I','p4A':'IV',
    '88 - Not applicable to 7th Edition staging':np.NaN,
           '88 - No classification is recommended in 6th Edition':np.NaN,
    '2A - T3, N0, M0':'II','4 - Any T, Any N, M1':'IV'}
df_vital['Stage'] = df_vital.loc[:,'Stage Grouping _ Dominant'].replace(d_stage) 

#collapse grade

d_grade = {'Grade II  Moderately Diff / Mod Well Diff':'2',
       'Grade I   Well Differentiated/Differentiated':'1',
       'Cell type not determined; not stated;N/A;Unk; high grade dysplas':np.NaN,
       'Grade III Poorly Differentiated':'3', 'nan':np.NaN,'Grade IV Undifferentiated, Anaplastic':'4',
         'B-CELL    LYMPHOMA OR LEUKEMIA ONLY':np.NaN}
df_vital['Grade'] = df_vital.loc[:,'Grade_Differentiation'].replace(d_grade) 

#collapse LV invasion



d_replace = {'nan':np.NaN, 'LYMPHOVASCULAR INVASION STATED AS NOT PRESENT':'NO',
       'LYMPHOVASCULAR INVASION PRESENT/IDENTIFIED':'YES',
       'Unknown/Indeterminate':np.NaN, 'NOT APPLICABLE':np.NaN,
       'Lymph-vascular Invasion Present/Identified':'YES',
       'LYMPHATIC AND SMALL VESSEL INVASION ONLY (L)':'YES',
       'BOTH LYMPHATIC AND SMALL VESSEL AND VENOUS (LARGE VESSEL) INVASION':'YES'}
df_vital['LV_Invasion'] = df_vital.loc[:,'Lymph_vascular Invasion'].replace(d_replace)

# LN positivity
df_vital['LN_Pos'] = df_vital.loc[:,'Regional Lymph Nodes Positive'] >= 1
df_vital.loc[df_vital.loc[:,'Regional Lymph Nodes Positive'].isna(),'LN_Pos'] = np.NaN
df_vital['LN_Pos']  = df_vital.LN_Pos.replace({True:'YES',False:'NO'})

df_vital['Survival'] = df_vital.cVitalStatus.replace({'Alive':0,'Dead':1})
print(df_vital.Grade.unique())
print(df_vital.Stage.unique())



In [None]:
# #save
# #'Stage Grouping _ Dominant'
# pd.Series(d_stage, name='Stage').to_csv('Stage_Mapping.csv')
# # 'Grade_Differentiation'
# pd.Series(d_grade, name='Grade').to_csv('Grade_Mapping.csv')
# #'Lymph_vascular Invasion'
# pd.Series(d_replace, name='LV_Invasion').to_csv('LV_Invasion_Mapping.csv')

In [None]:
# %matplotlib inline
# # check prognostic value of clinicopathologiocal variables

# ls_vital = ['Stage', 'Grade','LV_Invasion','LN_Pos'] 
# s_time = 'cDays from Diagnosis to FU'
# s_censor = 'Survival'
# for s_vital in ls_vital:
#     print(s_vital)
#     df = df_vital.loc[df_vital.loc[:,'Primary Site _ Major Groups For Staging']=='Pancreas',[s_vital,s_time,s_censor]].dropna(how='any')
#     fig, __ = km_plot(df,s_vital,s_time,s_censor)
#     fig.savefig(f'figures/KM_clinicopath_{s_vital}.png')
#     #break

In [None]:
#CPH
s_time = 'cDays from Diagnosis to FU'
df_vital['Survival_time'] = df_vital.loc[:,s_time]
s_censor = 'Survival'
ls_vital = ['LV_Invasion','Stage', 'Grade','LN_Pos'] #'Age at Diagnosis','Sex',
for s_vital in ls_vital:
    print(s_vital)
    df = df_vital.loc[df_vital.loc[:,'Primary Site _ Major Groups For Staging']=='Pancreas',[s_vital,s_time,s_censor]].dropna(how='any')

    if df.columns.isin(['Stage']).any():
        df.Stage = df.Stage.replace({'I':1,'II':2,'III':3,'IV':4}).astype('int')
    if df.columns.isin(['Grade']).any():
        df.Grade = df.Grade.astype('int')
    if df.loc[:,s_vital].dtype=='O':
        df_dummy = pd.get_dummies(df.loc[:,[s_vital]],drop_first=True)
        df.drop(s_vital,axis=1,inplace=True)
        s_vital = df_dummy.columns[0]
        df[s_vital] = df_dummy
    fig, cph = cph_plot(df,s_vital,s_time,s_censor,figsize=(3,1.5))
    plt.tight_layout()
    fig.savefig(f'figures/CPH_single_{s_vital}_all.png')
    break

### omics data

In [None]:
#load full patient data, neoadjuvant
df_patient = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset1_age.xlsx',
                           sheet_name='Patients - Tab 1')
df_patient.rename({'Patient ID':'Public_Patient_ID'},axis=1,inplace=True)
df_patient.loc[df_patient.loc[:,'Lung Met Present']=='YES','Cohort'] = 'Lung'
df_patient.loc[df_patient.loc[:,'Liver Met Present']=='YES','Cohort'] = 'Liver'

#df_patient['Public.Specimen.ID'] = [item + '-T' for item in df_patient.loc[:,'Patient ID']]

#neo
df_patient.loc[:,'Neoadjuvant Treatment'].unique()
df_patient.loc[df_patient.loc[:,'Neoadjuvant Treatment'] == 'Yes Neoadjuvant','Neoadjuvant'] = 'Yes'
df_patient['Neoadjuvant'] = df_patient.Neoadjuvant.fillna('No')


#load T cell data
df_tcell = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset6.xlsx',
                         sheet_name='Tumor Samples')

df_tcell_blood = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset6.xlsx',
                         sheet_name='Blood Samples')

#both
df_tcell = df_tcell.merge(df_tcell_blood,on='Patient ID',suffixes=(' tumor',' blood'),how='outer')

df_tcell.rename({'Productive Rearrangements (Observed Richness)':'Productive_Rearrangements',
                'Templates per ng':'Templates_per_ng','Patient ID':'Public_Patient_ID'},axis=1,inplace=True)

#merge
df_patient = df_patient.merge(df_tcell,on='Public_Patient_ID',how='outer')

#load pSUB
df_gsva = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset1.xlsx', sheet_name='GSVA Scores - Tab 3')
df_gsva.rename({'pSUB\nPrimaries':'pSUB_Primary','pORG\nPrimaries':'pORG_Primary',
                'Patient ID':'Public_Patient_ID'},axis=1,inplace=True)

d_gsva = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset1.xlsx', sheet_name=None)

df_gsva = df_gsva.merge(d_gsva['Specimen Subtype - Tab 2'].loc[:,['PurIST Score','Patient Specimen ID','PurIST Subtype']],on='Patient Specimen ID')

df_patient = df_patient.merge(df_gsva.loc[:,~df_gsva.columns.str.contains('\n')],on='Public_Patient_ID',how='outer')


In [None]:
df_gene = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset2.xlsx',
                           sheet_name='Mutation Data')

ls_genes = []
for s_gene in df_gene.Gene.unique():  
    ls_patient = df_gene.loc[df_gene.Gene==s_gene,'Patient Specimen ID']
    if len(ls_patient) > 10:
        print(s_gene)
        df_patient.loc[df_patient.loc[:,'Patient Specimen ID'].isin(ls_patient),f'{s_gene}_altered'] = True
        df_patient.loc[:,f'{s_gene}_altered'].fillna(False,inplace=True)
        ls_genes.append(f'{s_gene}_altered')
    #break


### overlap in omics data samples

In [None]:
#the samples overlap/don't overlap in assays
# all samples 

from matplotlib_venn import venn3, venn3_circles
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from pyvenn import venn
d_ids = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}annotation/Simplified_Public_IDs_Key.xlsx',sheet_name=None)

print(d_ids.keys())
d_keys = {#'OPTR.Specimen.ID':['RnaSeqKey','TcrTumorKey','DnaPanelKey'], #'',
          'OPTR':['RnaSeqKey','TcrTumorKey','TcrBloodKey','DnaPanelKey'],
          #'OPTR':['RnaSeqKey','TcrTumorKey','DnaPanelKey'],
       }

for s_col ,ls_keys in d_keys.items():
    d_sets = {}
    for s_key in ls_keys:
            print(s_key)
            print(len(d_ids[s_key]))
            d_sets.update({s_key:d_ids[s_key]})
    #plot
    if len(ls_keys) == 4:
        labels = venn.get_labels([set(item.loc[:,s_col]) for key, item in d_sets.items()])
        fig,ax = venn.venn4(labels, names=[key.split('Key')[0] for key, item in d_sets.items()])#,ax=ax
    elif len(ls_keys) == 3:
        fig,ax = plt.subplots(figsize=(3,3),dpi=300)
        venn3([set(item.loc[:,s_col]) for key, item in d_sets.items()], [key.split('Key')[0] for key, item in d_sets.items()],ax=ax)


In [None]:


# #primaries

# from matplotlib_venn import venn3, venn3_circles
# from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
# from pyvenn import venn
# d_ids = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}annotation/Simplified_Public_IDs_Key.xlsx',sheet_name=None)

# print(d_ids.keys())
# d_keys = {#'OPTR.Specimen.ID':['RnaSeqKey','TcrTumorKey','DnaPanelKey'], #'',
#           #'OPTR':['RnaSeqKey','TcrTumorKey','TcrBloodKey','DnaPanelKey'],
#           'OPTR':['RnaSeqKey','TcrTumorKey','DnaPanelKey'],
#        }

# for s_col ,ls_keys in d_keys.items():
#     d_sets = {}
#     for s_key in ls_keys:
#             print(s_key)
#             print(len(d_ids[s_key]))
#             df_set = d_ids[s_key][~d_ids[s_key].loc[:,'OPTR.Specimen.ID'].str.contains('-M')]
#             d_sets.update({s_key:df_set})
#     #plot
#     if len(ls_keys) == 4:
#         labels = venn.get_labels([set(item.loc[:,s_col]) for key, item in d_sets.items()])
#         fig,ax = venn.venn4(labels, names=[key.split('Key')[0] for key, item in d_sets.items()],ax=ax)
#     elif len(ls_keys) == 3:
#         fig,ax = plt.subplots(figsize=(3,3),dpi=300)
#         venn3([set(item.loc[:,s_col]) for key, item in d_sets.items()], [key.split('Key')[0] for key, item in d_sets.items()],ax=ax)


In [None]:
df_id = d_ids['RnaSeqKey']
ls_ids = df_id.loc[:,'Public.Specimen.ID']

# add patients w/o RNA seq
for s_key in ['TcrTumorKey','TcrBloodKey']:
    df_add = d_ids[s_key].loc[~d_ids[s_key].loc[:,'Public.Specimen.ID'].isin(ls_ids)]
    df_id = pd.concat([df_id,df_add])

#check
print(df_id.loc[:,'OPTR.Specimen.ID'].duplicated().any())

ls_drop = df_id.loc[df_id.loc[:,'OPTR.Specimen.ID'].str.contains('-T2')].index
df_unique = df_id.loc[df_id.loc[:,'OPTR.Specimen.ID'].str.contains('-T')].drop(ls_drop)

#check
print(df_unique.OPTR.duplicated().any())

#add id
df_vital['Public_Patient_ID'] = df_vital.OPTR.map(dict(zip(df_unique.OPTR,df_unique.loc[:,'Biolibrary.Subject.ID'])))

#omics data plus clinical data
df_patient = df_patient.merge(df_vital,on='Public_Patient_ID',how='left')

#df_id.to_csv('Patient_IDs.csv')

In [None]:
#add age category
df_patient.loc[df_patient.loc[:,'Age at Diagnosis'] > 70,'Age'] = '>70'
df_patient.loc[df_patient.loc[:,'Age at Diagnosis'] <= 70,'Age'] = '<=70'
df_patient = df_patient.drop(ls_drop)

In [None]:
# drop less than 30 days
# died of surgery
ls_drop_surgery = df_patient[(df_patient.loc[:,'Days from Resection to FU'] < 30) & (df_patient.loc[:,'cVitalStatus'] == 'Dead')].index
df_patient = df_patient.drop(ls_drop_surgery)

## Survival Analysis - patients in paper

KM, CPH

In [None]:
#CPH
df_result=pd.DataFrame()
ls_vital = ['Neoadjuvant','Age','Sex','Stage', 'Grade','LV_Invasion','LN_Pos', #'Age at Diagnosis',
       'PurIST Score']
#ls_vital = ls_genes
for s_vital in ls_vital:
    print(s_vital)
    df = df_patient.loc[:,[s_vital,s_time,s_censor]].dropna(how='any')
    if df.columns.isin(['Stage']).any():
        df.Stage = df.Stage.replace({'I':1,'II':2,'III':3,'IV':4}).astype('int')
    if df.columns.isin(['Grade']).any():
        df.Grade = df.Grade.astype('int')
    if df.loc[:,s_vital].dtype=='O':
        df_dummy = pd.get_dummies(df.loc[:,[s_vital]],drop_first=True)
        df.drop(s_vital,axis=1,inplace=True)
        s_vital = df_dummy.columns[0]
        df[s_vital] = df_dummy
    if s_vital.find('_Up_')>-1:
        d_rename = {s_vital:s_vital.split('_Up')[0]}
        df=df.rename(d_rename,axis=1)
        s_vital = s_vital.split('_Up')[0]
    fig, cph = cph_plot(df,s_vital,s_time,s_censor,figsize=(3,1.5))
    plt.tight_layout()
    fig.savefig(f'figures/CPH_single_{s_vital}_subset.png')
    df_result=pd.concat([df_result,cph.summary.loc[:,['exp(coef)','p']]])
    break

In [None]:
%matplotlib inline
ls_drop = df_patient[~df_patient.loc[:,'Patient Specimen ID'].str.contains('-T2').fillna(False)].index
# check prognostic value of clinicopathologiocal variables
import lifelines
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.statistics import multivariate_logrank_test
ls_vital = ['Neoadjuvant','Age','Sex','Stage', 'Grade','LV_Invasion','LN_Pos'] 
#ls_vital  = ls_genes
s_time = 'cDays from Diagnosis to FU'
s_censor = 'Survival'
for s_vital in ls_vital:
    print(s_vital)
    df = df_patient.loc[df_patient.Stage!='0',[s_vital,s_time,s_censor]].dropna(how='any')
    df.loc[:,s_vital] = df.loc[:,s_vital].replace({True:'altered',False:'WT'})
    fig, __ = km_plot(df,s_vital,s_time,s_censor)
    fig.savefig(f'figures/KM_clinicopath_{s_vital}_subset.png')
    break
    


## gene signature scores

pORG, pSUB GSVA

In [None]:
df_pORG = pd.read_csv(f'{codedir}/data/GSVA_Scores_Link.csv')#=,index_col='Public_Specimen_ID'
df_pORG['Patient Specimen ID'] = df_pORG.Public_Specimen_ID#[item.split('-')[0] + '-' + item.split('-')[1] for item in df_pORG.Public_Specimen_ID]
df_patient = df_patient.merge(df_pORG,on='Patient Specimen ID',suffixes=('',"_x"))

In [None]:
ls_drop_T2 = df_patient[df_patient.Public_Specimen_ID.str.contains('-T2')].index
df_patient.drop(ls_drop_T2,axis=0,inplace=True)

In [None]:
#CPH
#df_result=pd.DataFrame()
ls_vital = [ #'Age at Diagnosis',
      'Original_pORG_Up_78_Genes', 'txi_pORG_Up_42_Genes',
       'Original_pSUB_Up_100_Genes', 'txi_pSUB_Up_100_Genes', ]
for s_vital in ls_vital:
    print(s_vital)
    df = df_patient.loc[:,[s_vital,s_time,s_censor]].dropna(how='any')
    if df.columns.isin(['Stage']).any():
        df.Stage = df.Stage.replace({'I':1,'II':2,'III':3,'IV':4}).astype('int')
    if df.columns.isin(['Grade']).any():
        df.Grade = df.Grade.astype('int')
    if df.loc[:,s_vital].dtype=='O':
        df_dummy = pd.get_dummies(df.loc[:,[s_vital]],drop_first=True)
        df.drop(s_vital,axis=1,inplace=True)
        s_vital = df_dummy.columns[0]
        df[s_vital] = df_dummy
    if s_vital.find('_Up_')>-1:
        d_rename = {s_vital:s_vital.split('_Up')[0]}
        df=df.rename(d_rename,axis=1)
        s_vital = s_vital.split('_Up')[0]
    fig, cph = cph_plot(df,s_vital,s_time,s_censor,figsize=(3,1.5))
    plt.tight_layout()
    fig.savefig(f'figures/CPH_single_{s_vital}_subset.png')
    df_result=pd.concat([df_result,cph.summary.loc[:,['exp(coef)','p']]])
    break

In [None]:
#CPH
df_result_multi = pd.DataFrame()
ls_drop = df_patient[~df_patient.loc[:,'Patient Specimen ID'].str.contains('-T2').fillna(False) & ~df_patient.pSUB_Primary.isna()].index
ls_multi = ['txi_pORG_Up_42_Genes','pORG_Primary','pSUB_Primary','PurIST Score','txi_pSUB_Up_100_Genes',]
ls_cats = ['TP53_altered','CDKN2A_altered','LV_Invasion','LN_Pos','Age']
for s_multi in ls_multi:
    #why does PurIST have extra patients?
    df = df_patient.loc[ls_drop,[s_multi,s_time,s_censor,'Grade','Stage']] ##'Neoadjuvant','Age at Diagnosis'
    df_dummy = pd.get_dummies(df_patient.loc[:,ls_cats],drop_first=True) #'TP53_altered','CDKN2A_altered',,'Sex'
    df = pd.concat([df,df_dummy],axis=1)
    fig, cph = cph_plot(df,s_multi,s_time,s_censor)
    fig.savefig(f'figures/CPH_{s_multi}.png')
    df_result_multi=pd.concat([df_result_multi,cph.summary.loc[:,['exp(coef)','p']]])
    #break

In [None]:
#save out the metadata
ls_drop = ['Liver Met Present in Patient tumor',
       'Lung Met Present in Patient tumor','Lung Met Present in Patient blood',
       'Liver Met Present in Patient blood','Unnamed: 0',#'OPTR',
          ]
if not os.path.exists('Patient_metadata_clean.csv'):
    print('saving')
    df_patient.drop(ls_drop,axis=1).to_csv('Patient_metadata_clean.csv')

# Section 5 <a name="newsurv"></a>

new patient survival


[contents](#contents)

In [None]:
df_id = pd.read_csv('Patient_IDs.csv')

In [None]:
# # load new survival and de-identify (done)
# df_new_surv = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}annotation/cancer_participant_overview_optr_2023-05-04_16-55-18.xlsx')#,dtype=object
# df_new_surv.loc[:,'Participant_ID'] = df_new_surv.loc[:,'Participant ID'].astype('str')
# df_new_surv['Vital_Status'] = df_new_surv.loc[:,'Vital Status at Follow Up'].replace({'Alive':0,'Dead':1})
# #load new pORG
# df_pORG = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}data/PartialReport_pORG_pSUB_Update.xlsx',sheet_name='GSVA_Scores')
# df_pORG['Participant_ID'] = [item.split('-')[0] for item in df_pORG.SpecimenID]
# df_pORG['Type_TM'] = [item.split('-')[1] for item in df_pORG.SpecimenID]
# ls_drop = df_pORG[df_pORG.Type_TM=='T2'].index
# df_pORG.drop(ls_drop,inplace=True)
# #merge scores
# df_new = df_pORG.merge(df_new_surv,on='Participant_ID',how='left',suffixes=('','_x'))
# d_zip = dict(zip(df_id.loc[:,'OPTR.Specimen.ID'],df_id.loc[:,'Public.Specimen.ID']))
# df_new['Public_Specimen_ID'] = df_new.SpecimenID.map(d_zip)
# # save
# ls_drop_col = ['SpecimenID','Participant_ID'] #[]#
# df_new.drop(ls_drop_col,axis=1).to_csv('Updated_survival_scores_Link.csv')


In [None]:
df_new = pd.read_csv('Updated_survival_scores_OPTR.csv',index_col=0) #Updated_survival_scores_Link
df_patient= pd.read_csv('Patient_metadata_clean.csv',index_col=0)
df_patient['SpecimenID'] = df_patient.Public_Specimen_ID.map(dict(zip(df_id.loc[:,'Public.Specimen.ID'],df_id.loc[:,'OPTR.Specimen.ID'])))

#merge
df_new_pt = df_patient.merge(df_new,on='SpecimenID',suffixes=('','_x'))



In [None]:
#list od patients still alive
df_alive = df_patient[(df_patient.Survival==0)& (df_patient.Cohort=='Lung')] #  & (df_patient.Cohort=='Lung').Public_Specimen_ID
ls_index = df_new[df_new.Public_Specimen_ID.isin(df_alive.Public_Specimen_ID)].loc[:,'Date of Last Follow Up'].sort_values()[-3::]#.index
([print(item) for item in df_new.loc[ls_index.index,'SpecimenID']])



In [None]:
#CPH
s_censor = 'Vital_Status'
s_time = 'Overall Survival (Days)'
df_result_multi = pd.DataFrame()
ls_drop = df_new_pt[~df_new_pt.loc[:,'SpecimenID'].str.contains('-T2').fillna(False) & ~df_new_pt.txi_pORG_Up_42_Genes.isna()].index
ls_multi = ['txi_pORG_Up_42_Genes','pORG_Primary','pSUB_Primary','PurIST Score','txi_pSUB_Up_100_Genes',]
ls_cats = ['TP53_altered','CDKN2A_altered','LV_Invasion','LN_Pos','Age']
for s_multi in ls_multi:
    #why does PurIST have extra patients?
    df = df_new_pt.loc[ls_drop,[s_multi,s_time,s_censor,'Grade','Stage']] ##'Neoadjuvant','Age at Diagnosis'
    df_dummy = pd.get_dummies(df_patient.loc[:,ls_cats],drop_first=True) #'TP53_altered','CDKN2A_altered',,'Sex'
    df = pd.concat([df,df_dummy],axis=1)
    fig, cph = cph_plot(df,s_multi,s_time,s_censor)
    fig.savefig(f'figures/CPH_{s_multi}.png')
    df_result_multi=pd.concat([df_result_multi,cph.summary.loc[:,['exp(coef)','p']]])
    #break

In [None]:
df_new_surv.loc[:,'Participant ID'].isin([4329]).any()
df_new_surv.loc[:,'Participant ID'].isin([4104]).any()

# Section 6 <a name="geneexp"></a>

gene expresison correlation


[contents](#contents)

In [None]:
df_vst = pd.read_csv('data/VST_Genes_Link.csv',index_col=0)
df_rna = df_vst.T.copy()
df_rna.head()

In [None]:
df_pORG = pd.read_csv(f'{codedir}/data/GSVA_Scores_Link.csv')#=,index_col='Public_Specimen_ID'
df_pORG['Patient Specimen ID'] = df_pORG.Public_Specimen_ID
df_pORG

In [None]:

ls_add = ['Original_pORG_Up_78_Genes',
 'txi_pORG_Up_42_Genes',
 'Original_pSUB_Up_100_Genes',
 'txi_pSUB_Up_100_Genes']

for s_add in ls_add:
    d_map = dict(zip(df_pORG.loc[:,'Patient Specimen ID'],df_pORG.loc[:,s_add]))
    df_rna[s_add] = df_rna.index.map(d_map)

In [None]:
#df_rna.isna().sum().sum()

In [None]:

d_rename={#'pORG_Up_78':'pORG',
          'MS4A1':'MS4A1 (CD20)'}
from scipy.stats import pearsonr
dim = (4,3)
    
for s_add in ls_add:
    ls_marker = ['CD3E','CD4', 'CD8A', 'MS4A1',s_add]
    df_all = df_rna.loc[:,ls_marker].corr().rename(d_rename,axis=1).rename(d_rename,axis=0)
    print(len(df_rna))
    g = sns.clustermap(df_all)
    plt.close()
    categories_order = df_all.iloc[g.dendrogram_col.reordered_ind,:].index.tolist()
    df_all = df_all.loc[categories_order,categories_order]
    rho = df_rna.loc[:,ls_marker].corr() #df_all.corr()
    pval = df_rna.loc[:,ls_marker].corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
    p_vals = pval.applymap(lambda x: ''.join(['*' for t in [0.001,0.005,0.05] if x<=t]))
    p_vals = p_vals.rename(d_rename,axis=1).rename(d_rename,axis=0)
    p_vals = p_vals.loc[categories_order,categories_order]
    fig, ax = plt.subplots(figsize=dim,dpi=300)
    sns.heatmap(df_all, vmin=-1, vmax=1, annot=p_vals, fmt = '', cmap='RdBu_r',ax=ax,
               cbar_kws={'shrink':0.85,'label':s_add})

    #ax.set_title(f'Cell Type versus Scores', fontdict={'fontsize':16}, pad=12);
    #fig.savefig(f'{codedir}/{s_date}/heatmap_Celltype_vs_pORG_purIST.png', dpi=300, bbox_inches='tight')
    '''
    fig, ax = plt.subplots(figsize=dim,dpi=300)
    matrix = np.triu(np.ones_like(rho))
    np.fill_diagonal(matrix, val=0)
    np.fill_diagonal(p_vals.values,'')
    sns.heatmap(df_all, vmin=-1, vmax=1, annot=p_vals, fmt = '', cmap='RdBu_r',
                ax=ax,mask=matrix,cbar_kws={'shrink':0.85,'label':'Pearson Correlation'}) #'anchor':(-1.4,0.0)
    #'''
    #pval
    #break

## Section 6 <a name="tcell"></a>

re-analyze t cell data

289 blood samples with matching 175 primary
tumors (141 overlapping with the RNA-seq dataset) and 43 metastatic tumors (33 overlapping with
the RNA-seq dataset). (218)

**missing 2 tumor, all 289 blood there**

TOTAL = 174

(total pORG = 171)


we analyzed blood samples from 77 patients in the liver cohort and 16
patients in the lung cohort, of which 60 and 16 were matched with tumor samples from the same
patient, respectively

TOTAL = 94 blood
TOTAL = 76 tumor

tumor distinct clones
used data from 214 matched pairs of tumor and blood samples

**213 are there, missing 1**

(TCR tumor: 59 and 16)


[contents](#contents)

In [None]:
#load full patient data, neoadjuvant
from scipy.stats import f_oneway
df_patient = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset1_age.xlsx',
                           sheet_name='Patients - Tab 1')
df_patient.rename({'Patient ID':'Public_Patient_ID'},axis=1,inplace=True)
df_patient.loc[df_patient.loc[:,'Lung Met Present']=='YES','Cohort'] = 'Lung'
df_patient.loc[df_patient.loc[:,'Liver Met Present']=='YES','Cohort'] = 'Liver'

#df_patient['Public.Specimen.ID'] = [item + '-T' for item in df_patient.loc[:,'Patient ID']]

#neo
df_patient.loc[:,'Neoadjuvant Treatment'].unique()
df_patient.loc[df_patient.loc[:,'Neoadjuvant Treatment'] == 'Yes Neoadjuvant','Neoadjuvant'] = 'Yes'
df_patient['Neoadjuvant'] = df_patient.Neoadjuvant.fillna('No')


#load T cell data
df_tcell = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset6.xlsx',
                         sheet_name='Tumor Samples')
print(len(df_tcell))

df_tcell_blood = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset6.xlsx',
                         sheet_name='Blood Samples')
print(len(df_tcell_blood))
#both
df_tcell = df_tcell.merge(df_tcell_blood,on='Patient ID',suffixes=(' tumor',' blood'),how='outer')

df_tcell.rename({'Productive Rearrangements (Observed Richness)':'Productive_Rearrangements',
                'Templates per ng':'Templates_per_ng','Patient ID':'Public_Patient_ID'},axis=1,inplace=True)


#load pSUB, pORG
df_gsva = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset1.xlsx', sheet_name='GSVA Scores - Tab 3')
df_gsva.rename({'pSUB\nPrimaries':'pSUB_Primary','pORG\nPrimaries':'pORG_Primary','pSUB\nAll':'pSUB_All',
                'pORG\nAll':'pORG_All',
                'Patient Specimen ID':'Patient_Specimen_ID'},axis=1,inplace=True)

# # #FNA #changes to 34 high
# ls_fna = df_gsva.Patient_Specimen_ID[df_gsva.Patient_Specimen_ID.str.contains('-F')]
# ls_fna_pt = [item.split('-F')[0] for item in ls_fna]
# ls_fna_index = df_tcell.loc[(df_tcell.Public_Patient_ID.isin(ls_fna_pt)) & (df_tcell.loc[:,'Tumor Type'].isna()),:].index
# df_tcell.loc[ls_fna_index,'Tumor Type'] = '-F'

#met or primary
df_tcell['Type_TM'] = df_tcell.loc[:,'Tumor Type'].replace({'Primary':'-T', 'Met':'-M'})
df_tcell['Patient_Specimen_ID'] = df_tcell.Public_Patient_ID + df_tcell.Type_TM

df_tcell['Cohort'] = df_tcell.Public_Patient_ID.map(dict(zip(df_patient.Public_Patient_ID,df_patient.Cohort)))
for s_cat in ['pORG_Primary','pSUB_Primary','pSUB_All','pORG_All']:
    df_tcell[s_cat] = df_tcell.Patient_Specimen_ID.map(dict(zip(df_gsva.Patient_Specimen_ID,df_gsva.loc[:,s_cat])))

# load purIST
df_gsva = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset1.xlsx', sheet_name='Specimen Subtype - Tab 2')
df_gsva.rename({'Patient Specimen ID':'Patient_Specimen_ID'},axis=1,inplace=True)
for s_cat in ['PurIST Subtype']:
    df_tcell[s_cat] = df_tcell.Patient_Specimen_ID.map(dict(zip(df_gsva.Patient_Specimen_ID,df_gsva.loc[:,s_cat])))
    
df_merge = df_tcell.copy()


In [None]:
#add pORG quartiles
s_porg = 'pORG_Primary'# 'pORG_All'#
x = df_merge.loc[:,s_porg]

d_cut = {'quartiles':(4,['low','med-low','med-high','high']),
        'medians' : (2,['low','high'])}
 
for s_col, tu_cut in d_cut.items():
    i_cut = tu_cut[0]
    labels = tu_cut[1]
    q = pd.qcut(x, q=i_cut,labels=labels) 
    if s_col == 'quartiles':
        df_merge[s_col] = q.replace({'med-low':np.NaN,'med-high':np.NaN})
    else:
        df_merge[s_col] = q
    print(df_merge[s_col].value_counts())
    
df_merge['Tumor Distinct Clones 100-x'] = 100 - df_merge.loc[:,'Percentage Tumor-Distinct Clones in Paired Tumor Sample']

In [None]:
# the greater the absolute value of the slope, the greater the diversity.
# redefine liver patient?

#b_primary = False #  True #
if s_porg == 'pORG_Primary': 
    df_pri = df_merge[df_merge.loc[:,'Tumor Type']=='Primary']  # 
else:
    df_pri = df_merge


# if b_primary:
#     df_pri.loc[df_pri.loc[:,'Tumor Type'] != 'Primary',['quartiles','median']] = np.nan
#     print(df_pri.Cohort.isna().sum())


ls_foci = ['Templates_per_ng',
           "Simpson's Evenness tumor",
           "Simpson's Evenness blood",
          'Tumor Distinct Clones 100-x',
          ]

d_order =  {
    'PurIST Subtype':['basal-like','classical'],
    'Cohort':['Liver','Lung'],
    'quartiles':['high','low'],
    #'medians':['high','low']
     }#tertiles_h # quartiles

for s_foci in ls_foci:
    print(s_foci)
    figsize=(3.5,2.8)
    fig,ax=plt.subplots(dpi=300,figsize=figsize)
    order = []
    ls_ticks = []
    d_pval = {}
    df_both = pd.DataFrame()
    for idx, s_cat in enumerate(d_order.keys()):
        #print(s_cat)
        ls_order = d_order[s_cat]
        s_bad = ls_order[0]
        s_good = ls_order[1]
        d_replace = {s_bad:'bad',s_good:'good'}
        a = df_pri.loc[df_pri.loc[:,s_cat]==ls_order[0],s_foci].dropna()
        b = df_pri.loc[df_pri.loc[:,s_cat]==ls_order[1],s_foci].dropna()
        ##statistic, pvalue = stats.ttest_ind(a,b,alternative=s_alt)
        #print(len(a) + len(b))
        statistic, pvalue = f_oneway(b,a)
        df_pri['hue'] = df_pri.loc[:,s_cat].replace(d_replace)
        df_pri['x'] = s_cat
        df_both=pd.concat([df_both,df_pri.loc[:,['x','hue',s_foci]]])
        for s_test in ls_order:
            order.append((s_cat,d_replace[s_test]))
            ls_ticks.append(s_test)
        d_pval.update({s_cat:pvalue})
    sns.violinplot(data=df_both,y=s_foci,x='x',hue='hue',ax=ax,alpha=0.7,linewidth=1,cut=0,inner='quartile',hue_order=['bad','good'])#hue='Ki67pos'
    sns.stripplot(data=df_both,y=s_foci,x='x',hue='hue',s=2,dodge=True,ax=ax,palette='dark',jitter=0.2,alpha=0.3,hue_order=['bad','good']) #hue='Ki67pos'
    #annotate
    if len(order) == 6:
        pairs = [(order[0],order[1]),(order[2],order[3]),(order[4],order[5])]
        pvalues = [d_pval[list(d_order.keys())[0]],d_pval[list(d_order.keys())[1]],d_pval[list(d_order.keys())[2]]]
    else:
        pairs = [(order[0],order[1]),(order[2],order[3]),(order[4],order[5]),(order[6],order[7])]
        pvalues = [d_pval[list(d_order.keys())[0]],d_pval[list(d_order.keys())[1]],d_pval[list(d_order.keys())[2]],d_pval[list(d_order.keys())[3]]]
    reject, corrected, __, __ = statsmodels.stats.multitest.multipletests(pvalues,method='fdr_bh')
    formatted_pvalues = [f'p={pvalue:.2}' for pvalue in list(corrected)]#corrected #pvalues
    annotator = Annotator(ax, pairs=pairs, data=df_both,y=s_foci,x='x',hue='hue',verbose=False)
    annotator.set_custom_annotations(formatted_pvalues)
    #annotator.configure(line_height=0.9,line_offset=10)
    annotator.annotate()
    ax.legend().remove()
    if len(order) == 6:
        ax.set_xticks([-0.2,0.2, 0.8,1.2,1.8,2.2])
    else:
        ax.set_xticks([-0.2,0.2, 0.8,1.2,1.8,2.2,2.8,3.2])
    ax.set_xticklabels(ls_ticks,rotation=45)
    ax.set_xlabel('')
    ax.set_title(f"{s_foci}", fontsize='x-large') #
    plt.tight_layout()
    fig.savefig(f'figures/violinplot_both_{s_foci}_{list(d_order.keys())[-1]}_all.png')
    #break

In [None]:
df_both.dropna().groupby(['x','hue']).count()
#35, 35 low pORG primary!!
#16 lung, 59 liver 
# 45 basal like

### additional TCR data

In [None]:
# df_patient = pd.read_csv('data/Patient_metadata_clean.csv',index_col=0)


df_tum = pd.read_excel('Diversity Metrics/Diversity Metrics - All Tumors 210809.xlsx',sheet_name='diversity')
#add cohort
df_tum['Cohort'] = (df_tum.Liver.astype('str') + df_tum.Lung.astype('str')).replace({'00':np.nan,'10':'Liver','11':'Liver','01':'Lung'})
print(df_tum['Cohort'].value_counts())
#add pORG quartiles
s_porg = 'pORG-78UP-All'#'pORG-78UP-Primaries Halves' #
x = df_tum.loc[:,s_porg]
d_cut = {'quartiles':(4,['low','med-low','med-high','high']),
        'median' : (2,['low','high']), 
         'tertiles' : (3,['low','med','high']),
        }
 
for s_col, tu_cut in d_cut.items():
    i_cut = tu_cut[0]
    labels = tu_cut[1]

    q = pd.qcut(x, q=i_cut,labels=labels) 
    if s_col == 'quartiles':
        df_tum[s_col] = q.replace({'med-low':np.NaN,'med-high':np.NaN})
    elif s_col == 'tertiles':
        df_tum[s_col] = q.replace({'med':np.NaN})
    else:
        df_tum[s_col] = q
    print(df_tum[s_col].value_counts())

## blood
df_bld = pd.read_excel('Diversity Metrics/Diversity Metrics - All Blood 210804.xlsx',sheet_name='Sheet1')
df_bld['OPTR ID'] = df_bld.loc[:,'sample'].astype('Int64',errors='ignore')
df_bld = df_bld[~df_bld['OPTR ID'].isna()].merge(df_tum.loc[:,['Cohort','OPTR ID',s_porg,'Subtype']],on='OPTR ID')
#add cohort
#df_bld['Cohort'] = (df_bld.Liver.astype('str') + df_bld.Lung.astype('str')).replace({'00':np.nan,'10':'Liver','11':'Liver','01':'Lung'})

#add pORG quartiles
x = df_bld.loc[:,s_porg]

d_cut = {'quartiles':(4,['low','med-low','med-high','high']),
        'median' : (2,['low','high']),
        'tertiles' : (3,['low','med','high']),
        }
 
for s_col, tu_cut in d_cut.items():
    i_cut = tu_cut[0]
    labels = tu_cut[1]
    q = pd.qcut(x, q=i_cut,labels=labels) 
    if s_col == 'quartiles':
        df_bld[s_col] = q.replace({'med-low':np.NaN,'med-high':np.NaN})
    elif s_col == 'tertiles':
        df_bld[s_col] = q.replace({'med':np.NaN})
    else:
        df_bld[s_col] = q
    print(df_bld[s_col].value_counts())

In [None]:
d_order =  {
    'Subtype':['basal-like','classical'],
    'Cohort':['Liver','Lung'],
    #'quartiles':['high','low'],
    'tertiles':['high','low']
     }#tertiles_h # quartiles
d_type = {
      'Tumor' :["Simpson's Evenness", 'Pielou Evenness',
       'Clone Distribution Slope', 
       'Total Productive Templates', 'Templates per ng',
       'Productive Rearrangements (Observed Richness)', 'Prod Rearr per ng',
       'iChao1', 'Efron Thisted Estimator', 'Richness Index',
        "Simpson's D", 'Simpson Clonality',
       'Max Prod Freq',  'Z-score Simpson Clonality',
       "Z-score Simpson's D", '% Tumor', 'Estimator Multiplier'],
        'Blood':[
            "Simpson's Evenness", 'Pielou Evenness',
       'Clone Distribution Slope', 'Total Productive Templates',
       'Productive Rearrangements (Observed Richness)', 'iChao1',
       'Efron Thisted Estimator', 
       'Max Prod Freq', 'Simpson Clonality', "Simpson's D",
        "Simpson's D Z-score", 'Clone Distribution Slope Z-score',
       "Simpson's Evenness Z-score", "Simpson Clonality Z-score",
       'Estimator Multiplier', 'Estimator Extrapolation Value']
         }

b_primary = True #True

for s_type, ls_foci in d_type.items():
    if s_type == 'Tumor':
        df_pri = df_tum  #
        if b_primary:
            df_pri = df_pri.loc[df_pri.loc[:,'Tumor Site']=='Pancreas']
    elif s_type == 'Blood':
        df_pri = df_bld  # df_bld #
        if b_primary:
            df_pri = df_pri.loc[df_pri.loc[:,'Blood Collected During Metastatic Disease'] == 'NO']
    for s_foci in ls_foci:
        print(s_foci)
        figsize=(3.5,2.8)
        fig,ax=plt.subplots(dpi=300,figsize=figsize)
        order = []
        ls_ticks = []
        d_pval = {}
        df_both = pd.DataFrame()
        for idx, s_cat in enumerate(d_order.keys()):
            #print(s_cat)
            ls_order = d_order[s_cat]
            s_bad = ls_order[0]
            s_good = ls_order[1]
            d_replace = {s_bad:'bad',s_good:'good'}
            a = df_pri.loc[df_pri.loc[:,s_cat]==ls_order[0],s_foci].dropna()
            b = df_pri.loc[df_pri.loc[:,s_cat]==ls_order[1],s_foci].dropna()
            ##statistic, pvalue = stats.ttest_ind(a,b,alternative=s_alt)
            #print(len(a) + len(b))
            statistic, pvalue = f_oneway(b,a)
            df_pri['hue'] = df_pri.loc[:,s_cat].replace(d_replace)
            df_pri['x'] = s_cat
            df_both=pd.concat([df_both,df_pri.loc[:,['x','hue',s_foci]]])
            for s_test in ls_order:
                order.append((s_cat,d_replace[s_test]))
                ls_ticks.append(s_test)
            d_pval.update({s_cat:pvalue})
        sns.violinplot(data=df_both,y=s_foci,x='x',hue='hue',ax=ax,alpha=0.7,linewidth=1,cut=0,inner='quartile',hue_order=['bad','good'])#hue='Ki67pos'
        sns.stripplot(data=df_both,y=s_foci,x='x',hue='hue',s=2,dodge=True,ax=ax,palette='dark',jitter=0.2,alpha=0.3,hue_order=['bad','good']) #hue='Ki67pos'
        #annotate
        if len(order) == 6:
            pairs = [(order[0],order[1]),(order[2],order[3]),(order[4],order[5])]
            pvalues = [d_pval[list(d_order.keys())[0]],d_pval[list(d_order.keys())[1]],d_pval[list(d_order.keys())[2]]]
        else:
            pairs = [(order[0],order[1]),(order[2],order[3]),(order[4],order[5]),(order[6],order[7])]
            pvalues = [d_pval[list(d_order.keys())[0]],d_pval[list(d_order.keys())[1]],d_pval[list(d_order.keys())[2]],d_pval[list(d_order.keys())[3]]]
        reject, corrected, __, __ = statsmodels.stats.multitest.multipletests(pvalues,method='fdr_bh')
        formatted_pvalues = [f'p={pvalue:.2}' for pvalue in list(pvalues)]#corrected #pvalues
        annotator = Annotator(ax, pairs=pairs, data=df_both,y=s_foci,x='x',hue='hue',verbose=False)
        annotator.set_custom_annotations(formatted_pvalues)
        #annotator.configure(line_height=0.9,line_offset=10)
        annotator.annotate()
        ax.legend().remove()
        if len(order) == 6:
            ax.set_xticks([-0.2,0.2, 0.8,1.2,1.8,2.2])
        else:
            ax.set_xticks([-0.2,0.2, 0.8,1.2,1.8,2.2,2.8,3.2])
        ax.set_xticklabels(ls_ticks,rotation=45)
        ax.set_xlabel('')
        ax.set_title(f"{s_foci} {s_type}", fontsize='x-large') #
        plt.tight_layout()
        if np.array([item < 0.05 for item in pvalues]).any():
            fig.savefig(f'figures/violinplot_original_data_{s_type}_{s_foci}_{list(d_order.keys())[-1]}_all.png')
        else:
            plt.close()
        #break
    break
          

In [None]:
alpha = 0.05
# pearsons
d_type = {
#       'Tumor' :["Simpson's Evenness", 'Pielou Evenness',
#        'Clone Distribution Slope', 
#        'Total Productive Templates', 'Templates per ng',
#        'Productive Rearrangements (Observed Richness)', 'Prod Rearr per ng',
#        'iChao1', 'Efron Thisted Estimator', 'Richness Index',
#         "Simpson's D", 'Simpson Clonality',
#        'Max Prod Freq',  'Z-score Simpson Clonality',
#        "Z-score Simpson's D", '% Tumor', 'Estimator Multiplier'],
        'Blood':[
            "Simpson's Evenness", 'Pielou Evenness',
       'Clone Distribution Slope', 'Total Productive Templates',
       'Productive Rearrangements (Observed Richness)', 'iChao1',
       'Efron Thisted Estimator', 
       'Max Prod Freq', 'Simpson Clonality', "Simpson's D",
        "Simpson's D Z-score", 'Clone Distribution Slope Z-score',
       "Simpson's Evenness Z-score", "Simpson Clonality Z-score",
       'Estimator Multiplier', 'Estimator Extrapolation Value']
         }

b_primary = True #False # 
x_value = 'pORG-78UP-All'
for s_type, ls_foci in d_type.items():
    if s_type == 'Tumor':
        df_pri = df_tum  #
        if b_primary:
            s_type = s_type + " Primaries" #'Mets'
            df_pri = df_pri.loc[df_pri.loc[:,'Tumor Site']=='Pancreas']
    elif s_type == 'Blood':
        df_pri = df_bld  # df_bld #
        if b_primary:
            s_type = s_type + " Primaries"  #' Mets'#
            df_pri = df_pri.loc[df_pri.loc[:,'Blood Collected During Metastatic Disease'] == 'NO']# 
    for y_value in ls_foci:
        fig,ax=plt.subplots(figsize=(3,2.8),dpi=200)
        df_pearson = df_pri.loc[:,[x_value,y_value]].dropna()
        sns.regplot(data=df_pearson, x=x_value, y=y_value,ax=ax,line_kws={'color':'k'})
        stat, pvalue = stats.pearsonr(x=df_pearson.loc[:,x_value], y=df_pearson.loc[:,y_value])
        ax.set_title(f'{y_value} {s_type}\np={pvalue:.3} n={len(df_pearson)}')
        if pvalue > alpha:
            plt.close(fig)
        #break
    #break

## Section 7 <a name="split"></a>
reproduce figures

evaluate high/low

[contents](#contents)

In [None]:
#1/3 splits
df_patient = pd.read_csv('data/Patient_metadata_clean.csv',index_col=0)
#df_patient.loc[:,'OPTR ID'] = df_pri.OPTR.astype('Int64')
#df_patient = df_patient[~df_patient.loc[:,'OPTR ID'].isna()].merge(df_tum.loc[:,['OPTR ID','Cohort']],on='OPTR ID',suffixes=('_x',''))
#primaries
df_pri = df_patient[(~df_patient.loc[:,'Patient Specimen ID'].str.contains('-T2')) ].copy() #(~df_patient.loc[:,'txi_pORG_Up_42_Genes'].isna())
#& (df_pri.loc[:,'Primary Site _ Major Groups For Staging'])
##df_pri.to_csv('Patient_List.csv')
df_pri = df_patient

In [None]:
#CPH
ls_multi = [#'txi_pORG_Up_42_Genes',
            'Original_pORG_Up_78_Genes', 
       #'Original_pSUB_Up_100_Genes', 
    #'txi_pSUB_Up_100_Genes',
    'PurIST Score']
d_col = {'quartiles':(4,['low','med-low','med-high','high']), #
         #'median':(2,['low','high']),
         #'tertiles':(3,['low','med','high']),
        #'tertiles_h':([0, .33, 1.],['low','high']),
        # 'tertiles_l':([0, .66, 1.],['low','high'])
        }
b_plot = True
b_two = True
for s_col, tu_cut in d_col.items():
    i_cut = tu_cut[0]
    labels = tu_cut[1]
    for s_multi in ls_multi:
        x = df_pri.loc[:,s_multi]
        q = pd.qcut(x, q=i_cut,labels=labels) 
        if b_two:
            if s_col == 'quartiles':
                df_pri[s_col] = q.replace({'med-low':np.NaN,'med-high':np.NaN})
            elif s_col == 'tertiles':
                df_pri[s_col] = q.replace({'med':np.NaN})
                #print('')
            else:
                df_pri[s_col] = q 
        else:
            df_pri[s_col] = q 
        if b_plot:
            df = df_pri.loc[:,[s_col,s_time,s_censor]].dropna()
            fig, ls_order = km_plot(df,s_col,s_time,s_censor)
            fig.suptitle(s_multi,y=.93)
            plt.tight_layout()
            fig.savefig(f'figures/KM_{s_col}_{s_multi}.png')
#         break
        break

In [None]:
# ls_multi = ['quartiles', 'median', 'tertiles', 'tertiles_h', 'tertiles_l']
# s_time = 'cDays from Diagnosis to FU'
# s_censor = 'Survival'
# for s_multi in ls_multi:
#     #why does PurIST have extra patients?
#     df = df_pri.loc[:,[s_time,s_censor,'Grade','Stage']]
#     df_dummy = pd.get_dummies(df_pri.loc[:,['LV_Invasion','LN_Pos','Neoadjuvant',s_multi]],drop_first=True)
#     df = pd.concat([df,df_dummy],axis=1)
#     fig = cph_plot(df,f'{s_multi}_high',s_time,s_censor)
#     fig.savefig(f'figures/CPH_{s_multi}.png')
#     #break

## figure 5

In [None]:
# the greater the absolute value of the slope, the greater the diversity.
# redefine liver patient?
from scipy.stats import f_oneway
df_pri['Tumor Distinct Clones'] = 100 - df_pri.loc[:,'Percentage Tumor-Distinct Clones in Paired Tumor Sample']
df_pri["Simpson's Diversity tumor"] = 1 - df_pri.loc[:,"Simpson's Evenness tumor"]
df_pri["Simpson's Diversity blood"] = 1 - df_pri.loc[:,"Simpson's Evenness blood"]

ls_foci = [#'pORG_Primary', sanity
#     ('Templates_per_ng','less'),
#     ('Productive_Rearrangements','less'),
#     ('Clone Distribution Slope blood','less'),
#     ('Pielou Evenness blood','less'),
#     ("Simpson's Evenness blood",'less'),
#     ('Clone Distribution Slope tumor','less'),
#     ('Pielou Evenness tumor','less'),
#     ("Simpson's Evenness tumor",'greater'),
#     ("Simpson's Diversity tumor",'greater'),
#     ("Simpson's Diversity blood",'greater'),
    ('Tumor Distinct Clones','less'),
    ]
d_order =  {
    'PurIST Subtype':['basal-like','classical'],
    'Cohort':['Liver','Lung'],
    'quartiles':['high','low']}#tertiles_h # quartiles
for s_foci_alt in ls_foci:
    s_foci = s_foci_alt[0]
    print(s_foci)
    s_alt = s_foci_alt[1]
    figsize=(3.5,2.8)
    fig,ax=plt.subplots(dpi=300,figsize=figsize)
    order = []
    ls_ticks = []
    d_pval = {}
    df_both = pd.DataFrame()
    for idx, s_cat in enumerate(d_order.keys()):
        print(s_cat)
        ls_order = d_order[s_cat]
        s_bad = ls_order[0]
        s_good = ls_order[1]
        d_replace = {s_bad:'bad',s_good:'good'}
        a = df_pri.loc[df_pri.loc[:,s_cat]==ls_order[0],s_foci].dropna()
        b = df_pri.loc[df_pri.loc[:,s_cat]==ls_order[1],s_foci].dropna()
        statistic, pvalue = stats.ttest_ind(a,b,alternative=s_alt)
        print(len(a) + len(b))
        
        statistic, pvalue = f_oneway(df_pri.loc[df_pri.loc[:,s_cat]==ls_order[0],s_foci].dropna(),
                                    df_pri.loc[df_pri.loc[:,s_cat]==ls_order[1],s_foci].dropna(),#alternative='greater'
                                             )
        df_pri['hue'] = df_pri.loc[:,s_cat].replace(d_replace)
        df_pri['x'] = s_cat
        df_both=pd.concat([df_both,df_pri.loc[:,['x','hue',s_foci]]])
        for s_test in ls_order:
            order.append((s_cat,d_replace[s_test]))
            ls_ticks.append(s_test)
        d_pval.update({s_cat:pvalue})
    sns.violinplot(data=df_both,y=s_foci,x='x',hue='hue',ax=ax,alpha=0.5,linewidth=1,cut=0,inner='quartile')#hue='Ki67pos'
    sns.stripplot(data=df_both,y=s_foci,x='x',hue='hue',s=2,dodge=True,ax=ax,palette='dark',jitter=0.2) #hue='Ki67pos'
    #annotate
    pairs = [(order[0],order[1]),(order[2],order[3]),(order[4],order[5])]
    pvalues = [d_pval[list(d_order.keys())[0]],d_pval[list(d_order.keys())[1]],d_pval[list(d_order.keys())[2]]]
    reject, corrected, __, __ = statsmodels.stats.multitest.multipletests(pvalues,method='fdr_bh')
    formatted_pvalues = [f'p={pvalue:.3}' for pvalue in list(pvalues)]#corrected #pvalues
    annotator = Annotator(ax, pairs=pairs, data=df_both,y=s_foci,x='x',hue='hue')
    annotator.set_custom_annotations(formatted_pvalues)
    annotator.annotate()
    ax.legend().remove()
    ax.set_xticks([-0.2,0.2, 0.8,1.2,1.8,2.2])
    ax.set_xticklabels(ls_ticks,rotation=45)
    ax.set_xlabel('')
    ax.set_title(f"{s_foci}", fontsize='x-large') #
    plt.tight_layout()
    fig.savefig(f'figures/violinplot_both_{s_foci}_{list(d_order.keys())[-1]}_primary.png')
    break

In [None]:
df_both.groupby('x').count()

## figure 2


In [None]:
#need to get mets 

ls_foci = ['Original_pORG_Up_78_Genes', 'txi_pORG_Up_42_Genes',
       'Original_pSUB_Up_100_Genes', 'txi_pSUB_Up_100_Genes',]
d_order =  {'Cohort':['Liver','Lung'],
            'PurIST Subtype':['basal-like','classical']}
for s_foci in ls_foci:
    figsize=(4,2.8)
    fig,ax=plt.subplots(dpi=300,figsize=figsize)
    order = []
    ls_ticks = []
    d_pval = {}
    df_both = pd.DataFrame()
    for idx, s_cat in enumerate(d_order.keys()):
        print(s_cat)
        ls_order = d_order[s_cat]
        s_bad = ls_order[0]
        s_good = ls_order[1]
        d_replace = {s_bad:'bad',s_good:'good'}
        a = df_pri.loc[df_pri.loc[:,s_cat]==ls_order[0],s_foci]
        b =df_pri.loc[df_pri.loc[:,s_cat]==ls_order[1],s_foci]
        statistic, pvalue = stats.ttest_ind(a,b, )#    alternative='greater' )
        print(len(a) + len(b))
        df_pri['hue'] = df_pri.loc[:,s_cat].replace(d_replace)
        df_pri['x'] = s_cat
        df_both=pd.concat([df_both,df_pri.loc[:,['x','hue',s_foci]]])
        for s_test in ls_order:
            order.append((s_cat,d_replace[s_test]))
            ls_ticks.append(s_test)
        d_pval.update({s_cat:pvalue})
    sns.violinplot(data=df_both,y=s_foci,x='x',hue='hue',ax=ax,alpha=0.5,linewidth=1,inner='quartile')#hue='Ki67pos'
    sns.stripplot(data=df_both,y=s_foci,x='x',hue='hue',s=2,dodge=True,ax=ax,palette='dark',jitter=0.2) #hue='Ki67pos'
    #annotate
    pairs = [(order[0],order[1]),(order[2],order[3])]
    pvalues = [d_pval[list(d_order.keys())[0]],d_pval[list(d_order.keys())[1]]]
    reject, corrected, __, __ = statsmodels.stats.multitest.multipletests(pvalues,method='fdr_bh')
    formatted_pvalues = [f'p={pvalue:.2}' for pvalue in list(corrected)]
    annotator = Annotator(ax, pairs=pairs, data=df_both,y=s_foci,x='x',hue='hue')
    annotator.set_custom_annotations(formatted_pvalues)
    annotator.annotate()
    ax.legend().remove()
    ax.set_xticks([-0.2,0.2, 0.8,1.2])
    ax.set_xticklabels(ls_ticks)
    ax.set_xlabel('')
    ax.set_title(f"{s_foci.split('_')[0]} GSVA Scores", fontsize='x-large') #{s_foci.replace('_',' ')} vs. 
    plt.tight_layout()
    fig.savefig(f'figures/violinplot_both_{s_foci}_primary.png')
    #break

In [None]:
#CPH

for s_vital_ in ls_foci:
    s_multi = s_vital_[0]
    df = df_pri.loc[:,[s_multi,s_time,s_censor,'Grade','Stage']]
    df_dummy = pd.get_dummies(df_pri.loc[:,['LV_Invasion','LN_Pos','Neoadjuvant']],drop_first=True)
    df = pd.concat([df,df_dummy],axis=1)
    fig, cph = cph_plot(df,s_multi,s_time,s_censor,figsize=(5,3.5))
    plt.tight_layout()
    fig.savefig(f'figures/CPH_{s_multi}.png')
    #break

In [None]:
d_col = {#'quartiles':(4,['low','med-low','med-high','high']), #
         #'median':(2,['low','high']),
         'tertiles':(3,['low','med','high']),
         #'#tertiles_h':([0, .33, 1.],['low','high']),
         #'tertiles_l':([0, .66, 1.],['low','high'])
        }

for s_col, tu_cut in d_col.items():
    i_cut = tu_cut[0]
    labels = tu_cut[1]
    break

for s_vital_ in ls_foci:
    s_vital = s_vital_[0]
    print(s_vital)
    df = df_pri.copy()
    x = df.loc[:,s_vital]
    q = pd.qcut(x, q=i_cut,labels=labels) 
    if s_col == 'quartiles':
        df[s_vital] = q.replace({'med-low':np.NaN,'med-high':np.NaN})
    elif s_col == 'tertiles':
        df[s_vital] = q.replace({'med':np.NaN})
    else:
        df[s_vital] = q 
    df = df.loc[:,[s_vital,s_time,s_censor]].dropna(how='any')
    fig, __ = km_plot(df,s_vital,s_time,s_censor)
    fig.savefig(f'figures/KM_clinicopath_{s_vital}_subset.png')
    #break