In [None]:
# Import libraries
import os
import sys
import numpy as np
import pandas as pd
import shutil
import matplotlib.pyplot as plt
import re
from skimage import io
import tifffile
from scipy.ndimage import median_filter
from skimage.util import img_as_ubyte,  img_as_float
import skimage
from skimage.feature import blob_dog, blob_log, blob_doh
from math import sqrt
import scipy
import seaborn as sns
from scipy import stats
from skimage.filters import unsharp_mask
from skimage.restoration import (denoise_tv_chambolle, denoise_bilateral,
                                 denoise_wavelet, estimate_sigma)
from skimage import color, morphology
from skimage.transform import rescale
import matplotlib as mpl
import lifelines
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.statistics import multivariate_logrank_test

from statannotations.Annotator import Annotator
from itertools import combinations
import statsmodels
import warnings
warnings.filterwarnings("ignore", category=UserWarning) 

# Set Paths
codedir = os.getcwd()
import util

%matplotlib inline

In [None]:
#images: dowload from synapse.org syn51068458
# (free account required)
rootdir = codedir#'/home/groups/BCC_Chin_Lab/ChinData/Cyclic_Workflow/cmIF_2021-05-03_PDAC'
regdir = f'{rootdir}/RegisteredImages'
segdir = f'{codedir}/Segmentation'
segdiro = f'{rootdir}/Segmentation'

# clone mplex_image at https://gitlab.com/engje/mplex_image
os.chdir('../..')
from mplex_image import preprocess, mpimage #, cmif
os.chdir(codedir)

# Table of contents <a name="contents"></a>
0. [functions](#func)
1. [skimage blobs](#sk)
2. [Foci analysis](#focifoci)
3. [mIHC analysis](#mihc)
4. [Patient metadata](#meta)
4. [CPH modeling](#clin)
5. [gene expression analysis](#geneexp)
6. [TCR analysis](#tcell)
7. [GSEA plots](#split)

In [None]:
# import importlib
# importlib.reload(util)

## functions <a name="func"></a> 

import from util.py

[contents](#contents)

##  Section 1: skimage blob detection

Foci are detected here from images plus segmentation masks

**You can skip and use pre-computed foci counts**

 <a name="sk"></a> 

[contents](#contents)

In [None]:
#intensity data: dowload from synapse.org syn51068458
df_mi = pd.read_csv(f'{codedir}/data/20220720_U54-TMA_FilteredMeanIntensity_Link.csv',index_col=0)

In [None]:
#skimage blob detection
#requires images: dowload from synapse.org syn51068458
threshold=0.002
d_thresh = {'pRPA':1100, 'gH2AX':1100, 'RAD51':1300}
os.chdir(regdir)
ls_slide = sorted(set(df_mi.slide_scene)) 
ls_marker = ['pRPA','gH2AX','RAD51',] 
df_result_all = pd.DataFrame()
for s_slide in ls_slide:
    print(s_slide)
    os.chdir(f'{segdiro}/U54-TMA-9_CellposeSegmentation') #change this to path for your downloaded segmentation masks
    s_seg = pd.Series(sorted(os.listdir()),dtype='object')[pd.Series(index=sorted(os.listdir()),dtype='object').index.str.contains(f'{s_slide}_nuc30_NucleiSegmentationBasins')].iloc[0]
    label_image = io.imread(s_seg)
    os.chdir(f'{regdir}/{s_slide}') # change this to path to your downloaded registered tifs
    df_img=mpimage.parse_org()
    for idxs, s_marker in enumerate(ls_marker):
        intensity_image = io.imread(df_img[df_img.marker==s_marker].index[0])
        props = skimage.measure.regionprops_table(label_image, intensity_image=intensity_image, properties=('label','bbox','mean_intensity')) # 'image','intensity_image',
        df_props = pd.DataFrame(props,dtype='float').set_index('label')
        df_props.columns = [item.replace('-','') for item in df_props.columns]
        ls_index = df_props[df_props.mean_intensity>d_thresh[s_marker]].index
        print(f'{s_marker} {len(ls_index)}')
        for i_cell in ls_index:
            se_cell = df_props.loc[i_cell].dropna().astype('int')
            image = intensity_image[se_cell.bbox0:se_cell.bbox2,se_cell.bbox1:se_cell.bbox3]
            if s_marker == 'pRPA':
                blobs, fig = util.get_blobs2(image,min_sigma=0.1,max_sigma=2,threshold=threshold,exclude_border=1)
            elif s_marker == 'gH2AX':
                blobs, fig = util.get_blobs2(image,min_sigma=1,max_sigma=2,threshold=threshold,exclude_border=1)
            else:
                blobs, fig = util.get_blobs2(image,min_sigma=0.1,max_sigma=2,threshold=threshold,exclude_border=1)
            df_props.loc[i_cell,'blobs'] = len(blobs)
            if len(blobs) > 1:
                fig.suptitle(s_marker)
                plt.tight_layout()
                fig.savefig(f'{codedir}/blobs/{s_marker}/{s_slide}_{s_marker}_{i_cell}.png',dpi=200)
                plt.close(fig)
        df_props.index = [s_slide + '_cell' + str(item) for item in df_props.index]
        if len(ls_index)==0:
                df_props['blobs'] = np.NaN
        if idxs == 0:
            df_result = df_props.rename({'blobs':f'{s_marker}_foci'},axis=1)
        else:
            df_result.loc[df_props.index,f'{s_marker}_foci'] = df_props.blobs
    df_result_all = df_result_all.append(df_result)

#uncomment to save (this data has been pre-computed and saved in the repo)
#df_result_all.loc[:,df_result_all.columns.str.contains('foci')].dropna(how='all').to_csv(f'{codedir}/foci_U54-TMA-9_{".".join(ls_marker)}_{threshold}.csv')


### Section 2: FOCI Analysis <a name="focifoci"></a>

Load saved foci for plotting and downstream analysis

liver/lung

**You don't need to run section 1 to run this**


[contents](#contents)

In [None]:
df_surv = pd.read_csv(f'{codedir}/data/u54_tma_sampleannot_Link.csv')
len(df_surv)
#df_pORG = pd.read_csv(f'{codedir}/data/GSVA_Scores_Link.csv')#=,index_col='Public_Specimen_ID'
#df_pORG = pd.read_csv(f'{codedir}/20230526_GSVA_Scores.csv')
df_pORG = pd.read_csv(f'data/20230608_GSVA_Scores.csv')
#df_pORG
df_primary = df_surv[((df_surv.Tissue=='PDAC') | (df_surv.Tissue=='Intestinal'))].copy()
print(len(df_primary))
df_primary = df_primary.merge(df_pORG[df_pORG.Group=='GSVA_All'],on='Public_Specimen_ID',how='left')
df_primary.set_index("Public_Specimen_ID", inplace = True)
len(df_primary)

In [None]:
df_mapper = df_primary[~df_primary.index.duplicated()]
ls_add = ['txi_pORG_Up_42_Genes',
          'trim_padj_0.2_pORG_Up_55_Genes',
          'trim_padj_0.2_pSUB_Up_100_Genes',
        'txi_pSUB_Up_100_Genes']
for s_add in ls_add:
    d_map = dict(zip(df_mapper.index,df_mapper.loc[:,s_add]))
    print(len(d_map))
    df_surv[s_add] = df_surv.Public_Specimen_ID.map(d_map)

### TMA survival

In [None]:
##
%matplotlib inline
alpha = 0.06
s_propo = ''
savedir = f'{codedir}'
s_time=  'Survival_time'#
s_censor='Survival'#
s_subtype = ''
s_cell = ''
s_type_title = ''
 
for s_col in ls_add:
    print(s_col)
    for cutp in [0.33,0.5,0.66]: #np.round(np.arange(0.25,1,0.25),3):#
            print(cutp)
            df_km, pvalue = util.single_km(df_surv[~df_surv.Public_Patient_ID.duplicated()],s_cell,s_subtype,s_type_title,s_col,savedir,alpha,cutp, #
                                       s_time,s_censor,s_propo)
            print(pvalue)
    #break

### define high/low pORG

based on survival differences on TMA samples

In [None]:
df_pri = df_surv[~(df_surv.Public_Patient_ID.duplicated(keep='first')) & ((df_surv.Tissue=='PDAC'))]
s_select_porg = 'trim_padj_0.2_pORG_Up_55_Genes'#'txi_pORG_Up_42_Genes'
i_pORG_txi = np.quantile(df_pri.loc[:,s_select_porg].dropna(),0.66)#0.66 #0.5
print(i_pORG_txi)
# df_surv.loc[df_surv['Original_pORG_Up_78_Genes'] >= i_pORG_ori,'pORG_binary_orig'] = 'high'
# df_surv.loc[df_surv['Original_pORG_Up_78_Genes'] < i_pORG_ori,'pORG_binary_orig'] = 'low'

df_surv.loc[df_surv[s_select_porg] >= i_pORG_txi,'pORG_binary'] = 'high'
df_surv.loc[df_surv[s_select_porg] < i_pORG_txi,'pORG_binary'] = 'low'

In [None]:
#save the 0608
#df_surv.drop(['Unnamed: 0','pORG_binary','pORG_Score'],axis=1).to_csv(f'{codedir}/data/u54_tma_sampleannot_Link_new.csv')


In [None]:
#print(df_surv.pORG_binary_orig.value_counts())
print(df_surv[(df_surv.Tissue=='PDAC') & (~df_surv.Public_Specimen_ID.duplicated())].pORG_binary.value_counts())

In [None]:
#load combined
s_sample = '20220721_U54-TMA'#'20220711_U54-TMA' #'20220409_JP-TMAs_IMC-TMAs'
s_names = 'Combined' #'unnamed' #
s_type = 'PDAC'
df_lei = pd.read_csv(f'{codedir}/data/{s_sample}_{s_names}Celltypes_{s_type}_Link.csv',index_col=0)

d_patient = dict(zip(df_surv.coor_mplexable,df_surv.Public_Patient_ID))

df_lei['Tissue'] = df_lei.Patient.map(dict(zip(df_surv.Public_Patient_ID,df_surv.Tissue)))
df_epi = pd.read_csv(f'{codedir}/data/results_20220721_U54-TMA_CellTypeCounts_byPatient_byleidencelltype5_PDAC_Link.csv',index_col=0)


In [None]:

ls_marker = ['gH2AX','pRPA','RAD51']
threshold=0.002
df_foci2 = pd.read_csv(f'{codedir}/data/foci_U54-TMA-9_{".".join(ls_marker)}_{threshold}.csv',index_col=0)
df_foci2.index = [item.replace('.0','') for item in df_foci2.index]
df_foci2 = df_foci2.fillna(0)
df_foci2['scene'] = [item.split('_cell')[0] for item in df_foci2.index]
df_foci2['Patient'] = df_foci2.scene.map(d_patient)

In [None]:
# plot Ki67 versus pRPA foci
s_define = 'pORG_binary'#'pORG_binary_txi'#'pORG_binary_orig'#
df_lei['Cohort'] = df_lei.Patient.map(dict(zip(df_surv.Public_Patient_ID,df_surv.Cohort)))
df_lei['pORG_binary'] = df_lei.Patient.map(dict(zip(df_surv.Public_Patient_ID,df_surv.loc[:,s_define])))
df_lei['Ki67pos'] = np.nan
df_lei.loc[df_lei.Ki67>3*256,'Ki67pos'] = 'Ki67+'
df_lei.Ki67pos.fillna('Ki67-',inplace=True)
ls_foci =['pRPA_foci','gH2AX_foci','RAD51_foci']
df_lei_foci = df_lei.merge(df_foci2.loc[:,ls_foci],left_index=True,right_index=True,how='left')
df_lei_foci.loc[:,ls_foci] = df_lei_foci.loc[:,ls_foci].fillna(0)

In [None]:
#use df foci sum 2 (lower threshold)
ls_index = df_lei[df_lei.leidencelltype5=='epithelial'].index
df_foci_sum2 = df_foci2.loc[df_foci2.index.isin(ls_index)].groupby('Patient').sum()
for s_marker in ls_marker:
    df_foci_sum2[f'log_{s_marker}_foci'] = np.log(df_foci_sum2.loc[:,f'{s_marker}_foci'] + 1)
    df_foci_sum2[f'mean_{s_marker}_foci'] = (df_foci_sum2.loc[df_foci_sum2.index.isin(df_epi.index),f'{s_marker}_foci']/df_epi.epithelial).fillna(0)

    
df_foci_sum2['Public_Patient_ID'] = df_foci_sum2.index
df_surv = df_surv.merge(df_foci_sum2,on='Public_Patient_ID',how='left',suffixes=('_1',''))

In [None]:
s_group = 'pORG_binary'
s_type = ''
s_cell = 'epithelial'
for s_marker in ['mean_pRPA_foci','mean_RAD51_foci','mean_gH2AX_foci']:
    util.categorical_correlation_boxplot(df_surv[~df_surv.Public_Patient_ID.duplicated()],s_group,s_marker,s_type,s_cell,
                                         alpha=1.05,s_propo='in',b_ttest=True)


In [None]:
#compare Ki67+ versus negative, double violinplots
#compare all the combinations, double violinplots
%matplotlib inline
import seaborn as sns
from scipy import stats
from statannotations.Annotator import Annotator
from itertools import combinations
import statsmodels
from statsmodels.stats.multicomp import pairwise_tukeyhsd

s_compare = 'Ki67'#'all' #
import warnings
warnings.filterwarnings("ignore", category=UserWarning) 
for s_cat in ['pORG_binary','Cohort']:
    ls_order = ['Ki67-','Ki67+']
    for s_foci in ls_foci:
        if s_compare == 'all':
            figsize=(3.5,3)
        else:
            figsize=(2.8,2.8)
        fig,ax=plt.subplots(dpi=300,figsize=figsize)
        df_both = pd.DataFrame()
        order = []
        d_pval = {}
        for idx, s_cohort in enumerate(df_lei_foci.loc[:,s_cat].dropna().unique()):
            print(s_cohort)
            df_plot_foci =df_lei_foci[(df_lei_foci.leidencelltype5=='epithelial')
                                      & (df_lei_foci.Tissue=='PDAC') & (df_lei_foci.loc[:,s_cat]==s_cohort)]
            statistic, pvalue = stats.ttest_ind(df_plot_foci.loc[df_plot_foci.Ki67pos=='Ki67+',s_foci],
                                                df_plot_foci.loc[df_plot_foci.Ki67pos=='Ki67-',s_foci],
                                               alternative='greater')
            for s_order in ls_order:
                order.append((s_cohort,s_order))
            df_both = pd.concat([df_both,df_plot_foci])
            d_pval.update({s_cohort:pvalue})
        sns.violinplot(data=df_both,hue='Ki67pos',y=s_foci,x=s_cat,ax=ax,alpha=0.5,linewidth=0.5)
        sns.stripplot(data=df_both,hue='Ki67pos',y=s_foci,x=s_cat,s=1,dodge=True,ax=ax,palette='dark',jitter=0.2)
        #annotate
        if s_compare == 'all':
            pairs = list(combinations(order, r=2))
            annotator = Annotator(ax, pairs=pairs, data=df_both, y=s_foci,x=s_cat,hue='Ki67pos')
            annotator.configure(test="t-test_ind",line_width=1)#,alternative='greater
            pvalues = annotator.apply_test().annotations #annotator.apply_and_annotate() # 
            pvalues = [item.data.pvalue for item in pvalues]     
            reject, corrected, __, __ = statsmodels.stats.multitest.multipletests(pvalues,method='bonferroni')
            formatted_pvalues = [f'p={pvalue:.2}' for pvalue in list(corrected)]
            annotator.set_custom_annotations(formatted_pvalues)
            annotator.annotate()
        else:
            pairs = [(order[0],order[1]),(order[2],order[3])]
            pvalues = [d_pval[pairs[0][0][0]],d_pval[pairs[1][0][0]]]
            reject, corrected, __, __ = statsmodels.stats.multitest.multipletests(pvalues,method='bonferroni')
            formatted_pvalues = [f'p={pvalue:.2}' for pvalue in list(corrected)]
            annotator = Annotator(ax, pairs=pairs, data=df_both, y=s_foci,x=s_cat,hue='Ki67pos')
            annotator.set_custom_annotations(formatted_pvalues)
            annotator.annotate()
        
        ax.set_title(f"{s_foci.replace('_',' ')} vs. {s_cat.split('_')[0]}", fontsize='x-large')
        ax.set_xlabel(s_cat)
        ax.set_ylabel(f"No. {s_foci.replace('_',' ')}")
        h, l = ax.get_legend_handles_labels()
        labels =  [f'_{item}' if ind < 2 else item for ind,item in enumerate(l)]
        ax.legend(h,labels,title='',fontsize='small',markerscale=.5,bbox_to_anchor=(1.01,0.9))
        plt.tight_layout()
        fig.savefig(f'{codedir}/violinplot_both_{s_foci}_{s_cohort}_{s_compare}.png')
    if s_cat == 'pORG_binary':
        df_both.to_csv(f'results_foci_Ki67.csv')
        #break
    break

In [None]:
## make the bargraphs
# percent pRPA epithelial cells

d_cats = {'pORG_binary':['high','low'],'Cohort':['liver_cohort','lung_cohort']}
for s_cat, ls_order in d_cats.items():
    for s_foci in ls_foci:
        figsize=(2.8,2.5)
        fig,ax=plt.subplots(dpi=300,figsize=figsize)
        df_both = pd.DataFrame()
        d_pval = {}
        for idx, s_cohort in enumerate(df_lei_foci.loc[:,s_cat].dropna().unique()):
            print(s_cohort)
            df_plot_foci =df_lei_foci[(df_lei_foci.leidencelltype5=='epithelial')
                                      & (df_lei_foci.Tissue=='PDAC') & (df_lei_foci.loc[:,s_cat]==s_cohort)]
            df_both = pd.concat([df_both,df_plot_foci])
        df_obs = df_both[df_both.loc[:,s_foci] > 0].loc[:,s_cat]
        df_exp = df_both.loc[:,s_cat]
        #chi
        f_obs = df_obs.value_counts().loc[ls_order]
        f_exp = f_obs.sum() * df_exp.value_counts(normalize=True).loc[ls_order]
        statistic, pvalue =  stats.chisquare(f_obs, f_exp)
        df_obs.value_counts(normalize=True).loc[ls_order].plot(kind='bar',ax=ax,color=['mediumpurple','deepskyblue'])
        ax.set_title(f'{s_foci} vs. {s_cat}\nChi-squared p={pvalue:.3}')
        ax.set_ylabel(f'Percent of {s_foci.split("_")[0]}+ Epithelial')
        plt.tight_layout()
        xtickslocs = ax.get_xticks()
        for idx, s_order in enumerate(ls_order):
            s = f'{df_obs.value_counts().loc[s_order]}\nof\n{df_exp.value_counts().loc[s_order]}'
            plt.text(-0.2+idx,0.3,s)
        break
    break

In [None]:
## make the bargraphs
# percent pRPA epithelial cells
# percent of those that are Ki67+
d_cats = {'pORG_binary':['high','low'],'Cohort':['liver_cohort','lung_cohort']}
for s_cat, ls_order in d_cats.items():
    for s_foci in ls_foci:
        figsize=(2.8,2.5)
        fig,ax=plt.subplots(dpi=300,figsize=figsize)
        df_both = pd.DataFrame()
        d_pval = {}
        for idx, s_cohort in enumerate(df_lei_foci.loc[:,s_cat].dropna().unique()):
            print(s_cohort)
            df_plot_foci =df_lei_foci[(df_lei_foci.leidencelltype5=='epithelial') & (df_lei_foci.loc[:,s_foci] > 0)
                                      & (df_lei_foci.Tissue=='PDAC') & (df_lei_foci.loc[:,s_cat]==s_cohort)]
            df_both = pd.concat([df_both,df_plot_foci])
        df_obs = df_both[df_both.loc[:,'Ki67pos']=='Ki67+'].loc[:,s_cat]
        df_exp = df_both.loc[:,s_cat]
        #chi
        f_obs = df_obs.value_counts().loc[ls_order]
        f_exp = f_obs.sum() * df_exp.value_counts(normalize=True).loc[ls_order]
        statistic, pvalue =  stats.chisquare(f_obs, f_exp)
        df_plot = df_obs.value_counts()/df_exp.value_counts()
        df_plot.loc[ls_order].plot(kind='bar',ax=ax,color=['mediumpurple','deepskyblue'])
        ax.set_title(f'{s_foci} vs. {s_cat}\nChi-squared p={pvalue:.3}')
        ax.set_ylabel(f'Percent of {s_foci.split("_")[0]}+ Epithelial\nthat are Proliferating')
        plt.tight_layout()
        xtickslocs = ax.get_xticks()
        for idx, s_order in enumerate(ls_order):
            s = f'{df_obs.value_counts().loc[s_order]}\nof\n{df_exp.value_counts().loc[s_order]}'
            plt.text(-0.2+idx,0.15,s)
        #break
    #break

In [None]:
## make the boxplots
# percent pRPA epithelial cells
# percent of those that are Ki67+
d_cats = {'pORG_binary':['high','low'],'Cohort':['liver_cohort','lung_cohort']}
for s_cat, ls_order in d_cats.items():
    for s_foci in ls_foci:
        figsize=(2.8,2.5)
        #fig,ax=plt.subplots(dpi=300,figsize=figsize)
        df_both = pd.DataFrame()
        d_pval = {}
        for idx, s_cohort in enumerate(df_lei_foci.loc[:,s_cat].dropna().unique()):
            print(s_cohort)
            df_plot_foci =df_lei_foci[(df_lei_foci.leidencelltype5=='epithelial') & (df_lei_foci.loc[:,s_foci] > 0)
                                      & (df_lei_foci.Tissue=='PDAC') & (df_lei_foci.loc[:,s_cat]==s_cohort)]
            df_both = pd.concat([df_both,df_plot_foci])
        df_sum = df_both.loc[:,[s_cat,'Patient','Ki67pos']].groupby('Patient').value_counts().unstack().reset_index().fillna(0)
        df_sum['Frac. Prolif.'] = df_sum.loc[:,'Ki67+']/(df_sum.loc[:,['Ki67+','Ki67-']].sum(axis=1))
        util.categorical_correlation_boxplot(df_sum,s_cat,'Frac. Prolif.',s_type='',s_cell=f"{s_foci.split('_')[0]}+",
                                         alpha=1.05,s_propo='in',b_ttest=True)
        #break
    #break

### Section 3: mIHC Analysis <a name="mihc"></a>

Stacked barplot


[contents](#contents)

In [None]:
#survival

df_surv = pd.read_csv(f'{codedir}/data/u54_tma_sampleannot_Link_new.csv',index_col=0)
df_surv['Sample_ID'] = [item.split('-T')[0][-6::] for item in df_surv.loc[:,'Public_Patient_ID'].fillna('none')]

In [None]:
# dowload mIHC data from https://www.synapse.org/#!Synapse:syn51078766
# combine ROIs into large dataframe and save
if not os.path.exists(f'data/20221123_mIHC_LiverLung_Celltypes.csv'):
    ls_col = ['Sample_ID','class','Location_Center_X', 'Location_Center_Y', 
           'GRZB_func', 'KI67_func', 'PD1_func', 'PDL1_func', 'CD163_func',
           'CCR2_func', 'HLAII_func', 'EOMES_func','Area']
    df_ll =pd.DataFrame()
    for s_file in sorted(os.listdir('mIHC_Data')):
        df = pd.read_csv(f'mIHC_Data/{s_file}',index_col=0)
        s_sample = s_file.split('LiverLungBCC')[1].split('.')[0]
        df['Sample_ID'] = s_sample
        df.index = [f'{s_sample.split("Nuclei_")[1].replace("ROI","_scene")}_cell{item}' for item in df.index]
        df_ll = pd.concat([df_ll,df.loc[:,ls_col]])
        break
    df_ll['Organ'] = df_ll.Sample_ID.map(dict(zip(df_ll_annot.index,df_ll_annot.loc[:,'Met Site'])))
    df_ll['Location'] = df_ll.Sample_ID.map(dict(zip(df_ll_annot.index,df_ll_annot.loc[:,'Location'])))
    df_ll['Desc'] = df_ll.Sample_ID.map(dict(zip(df_ll_annot.index,df_ll_annot.loc[:,'Desc'])))
    df_ll.to_csv(f'data/20221123_mIHC_LiverLung_Celltypes.csv')

In [None]:
#load data
df_ll = pd.read_csv(f'data/20221123_mIHC_LiverLung_Celltypes.csv',index_col=0,low_memory=False)
print(len(df_ll))

df_ll_ann = pd.read_csv('data/LiverLung_annotations.csv',index_col=0)
df_ll_roi = pd.read_csv('data/annotated_LiverLung_perROI.csv',index_col=0)

In [None]:
#add annotation
df_ll['Sample_ID_short'] = [item.split('Nuclei_')[1].split('ROI')[0] for item in df_ll.Sample_ID]
df_ll['Sample_ID_int'] = [int(item) for item in df_ll.Sample_ID_short]
df_ll['ROI'] = [int(item.split('ROI')[1]) for item in df_ll.Sample_ID]
df_ll['Cohort'] = df_ll.Sample_ID_int.map(dict(zip(df_ll_ann.index, df_ll_ann.Cohort)))
df_ll['Patient'] = df_ll.Sample_ID_int.map(dict(zip(df_ll_ann.index, df_ll_ann.Patient)))
df_ll['Sample_ROI'] = df_ll.Sample_ID_int.astype('str') + '_' + df_ll.ROI.astype('str')
df_ll_roi['Sample_ROI'] = df_ll_roi.index.astype('str') + '_' + df_ll_roi.ROI.astype('str')
df_ll['Location'] = df_ll.Sample_ROI.map(dict(zip(df_ll_roi.Sample_ROI,df_ll_roi.loc[:,'Location'])))

In [None]:
df_ll['classII'] = df_ll.loc[:,'class'].replace({'T-regulatory CD4 cells':'T cells',
                                                 'CD4 T helper cells':'T cells','CD8 T cells':'T cells'}) #
df_ll.rename({'class':'classI'},axis=1,inplace=True)

In [None]:
d_color = dict(zip(['ST-00016289', 'ST-00017078', 'ST-00017310', 'ST-00017381',
       'ST-00018269', 'ST-00018955', 'ST-00019367', 'ST-00019368',
       'ST-00020181'],sns.color_palette('Purples',9)))
d_color.update(dict(zip(['ST-00015839', 'ST-00017440', 'ST-00017804'],sns.color_palette('Blues',3))))




In [None]:
#by location
s_define = 'trim_padj_0.2_pORG_Up_55_Genes'#'txi_pORG_Up_42_Genes'#''#
s_group = 'Cohort'
alpha = 1.05
s_column = 'classI'#'classII'#
if s_column == 'classI':
    ls_mihc = [ 'PanCK+','aSMA+','CD4 T helper cells','CD8 T cells', 'T-regulatory CD4 cells','B cells', 'Granulocytes', 'Monocyte',
 'Macrophage', 'Mature DC','Immature DC']
elif s_column == 'classII':
    ls_mihc = ['T cells', #'B cells', 'Granulocytes', 'Monocyte', 'Macrophage',
       #'Mature DC', 'Immature DC', 'Immune Other', 'PanCK+', 'aSMA+','Other Cells'
       ]
s_patient = 'Public_Patient_ID'

for s_loc in ['all','T','B','D']: 
    if s_loc == 'all':
        df_loc = df_ll
    else:
        df_loc = df_ll[df_ll.Location==s_loc]
    print(len(df_loc.Sample_ID_short.unique()))
    df_group = (df_loc.groupby(['Patient',s_column]).count().Sample_ID/(df_loc.groupby(['Patient']).count().Sample_ID)).unstack()
    df_group[s_patient] = df_group.index
    df_group['Cohort'] = df_group.index.map(dict(zip(df_ll_roi.Patient,df_ll_roi.loc[:,'Cohort'])))
    df_group = df_group.merge(df_surv.loc[:,['Survival','Survival_time',s_define,s_patient]],on=s_patient)
    df_group = df_group[~df_group.loc[:,s_patient].duplicated()]
    df_group = df_group[df_group.Cohort!='Liver met'].fillna(0)
    ls_order = sorted(df_group.loc[:,s_group].unique())
    for s_marker in ls_mihc:
        try:
            s_high = df_group.loc[:,s_group].unique()[0]
            s_low = df_group.loc[:,s_group].unique()[1]
        except:
            continue
        n_high = sum(df_group.loc[:,s_group]==s_high)
        n_low = sum(df_group.loc[:,s_group]==s_low)
        statistic,pvalue = stats.ttest_ind(df_group.loc[df_group.loc[:,s_group]==s_high,s_marker],
                                           df_group.loc[df_group.loc[:,s_group]==s_low,s_marker])
        if pvalue <= alpha:
            df_group_roi = (df_loc.groupby(['Sample_ROI',s_column]).count().Sample_ID/(df_loc.groupby(['Sample_ROI']).count().Sample_ID)).unstack()
            df_group_roi['Cohort'] = df_group_roi.index.map(dict(zip(df_ll_roi.Sample_ROI,df_ll_roi.loc[:,'Cohort'])))
            df_group_roi['Patient'] = df_group_roi.index.map(dict(zip(df_ll_roi.Sample_ROI,df_ll_roi.loc[:,'Patient'])))
            df_group_roi = df_group_roi[df_group_roi.Cohort.isin(['liver_cohort','lung_cohort'])]
            fig, ax = plt.subplots(figsize=(3,3),dpi=300)
            sns.boxplot(data=df_group_roi,x=s_group,y=s_marker,showfliers=False,ax=ax,order=[str(item) for item in ls_order],palette=['mediumpurple','deepskyblue'])
            sns.stripplot(data=df_group_roi,x=s_group,y=s_marker,ax=ax,hue='Patient',s=3,palette=d_color)
            ax.set_ylim(ax.get_ylim()[0],ax.get_ylim()[1])
            ax.set_title(f'{s_group} versus\n {s_marker}\n p={pvalue:.4f} (n={n_low}, {n_high})')
            ax.set_ylabel(f'{s_marker} in {s_loc}')
            #ax.set_ylabel(f'{s_marker}')
            ax.get_legend().remove()
            plt.tight_layout()
            #fig.savefig(f'{s_date}/boxplot_mIHC_{s_marker}_versus_{s_group}_in_{s_loc}.png')
        break
    break

In [None]:

print(df_group.groupby('Cohort').mean().loc[:,s_define])
print(df_group.groupby('Cohort').sem().loc[:,s_define])

In [None]:
fig,ax=plt.subplots(dpi=300)
df_group.groupby('Cohort').mean().loc[:,ls_mihc].plot(kind='bar',width=.9,stacked=True,ax=ax,colormap='Paired')
ax.legend(bbox_to_anchor=(1,.9))
ax.set_ylabel('Fraction in Tissue')
plt.tight_layout()

# Section 4 <a name="meta"></a>

patient metadata


[contents](#contents)

### Patients in LabKey

In [None]:
# load patient vital status
#also had stage/ grade/ lymph nodes etc.
df_vital = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/FMP_Patients_Nov17_2021.xlsx',sheet_name='Sheet1')

#Stage 1 pancreatic cancer means the cancer is not more than 4cm in size and it hasn't spread outside the pancreas. It is split into 1A and 1B.
# 1A In TNM staging, this is the same as T1, N0, M0 1B In TNM staging, this is the same as T2, N0, M0

# Stage 2A means the cancer is bigger than 4cm but is still within the pancreas. It has not spread to the lymph nodes Open a glossary item or other areas of the body.
#2A in TNM staging, this is the same as T3, N0, M0. 2B In TNM staging, this is the same as T1, 2 or 3, N1, M0.

#3 Stage 3 can mean that the cancer is any size within the pancreas and has spread to 4 or more nearby lymph nodes Open a glossary item.
#In TNM staging, this is the same as T1, 2 or 3, N2, M0.
#Or stage 3 can mean the cancer has started to grow outside the pancreas into the major blood vessels nearby. It may or may not have spread into the lymph nodes. It hasn't spread to any other areas of the body.
#In TNM staging, this is the same as T4, Any N, M0.

#4 Your doctor might call this advanced (metastatic) cancer.
#In TNM staging, this is the same as Any T, Any N, M1.

#collapse stage
d_stage = {'2B - IIB':'II', 'p2A':'II', 'p2B':'II', '4 - IV':'IV', '2A - IIA':'II', 'p3':'III', '1B - IB':'I',
    'c1B':'I', 'c2B':'I', 'nan':np.NaN, 'p4':'IV','2B - T1-3, N1, M0':'II','p0':'0','p1B':'I','c1':'I',
    'c4':'IV','c3':'III','p1A':'I','pNA':np.NaN,'3 - T4, Any N, M0':'III','c2A':'II','p4B':'IV','c4B':'IV',
     '2B - T1, N1, M0 / T2, N1, M0 / T3, N1, M0':'II','pUNK':np.NaN,'p2':'II','p3B':'III','c2':'II','p3A':'III',
    '3 - III':'III','99 - Unknown':np.NaN,'1A - IA':'I','c3A':'III','c4A':'IV','p1':'I','c1A':'I','p4A':'IV',
    '88 - Not applicable to 7th Edition staging':np.NaN,
           '88 - No classification is recommended in 6th Edition':np.NaN,
    '2A - T3, N0, M0':'II','4 - Any T, Any N, M1':'IV'}
df_vital['Stage'] = df_vital.loc[:,'Stage Grouping _ Dominant'].replace(d_stage) 

#collapse grade

d_grade = {'Grade II  Moderately Diff / Mod Well Diff':'2',
       'Grade I   Well Differentiated/Differentiated':'1',
       'Cell type not determined; not stated;N/A;Unk; high grade dysplas':np.NaN,
       'Grade III Poorly Differentiated':'3', 'nan':np.NaN,'Grade IV Undifferentiated, Anaplastic':'4',
         'B-CELL    LYMPHOMA OR LEUKEMIA ONLY':np.NaN}
df_vital['Grade'] = df_vital.loc[:,'Grade_Differentiation'].replace(d_grade) 

#collapse LV invasion
d_replace = {'nan':np.NaN, 'LYMPHOVASCULAR INVASION STATED AS NOT PRESENT':'NO',
       'LYMPHOVASCULAR INVASION PRESENT/IDENTIFIED':'YES',
       'Unknown/Indeterminate':np.NaN, 'NOT APPLICABLE':np.NaN,
       'Lymph-vascular Invasion Present/Identified':'YES',
       'LYMPHATIC AND SMALL VESSEL INVASION ONLY (L)':'YES',
       'BOTH LYMPHATIC AND SMALL VESSEL AND VENOUS (LARGE VESSEL) INVASION':'YES'}
df_vital['LV_Invasion'] = df_vital.loc[:,'Lymph_vascular Invasion'].replace(d_replace)

# LN positivity
df_vital['LN_Pos'] = df_vital.loc[:,'Regional Lymph Nodes Positive'] >= 1
df_vital.loc[df_vital.loc[:,'Regional Lymph Nodes Positive'].isna(),'LN_Pos'] = np.NaN
df_vital['LN_Pos']  = df_vital.LN_Pos.replace({True:'YES',False:'NO'})

df_vital['Survival'] = df_vital.cVitalStatus.replace({'Alive':0,'Dead':1})
print(df_vital.Grade.unique())
print(df_vital.Stage.unique())



In [None]:
# kaplan meier
# %matplotlib inline
# # check prognostic value of clinicopathologiocal variables

# ls_vital = ['Stage', 'Grade','LV_Invasion','LN_Pos'] 
# s_time = 'cDays from Diagnosis to FU'
# s_censor = 'Survival'
# for s_vital in ls_vital:
#     print(s_vital)
#     df = df_vital.loc[df_vital.loc[:,'Primary Site _ Major Groups For Staging']=='Pancreas',[s_vital,s_time,s_censor]].dropna(how='any')
#     fig, __ = util.km_plot(df,s_vital,s_time,s_censor)
#     fig.savefig(f'figures/KM_clinicopath_{s_vital}.png')
#     #break

#CPH
s_time = 'cDays from Diagnosis to FU'
df_vital['Survival_time'] = df_vital.loc[:,s_time]
s_censor = 'Survival'
ls_vital = ['LV_Invasion','Stage', 'Grade','LN_Pos'] #'Age at Diagnosis','Sex',
for s_vital in ls_vital:
    print(s_vital)
    df = df_vital.loc[df_vital.loc[:,'Primary Site _ Major Groups For Staging']=='Pancreas',[s_vital,s_time,s_censor]].dropna(how='any')
    if df.columns.isin(['Stage']).any():
        df.Stage = df.Stage.replace({'I':1,'II':2,'III':3,'IV':4}).astype('int')
    if df.columns.isin(['Grade']).any():
        df.Grade = df.Grade.astype('int')
    if df.loc[:,s_vital].dtype=='O':
        df_dummy = pd.get_dummies(df.loc[:,[s_vital]],drop_first=True)
        df.drop(s_vital,axis=1,inplace=True)
        s_vital = df_dummy.columns[0]
        df[s_vital] = df_dummy
    fig, cph = util.cph_plot(df,s_vital,s_time,s_censor,figsize=(3,1.5))
    plt.tight_layout()
    fig.savefig(f'figures/CPH_single_{s_vital}_all.png')
    plt.close(fig)
    break

In [None]:
#patinets in full BEMS
print(len(df_vital))

### Patients in Paper 

n=434 specimens

n=422 patients

In [None]:
#load full patient data, neoadjuvant
df_patient = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset1_age.xlsx',
                           sheet_name='Patients - Tab 1')
df_patient.rename({'Patient ID':'Public_Patient_ID'},axis=1,inplace=True)
df_patient.loc[df_patient.loc[:,'Lung Met Present']=='YES','Cohort'] = 'Lung'
df_patient.loc[df_patient.loc[:,'Liver Met Present']=='YES','Cohort'] = 'Liver'
#how many in cohorts
for s_site in df_patient.Cohort.dropna().unique():
    n_patients = df_patient[df_patient.Cohort==s_site].Public_Patient_ID.nunique()
    print(f'{s_site} {n_patients}')
print(f'number rows {len(df_patient)}')
print(f'number unique pts. {df_patient.Public_Patient_ID.nunique()}')

#define neodjuvant binary
df_patient.loc[:,'Neoadjuvant Treatment'].unique()
df_patient.loc[df_patient.loc[:,'Neoadjuvant Treatment'] == 'Yes Neoadjuvant','Neoadjuvant'] = 'Yes'
df_patient['Neoadjuvant'] = df_patient.Neoadjuvant.fillna('No')

#load T cell data
df_tcell_tumor = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset6.xlsx',
                         sheet_name='Tumor Samples')

df_tcell_blood = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset6.xlsx',
                         sheet_name='Blood Samples')

#merge T cell blood and tumor
df_tcell = df_tcell_tumor.merge(df_tcell_blood,on='Patient ID',suffixes=(' tumor',' blood'),how='outer')

df_tcell.rename({'Productive Rearrangements (Observed Richness)':'Productive_Rearrangements',
                'Templates per ng':'Templates_per_ng','Patient ID':'Public_Patient_ID'},axis=1,inplace=True)
print(f'TCR patients {len(df_tcell)}')
#merge
df_patient = df_patient.merge(df_tcell,on='Public_Patient_ID',how='left',suffixes=('','_x'))
df_patient['INDEX'] = df_patient.index + 1
# #load pSUB/pORG (old)
# df_gsva = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset1.xlsx', sheet_name='GSVA Scores - Tab 3')
# df_gsva.rename({'pSUB\nPrimaries':'pSUB_Primary','pORG\nPrimaries':'pORG_Primary',
#                 'Patient ID':'Public_Patient_ID'},axis=1,inplace=True)
#load Purist
d_gsva = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset1.xlsx', sheet_name=None)
df_gsva = d_gsva['Specimen Subtype - Tab 2'].loc[:,['PurIST Score','Patient Specimen ID','PurIST Subtype','Patient ID']]
df_gsva.rename({'Patient ID':'Public_Patient_ID'},axis=1,inplace=True)
df_patient = df_patient.merge(df_gsva,on='Public_Patient_ID',how='outer')
print(f'number rows {len(df_patient)}')
print(f'number unique pts. {df_patient.Public_Patient_ID.nunique()}')

In [None]:
# a patient who has a period of resection to FU but not to recurrence had no recurrence and 
#a patient who had a period of resection to recurrence but no liver or lung met had another site of recurrence.

#has a recurrence, not in liver or lung
b_recur = ~df_patient.loc[:,'Days from Resection to Recurrence'].isna() & df_patient.Cohort.isna() 

#had a resection, no recurrence,  not liver or lung
b_no_recur = ~df_patient.loc[:,'Days from Resection to FU'].isna() & df_patient.loc[:,'Days from Resection to Recurrence'].isna()  & df_patient.Cohort.isna() 

#add recurrence other
df_patient['Other_Recurrence'] = False
df_patient.loc[b_recur,'Other_Recurrence'] = True
#add recurrence none
df_patient['No_Recurrence'] = False
df_patient.loc[b_no_recur,'No_Recurrence'] = True
# should be 73 and 103, but I get 75 and 112 ...
print(df_patient.Other_Recurrence.sum())
print(df_patient.No_Recurrence.sum())
#df_patient.to_csv('temp_supp1.csv')

#add mutation data, if there are more than 10 patients with a mutation
df_gene = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset2.xlsx',
                           sheet_name='Mutation Data')

ls_genes = []
for s_gene in df_gene.Gene.unique():  
    ls_patient = df_gene.loc[df_gene.Gene==s_gene,'Patient Specimen ID']
    if len(ls_patient) > 10:
        #print(s_gene)
        df_patient.loc[df_patient.loc[:,'Patient Specimen ID'].isin(ls_patient),f'{s_gene}_altered'] = True
        df_patient.loc[:,f'{s_gene}_altered'].fillna(False,inplace=True)
        ls_genes.append(f'{s_gene}_altered')
    #break


In [None]:
#add age, filter patients who died after surgery
d_ids = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/OLD Versions/Simplified_Public_IDs_Key.xlsx',sheet_name=None)

df_id = d_ids['RnaSeqKey']
ls_ids = df_id.loc[:,'Public.Specimen.ID']

# add patients w/o RNA seq
for s_key in ['TcrTumorKey','TcrBloodKey']:
    df_add = d_ids[s_key].loc[~d_ids[s_key].loc[:,'Public.Specimen.ID'].isin(ls_ids)]
    df_id = pd.concat([df_id,df_add])

#check
# print(df_id.loc[:,'OPTR.Specimen.ID'].duplicated().any())
# ls_drop = df_id.loc[df_id.loc[:,'OPTR.Specimen.ID'].str.contains('-T2')].index
# df_unique = df_id.loc[df_id.loc[:,'OPTR.Specimen.ID'].str.contains('-T')].drop(ls_drop)
# #check
# print(df_unique.OPTR.duplicated().any())

#add id
df_vital['Public_Patient_ID'] = df_vital.OPTR.map(dict(zip(df_unique.OPTR,df_unique.loc[:,'Biolibrary.Subject.ID'])))

#omics data plus clinical data
df_patient = df_patient.merge(df_vital,on='Public_Patient_ID',how='left',suffixes=('','_x'))
print(f'number rows {len(df_patient)}')
print(f'number unique pts. {df_patient.Public_Patient_ID.nunique()}')
#df_id.to_csv('Patient_IDs.csv')

#add age category
df_patient.loc[df_patient.loc[:,'Age at Diagnosis'] > 70,'Age'] = '>70'
df_patient.loc[df_patient.loc[:,'Age at Diagnosis'] <= 70,'Age'] = '<=70'

#drop T2s
#ls_drop = df_id.loc[df_id.loc[:,'OPTR.Specimen.ID'].str.contains('-T2')].loc[:,'Public.Specimen.ID']
ls_drop = df_patient[df_patient.loc[:,'Patient Specimen ID'].str.contains('-T2').fillna(False)].index
df_patient = df_patient.drop(ls_drop)
print('dropping T2s')
print(f'number rows {len(df_patient)}')
print(f'number unique pts. {df_patient.Public_Patient_ID.nunique()}')

# drop less than 30 days
# died of surgery
ls_drop_surgery = df_patient[(df_patient.loc[:,'Days from Resection to FU'] < 30) & (df_patient.loc[:,'cVitalStatus'] == 'Dead')].index
print(df_patient[df_patient.index.isin(ls_drop_surgery)].Public_Patient_ID)
df_patient = df_patient.drop(ls_drop_surgery)
print('dropping died of surgery')
print(f'number rows {len(df_patient)}')
print(f'number unique pts. {df_patient.Public_Patient_ID.nunique()}')

#add lung and liver versus all (not just lung, liver and NA)
df_patient['Lung_Cohort'] = False
df_patient.loc[df_patient.Cohort=='Lung','Lung_Cohort'] = True

df_patient['Liver_Cohort'] = False
df_patient.loc[df_patient.Cohort=='Liver','Liver_Cohort'] = True

#add T or M
df_patient['Specimen_Type'] = [item.split('-')[-1].replace('M2','M') for item in df_patient.loc[:,'Patient Specimen ID'].fillna('-')]
#fix merge errors
df_matched = df_patient[df_patient.INDEX.duplicated(keep=False)].copy()
b_pri_to_met = (df_patient.Specimen_Type=='M') & (df_patient.loc[:,'Tumor Type']=='Primary') & (df_patient.Public_Patient_ID.isin(df_matched.Public_Patient_ID))
b_met_to_pri = (df_patient.Specimen_Type=='T') & (df_patient.loc[:,'Tumor Type']=='Met') & (df_patient.Public_Patient_ID.isin(df_matched.Public_Patient_ID))
print('fix merging')
print(b_pri_to_met.sum())
print(b_met_to_pri.sum())
df_patient.loc[b_pri_to_met,'Tumor Type'] = 'Met'
df_patient.loc[b_met_to_pri,'Tumor Type'] = 'Primary'
df_matched = df_patient[df_patient.INDEX.duplicated(keep=False)]
# should be 73 and 103 (got 73 and 103)
print(df_patient[~df_patient.Public_Patient_ID.duplicated()].Other_Recurrence.sum())
print(df_patient[~df_patient.Public_Patient_ID.duplicated()].No_Recurrence.sum())

print(f'number rows {len(df_patient)}')
print(f'number unique pts. {df_patient.Public_Patient_ID.nunique()}')

In [None]:
#all recurrence in one
df_patient['Recurrence_Sites_4'] = df_patient.Cohort
df_patient.loc[df_patient.Other_Recurrence,'Recurrence_Sites_4'] = 'Other_site'
df_patient.loc[df_patient.No_Recurrence,'Recurrence_Sites_4'] = 'No_Recurrence'
df_patient.Recurrence_Sites_4.fillna('No_Resection',inplace=True)

for s_site in df_patient.Recurrence_Sites_4.unique():
    n_patients = df_patient[df_patient.Recurrence_Sites_4==s_site].Public_Patient_ID.nunique()
    print(f'{s_site} {n_patients}')

print(f'number rows {len(df_patient)}')
print(f'number unique pts. {df_patient.Public_Patient_ID.nunique()}')

# #additional to liver/lung
# #compare to?
# s_compare_to = 'Other'#'Liver' #
# df_patient['Recurrence_'] = np.nan
# if s_compare_to == 'Other':
#     df_patient.loc[df_patient.Other_Recurrence,'Recurrence_'] = 'Other_Site'
# elif s_compare_to == 'Liver':
#     df_patient.loc[df_patient.Cohort=='Liver','Recurrence_'] = 'Other_Site'
# df_patient.loc[df_patient.No_Recurrence,'Recurrence_'] = 'None'
# print(df_patient[~df_patient.Public_Patient_ID.duplicated()].Recurrence_.value_counts())

#looks good
130 + 34 + 103 + 73 + 74

In [None]:
#add pORG 
s_pORG = 'trim_padj_0.2_pORG_Up_55_Genes'
s_pSUB = 'Kallisto55_pSUB1e-04'
df_pORG = pd.read_csv(f'../20230608_GSVA_Scores.csv',index_col=0)
df_pORG['Patient Specimen ID'] = df_pORG.Public_Specimen_ID
for s_group in ['GSVA_Primary', 'GSVA_Met', 'GSVA_All']:
    df_group = df_pORG.loc[df_pORG.Group==s_group,[s_pORG,'Patient Specimen ID']]
    df_group.rename({s_pORG:'pORG_0.2'},axis=1,inplace=True)
    s_suffix = s_group.replace('GSVA','')
    df_patient = df_patient.merge(df_group,on='Patient Specimen ID',suffixes=('',s_suffix),how='left')
df_patient.rename({'pORG_0.2':'pORG_0.2_Primary'},axis=1,inplace=True)


# add pSUB
d_rename = {'Kallisto55_pSUB1e-05':'pSUB1e-05', 'Kallisto55_pSUB1e-04':'pSUB1e-04'}
for s_group in ['Primaries','Mets','All']:
    df_pSUB = pd.read_csv(f'../annotation/GSVA_{s_group}_Kallisto55_pSUB1e-4and1e-5.tsv',sep='\t')
    #temp: use OPTR
    df_pSUB['Patient Specimen ID'] = df_pSUB.SampleID.map(dict(zip(df_id.loc[:,'OPTR.Specimen.ID'],df_id.loc[:,'Public.Specimen.ID'])))
    df_pSUB.drop('SampleID',axis=1,inplace=True)
    df_pSUB = df_pSUB.loc[:,[s_pSUB,'Patient Specimen ID']]
    df_pSUB.rename(d_rename,axis=1,inplace=True)
    s_suffix = s_group.replace('Mets','Met').replace('Primaries','Primary')
    df_patient = df_patient.merge(df_pSUB,on='Patient Specimen ID',suffixes=('',f'_{s_suffix}'),how='left')
    #break
df_patient.rename({'pSUB1e-05':'pSUB1e-05_Primary', 'pSUB1e-04':'pSUB1e-04_Primary'},axis=1,inplace=True)

print(f'number rows {len(df_patient)}')
print(f'number unique pts. {df_patient.Public_Patient_ID.nunique()}')

In [None]:
print(f'number rows {len(df_patient)}')
print(f'number unique pts. {df_patient.Public_Patient_ID.nunique()}')

In [None]:
s_out = '20230815_Patient_Metadata.csv'
ls_drop_columns = ['OPTR','Lung Met Present in Patient blood','Liver Met Present in Patient blood',
                  'Liver Met Present in Patient tumor','Lung Met Present in Patient tumor']
if not os.path.exists(s_out):
    print('saving')
    df_patient.loc[:,~df_patient.columns.isin(ls_drop_columns)].to_csv(s_out)
    

In [None]:
### overlap in omics data samples
# #the samples overlap/don't overlap in assays
# # all samples 

# from matplotlib_venn import venn3, venn3_circles
# from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
# from pyvenn import venn
# d_ids = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}annotation/Simplified_Public_IDs_Key.xlsx',sheet_name=None)

# print(d_ids.keys())
# d_keys = {#'OPTR.Specimen.ID':['RnaSeqKey','TcrTumorKey','DnaPanelKey'], #'',
#           'OPTR':['RnaSeqKey','TcrTumorKey','TcrBloodKey','DnaPanelKey'],
#           #'OPTR':['RnaSeqKey','TcrTumorKey','DnaPanelKey'],
#        }

# for s_col ,ls_keys in d_keys.items():
#     d_sets = {}
#     for s_key in ls_keys:
#             print(s_key)
#             print(len(d_ids[s_key]))
#             d_sets.update({s_key:d_ids[s_key]})
#     #plot
#     if len(ls_keys) == 4:
#         labels = venn.get_labels([set(item.loc[:,s_col]) for key, item in d_sets.items()])
#         fig,ax = venn.venn4(labels, names=[key.split('Key')[0] for key, item in d_sets.items()])#,ax=ax
#     elif len(ls_keys) == 3:
#         fig,ax = plt.subplots(figsize=(3,3),dpi=300)
#         venn3([set(item.loc[:,s_col]) for key, item in d_sets.items()], [key.split('Key')[0] for key, item in d_sets.items()],ax=ax)

# # #primaries

# # from matplotlib_venn import venn3, venn3_circles
# # from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
# # from pyvenn import venn
# # d_ids = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}annotation/Simplified_Public_IDs_Key.xlsx',sheet_name=None)

# # print(d_ids.keys())
# # d_keys = {#'OPTR.Specimen.ID':['RnaSeqKey','TcrTumorKey','DnaPanelKey'], #'',
# #           #'OPTR':['RnaSeqKey','TcrTumorKey','TcrBloodKey','DnaPanelKey'],
# #           'OPTR':['RnaSeqKey','TcrTumorKey','DnaPanelKey'],
# #        }

# # for s_col ,ls_keys in d_keys.items():
# #     d_sets = {}
# #     for s_key in ls_keys:
# #             print(s_key)
# #             print(len(d_ids[s_key]))
# #             df_set = d_ids[s_key][~d_ids[s_key].loc[:,'OPTR.Specimen.ID'].str.contains('-M')]
# #             d_sets.update({s_key:df_set})
# #     #plot
# #     if len(ls_keys) == 4:
# #         labels = venn.get_labels([set(item.loc[:,s_col]) for key, item in d_sets.items()])
# #         fig,ax = venn.venn4(labels, names=[key.split('Key')[0] for key, item in d_sets.items()],ax=ax)
# #     elif len(ls_keys) == 3:
# #         fig,ax = plt.subplots(figsize=(3,3),dpi=300)
# #         venn3([set(item.loc[:,s_col]) for key, item in d_sets.items()], [key.split('Key')[0] for key, item in d_sets.items()],ax=ax)
# #'''

# Survival Analysis - patients in paper <a name="clin"></a>

KM, CPH


[contents](#contents)



In [None]:
s_out = '20230815_Patient_Metadata.csv'
df_patient = pd.read_csv(s_out,index_col=0)
print(len(df_patient))

df_patient.columns

In [None]:
# 10 patients primary to met
df_matched = df_patient[df_patient.INDEX.duplicated(keep=False)]
for idx, y in enumerate(['PurIST Score','pORG_0.2_All', 'pSUB1e-04_All']):
    fig, ax = plt.subplots(figsize=(2,3),dpi=200)
    sns.pointplot(data=df_matched, x="Specimen_Type", y=y, hue="Public_Patient_ID",
                 ax=ax)
    ax.legend().remove()
    a = df_matched.where(df_matched.Specimen_Type== 'T')[y].dropna()
    b = df_matched.where(df_matched.Specimen_Type== 'M')[y].dropna()
    result, pvalue = stats.ttest_rel(a, b)
    ax.set_title(f'{y}\np={pvalue:.2}')
    ax.set_ylabel('')
    #break
#make a legend
fig, ax = plt.subplots(figsize=(3,3),dpi=200)
sns.pointplot(data=df_matched, x="Specimen_Type", y=y, hue="Public_Patient_ID",ax=ax)
ax.legend(bbox_to_anchor=(1,1))

In [None]:
#CPH single variable
%matplotlib inline
df_result=pd.DataFrame()
ls_vital = ['Neoadjuvant','Age','Sex','Stage', 'Grade','LV_Invasion','LN_Pos', #'Age at Diagnosis',
       'PurIST Score','Other_Recurrence', 'No_Recurrence','Lung_Cohort','Liver_Cohort',
           'pORG_0.2_Primary',
       'pORG_0.2_Met', 'pORG_0.2_All', 'pSUB1e-04_Primary', 'pSUB1e-04_Met',
       'pSUB1e-04_All']
ls_vital = ls_vital + ls_genes
for s_vital in ls_vital:
    print(s_vital)
    df = df_patient.loc[(~df_patient.Public_Patient_ID.duplicated(keep='first')),[s_vital,s_time,s_censor]]#.dropna(how='any')
    print(len(df))
    df = df.dropna()
    print(len(df))
    if df.columns.isin(['Stage']).any():
        df.Stage = df.Stage.replace({'I':1,'II':2,'III':3,'IV':4}).astype('int')
    if df.columns.isin(['Grade']).any():
        df.Grade = df.Grade.astype('int')
    if df.loc[:,s_vital].dtype=='O':
        df_dummy = pd.get_dummies(df.loc[:,[s_vital]],drop_first=True)
        df.drop(s_vital,axis=1,inplace=True)
        df[s_vital] = df_dummy
    fig, cph = util.cph_plot(df,s_vital,s_time,s_censor,figsize=(3,1.5))
    plt.tight_layout()
    fig.savefig(f'figures/CPH_single_{s_vital}_subset.png')
    plt.close(fig)
    df_result=pd.concat([df_result,cph.summary.loc[:,['exp(coef)','p']]])
    #break

In [None]:
#all the recurrence combos
s_vital = 'Recurrence_Sites_4'

for tu_combo in combinations(df_patient.Recurrence_Sites_4.unique(),r=2):
    print(tu_combo)
    df = df_patient.loc[(~df_patient.Public_Patient_ID.duplicated(keep='first')),[s_vital,s_time,s_censor]]#.dropna(how='any')
    print(len(df))
    df = df[df.loc[:,s_vital].isin(tu_combo)].dropna()
    #order them
    df.Recurrence_Sites_4 = df.Recurrence_Sites_4.astype('category')
    df.Recurrence_Sites_4 = df.Recurrence_Sites_4.cat.set_categories(sorted(tu_combo))
    df_dummy = pd.get_dummies(df.loc[:,[s_vital]],drop_first=False)
    df.drop(s_vital,axis=1,inplace=True)
    s_compare = " to ".join(sorted(tu_combo))
    df[s_compare] = df_dummy.iloc[:,0]
    print(len(df))
    try:
        fig, cph = util.cph_plot(df,s_compare,s_time,s_censor,figsize=(3,1.5))
        plt.tight_layout()
        fig.savefig(f'figures/CPH_single_{s_vital}_{s_compare}_subset.png')
        plt.close(fig)
        df_result=pd.concat([df_result,cph.summary.loc[:,['exp(coef)','p']]])
    except:
        print('cph error')
    #break

In [None]:
#save results
df_result.to_csv('results_single_CPH.csv')

## multivariable

In [None]:
s_time = 'cDays from Diagnosis to FU'
s_time = 'cDays from Resection to Recurrence'

In [None]:
#CPH
b_genes = False
df_result_multi = pd.DataFrame()
ls_multi = [#'Sex','Neoadjuvant',#'Age','Stage', 'Grade','LV_Invasion','LN_Pos', #'Age at Diagnosis',
       'PurIST Score','Other_Recurrence', 'No_Recurrence','Lung_Cohort','Liver_Cohort',
           'pORG_0.2_Primary',#'pORG_0.2_Met', 
       'pORG_0.2_All', 'pSUB1e-04_Primary', #'pSUB1e-04_Met',
       'pSUB1e-04_All']
ls_cats = ['LV_Invasion','LN_Pos','Age'] #categorical
if b_genes:
    ls_cats = ['RNF43_altered', 'CDKN2A_altered', 'ARID1B_altered','KMT2D_altered']
for s_multi in ls_multi:
    print(s_multi)
    df = df_patient.loc[:,[s_multi,s_time,s_censor,'Grade','Stage']].dropna() ##'Neoadjuvant','Age at Diagnosis'
    if ls_cats[0].find('altered') > -1:
        df = df_patient.loc[:,[s_multi,s_time,s_censor]].dropna()
    if df.loc[:,s_multi].dtype=='O':
        df_dummy = pd.get_dummies(df.loc[:,[s_multi]],drop_first=True)
        df.drop(s_multi,axis=1,inplace=True)
        s_multi = df_dummy.columns[0]
        df[s_multi] = df_dummy
    df_dummy = pd.get_dummies(df_patient.loc[:,ls_cats],drop_first=True) #'TP53_altered','CDKN2A_altered',,'Sex'
    df = pd.concat([df,df_dummy],axis=1)
    df.dropna(inplace=True)
    if df.columns.isin(['Stage']).any():
        df.Stage = df.Stage.replace({'I':1,'II':2,'III':3,'IV':4}).astype('int')
    if df.columns.isin(['Grade']).any():
        df.Grade = df.Grade.astype('int')
    fig, cph = util.cph_plot(df,s_multi,s_time,s_censor,figsize=(4,3))
    fig.savefig(f'figures/CPH_multi_{s_multi}.png')
    df_result_model = cph.summary.loc[:,['exp(coef)','p']].reset_index()
    df_result_model['model'] = s_multi
    df_result_multi=pd.concat([df_result_multi,df_result_model])

# all the recurrence combos - multi
s_vital = 'Recurrence_Sites_4'
for tu_combo in combinations(df_patient.Recurrence_Sites_4.unique(),r=2):
    print(tu_combo)
    df = df_patient.loc[(~df_patient.Public_Patient_ID.duplicated(keep='first')),[s_vital,s_time,s_censor,'Stage','Grade']]#.dropna(how='any')
    print(len(df))
    df = df[df.loc[:,s_vital].isin(tu_combo)].dropna()
    #order them
    df.Recurrence_Sites_4 = df.Recurrence_Sites_4.astype('category')
    df.Recurrence_Sites_4 = df.Recurrence_Sites_4.cat.set_categories(sorted(tu_combo))
    df_dummy = pd.get_dummies(df.loc[:,[s_vital]],drop_first=False)
    df.drop(s_vital,axis=1,inplace=True)
    s_compare = " to ".join(sorted(tu_combo))
    df[s_compare] = df_dummy.iloc[:,0]
    df_dummy = pd.get_dummies(df_patient.loc[:,ls_cats],drop_first=True) #'TP53_altered','CDKN2A_altered',,'Sex'
    df = pd.concat([df,df_dummy],axis=1)
    df= df.dropna()
    print(len(df))
    try:
        fig, cph = util.cph_plot(df,s_compare,s_time,s_censor,figsize=(4,3))
        plt.tight_layout()
        fig.savefig(f'figures/CPH_mutli_{s_vital}_{s_compare}_subset.png')
        df_result_model = cph.summary.loc[:,['exp(coef)','p']].reset_index()
        df_result_model['model'] = s_compare
        df_result_multi=pd.concat([df_result_multi,df_result_model])
    except:
        print('cph error')
    

In [None]:
if b_genes:
    df_result_multi.reset_index(drop=True).to_csv('results_multi_CPH_gene_alterations.csv')
else:
    df_result_multi.reset_index(drop=True).to_csv('results_multi_CPH_clinical_covariates.csv')

In [None]:
# #save out the metadata
# ls_drop = ['Liver Met Present in Patient tumor',
#        'Lung Met Present in Patient tumor','Lung Met Present in Patient blood',
#        'Liver Met Present in Patient blood',#'Unnamed: 0',
#            'pORG_Primary',
#  'pSUB_Primary','OPTR','SampleID'#'OPTR',
#           ]
# #save out
# s_out = 'Patient_metadata_clean_20.csv'#'Patient_metadata_clean.csv'
# if not os.path.exists(s_out):
#     print('saving')
#     df_patient.loc[:,~df_patient.columns.isin(ls_drop)].to_csv(s_out)

# Section 6 <a name="geneexp"></a>

gene expresison correlation

The 7 genes used for the IRDS signature were: STAT1, IFI44, IFIT3, OAS1,208
IFIT1, G1P2, and MX1


[contents](#contents)

In [None]:
s_out = '20230815_Patient_Metadata.csv'
df_patient= pd.read_csv(s_out,index_col=0)

df_vst = pd.read_csv('data/VST_Genes_Link.csv',index_col=0)
df_rna = df_vst.T.copy()
df_rna.head()

In [None]:
ls_add = ['pORG_0.2_Primary','pSUB1e-04_Primary','PurIST Score']
for s_add in ls_add:
    d_map = dict(zip(df_patient.loc[:,'Patient Specimen ID'],df_patient.loc[:,s_add]))
    df_rna[s_add] = df_rna.index.map(d_map)


In [None]:
d_rename={'txi_pORG_Up_42_Genes':'pORG_42',
          'trim_padj_0.2_pORG_Up_55_Genes':'pORG_55',
          'pORG.14':'pORG_14', 'pORG.15':'pORG_15',
          'pORG_0.2_Primary':'pORG_0.2',
          'pSUB1e-04_Primary':'pSUB1e-04',
          'MRC1':'MRC1 (CD206)',
          'OLR1':'OLR1 (LOX1)',
          'MS4A1':'MS4A1 (CD20)'}
from scipy.stats import pearsonr
dim = (4,3)
    
for s_add in ls_add:
    ls_marker = ['CD3E','CD4', 'CD8A', 'MS4A1',#'LAMP3','ITGAX','CD209',
                 'CD68','OLR1','MRC1','MX1','STAT1',
                 s_add] #
    df_all = df_rna.loc[:,ls_marker].corr().rename(d_rename,axis=1).rename(d_rename,axis=0)
    print(len(df_rna))
    g = sns.clustermap(df_all)
    plt.close()
    categories_order = df_all.iloc[g.dendrogram_col.reordered_ind,:].index.tolist()
    df_all = df_all.loc[categories_order,categories_order]
    rho = df_rna.loc[:,ls_marker].corr() #df_all.corr()
    pval = df_rna.loc[:,ls_marker].corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
    p_vals = pval.applymap(lambda x: ''.join(['*' for t in [0.001,0.005,0.05] if x<=t]))
    p_vals = p_vals.rename(d_rename,axis=1).rename(d_rename,axis=0)
    p_vals = p_vals.loc[categories_order,categories_order]
    fig, ax = plt.subplots(figsize=dim,dpi=300)
    sns.heatmap(df_all, vmin=-1, vmax=1, annot=p_vals, fmt = '', cmap='RdBu_r',ax=ax,
               cbar_kws={'shrink':0.85,'label':s_add})
    #temp
    plt.title(f'{s_add} n={len(df_rna)}')
    fig, ax = plt.subplots(figsize=dim,dpi=300)
    matrix = np.triu(np.ones_like(rho))
    np.fill_diagonal(matrix, val=0)
    np.fill_diagonal(p_vals.values,'')
    sns.heatmap(df_all, vmin=-1, vmax=1, annot=p_vals, fmt = '', cmap='RdBu_r',
                ax=ax,mask=matrix,cbar_kws={'shrink':0.85,'label':'Pearson Correlation'},
               ) #'anchor':(-1.4,0.0)

In [None]:
# add cohort
df_rna['Public_Patient_ID'] = [item[0:-2] for item in df_rna.index]
df_rna['Cohort'] = df_rna.Public_Patient_ID.map(dict(zip(df_patient.Public_Patient_ID,df_patient.Cohort)))

## gene expresison versus liver lung

In [None]:
#liver vs lung (run t cell section to load t cell data)
ls_marker = ['CD3E','CD4', 'CD8A', 'MS4A1','CD68','OLR1','MRC1','MX1','STAT1',]
df_plot = df_rna.loc[df_rna.Public_Patient_ID.isin(df_tcell.Public_Patient_ID),ls_marker].unstack().reset_index()
df_plot = df_rna.loc[:,ls_marker].unstack().reset_index()
df_plot['Public_Patient_ID'] = [item[0:-2] for item in df_plot.level_1]
df_plot['Cohort'] = df_plot.Public_Patient_ID.map(dict(zip(df_patient.Public_Patient_ID,df_patient.Cohort)))
df_plot.rename({0:'Expression','level_0':'Gene'},axis=1,inplace=True)
fig,ax = plt.subplots(dpi=200,figsize=(5,3))
sns.stripplot(data=df_plot,x='Gene',y='Expression',hue='Cohort',dodge=True,ax=ax,s=2)
sns.boxplot(data=df_plot,x='Gene',y='Expression',hue='Cohort',ax=ax,showmeans=True,medianprops={'visible': False},
                       whiskerprops={'visible': False},meanline=True,showcaps=False,
                       meanprops={'color': 'k', 'ls': '-', 'lw': 2},showfliers=False,showbox=False)
h, l = ax.get_legend_handles_labels()
ax.legend(h[0:2],l[0:2],loc='lower left')
pairs = [((item,'Lung'),(item,'Liver')) for item in ls_marker]
annot = Annotator(ax, pairs, data=df_plot,x='Gene',y='Expression',hue='Cohort',
                  order=ls_marker,hue_order=('Lung','Liver'))
annot.configure(test='t-test_ind')
annot.apply_and_annotate()
ax.set_title(f'Liver vs Lung')
plt.tight_layout()

In [None]:
#liver vs lung (t cell)
ls_marker = ['CD3E','CD4', 'CD8A', 'MS4A1','CD68','OLR1','MRC1','MX1','STAT1',]
df_plot = df_rna.loc[df_rna.Public_Patient_ID.isin(df_tcell.Public_Patient_ID),ls_marker].unstack().reset_index()
#df_plot = df_rna.loc[:,ls_marker].unstack().reset_index()
df_plot['Public_Patient_ID'] = [item[0:-2] for item in df_plot.level_1]
df_plot['Cohort'] = df_plot.Public_Patient_ID.map(dict(zip(df_patient.Public_Patient_ID,df_patient.Cohort)))
df_plot.rename({0:'Expression','level_0':'Gene'},axis=1,inplace=True)
fig,ax = plt.subplots(dpi=200,figsize=(5,3))
sns.stripplot(data=df_plot,x='Gene',y='Expression',hue='Cohort',dodge=True,ax=ax,s=2)
sns.boxplot(data=df_plot,x='Gene',y='Expression',hue='Cohort',ax=ax,showmeans=True,medianprops={'visible': False},
                       whiskerprops={'visible': False},meanline=True,showcaps=False,
                       meanprops={'color': 'k', 'ls': '-', 'lw': 2},showfliers=False,showbox=False)
h, l = ax.get_legend_handles_labels()
ax.legend(h[0:2],l[0:2],loc='lower left')
pairs = [((item,'Lung'),(item,'Liver')) for item in ls_marker]
annot = Annotator(ax, pairs, data=df_plot,x='Gene',y='Expression',hue='Cohort',
                  order=ls_marker,hue_order=('Lung','Liver'))
annot.configure(test='t-test_ind')
annot.apply_and_annotate()
ax.set_title(f'Liver vs Lung: T cell samples')
plt.tight_layout()

## Section 6 <a name="tcell"></a>

re-analyze t cell data

289 blood samples with matching 175 primary
tumors (141 overlapping with the RNA-seq dataset) and 43 metastatic tumors (33 overlapping with
the RNA-seq dataset). (218)

**missing 2 tumor (have 216), all 289 blood there**

290 unique patients (one primary tumor w/o blood: ST-00018360)

215 blood and tumor are matched

TOTAL = 174 primary, 42 met (216 total)

Of the 290 patients, 284 are analyzed (some dropped surgery)

we analyzed blood samples from 77 patients in the liver cohort and 16
patients in the lung cohort, of which 60 and 16 were matched with tumor samples from the same
patient, respectively

TOTAL = 94 blood
TOTAL = 76 tumor

tumor distinct clones
used data from 214 matched pairs of tumor and blood samples

**213 are there, missing 1**

(TCR tumor: 59 and 16)

#### Exclude those who died of surgery

yes      ST-00018963

yes      ST-00020077

yes     ST-00016968

yes     ST-00006625

yes     ST-00007146

yes    ST-00018260

#### Actually analyzed 
Liver    76
Lung     16

| Cohort  | Tumor Type  |  number pts.  |
|---------|-------------|---------------|
| Liver   | Met         | 17            |
|         | Primary     | 42            |
| Lung    | Met         | 3             |
|         | Primary     | 13            |

[contents](#contents)

In [None]:
#load full patient data
s_out = '20230815_Patient_Metadata.csv'
df_patient= pd.read_csv(s_out,index_col=0)
# old patient data
df_patient_old = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset1.xlsx',
                           sheet_name='Patients - Tab 1')
#load T cell data - tumor
df_tcell = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset6.xlsx',
                         sheet_name='Tumor Samples')
print(len(df_tcell))
#blood
df_tcell_blood = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset6.xlsx',
                         sheet_name='Blood Samples')
df_tcell_blood['Tumor Type'] = 'Blood'
print(len(df_tcell_blood))
#both
df_tcell = df_tcell.merge(df_tcell_blood,on='Patient ID',suffixes=(' tumor',' blood'),how='outer')
df_tcell.rename({'Productive Rearrangements (Observed Richness)':'Productive_Rearrangements',
                'Templates per ng':'Templates_per_ng','Patient ID':'Public_Patient_ID'},axis=1,inplace=True)
print(len(df_tcell))
#six TCR patients have primary AND met in full patient data
df_matched = df_patient[df_patient.Public_Patient_ID.duplicated()]
df_tcell_matched = df_tcell[df_tcell.Public_Patient_ID.isin(df_matched.Public_Patient_ID)]
ls_primary = df_tcell_matched[df_tcell_matched.loc[:,'Tumor Type tumor'] == 'Primary'].Public_Patient_ID
ls_met = df_tcell_matched[df_tcell_matched.loc[:,'Tumor Type tumor'] == 'Met'].Public_Patient_ID
ls_drop_mets = df_patient[(df_patient.Public_Patient_ID.isin(ls_primary)) & (df_patient.loc[:,'pSUB1e-04_Primary'].isna())].index.tolist()
ls_drop_pri = df_patient[(df_patient.Public_Patient_ID.isin(ls_met)) & (df_patient.loc[:,'pSUB1e-04_Met'].isna())].index.tolist()
df_merge = df_patient[df_patient.Public_Patient_ID.isin(df_tcell.Public_Patient_ID)]
df_merge = df_merge[~df_merge.index.isin(ls_drop_mets + ls_drop_pri)]#.drop(ls_drop_mets + ls_drop_pri,axis=0)
print(len(df_merge))

#mysteries of pandas
# missing_ids: some were dropped surgery?
missing_ids = df_tcell[~df_tcell.Public_Patient_ID.isin(df_merge.Public_Patient_ID)].Public_Patient_ID
df_not_matched = df_patient[df_patient.Public_Patient_ID.isin(missing_ids)]
df_merge = pd.concat([df_merge,df_not_matched])

print((df_merge.Public_Patient_ID.nunique()))
print(df_merge.Cohort.value_counts())
print(df_merge.groupby(['Cohort','Tumor Type']).count().Public_Patient_ID)

print((df_merge.loc[:,'Tumor Type'] == 'Primary').sum())
print((df_tcell.loc[:,'Tumor Type tumor'] == 'Primary').sum())
print((df_merge.loc[:,'Tumor Type'] == 'Met').sum())
print((df_tcell.loc[:,'Tumor Type tumor'] == 'Met').sum())

## add calculated Entropy, simpsons 

In [None]:
#load data

d_load = {'TCR_Tumor_Distinct_Clones_no_rare.csv':['Public_Patient_ID',
                'Number Tumor Distinct Clones', 'Fraction Tumor Distinct Clones','Prod. Freq. Tumor Distinct Clones'],
          'TCR_Tumor_Distinct_Rearrangements_no_rare.csv':['Public_Patient_ID',
        'Number Tumor Distinct Rearrangements','Fraction Tumor Distinct Rearrangements','Prod. Freq. Tumor Distinct Rearrangements'],
          'TCR_Tumor_Distinct_Clones.csv':['Public_Patient_ID','Number Tumor Distinct Clones (rare)','Fraction Tumor Distinct Clones (rare)'],
          'TCR_Simpsons_Evenness_templates.csv':['Public_Patient_ID','Simpsons_Evenness','Simpsons_Evenness_no_CMV'],
          'TCR_Simpsons_D_templates.csv':['Public_Patient_ID','Simpsons_D'],
          'TCR_Shannon_entropy_productive_frequency.csv':['Public_Patient_ID','Shannon_Entropy','Normalized_Shannon_Entropy'],
         }
for s_en, ls_markers in d_load.items():
    print(s_en)
    df_en = pd.read_csv(f'data/{s_en}')
    df_en.rename({'Unnamed: 0':'Patient_Specimen_ID','Shannon Entropy':'Shannon_Entropy',#'Percent Tumor Distinct Rearrangements in Tumor':'Percent Tumor Distinct Clones (JL)',   
             'Normalized Shannon Entropy':'Normalized_Shannon_Entropy'},axis=1,inplace=True)
    if s_en == 'TCR_Tumor_Distinct_Clones.csv':
        df_en.rename({'Unnamed: 0':'Patient_Specimen_ID','Fraction Tumor Distinct Clones in Tumor':'Fraction Tumor Distinct Clones (rare)',
                     'Number Tumor Distinct Clones':'Number Tumor Distinct Clones (rare)'},axis=1,inplace=True)
    df_en['dash_end'] = [item.split('-')[-1] for item in df_en.Patient_Specimen_ID]
    df_en['Public_Patient_ID'] = [item.split('-')[0] + '-' + item.split('-')[1] for item in df_en.Patient_Specimen_ID]
    #patient rows with columns for blood and tumor
    df_en_pat = pd.DataFrame(index=df_en.Public_Patient_ID.unique())
    df_tum = df_en[df_en.dash_end!='B'].loc[:,ls_markers].set_index('Public_Patient_ID')
    df_en_pat = df_en_pat.merge(df_tum,left_index=True,right_index=True,how='left')
    if s_en.find('Tumor_Distinct') == -1:
        df_bld = df_en[df_en.dash_end=='B'].loc[:,ls_markers].set_index('Public_Patient_ID')
        df_en_pat = df_en_pat.merge(df_bld,left_index=True,right_index=True,how='left',suffixes=('_Tumor','_Blood'))
    df_merge = df_merge.merge(df_en_pat.reset_index().rename({'index':'Public_Patient_ID'},axis=1),on='Public_Patient_ID',how='left')

    
df_merge['Simpsons_Diversity_Tumor'] = 1/df_merge.Simpsons_D_Tumor#1-df_merge.Simpsons_D_Tumor#
df_merge['Simpsons_Diversity_Blood'] = 1/df_merge.Simpsons_D_Blood#1-df_merge.Simpsons_D_Blood#
df_merge['Clonality_Tumor'] = 1 - df_merge.Normalized_Shannon_Entropy_Tumor
df_merge['Clonality_Blood'] = 1 - df_merge.Normalized_Shannon_Entropy_Blood

#met or primary
df_merge['Percent Tumor Distinct Clones'] = 100 - df_merge.loc[:,'Percentage Tumor-Distinct Clones in Paired Tumor Sample']
df_merge.rename({'Patient Specimen ID':'Patient_Specimen_ID','Tumor Type':'Tumor_Type'},axis=1,inplace=True)
print((df_merge.Public_Patient_ID.nunique()))

In [None]:
#add pORG quartiles 
s_porg = 'pORG_0.2_All' # 'pORG_0.2_Primary' #  'pORG_0.2_Met'#   #select one

x = df_merge.loc[:,s_porg].dropna()
b_cut = df_merge.loc[:,s_porg].dropna().index
print(len(x))

d_cut = {'quartiles':(4,['low','med-low','med-high','high']),
         'tertiles' : (3,['low','med','high']),
        'medians' : (2,['low','high'])}
 
for s_col, tu_cut in d_cut.items():
    i_cut = tu_cut[0]
    labels = tu_cut[1]
    q = pd.qcut(x, q=i_cut,labels=labels) 
    if s_col == 'quartiles':
        df_merge[s_col] = np.NaN
        df_merge.loc[b_cut,s_col] = q.replace({'med-low':np.NaN,'med-high':np.NaN})
    elif s_col == 'tertiles':
        df_merge[s_col] = np.NaN
        df_merge.loc[b_cut,s_col] = q.replace({'med':np.NaN})#'high'
    else:
        df_merge[s_col] = np.NaN
        df_merge.loc[b_cut,s_col] = q
    print(df_merge[s_col].value_counts())
    

In [None]:
# the greater the absolute value of the slope, the greater the diversity.
df_pri = df_merge.copy()#.drop(236) #pr outlier 
if s_porg.find('Primar') > -1:
    print('Primaries')
    df_pri = df_merge.loc[(df_merge.Tumor_Type=='Primary')].copy()
elif s_porg.find('All') > -1: 
    print('Primaries and Mets')
elif s_porg.find('Met') > -1:
    df_pri = df_merge.loc[df_merge.Tumor_Type=='Met'].copy()
print(df_pri.Specimen_Type.unique())

alpha = 0.05
b_correct= False #True #
ls_foci = ['Productive_Rearrangements','Templates_per_ng',#'Normalized_Shannon_Entropy_Tumor', #'Normalized_Shannon_Entropy_Blood',#'Number Tumor Distinct Clones (rare)',  #'Number Tumor Distinct Clones', 
    'Shannon_Entropy_Tumor', 'Shannon_Entropy_Blood',
    'Clonality_Tumor','Clonality_Blood',
    'Simpsons_Diversity_Tumor','Simpsons_Diversity_Blood',
    "Simpson's Evenness tumor", "Simpson's Evenness blood",
    'Fraction Tumor Distinct Clones','Prod. Freq. Tumor Distinct Clones',
    'Fraction Tumor Distinct Clones (rare)','Percent Tumor Distinct Clones',
    ]
se_non_para = pd.Series(['Simpsons_Diversity_Blood', #non- parametric:
    'Templates_per_ng','Productive_Rearrangements',
     "Simpson's Evenness tumor", "Simpson's Evenness blood",
   'Number Tumor Distinct Clones'])
d_order =  {'Cohort':['Liver','Lung'],#'PurIST Subtype':['basal-like','classical'],
    'quartiles':['high','low']}

for s_foci in ls_foci:
    print(s_foci)
    if se_non_para.isin([s_foci]).any():
        s_stats = 'non-parametric'
    else:
        s_stats = 'mean' 
    df_both,d_pval,order,ls_ticks = util.violin_stats(df_pri,d_order,s_foci,s_stats)
    #util.qq_plot_hist(df_pri,s_cat,s_foci)  #anova eval
    fig,pvalues,corrected = util.plot_violins(df_both,d_pval,d_order,s_stats,s_foci,order,ls_ticks,b_correct=b_correct)#True#False
    fig.savefig(f'figures/violinplot_both_{s_foci}_{list(d_order.keys())[-1]}_{s_porg}_{s_stats}.png')
    if np.array(pvalues).min() > alpha:
        plt.close(fig)
    #and pearson
    if s_porg.find('All') > -1:
        fig2, pvalues2 = util.plot_pearson(df_pri,s_porg,s_foci,s_stats,ls_plots=['Primaries','Mets','Both'])
        fig.savefig(f'figures/scatterplot_pearson_{s_foci}_{list(d_order.keys())[-1]}_{s_porg}_{s_stats}.png')
    elif s_porg.find('Primar') > -1:
        fig2, pvalues2 = util.plot_pearson(df_pri,s_porg,s_foci,s_stats,ls_plots=['Primaries'])
    elif s_porg.find('Met') > -1:
        fig2, pvalues2 = util.plot_pearson(df_pri,s_porg,s_foci,s_stats,ls_plots=['Mets'])
    if np.array(pvalues2).min() > alpha:
        plt.close(fig2)
    break
    

## TCR survival



In [None]:
# df_mets = pd.read_csv(f'Mets_In_TCR_OPTR.csv',index_col=0)
# df_mets.rename({'Participant_ID':'OPTR'},axis=1,inplace=True)
# df_id = df_id.rename({'Biolibrary.Subject.ID':'Public_Patient_ID',
#                                                           'Public.Specimen.ID':'Public_Specimen_ID'},axis=1)
# b_met = df_id.Public_Specimen_ID.str.contains('-M')
# b_pri = df_id.Public_Specimen_ID.str.contains('-T')
# d_id_met = dict(zip(df_id.loc[b_met,'OPTR'],df_id.loc[b_met,'Public_Patient_ID']))
# d_id_pri = dict(zip(df_id.loc[b_pri,'OPTR'],df_id.loc[b_pri,'Public_Patient_ID']))
# df_mets['Public_Patient_ID'] = df_mets.OPTR.map(d_id_met)
# df_mets.drop('OPTR',axis=1).to_csv('Mets_In_TCR.csv')
df_mets = pd.read_csv(f'Mets_In_TCR.csv',index_col=0)
df_merge = df_merge.merge(df_mets,on='Public_Patient_ID',how='left',suffixes=('','_x'))
df_merge.Survival = df_merge.Survival.fillna(df_merge.loc[:,'Vital Status'])
df_merge.Survival_time = df_merge.Survival_time.fillna(df_merge.loc[:,'Overall Survival (Days)'])

In [None]:
df_result = pd.DataFrame()
importlib.reload(util)
ls_foci = ['Templates_per_ng','Productive_Rearrangements',
    'Shannon_Entropy_Tumor', 'Shannon_Entropy_Blood',
           'Clonality_Tumor', 'Clonality_Blood',
           'Simpsons_Diversity_Tumor','Simpsons_Diversity_Blood',
            "Simpson's Evenness tumor",   "Simpson's Evenness blood",
    'Fraction Tumor Distinct Clones (rare)', 'Percent Tumor Distinct Clones', 
          ]
s_time = 'Survival_time'#'Days from Diagnosis to FU'#
s_censor = 'Survival'
alpha = 0.05
s_title_str = 'Met' # 'All'#'Primary'#
savedir = 'figures'
i_cut=4
labels= ['low','med-low','med-high','high'] # ['low','highX','high']#
if s_title_str == 'All':
    df_km_samples = df_merge
elif s_title_str == 'Met':
    i_cut=3
    labels=['low','med','high']
    df_km_samples = df_merge.loc[df_merge.loc[:,'Tumor_Type'] == s_title_str,[s_time,s_censor]+ls_foci]
else:
    df_km_samples = df_merge.loc[df_merge.loc[:,'Tumor_Type'] == s_title_str,[s_time,s_censor]+ls_foci]
for s_col in ls_foci:
    print(s_col)
    df_km, pvalue = util.quartile_km(df_km_samples,s_col,s_title_str,savedir,alpha,i_cut,
                                        labels,s_time=s_time,s_censor='Survival')
    print(pvalue)
    try: #cox
        df = df_km_samples.loc[:,[s_col,s_time,s_censor]].dropna()
        fig, cph = util.cph_plot(df,s_col,s_time,s_censor,figsize=(4,1.5))
        plt.tight_layout()
        fig.savefig(f'figures/CPH_TCR_{s_col}_{s_title_str}.png')
        df_result_model = cph.summary.loc[:,['exp(coef)','p']].reset_index()
        df_result=pd.concat([df_result,df_result_model])
    except:
        print('')
    #break

In [None]:
print(s_title_str)
df_result.sort_values(by='p').set_index('covariate')

### TCR heatmap

In [None]:
#heatmap
import matplotlib as mpl
df_merge['Survival interval'] = pd.qcut(df_merge.Survival_time,6)
df_merge.rename({'medians':f'{s_porg} (median)'},axis=1,inplace=True)
ls_annot = ['Tumor_Type', 'Cohort',f'{s_porg} (median)' ,'Survival interval']
cmap=plt.cm.get_cmap('Blues', 6)
ls_color=[mpl.cm.tab10.colors,mpl.cm.Set1.colors,mpl.cm.Set2.colors,
          #mpl.cm.Set3.colors,
          #mpl.cm.Paired.colors,mpl.cm.Pastel1.colors
         [cmap(item) for item in np.arange(0,1,.17)]]
df_annot = pd.DataFrame()
dd_color = {}
for idx, s_annot in enumerate(ls_annot):
    color_palette = ls_color[idx]
    d_color = dict(zip(sorted(df_merge.loc[:,s_annot].dropna().unique()),color_palette[0:len(df_merge.loc[:,s_annot].dropna().unique())]))
    d_color.update({'NA':'lightgray'})
    network_colors = df_merge.loc[:,s_annot].astype('object').fillna('NA').map(d_color)
    df_annot[s_annot] = pd.DataFrame(network_colors)
    dd_color.update({s_annot:d_color})
from matplotlib.pyplot import gcf
g = sns.clustermap(df_merge.loc[:,ls_foci].dropna(),z_score=1,dendrogram_ratio=0.1,cmap='RdBu_r',vmin=-5,vmax=5,
      cbar_pos=(0.01, 0.94, 0.04, 0.08),figsize=(8, 10),row_colors=df_annot,cbar_kws={'label': 'z-score'})
for idx, (s_annot, d_color) in enumerate(dd_color.items()):
    g.ax_col_dendrogram.bar(0, 0, color='w',label=' ', linewidth=0)
    for label,color in d_color.items():
        g.ax_col_dendrogram.bar(0, 0, color=color,label=label, linewidth=0)
g.ax_heatmap.set_yticks([])
l1 = g.ax_col_dendrogram.legend(loc="right", ncol=1,bbox_to_anchor=(0, 0.7),bbox_transform=gcf().transFigure)
g.ax_heatmap.set_title(s_porg)
g.savefig(f'figures/heatmap_TCR_{s_porg}.png')
plt.close(g.fig)

In [None]:
#high low pORG quartile versus gene expresison
if s_porg == 'pORG_0.2_Primary':
    df_rna['Public_Patient_ID'] = [item.split('-T')[0] for item in df_rna.index]
    #liver vs lung (run t cell section to load t cell data!)
    ls_marker = ['CD3E','CD4', 'CD8A', 'MS4A1','CD68','OLR1','MRC1','MX1','STAT1',]
    df_plot = df_rna.loc[df_rna.Public_Patient_ID.isin(df_merge.Public_Patient_ID),ls_marker].unstack().reset_index()
    df_plot['Public_Patient_ID'] = [item[0:-2] for item in df_plot.level_1]
    df_plot['pORG'] = df_plot.Public_Patient_ID.map(dict(zip(df_pri.Public_Patient_ID,df_pri.quartiles)))
    df_plot.rename({0:'Expression','level_0':'Gene'},axis=1,inplace=True)
    fig,ax = plt.subplots(dpi=200,figsize=(5,3))
    sns.stripplot(data=df_plot,x='Gene',y='Expression',hue='pORG',dodge=True,ax=ax,s=2)
    sns.boxplot(data=df_plot,x='Gene',y='Expression',hue='pORG',ax=ax,showmeans=True,medianprops={'visible': False},
                           whiskerprops={'visible': False},meanline=True,showcaps=False,
                           meanprops={'color': 'k', 'ls': '-', 'lw': 2},showfliers=False,showbox=False)
    h, l = ax.get_legend_handles_labels()
    ax.legend(h[0:2],l[0:2],loc='lower left')
    pairs = [((item,'low'),(item,'high')) for item in ls_marker]
    annot = Annotator(ax, pairs, data=df_plot,x='Gene',y='Expression',hue='pORG',
                      order=ls_marker,hue_order=('low','high'))
    annot.configure(test='t-test_ind', text_format='simple',fontsize=7,comparisons_correction='fdr_bh') #
    annot.apply_test()
    d_pval = dict([(res.data.group1[0],res.data.pvalue) for res in annot.annotations])
    pvalues = [d_pval[item] for item in ls_marker]
    reject, corrected, __, __ = statsmodels.stats.multitest.multipletests(pvalues,method='fdr_bh')
    formatted_pvalues = [f'p={pvalue:.2}' for pvalue in list(corrected)]
    annot.set_custom_annotations(formatted_pvalues)
    annot.annotate()
    ax.set_title(f'{s_porg} Quartiles') 
    plt.tight_layout()
    fig.savefig(f'figures/gene_expression_binary_{s_porg}_Quartiles.png')
    plt.close(fig)

In [None]:
# s_time = 'Survival_time'
# s_censor = 'Survival'
# s_porg = 'trim_padj_0.2_pORG_Up_55_Genes'

# for s_type in ["Simpson's Evenness tumor","Simpson's Evenness blood"]:
#     s_bld = s_type.split(' ')[-1]
#     for idx, s_cat in enumerate(d_order.keys()):
#         df = df_merge.loc[~df_merge.loc[:,s_type].isna(),[s_cat,s_time,s_censor,s_porg]].dropna(how='any')
#         s_rename = f'{s_cat} TCR {s_bld}'
#         df.rename({s_cat:s_rename},axis=1,inplace=True)
#         fig, __ = util.km_plot(df,s_rename,s_time,s_censor)
#         fig.savefig(f'figures/KM_clinicopath_{s_foci}_{s_cat}.png')
#         #PLOT pORG
#         ls_order = df.loc[:,s_rename].unique()
#         #define samples
#         group1 = df[df[s_rename]==ls_order[0]]
#         group2 = df[df[s_rename]==ls_order[1]]
#         #perform independent two sample t-test
#         statistic, pvalue = stats.ttest_ind(group1[s_porg], group2[s_porg])
#         fig,ax=plt.subplots(figsize=(3,3),dpi=200)
#         sns.boxplot(data=df,x=s_rename,y=s_porg,showfliers=False)
#         sns.stripplot(data=df,x=s_rename,y=s_porg,palette='dark')
#         ax.set_title(f'{s_rename} p={pvalue:.03}')
#         #break
#     #break


In [None]:
# df_both.dropna().groupby(['x','hue']).count()
# #35, 35 low pORG primary!!
# #16 lung, 59 liver 
# # 45 basal like

## Section 7 <a name="split"></a>


GSEA plots. I think we should say we are showing any pathway with FDRq < 0.15 and NES>1.5 for all for datasets, so if a pathway doesn’t meet these cutoffs, they are not shown as a bar in the two graphs: liver/lung and high/low pORG, and high/low PurIST and high/low pSUB. 

[contents](#contents)

In [None]:
#generate more GSEA
ls_columns = ['NAME', 'SIZE', 'ES', 'NES', 'NOM.p.val', 'FDR.q.val', 'FWER.p.val',#'RANK.AT.MAX'
       ]
ls_gsea = ['Top4th_PurIST.Score_vs_Bottom4th_PurIST.Score_h.xlsx',
     'basal-like_vs_classical_h.xlsx','LiverCohort_vs_LungNotLiverCohort_h.xlsx',
 'Top4th_pSUB.1eNeg4_vs_Bottom4th_pSUB.1eNeg4_h.xlsx',
 'Top4th_pSUB.1eNeg5_vs_Bottom4th_pSUB.1eNeg5_h.xlsx',
 'Top4th_pORG.20_vs_Bottom4th_pORG.20_h.xlsx'
    ]
d_en = {}
for s_gsea in ls_gsea:
    d_load =  pd.read_excel(f'data/{s_gsea}',sheet_name=None)#,index_col=0
    df_up = d_load['GSEA_UP'].loc[:,ls_columns] #df[df.ES>0]
    df_down = d_load['GSEA_DN'].loc[:,ls_columns]#df[df.ES<0]
    d_en.update({f"{s_gsea.split('_h.xlsx')[0]}_UP":df_up})
    d_en.update({f"{s_gsea.split('_h.xlsx')[0]}_DN":df_down})
    #break

In [None]:
# d_en_old = pd.read_excel(f'../SupplementalDataset4.xlsx',sheet_name=None)
# d_en.update(d_en_old)

In [None]:
d_labels = {'TopVsBot4th_PurIST':'Top vs Bottom Quartile by PurIST',
            'TopVsBot4th_pSUB':'Top vs Bottom Quartile by pSUB',
            'basal-like_vs_classical':'Basal-Like vs. Classical',
            'Top4th_PurIST.Score_vs_Bottom4th_PurIST.Score':'Top vs Bottom Quartile by PurIST',
           'TopVsBot4th_pORG':'Top vs Bottom Quartile by pORG 78 gene',
            'LungVsLiver':'Liver Cohort vs. Lung Cohort',
            'LiverCohort_vs_LungNotLiverCohort':'Liver Cohort vs. Lung Cohort',
           'Top4th_pORG.14_vs_Bottom4th_pORG.14':'Top vs Bottom Quartile by pORG 0.14',
           'Top4th_pORG.20_vs_Bottom4th_pORG.20':'Top vs Bottom Quartile by pORG 0.2',
           'Top4th_pSUB.1eNeg5_vs_Bottom4th_pSUB.1eNeg5':'Top vs Bottom Quartile by pSUB 1e-5',
           'Top4th_pSUB.1eNeg4_vs_Bottom4th_pSUB.1eNeg4':'Top vs Bottom Quartile by pSUB 1e-4'}

In [None]:
sorter_combined = ['HALLMARK_PEROXISOME','HALLMARK_XENOBIOTIC_METABOLISM', #0.05
   'HALLMARK_PANCREAS_BETA_CELLS', 'HALLMARK_BILE_ACID_METABOLISM',
    'HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION','HALLMARK_GLYCOLYSIS',
                  ]
sorter_combined = ['HALLMARK_XENOBIOTIC_METABOLISM',
  'HALLMARK_PEROXISOME',
  'HALLMARK_FATTY_ACID_METABOLISM',
  'HALLMARK_BILE_ACID_METABOLISM',
  'HALLMARK_PANCREAS_BETA_CELLS','HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION',
  'HALLMARK_APICAL_JUNCTION',
  'HALLMARK_HYPOXIA',
  'HALLMARK_GLYCOLYSIS']

#sorter_all

In [None]:
#subtype
b_manual = True #True #False
sorter_all = {}
#Rosie ask: rank by NES in pORG
rescale = lambda y: (y - np.min(y)) / (np.max(y) - np.min(y))
d_p = {}
es_add = set()#set(['HALLMARK_MYOGENESIS'])
s_pval = 'FDR.q.val'#'NES'#'NOM.p.val'#
alpha = 0.15#0.2#
df_all = pd.DataFrame()
es_marker = set()
ls_plot_items = [#'basal-like_vs_classical',
    #'Top4th_pSUB.1eNeg5_vs_Bottom4th_pSUB.1eNeg5',
    'Top4th_pSUB.1eNeg4_vs_Bottom4th_pSUB.1eNeg4',
    'Top4th_PurIST.Score_vs_Bottom4th_PurIST.Score'
                ]

for s_comp in ls_plot_items:
    df_plot_long = pd.DataFrame()
    df_plot_long2 = pd.DataFrame()
    for s_direction in ['UP','DN']:
        s_compare =f'{s_comp}_{s_direction}'
        print(s_compare)
        #find the genes up or down
        df_plot_o = d_en[s_compare][(d_en[s_compare].loc[:,s_pval]<alpha)].sort_values(by='NES',ascending=False)
        if s_direction == 'DN':
            print('down')
            #print(len(df_plot))
            sorter = df_plot_o.sort_values(by='NES',ascending=False).NAME.tolist()
        else:
            print('up')
            #print(len(df_plot))
            sorter = df_plot_o.sort_values(by='NES').NAME.tolist()
        #override
        if b_manual:
            df_plot = d_en[s_compare].loc[d_en[s_compare].NAME.isin(sorter_combined),ls_columns]
        else:
            df_plot = df_plot_o
        print(len(df_plot))
        df_plot_long = pd.concat([df_plot_long,df_plot])
        #df_plot_long2 = pd.concat([df_plot_long2,d_en[s_compare]])
        es_marker = es_marker.union(set(df_plot.NAME)).union(es_add)
        sorter_all.update({s_compare:sorter})
    #'''
    df_plot_long = df_plot_long.sort_values(by='NES')
    df_plot_long['comparison'] = s_comp
    #df_plot_long2 = df_plot_long2.sort_values(by='NES',ascending=False)
    #print(df_plot_long2.head())
    df_all = pd.concat([df_all,df_plot_long])
    d_p.update({s_comp:df_plot_long.NAME.unique()})
    #break
#'''
df_all.NAME = df_all.NAME.astype('category')
df_all.NAME = df_all.NAME.cat.set_categories(sorter_combined)
df_plot_bar = df_all[df_all.NAME.isin(sorter_combined)].sort_values(by=['comparison','NAME'])
df_plot_bar['FDR_color'] = rescale(df_plot_bar.loc[:,'FDR.q.val'])
#erase low FDR.q
#df_plot_bar.loc[(abs(df_plot_bar.loc[:,'NES']) < 1.5) | (df_plot_bar.loc[:,'FDR.q.val'] > 0.15),'NES'] = 0
fig, ax = plt.subplots(dpi=200)
my_cmap = plt.get_cmap("Reds_r")
height=0.4
for idx, s_comp in enumerate(ls_plot_items):
    df_comp = df_plot_bar[df_plot_bar.comparison==s_comp]
    df_comp.set_index('NAME',inplace=True)
    if idx == 0:
        indices = np.arange(len(df_comp.index))
        ax.barh(y=indices+height/2, width=df_comp.loc[sorter_combined,'NES'],height=height,
            color=[my_cmap(item) for item in df_comp.FDR_color],label=s_comp,
           )
    else: 
        ax.barh(y=indices-height/2, width=df_comp.loc[sorter_combined,'NES'],height=height,hatch='//',
            color=[my_cmap(item) for item in df_comp.FDR_color],label=s_comp,
           )
    ax.set_yticks(range(len(df_comp.index)))
    ax.set_yticklabels(df_comp.index)
norm = mpl.colors.Normalize(vmin=df_plot_bar.loc[:,'FDR.q.val'].min(),vmax=df_plot_bar.loc[:,'FDR.q.val'].max())
norm = mpl.colors.Normalize(vmin=0,vmax=0.2)
mappable = mpl.cm.ScalarMappable(norm=norm, cmap=my_cmap)
fig.colorbar(mappable=mappable,ax=ax,label='FDR.q.val')
ax.set_title('Subtype')
handles = [mpl.patches.Patch(facecolor='lightgray', edgecolor='black',
                         label=d_labels[ls_plot_items[0]]),
                  mpl.patches.Patch(facecolor='lightgray', edgecolor='black',
                         label=d_labels[ls_plot_items[1]],#label='Top vs Bottom Quartile by PurIST',
                                    hatch='//')]
ax.legend(handles=handles,bbox_to_anchor = (1.25,1),markerscale=1,title='Comparison')
ax.set_xlabel('NES')
#'''



In [None]:
#organotropism
#Rosie ask: rank by NES in pORG
rescale = lambda y: (y - np.min(y)) / (np.max(y) - np.min(y))
d_p = {}
es_add = set()#set(['HALLMARK_MYOGENESIS'])
s_pval = 'FDR.q.val'#'NOM.p.val'#
alpha = 0.05#0.2#
df_all = pd.DataFrame()
es_marker = set()
ls_plot_items = ['Top4th_pORG.20_vs_Bottom4th_pORG.20',
 #'Top4th_pORG.14_vs_Bottom4th_pORG.14',
 #'TopVsBot4th_pORG',
   # 'LungVsLiver'
    'LiverCohort_vs_LungNotLiverCohort',
                ]
for s_comp in ls_plot_items:
    df_plot_long = pd.DataFrame()
    df_plot_long2 = pd.DataFrame()
    for s_direction in ['UP','DN']:
        s_compare =f'{s_comp}_{s_direction}'
        print(s_compare)
        df_plot = d_en[s_compare][(d_en[s_compare].loc[:,s_pval]<alpha)].sort_values(by='NES',ascending=False)
        if s_compare == 'Top4th_pORG.14_vs_Bottom4th_pORG.14_UP':
            sorter = df_plot.sort_values(by='NES').NAME.tolist()
        elif s_compare == 'Top4th_pORG.20_vs_Bottom4th_pORG.20_UP':
            sorter = df_plot.sort_values(by='NES').NAME.tolist()
        elif s_compare == 'TopVsBot4th_pORG_UP':
            sorter = df_plot.sort_values(by='NES').NAME.tolist()
        else:
            other_sorter = df_plot.sort_values(by='NES').NAME.tolist()
        if s_direction == 'DN':
            print(len(df_plot))
        df_plot_long = pd.concat([df_plot_long,df_plot])
        df_plot_long2 = pd.concat([df_plot_long2,d_en[s_compare]])
        es_marker = es_marker.union(set(df_plot.NAME)).union(es_add)
        #break
    df_plot_long = df_plot_long.sort_values(by='NES')
    df_plot_long2['comparison'] = s_comp
    df_plot_long2 = df_plot_long2.sort_values(by='NES',ascending=False)
    df_all = pd.concat([df_all,df_plot_long2])
    d_p.update({s_comp:df_plot_long.NAME.unique()})
    #break
df_all.NAME = df_all.NAME.astype('category')
df_all.NAME = df_all.NAME.cat.set_categories(sorter)
df_plot_bar = df_all[df_all.NAME.isin(es_marker)].sort_values(by=['comparison','NAME'])
df_plot_bar['FDR_color'] = rescale(df_plot_bar.loc[:,'FDR.q.val'])
#erase low FDR.q
df_plot_bar.loc[(abs(df_plot_bar.loc[:,'NES']) < 1.5) | (df_plot_bar.loc[:,'FDR.q.val'] > 0.15),'NES'] = 0
fig, ax = plt.subplots(dpi=200)
my_cmap = plt.get_cmap("Reds_r")
height=0.4
for idx, s_comp in enumerate(ls_plot_items):
    df_comp = df_plot_bar[df_plot_bar.comparison==s_comp]
    df_comp.set_index('NAME',inplace=True)
    if idx == 0:
        indices = np.arange(len(df_comp.index))
        ax.barh(y=indices+height/2, width=df_comp.loc[sorter,'NES'],height=height,
            color=[my_cmap(item) for item in df_comp.FDR_color],label=s_comp,
           )
    else: 
        ax.barh(y=indices-height/2, width=df_comp.loc[sorter,'NES'],height=height,hatch='//',
            color=[my_cmap(item) for item in df_comp.FDR_color],label=s_comp,
           )
    ax.set_yticks(range(len(df_comp.index)))
    ax.set_yticklabels(df_comp.index)
norm = mpl.colors.Normalize(vmin=df_plot_bar.loc[:,'FDR.q.val'].min(),vmax=df_plot_bar.loc[:,'FDR.q.val'].max())
norm = mpl.colors.Normalize(vmin=0,vmax=0.2)
mappable = mpl.cm.ScalarMappable(norm=norm, cmap=my_cmap)
fig.colorbar(mappable=mappable,ax=ax,label='FDR.q.val')
ax.set_title('Organotropism')
handles = [mpl.patches.Patch(facecolor='lightgray', edgecolor='black',
                         label=d_labels[ls_plot_items[0]]),
                  mpl.patches.Patch(facecolor='lightgray', edgecolor='black',
                         label='Liver vs Lung Cohort',hatch='//')]
ax.legend(handles=handles,bbox_to_anchor = (1.25,1),markerscale=1,title='Comparison')
ax.set_xlabel('NES')



In [None]:
#any liver versus lung?
other_sorter

## figure 2


In [None]:
#load full patient data
s_out = '20230815_Patient_Metadata.csv'
df_patient= pd.read_csv(s_out,index_col=0)

ls_foci = ['pORG_0.2_Primary','pORG_0.2_Met',
           'pORG_0.2_All']


d_order =  {'Cohort':['Liver','Lung'],
            'PurIST Subtype':['basal-like','classical']}
for s_foci in ls_foci:
    for s_population in ls_foci:
        if s_population == 'pORG_0.2_All' and s_foci != 'pORG_0.2_All':
            continue
        df_pri = df_patient[~df_patient.loc[:,s_population].isna()].copy()
        figsize=(4.7,2.8)
        fig,ax=plt.subplots(dpi=300,figsize=figsize)
        order = []
        ls_ticks = []
        d_pval = {}
        df_both = pd.DataFrame()
        for idx, s_cat in enumerate(d_order.keys()):
            print(s_cat)
            ls_order = d_order[s_cat]
            s_bad = ls_order[0]
            s_good = ls_order[1]
            d_replace = {s_bad:'bad',s_good:'good'}
            a = df_pri.loc[df_pri.loc[:,s_cat]==ls_order[0],s_foci].dropna()
            b =df_pri.loc[df_pri.loc[:,s_cat]==ls_order[1],s_foci].dropna()
            statistic, pvalue = stats.ttest_ind(a,b)#    alternative='greater' )
            print(len(a) + len(b))
            df_pri['hue'] = df_pri.loc[:,s_cat].replace(d_replace)
            df_pri['x'] = s_cat
            df_both=pd.concat([df_both,df_pri.loc[:,['x','hue',s_foci]]])
            for s_test in ls_order:
                order.append((s_cat,d_replace[s_test]))
                ls_ticks.append(s_test)
            d_pval.update({s_cat:pvalue})
        sns.violinplot(data=df_both,y=s_foci,x='x',hue='hue',ax=ax,alpha=0.5,linewidth=1,inner=None,color='white')
        sns.boxplot(data=df_both,y=s_foci,x='x',hue='hue',ax=ax,showmeans=True,medianprops={'visible': False},
                           whiskerprops={'visible': False},meanline=True,showcaps=False,
                           meanprops={'color': 'k', 'ls': '-', 'lw': 2},showfliers=False,showbox=False)
        sns.stripplot(data=df_both,y=s_foci,x='x',hue='hue',s=3,dodge=True,ax=ax,jitter=0.2,
                     palette="Set1",alpha=0.8) #hue='Ki67pos'
        #annotate
        pairs = [(order[0],order[1]),(order[2],order[3])]
        pvalues = [d_pval[list(d_order.keys())[0]],d_pval[list(d_order.keys())[1]]]
        reject, corrected, __, __ = statsmodels.stats.multitest.multipletests(pvalues,method='fdr_bh')
        formatted_pvalues = [f'p={pvalue:.2}' for pvalue in list(corrected)]
        try:
            annotator = Annotator(ax, pairs=pairs, data=df_both,y=s_foci,x='x',hue='hue')
            annotator.set_custom_annotations(formatted_pvalues)
            annotator.annotate()
            ax.legend().remove()
            ax.set_xticks([-0.2,0.2, 0.8,1.2])
            ax.set_xticklabels(ls_ticks)
            ax.set_xlabel('')
            ax.set_title(f"pORG in {s_population.split('_')[-1]}", fontsize='x-large') #{s_foci.replace('_',' ')} vs. 
            plt.tight_layout()
            fig.savefig(f'figures/violinplot_both_{s_foci}_{s_population.split("_")[-1]}.png')
        except:
            plt.close(fig)
        #break

In [None]:
df_patient.loc[:,ls_foci + ['Cohort']].groupby('Cohort').count()