In [None]:
# Import libraries
import os
import sys
import numpy as np
import pandas as pd
import shutil
import importlib
import matplotlib.pyplot as plt
import re
from skimage import io
import tifffile
from scipy.ndimage import median_filter
from skimage.util import img_as_ubyte,  img_as_float
import skimage
from skimage.feature import blob_dog, blob_log, blob_doh
from math import sqrt
import scipy
import seaborn as sns
from scipy import stats
from skimage.filters import unsharp_mask
from skimage.restoration import (denoise_tv_chambolle, denoise_bilateral,
                                 denoise_wavelet, estimate_sigma)
from skimage import color, morphology
from skimage.transform import rescale
import matplotlib as mpl
import lifelines
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.statistics import multivariate_logrank_test

from statannotations.Annotator import Annotator
from itertools import combinations
import statsmodels
import warnings
warnings.filterwarnings("ignore", category=UserWarning) 
from bokeh.palettes import Colorblind
d_colorblind = {'Liver':'#d55e00','Lung':'#0072b2',
               'high':'#e69f00','low': '#56b4e9',
               'basal-like':'#000000','classical':'#cc79a7',
               'high pSUB': '#f0e442','low pSUB':'#009E73'}


# Set Paths
codedir = os.getcwd()
import util

%matplotlib inline

In [None]:
#images: dowload from synapse.org syn51068458
# (free account required)
rootdir = codedir#'/home/groups/BCC_Chin_Lab/ChinData/Cyclic_Workflow/cmIF_2021-05-03_PDAC'
regdir = f'{rootdir}/RegisteredImages'
segdir = f'{codedir}/Segmentation'
segdiro = f'{rootdir}/Segmentation'

# clone mplex_image at https://gitlab.com/engje/mplex_image
os.chdir('../..')
from mplex_image import preprocess, mpimage #, cmif
os.chdir(codedir)

# Table of contents <a name="contents"></a>
0. [functions](#func)
1. [skimage blobs](#sk)
2. [Foci analysis](#focifoci)
3. [mIHC analysis](#mihc)
4. [Patient metadata](#meta)  [Primary vs met](#primet)
4. [CPH modeling](#clin)  [CPH forest plots](#cphplot)
5. [gene expression analysis](#geneexp)
6. [TCR analysis](#tcell) [TCR survival](#tcells)
7. [GSVA violins](#violin2)  [GSEA bar plots](#bars2)
8. [OLD](#split) (not used)

## functions <a name="func"></a> 

import from util.py

[contents](#contents)

##  Section 1: skimage blob detection

Foci are detected here from images plus segmentation masks

**You can skip and use pre-computed foci counts**

 <a name="sk"></a> 

[contents](#contents)

In [None]:
#intensity data: dowload from synapse.org syn51068458
df_mi = pd.read_csv(f'{codedir}/data/20220720_U54-TMA_FilteredMeanIntensity_Link.csv',index_col=0)

In [None]:
#skimage blob detection
#requires images: dowload from synapse.org syn51068458
threshold=0.002
d_thresh = {'pRPA':1100, 'gH2AX':1100, 'RAD51':1300}
os.chdir(regdir)
ls_slide = sorted(set(df_mi.slide_scene)) 
ls_marker = ['pRPA','gH2AX','RAD51',] 
df_result_all = pd.DataFrame()
for s_slide in ls_slide:
    print(s_slide)
    os.chdir(f'{segdiro}/U54-TMA-9_CellposeSegmentation') #change this to path for your downloaded segmentation masks
    s_seg = pd.Series(sorted(os.listdir()),dtype='object')[pd.Series(index=sorted(os.listdir()),dtype='object').index.str.contains(f'{s_slide}_nuc30_NucleiSegmentationBasins')].iloc[0]
    label_image = io.imread(s_seg)
    os.chdir(f'{regdir}/{s_slide}') # change this to path to your downloaded registered tifs
    df_img=mpimage.parse_org()
    for idxs, s_marker in enumerate(ls_marker):
        intensity_image = io.imread(df_img[df_img.marker==s_marker].index[0])
        props = skimage.measure.regionprops_table(label_image, intensity_image=intensity_image, properties=('label','bbox','mean_intensity')) # 'image','intensity_image',
        df_props = pd.DataFrame(props,dtype='float').set_index('label')
        df_props.columns = [item.replace('-','') for item in df_props.columns]
        ls_index = df_props[df_props.mean_intensity>d_thresh[s_marker]].index
        print(f'{s_marker} {len(ls_index)}')
        for i_cell in ls_index:
            se_cell = df_props.loc[i_cell].dropna().astype('int')
            image = intensity_image[se_cell.bbox0:se_cell.bbox2,se_cell.bbox1:se_cell.bbox3]
            if s_marker == 'pRPA':
                blobs, fig = util.get_blobs2(image,min_sigma=0.1,max_sigma=2,threshold=threshold,exclude_border=1)
            elif s_marker == 'gH2AX':
                blobs, fig = util.get_blobs2(image,min_sigma=1,max_sigma=2,threshold=threshold,exclude_border=1)
            else:
                blobs, fig = util.get_blobs2(image,min_sigma=0.1,max_sigma=2,threshold=threshold,exclude_border=1)
            df_props.loc[i_cell,'blobs'] = len(blobs)
            if len(blobs) > 1:
                fig.suptitle(s_marker)
                plt.tight_layout()
                fig.savefig(f'{codedir}/blobs/{s_marker}/{s_slide}_{s_marker}_{i_cell}.png',dpi=200)
                plt.close(fig)
        df_props.index = [s_slide + '_cell' + str(item) for item in df_props.index]
        if len(ls_index)==0:
                df_props['blobs'] = np.NaN
        if idxs == 0:
            df_result = df_props.rename({'blobs':f'{s_marker}_foci'},axis=1)
        else:
            df_result.loc[df_props.index,f'{s_marker}_foci'] = df_props.blobs
    df_result_all = df_result_all.append(df_result)

#uncomment to save (this data has been pre-computed and saved in the repo)
#df_result_all.loc[:,df_result_all.columns.str.contains('foci')].dropna(how='all').to_csv(f'{codedir}/foci_U54-TMA-9_{".".join(ls_marker)}_{threshold}.csv')


### Section 2: FOCI Analysis <a name="focifoci"></a>

Load saved foci for plotting and downstream analysis

liver/lung

**You don't need to run section 1 to run this**


[contents](#contents)

In [None]:
df_surv = pd.read_csv(f'{codedir}/data/u54_tma_sampleannot_Link.csv')
len(df_surv)
#df_pORG = pd.read_csv(f'{codedir}/data/GSVA_Scores_Link.csv')#=,index_col='Public_Specimen_ID'
#df_pORG = pd.read_csv(f'{codedir}/20230526_GSVA_Scores.csv')
df_pORG = pd.read_csv(f'data/20230608_GSVA_Scores.csv')
#df_pORG
df_primary = df_surv[((df_surv.Tissue=='PDAC') | (df_surv.Tissue=='Intestinal'))].copy()
print(len(df_primary))
df_primary = df_primary.merge(df_pORG[df_pORG.Group=='GSVA_All'],on='Public_Specimen_ID',how='left')
df_primary.set_index("Public_Specimen_ID", inplace = True)
len(df_primary)

In [None]:
df_mapper = df_primary[~df_primary.index.duplicated()]
ls_add = ['txi_pORG_Up_42_Genes',
          'trim_padj_0.2_pORG_Up_55_Genes',
          'trim_padj_0.2_pSUB_Up_100_Genes',
        'txi_pSUB_Up_100_Genes']
for s_add in ls_add:
    d_map = dict(zip(df_mapper.index,df_mapper.loc[:,s_add]))
    print(len(d_map))
    df_surv[s_add] = df_surv.Public_Specimen_ID.map(d_map)

### TMA survival

In [None]:
# ##
# %matplotlib inline
# alpha = 0.06
# s_propo = ''
# savedir = f'{codedir}'
# s_time=  'Survival_time'#
# s_censor='Survival'#
# s_subtype = ''
# s_cell = ''
# s_type_title = ''
 
# for s_col in ls_add:
#     print(s_col)
#     for cutp in [0.33,0.5,0.66]: #np.round(np.arange(0.25,1,0.25),3):#
#             print(cutp)
#             df_km, pvalue = util.single_km(df_surv[~df_surv.Public_Patient_ID.duplicated()],s_cell,s_subtype,s_type_title,s_col,savedir,alpha,cutp, #
#                                        s_time,s_censor,s_propo)
#             print(pvalue)
#     #break

### define high/low pORG

based on survival differences on TMA samples

In [None]:
df_pri = df_surv[~(df_surv.Public_Patient_ID.duplicated(keep='first')) & ((df_surv.Tissue=='PDAC'))]
s_select_porg = 'trim_padj_0.2_pORG_Up_55_Genes'#'txi_pORG_Up_42_Genes'
i_pORG_txi = 0.0249 #np.quantile(df_pri.loc[:,s_select_porg].dropna(),0.66)#0.66 #0.5
print(i_pORG_txi)
# df_surv.loc[df_surv['Original_pORG_Up_78_Genes'] >= i_pORG_ori,'pORG_binary_orig'] = 'high'
# df_surv.loc[df_surv['Original_pORG_Up_78_Genes'] < i_pORG_ori,'pORG_binary_orig'] = 'low'

df_surv.loc[df_surv[s_select_porg] > i_pORG_txi,'pORG_binary'] = 'high'
df_surv.loc[df_surv[s_select_porg] <= i_pORG_txi,'pORG_binary'] = 'low'

In [None]:
#save the 0608
#df_surv.drop(['Unnamed: 0','pORG_binary','pORG_Score'],axis=1).to_csv(f'{codedir}/data/u54_tma_sampleannot_Link_new.csv')
#df_surv.Tissue

In [None]:
#print(df_surv.pORG_binary_orig.value_counts())
print(df_surv[(df_surv.Tissue=='PDAC') & (~df_surv.Public_Specimen_ID.duplicated())].pORG_binary.value_counts())

In [None]:
#load combined
s_sample = '20220721_U54-TMA'#'20220711_U54-TMA' #'20220409_JP-TMAs_IMC-TMAs'
s_names = 'Combined' #'unnamed' #
s_type = 'PDAC'
df_lei = pd.read_csv(f'{codedir}/data/{s_sample}_{s_names}Celltypes_{s_type}_Link.csv',index_col=0)

d_patient = dict(zip(df_surv.coor_mplexable,df_surv.Public_Patient_ID))

df_lei['Tissue'] = df_lei.Patient.map(dict(zip(df_surv.Public_Patient_ID,df_surv.Tissue)))
# number of epithelial
df_epi = pd.read_csv(f'{codedir}/data/results_20220721_U54-TMA_CellTypeCounts_byPatient_byleidencelltype5_PDAC_Link.csv',index_col=0)


In [None]:

ls_marker = ['gH2AX','pRPA','RAD51']
threshold=0.002
df_foci2 = pd.read_csv(f'{codedir}/data/foci_U54-TMA-9_{".".join(ls_marker)}_{threshold}.csv',index_col=0)
df_foci2.index = [item.replace('.0','') for item in df_foci2.index]
df_foci2 = df_foci2.fillna(0)
df_foci2['scene'] = [item.split('_cell')[0] for item in df_foci2.index]
df_foci2['Patient'] = df_foci2.scene.map(d_patient)


In [None]:
# plot Ki67 versus pRPA foci
s_define = 'pORG_binary'#'pORG_binary_txi'#'pORG_binary_orig'#
df_lei['Cohort'] = df_lei.Patient.map(dict(zip(df_surv.Public_Patient_ID,df_surv.Cohort)))
df_lei['pORG_binary'] = df_lei.Patient.map(dict(zip(df_surv.Public_Patient_ID,df_surv.loc[:,s_define])))
df_lei['Ki67pos'] = np.nan
df_lei.loc[df_lei.Ki67>3*256,'Ki67pos'] = 'Ki67+'
df_lei.Ki67pos.fillna('Ki67-',inplace=True)
ls_foci =['pRPA_foci','gH2AX_foci','RAD51_foci']
df_lei_foci = df_lei.merge(df_foci2.loc[:,ls_foci],left_index=True,right_index=True,how='left')
df_lei_foci.loc[:,ls_foci] = df_lei_foci.loc[:,ls_foci].fillna(0)
for s_col in ls_foci:
    print(s_col)
    df_lei_foci[f'{s_col}_pos'] = df_lei_foci.loc[:,s_col] > 0


In [None]:
#use df foci sum 2 (lower threshold)

ls_index = df_lei[(df_lei.leidencelltype5=='epithelial')].index
print(len(ls_index))
df_foci_sum2 = df_lei_foci.loc[df_lei_foci.index.isin(ls_index)].groupby('Patient').sum()
df_foci_sum2.drop(df_foci_sum2.columns[df_foci_sum2.dtypes=='object'],axis=1,inplace=True)
for s_marker in ls_marker:
    print(s_marker)
    df_foci_sum2[f'mean_{s_marker}_foci'] = (df_foci_sum2.loc[df_foci_sum2.index.isin(df_epi.index),f'{s_marker}_foci']/df_epi.epithelial).fillna(0)
    df_foci_sum2[f'log_mean_{s_marker}_foci'] = np.log(df_foci_sum2.loc[:,f'mean_{s_marker}_foci'] + 1)
    df_foci_sum2[f'mean_{s_marker}_foci_pos'] = (df_foci_sum2.loc[df_foci_sum2.index.isin(df_epi.index),f'{s_marker}_foci_pos']/df_epi.epithelial).fillna(0)

df_foci_sum2['Public_Patient_ID'] = df_foci_sum2.index
df_surv = df_surv.merge(df_foci_sum2,on='Public_Patient_ID',how='left',suffixes=('_1',''))


In [None]:
#use df foci sum 2 (Ki67 lower threshold)
for s_ki67 in df_lei.Ki67pos.dropna().unique():
    ls_index = df_lei[(df_lei.Ki67pos==s_ki67) & (df_lei.leidencelltype5=='epithelial')].index
    df_ki67 = df_lei.loc[ls_index].groupby('Patient').count()
    df_foci_sum2 = df_lei_foci.loc[df_lei_foci.index.isin(ls_index)].groupby('Patient').sum()
    df_foci_sum2.drop(df_foci_sum2.columns[df_foci_sum2.dtypes=='object'],axis=1,inplace=True)
    for s_marker in ls_marker:
        df_foci_sum2[f'mean_{s_marker}_foci_in_{s_ki67}'] = (df_foci_sum2.loc[df_foci_sum2.index.isin(df_epi.index),f'{s_marker}_foci']/df_ki67.Ki67).fillna(0)
        df_foci_sum2[f'log_mean_{s_marker}_foci_in_{s_ki67}'] = np.log(df_foci_sum2.loc[:,f'mean_{s_marker}_foci_in_{s_ki67}'] + 1)
        df_foci_sum2[f'mean_{s_marker}_foci_pos_in_{s_ki67}'] = (df_foci_sum2.loc[df_foci_sum2.index.isin(df_epi.index),f'{s_marker}_foci_pos']/df_ki67.Ki67).fillna(0)
    
df_foci_sum2['Public_Patient_ID'] = df_foci_sum2.index
df_surv = df_surv.merge(df_foci_sum2,on='Public_Patient_ID',how='left',suffixes=('_1',''))


In [None]:
#df_plot.groupby('Public_Patient_ID')

In [None]:
importlib.reload(util)
pal_porg = ('#E69F00','#56B4E9',)
pal_liv = ('#0072B2','#D55E00')
sns.set_palette(pal_porg)

for s_group in ['pORG_binary','Cohort']:
    s_type = ''
    s_cell = 'epithelial'
    ls_tissue = ['Intestinal', #'Liver_Met','Lung_Met',
                 'PDAC',]
    s_propo='in'
    df_plot = df_surv.loc[(df_surv.Tissue.isin(ls_tissue))& (~df_surv.coor_mplexable.duplicated())]#& (~df_surv.Public_Patient_ID.duplicated())
    ls_foci = ['mean_pRPA_foci',#'log_mean_pRPA_foci','mean_pRPA_foci_pos',
                     #'log_mean_pRPA_foci_in_Ki67+',  'mean_pRPA_foci_pos_in_Ki67+'
               'mean_pRPA_foci_in_Ki67+',
                  ]
    
    for s_marker in ls_foci:
        #util.qq_plot_hist(df_plot,s_group,s_marker)
        df_mean = df_plot.loc[:,[s_marker,'Public_Patient_ID']].groupby('Public_Patient_ID').mean()
        df_mean[s_group] = df_mean.index.map(dict(zip(df_plot.Public_Patient_ID,df_plot.loc[:,s_group])))
        fig, __,__,__ = util.categorical_correlation_boxplot(df_mean,s_group,s_marker,s_type,s_cell,
                                             alpha=1.05,s_propo=s_propo,b_ttest=True) #
        if s_marker == 'mean_pRPA_foci_in_Ki67+':
            fig.get_axes()[0].set_ylabel(f"{s_marker.replace('_',' ').replace('mean ','').replace('in','per')}\n {s_cell} cell")
        else:
            fig.get_axes()[0].set_ylabel(f"{s_marker.replace('_',' ').replace('mean ','')}\n{s_propo.replace('in','per')} {s_cell} cell")
        fig.get_axes()[0].set_xlabel(f"{s_group.split('_')[0]}")
        fig.get_axes()[0].set_title(fig.get_axes()[0].title.get_text().split('\n ')[-1])
        #break

In [None]:
# additonal foci figures, not used

In [None]:
#linear correlation
sns.set_palette(pal_liv)
s_score = 'pORG_Score'
for s_foci in ls_foci:
    df_plot = df_plot[(~df_plot.Public_Patient_ID.duplicated()) & (df_surv.Tissue.isin(ls_tissue))] # primary tumors
    b_score = df_plot.loc[:,s_score].notna()
    r, pvalue = stats.pearsonr(x=df_plot.loc[b_score,s_foci].fillna(0),y=df_plot.loc[b_score,s_score])
    #if s_foci.find('Ki67') > -1:
    #r, pvalue =stats.spearmanr(df_plot.loc[b_score,s_foci].fillna(0),df_plot.loc[b_score,s_score])
    print(pvalue)
    if pvalue < 1.1:
        fig, ax = plt.subplots(figsize=(3,2),dpi=200)
        sns.regplot(x=df_plot.loc[b_score,s_foci],y=df_plot.loc[b_score,s_score],
                    ax=ax,scatter_kws={'s':3})
        ax.set_title(f'{s_foci} p = {pvalue:.3}')

In [None]:
#compare Ki67+ versus negative, double violinplots
#compare all the combinations, double violinplots
%matplotlib inline
import seaborn as sns
from scipy import stats
from statannotations.Annotator import Annotator
from itertools import combinations
import statsmodels
from statsmodels.stats.multicomp import pairwise_tukeyhsd
ls_foci =['pRPA_foci','gH2AX_foci','RAD51_foci']
s_compare = 'Ki67'#'all' #
import warnings
warnings.filterwarnings("ignore", category=UserWarning) 
for s_cat in ['pORG_binary','Cohort']:
    ls_order = ['Ki67-','Ki67+']
    for s_foci in [ls_foci[0]]:
        if s_compare == 'all':
            figsize=(3.5,3)
        else:
            figsize=(2.8,2.8)
        fig,ax=plt.subplots(dpi=300,figsize=figsize)
        df_both = pd.DataFrame()
        order = []
        d_pval = {}
        for idx, s_cohort in enumerate(df_lei_foci.loc[:,s_cat].dropna().unique()):
            print(s_cohort)
            df_plot_foci =df_lei_foci[(df_lei_foci.leidencelltype5=='epithelial')
                                      & (df_lei_foci.Tissue=='PDAC') & (df_lei_foci.loc[:,s_cat]==s_cohort)]
            statistic, pvalue = stats.ttest_ind(df_plot_foci.loc[df_plot_foci.Ki67pos=='Ki67+',s_foci],
                                                df_plot_foci.loc[df_plot_foci.Ki67pos=='Ki67-',s_foci],
                                               alternative='greater')
            for s_order in ls_order:
                order.append((s_cohort,s_order))
            df_both = pd.concat([df_both,df_plot_foci])
            d_pval.update({s_cohort:pvalue})
        sns.violinplot(data=df_both,hue='Ki67pos',y=s_foci,x=s_cat,ax=ax,alpha=0.5,linewidth=0.5)
        sns.stripplot(data=df_both,hue='Ki67pos',y=s_foci,x=s_cat,s=1,dodge=True,ax=ax,palette='dark',jitter=0.2)
        #annotate
        if s_compare == 'all':
            pairs = list(combinations(order, r=2))
            annotator = Annotator(ax, pairs=pairs, data=df_both, y=s_foci,x=s_cat,hue='Ki67pos')
            annotator.configure(test="t-test_ind",line_width=1)#,alternative='greater
            pvalues = annotator.apply_test().annotations #annotator.apply_and_annotate() # 
            pvalues = [item.data.pvalue for item in pvalues]     
            reject, corrected, __, __ = statsmodels.stats.multitest.multipletests(pvalues,method='bonferroni')
            formatted_pvalues = [f'p={pvalue:.2}' for pvalue in list(corrected)]
            annotator.set_custom_annotations(formatted_pvalues)
            annotator.annotate()
        else:
            pairs = [(order[0],order[1]),(order[2],order[3])]
            pvalues = [d_pval[pairs[0][0][0]],d_pval[pairs[1][0][0]]]
            reject, corrected, __, __ = statsmodels.stats.multitest.multipletests(pvalues,method='bonferroni')
            formatted_pvalues = [f'p={pvalue:.2}' for pvalue in list(corrected)]
            annotator = Annotator(ax, pairs=pairs, data=df_both, y=s_foci,x=s_cat,hue='Ki67pos')
            annotator.set_custom_annotations(formatted_pvalues)
            annotator.annotate()
        
        ax.set_title(f"{s_foci.replace('_',' ')} vs. {s_cat.split('_')[0]}", fontsize='x-large')
        ax.set_xlabel(s_cat)
        ax.set_ylabel(f"No. {s_foci.replace('_',' ')}")
        h, l = ax.get_legend_handles_labels()
        labels =  [f'_{item}' if ind < 2 else item for ind,item in enumerate(l)]
        ax.legend(h,labels,title='',fontsize='small',markerscale=.5,bbox_to_anchor=(1.01,0.9))
        plt.tight_layout()
        fig.savefig(f'{codedir}/violinplot_both_{s_foci}_{s_cohort}_{s_compare}.png')
    if s_cat == 'pORG_binary':
        df_both.to_csv(f'results_foci_Ki67.csv')
        #break
    break

In [None]:
## make the bargraphs
# percent pRPA epithelial cells
ls_foci =['pRPA_foci','gH2AX_foci','RAD51_foci']
d_cats = {'pORG_binary':['high','low'],'Cohort':['liver_cohort','lung_cohort']}
for s_cat, ls_order in d_cats.items():
    for s_foci in [ls_foci[0]]:
        figsize=(2.8,2.5)
        fig,ax=plt.subplots(dpi=300,figsize=figsize)
        df_both = pd.DataFrame()
        d_pval = {}
        for idx, s_cohort in enumerate(df_lei_foci.loc[:,s_cat].dropna().unique()):
            print(s_cohort)
            df_plot_foci =df_lei_foci[(df_lei_foci.leidencelltype5=='epithelial')
                                      & (df_lei_foci.Tissue=='PDAC') & (df_lei_foci.loc[:,s_cat]==s_cohort)]
            df_both = pd.concat([df_both,df_plot_foci])
        df_obs = df_both[df_both.loc[:,s_foci] > 0].loc[:,s_cat]
        df_exp = df_both.loc[:,s_cat]
        #chi
        f_obs = df_obs.value_counts().loc[ls_order]
        f_exp = f_obs.sum() * df_exp.value_counts(normalize=True).loc[ls_order]
        statistic, pvalue =  stats.chisquare(f_obs, f_exp)
        df_obs.value_counts(normalize=True).loc[ls_order].plot(kind='bar',ax=ax,color=['mediumpurple','deepskyblue'])
        ax.set_title(f'{s_foci} vs. {s_cat}\nChi-squared p={pvalue:.3}')
        ax.set_ylabel(f'Fraction of {s_foci.split("_")[0]}+ Epithelial')
        ax.set_xticklabels([item.replace('_cohort','') for item in ls_order])
        plt.tight_layout()
        xtickslocs = ax.get_xticks()
        for idx, s_order in enumerate(ls_order):
            s = f'{df_obs.value_counts().loc[s_order]}\nof\n{df_exp.value_counts().loc[s_order]}'
            plt.text(-0.2+idx,0.3,s)
        break
    #break

In [None]:
## make the boxplots
# percent pRPA epithelial cells
pal_porg = ('#E69F00','#56B4E9',)
pal_liv = ('#0072B2','#D55E00')
sns.set_palette(pal_porg)
d_cats = {'pORG_binary':['high','low'],'Cohort':['liver_cohort','lung_cohort']}
for s_cat, ls_order in d_cats.items():
    for s_foci in [ls_foci[0]]:
        figsize=(2.8,2.5)
        #fig,ax=plt.subplots(dpi=300,figsize=figsize)
        df_both = pd.DataFrame()
        d_pval = {}
        for idx, s_cohort in enumerate(df_lei_foci.loc[:,s_cat].dropna().unique()):
            print(s_cohort)
            df_plot_foci =df_lei_foci[(df_lei_foci.leidencelltype5=='epithelial') #& (df_lei_foci.loc[:,s_foci] > 0)
                                      & (df_lei_foci.Tissue.isin(ls_tissue)) & (df_lei_foci.loc[:,s_cat]==s_cohort)]
            print(df_plot_foci.Patient.nunique())
            df_both = pd.concat([df_both,df_plot_foci])
        df_mean = pd.DataFrame(df_both.loc[:,[f'{s_foci}_pos','Patient']].groupby('Patient').mean())
        df_mean[s_cat] = df_mean.index.map(dict(zip(df_both.Patient,df_both.loc[:,s_cat])))
        fig, __, __, __ = util.categorical_correlation_boxplot(df_mean,s_cat,f'{s_foci}_pos',s_type='',s_cell=f"epithelial",
                                         alpha=1.05,s_propo='in',b_ttest=True)
        #fig.get_axes()[0].set_ylabel(f"{s_marker.replace('_',' ').replace('mean ','')} per\n{s_cell} cell")
        #fig.get_axes()[0].set_xlabel(f"{s_group.split('_')[0]}")
        #fig.get_axes()[0].set_title(fig.get_axes()[0].title.get_text().split('\n ')[-1])
        #break
    #break

In [None]:
## make the bargraphs
# percent pRPA epithelial cells
# percent of those that are Ki67+
d_cats = {'pORG_binary':['high','low'],'Cohort':['liver_cohort','lung_cohort']}
for s_cat, ls_order in d_cats.items():
    for s_foci in [ls_foci[0]]:
        figsize=(2.8,2.5)
        fig,ax=plt.subplots(dpi=300,figsize=figsize)
        df_both = pd.DataFrame()
        d_pval = {}
        for idx, s_cohort in enumerate(df_lei_foci.loc[:,s_cat].dropna().unique()):
            print(s_cohort)
            df_plot_foci =df_lei_foci[(df_lei_foci.leidencelltype5=='epithelial') & (df_lei_foci.loc[:,s_foci] > 0)
                                      & (df_lei_foci.Tissue=='PDAC') & (df_lei_foci.loc[:,s_cat]==s_cohort)]
            df_both = pd.concat([df_both,df_plot_foci])
        df_obs = df_both[df_both.loc[:,'Ki67pos']=='Ki67+'].loc[:,s_cat]
        df_exp = df_both.loc[:,s_cat]
        #chi
        f_obs = df_obs.value_counts().loc[ls_order]
        f_exp = f_obs.sum() * df_exp.value_counts(normalize=True).loc[ls_order]
        statistic, pvalue =  stats.chisquare(f_obs, f_exp)
        df_plot = df_obs.value_counts()/df_exp.value_counts()
        df_plot.loc[ls_order].plot(kind='bar',ax=ax,color=['mediumpurple','deepskyblue'])
        ax.set_title(f'{s_foci} vs. {s_cat}\nChi-squared p={pvalue:.3}')
        ax.set_ylabel(f'Fration of {s_foci.split("_")[0]}+ Epithelial\nthat are Proliferating')
        ax.set_xticklabels([item.replace('_cohort','') for item in ls_order])
        plt.tight_layout()
        xtickslocs = ax.get_xticks()
        for idx, s_order in enumerate(ls_order):
            s = f'{df_obs.value_counts().loc[s_order]}\nof\n{df_exp.value_counts().loc[s_order]}'
            plt.text(-0.2+idx,0.15,s)
        #break
    #break

In [None]:
## make the boxplots
# percent pRPA epithelial cells
# percent of those that are Ki67+
pal_porg = ('#E69F00','#56B4E9',)
pal_liv = ('#0072B2','#D55E00')
sns.set_palette(pal_porg)
d_cats = {'pORG_binary':['high','low'],'Cohort':['liver_cohort','lung_cohort']}
for s_cat, ls_order in d_cats.items():
    for s_foci in [ls_foci[0]]:
        figsize=(2.8,2.5)
        #fig,ax=plt.subplots(dpi=300,figsize=figsize)
        df_both = pd.DataFrame()
        d_pval = {}
        for idx, s_cohort in enumerate(df_lei_foci.loc[:,s_cat].dropna().unique()):
            print(s_cohort)
            df_plot_foci =df_lei_foci[(df_lei_foci.leidencelltype5=='epithelial') & (df_lei_foci.loc[:,s_foci] > 0)
                                      & (df_lei_foci.Tissue.isin(ls_tissue)) & (df_lei_foci.loc[:,s_cat]==s_cohort)]
            df_both = pd.concat([df_both,df_plot_foci])
        df_sum = df_both.loc[:,[s_cat,'Patient','Ki67pos']].groupby('Patient').value_counts().unstack().reset_index().fillna(0)
        df_sum['Frac. Prolif.'] = df_sum.loc[:,'Ki67+']/(df_sum.loc[:,['Ki67+','Ki67-']].sum(axis=1))
        fig, __, __, __ = util.categorical_correlation_boxplot(df_sum,s_cat,'Frac. Prolif.',s_type='',s_cell=f"{s_foci.split('_')[0]}+",
                                         alpha=1.05,s_propo='in',b_ttest=True)
        #fig.get_axes()[0].set_ylabel(f"{s_marker.replace('_',' ').replace('mean ','')} per\n{s_cell} cell")
        #fig.get_axes()[0].set_xlabel(f"{s_group.split('_')[0]}")
        #fig.get_axes()[0].set_title(fig.get_axes()[0].title.get_text().split('\n ')[-1])
        #break
    #break

### Section 3: mIHC Analysis <a name="mihc"></a>

Stacked barplot


[contents](#contents)

In [None]:
# load sam's data
df_patient = pd.read_csv(f'{codedir}/data/BCCLiverLung_V4_sampleavg_Density.csv')

# look as carl's youden cutoff for pORG
cutoff = 0.0249
df_patient.loc[df_patient.loc[:,'pORG_0.2_Primary'] > cutoff,f'pORG>{cutoff}'] = 'high' # 'pORG_0.2_Primary','pORG_0.2_Met','pORG_0.2_allPrimary','pORG_0.2_allMet',
df_patient.loc[df_patient.loc[:,'pORG_0.2_Primary'] <= cutoff,f'pORG>{cutoff}'] = 'low'
df_patient.pORG.value_counts()

In [None]:
df_patient.loc[:,['pORG>0.0249','pORG_0.2_Primary']].groupby('pORG>0.0249').std()
from scipy.stats import sem
for s_col in ['pORG>0.0249','Cohort']:
    for s_col_col in df_patient.loc[:,s_col].dropna().unique():
        print(s_col_col)
        print(sem(df_patient.loc[df_patient.loc[:,s_col]==s_col_col,'pORG_0.2_Primary']))

In [None]:
# #survival
# df_surv = pd.read_csv(f'{codedir}/data/u54_tma_sampleannot_Link_new.csv',index_col=0)
# df_surv['Sample_ID'] = [item.split('-T')[0][-6::] for item in df_surv.loc[:,'Public_Patient_ID'].fillna('none')]
s_out = 'annotation/20231205_Patient_Metadata.csv'#'annotation/20230921_Patient_Metadata.csv'
df_surv = pd.read_csv(s_out,index_col=0)
df_surv = df_surv[df_surv.Public_Patient_ID.isin(df_patient.Public_Patient_ID)]

In [None]:
df_density = pd.read_csv(f'{codedir}/pORGCohort_DBT_ROI_Density.csv')
for s_col in ['pORG>0.0249','Cohort']:
    df_density[s_col] = df_density.Sample_ID.map(dict(zip(df_patient.Sample_ID,df_patient.loc[:,s_col])))

In [None]:
# stacked bar
ls_mihc = ['CD8 T cells', 'T-regulatory CD4 cells',
       'CD4 T helper cells', 'B cells', 'Granulocytes', 'Monocyte',
       'Macrophage', 'Mature DC', 'Immature DC', #'Immune Other', 'PanCK+',
       #'aSMA+', 'Other Cells', 
          ]

for s_group in ['Cohort','pORG>0.0249']:
    fig,ax=plt.subplots(dpi=300)
    df_mean = df_density.loc[:,ls_mihc+[s_group]].groupby(s_group).mean()
    df_mean.plot(kind='bar',width=.9,stacked=True,ax=ax,colormap='Paired')
    ax.legend(bbox_to_anchor=(1,.9))
    ax.set_ylabel('Cell density (cells/mm$^2$)')
    plt.tight_layout()

In [None]:
df_density.loc[:,s_group].value_counts()

In [None]:
df_density.Patient.unique()

In [None]:
df_density.loc[:,'Cohort'].value_counts()

In [None]:
from statannotations.Annotator import Annotator

d_hue = {'Cohort':['Lung','Liver',],'pORG>0.0249':['low','high',]}
x = "cell_type"
y = "density"
order = ['CD4 T helper cells', 'CD8 T cells', 'Granulocytes', 'Macrophage',
       'Immature DC', 'B cells', 'Monocyte', 'T-regulatory CD4 cells',
       'Mature DC']
pal_porg_r = ('#56B4E9','#E69F00',)
pal_liv_r = ('#0072B2','#D55E00',)

with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=FutureWarning)
    for hue in ['Cohort','pORG>0.0249']:
        if hue == 'Cohort':
            sns.set_palette(pal_liv_r)
        else:
            sns.set_palette(pal_porg_r)
        for s_region in ['All']:#,'Tumor','Border','Distal'
            if s_region == 'All':
                df_mean = df_density.loc[:,ls_mihc+['Sample_ID']].groupby(['Sample_ID']).mean()
            else:
                df_mean = df_density.loc[df_density.Location==s_region[0],ls_mihc+['Sample_ID']].groupby(['Sample_ID']).mean()
            df_mean = df_mean.reset_index().set_index('Sample_ID')
            df_long = df_mean.stack().reset_index().rename({'level_1':'cell_type',0:'density'},axis=1)
            df_long[hue] = df_long.Sample_ID.map(dict(zip(df_patient.Sample_ID,df_patient.loc[:,hue])))
            #plot
            hue_order = d_hue[hue]
            pairs=[((item,hue_order[0]),(item,hue_order[1])) for item in order]
            fig, ax = plt.subplots(dpi=300)
            sns.violinplot(data=df_long, x=x, y=y, order=order, hue=hue, hue_order=hue_order,ax=ax,
                       cut=0,saturation=0.5,inner='quartile')
            sns.stripplot(data=df_long, x=x, y=y, order=order, hue=hue, hue_order=hue_order,ax=ax,
                       dodge=True,linewidth=1)
            ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
            ax.set_ylabel('Cell density (cells/mm$^2$)')
            ax.set_xlabel('')
            # ax.legend(bbox_to_anchor=(1,.9))
            annot = Annotator(ax=ax,pairs=pairs,data=df_long, x=x, y=y, order=order,
                          hue=hue, hue_order=hue_order)
            annot.configure(test='t-test_ind', verbose=2)
            annot.apply_test()
            annot.annotate()
            ax.set_title(s_region)
            plt.tight_layout()
            plt.savefig(f'figures/mIHC_per_patient_by_{hue}_density_in_{s_region}.png', dpi=300, bbox_inches='tight')

In [None]:
# # boxplot starthere

# df = df_patient.loc[:,ls_mihc].unstack().reset_index().rename({'level_0':'cell type',0:'Density'},axis=1)
# df['pORG'] = df.level_1.map(dict(zip(df_patient.index,df_patient.pORG)))
# df['Cohort'] = df.level_1.map(dict(zip(df_patient.index,df_patient.Cohort)))
# for s_group in ['Cohort','pORG']:
#     if s_group == 'Cohort':
#         pal_liv = ('#D55E00','#0072B2',)
#         sns.set_palette(pal_liv)
#     else:
#         pal_porg = ('#E69F00','#56B4E9',)
#         sns.set_palette(pal_porg)
#     fig,ax=plt.subplots(figsize=(8,4),dpi=200)
#     #sns.violinplot(data=df,x='cell type',y='Density',hue=s_group,cut=0,ax=ax)
#     sns.stripplot(data=df,x='cell type',y='Density',hue=s_group,dodge=True,ax=ax)
#     sns.boxplot(data=df,x='cell type',y='Density',hue=s_group,ax=ax,showmeans=True,
#                 medianprops={'visible': False},whiskerprops={'visible': False},
#                 meanline=True,showcaps=False,
#                 meanprops={'color': 'k', 'ls': '-', 'lw': 2},
#                 showfliers=False,showbox=False)#
#     # Rotating X-axis labels
#     ax.set_xticklabels(ax.get_xticklabels(), rotation = 50)
#     ax.set_xlabel('')
#     h, l = ax.get_legend_handles_labels()
#     ax.legend(h[0:2],l[0:2],loc='upper right',title=s_group)
#     ls_group = sorted(df.loc[:,s_group].unique())
#     pairs = [((item,ls_group[1]),(item,ls_group[0])) for item in ls_mihc]
#     annot = Annotator(ax, pairs, data=df,x='cell type',y='Density',hue=s_group,
#                       order=ls_mihc,hue_order=ls_group)
#     annot.configure(test='t-test_ind', text_format='star',fontsize=7) #,comparisons_correction='fdr_bh'
#     res = annot.apply_test()
#     d_pval = dict([(res.data.group1[0],res.data.pvalue) for res in annot.annotations])
#     pvalues = [d_pval[item] for item in ls_mihc]
#     reject, corrected, __, __ = statsmodels.stats.multitest.multipletests(pvalues,method='fdr_bh')
#     formatted_pvalues = [f'p={pvalue:.2}' for pvalue in list(pvalues)]
#     annot.set_custom_annotations(formatted_pvalues)
#     annot.annotate()


In [None]:
# # dowload mIHC data from https://www.synapse.org/#!Synapse:syn51078766
# # combine ROIs into large dataframe and save
# if not os.path.exists(f'data/20221123_mIHC_LiverLung_Celltypes.csv'):
#     ls_col = ['Sample_ID','class','Location_Center_X', 'Location_Center_Y', 
#            'GRZB_func', 'KI67_func', 'PD1_func', 'PDL1_func', 'CD163_func',
#            'CCR2_func', 'HLAII_func', 'EOMES_func','Area']
#     df_ll =pd.DataFrame()
#     for s_file in sorted(os.listdir('mIHC_Data')):
#         df = pd.read_csv(f'mIHC_Data/{s_file}',index_col=0)
#         s_sample = s_file.split('LiverLungBCC')[1].split('.')[0]
#         df['Sample_ID'] = s_sample
#         df.index = [f'{s_sample.split("Nuclei_")[1].replace("ROI","_scene")}_cell{item}' for item in df.index]
#         df_ll = pd.concat([df_ll,df.loc[:,ls_col]])
#         break
#     df_ll['Organ'] = df_ll.Sample_ID.map(dict(zip(df_ll_annot.index,df_ll_annot.loc[:,'Met Site'])))
#     df_ll['Location'] = df_ll.Sample_ID.map(dict(zip(df_ll_annot.index,df_ll_annot.loc[:,'Location'])))
#     df_ll['Desc'] = df_ll.Sample_ID.map(dict(zip(df_ll_annot.index,df_ll_annot.loc[:,'Desc'])))
#     df_ll.to_csv(f'data/20221123_mIHC_LiverLung_Celltypes.csv')

In [None]:
#load data
df_ll = pd.read_csv(f'data/20221123_mIHC_LiverLung_Celltypes.csv',index_col=0,low_memory=False)
print(len(df_ll))

df_ll_ann = pd.read_csv('data/LiverLung_annotations.csv',index_col=0)
df_ll_roi = pd.read_csv('data/annotated_LiverLung_perROI.csv',index_col=0)

In [None]:
#add annotation
df_ll['Sample_ID_short'] = [item.split('Nuclei_')[1].split('ROI')[0] for item in df_ll.Sample_ID]
df_ll['Sample_ID_int'] = [int(item) for item in df_ll.Sample_ID_short]
df_ll['ROI'] = [int(item.split('ROI')[1]) for item in df_ll.Sample_ID]
df_ll['Cohort'] = df_ll.Sample_ID_int.map(dict(zip(df_ll_ann.index, df_ll_ann.Cohort)))
df_ll['Patient'] = df_ll.Sample_ID_int.map(dict(zip(df_ll_ann.index, df_ll_ann.Patient)))
df_ll['Sample_ROI'] = df_ll.Sample_ID_int.astype('str') + '_' + df_ll.ROI.astype('str')
df_ll_roi['Sample_ROI'] = df_ll_roi.index.astype('str') + '_' + df_ll_roi.ROI.astype('str')
df_ll['Location'] = df_ll.Sample_ROI.map(dict(zip(df_ll_roi.Sample_ROI,df_ll_roi.loc[:,'Location'])))

In [None]:
df_ll['classII'] = df_ll.loc[:,'class'].replace({'T-regulatory CD4 cells':'T cells',
                                                 'CD4 T helper cells':'T cells','CD8 T cells':'T cells'}) #
df_ll.rename({'class':'classI'},axis=1,inplace=True)

In [None]:
d_color = dict(zip(['ST-00016289', 'ST-00017078', 'ST-00017310', 'ST-00017381',
       'ST-00018269', 'ST-00018955', 'ST-00019367', 'ST-00019368',
       'ST-00020181'],sns.color_palette('Purples',9)))
d_color.update(dict(zip(['ST-00015839', 'ST-00017440', 'ST-00017804'],sns.color_palette('Blues',3))))


In [None]:
#by location
s_define = 'pORG_Primary'#'trim_padj_0.2_pORG_Up_55_Genes'#'txi_pORG_Up_42_Genes'#''#
s_group = 'Cohort'
alpha = 0.05
s_column = 'classI'#'classII'#
if s_column == 'classI':
    ls_mihc = [ 'T-regulatory CD4 cells', 'CD4 T helper cells','PanCK+','aSMA+','CD8 T cells','B cells', 'Granulocytes', 'Monocyte',
 'Macrophage', 'Mature DC','Immature DC']
elif s_column == 'classII':
    ls_mihc = ['T cells', #'B cells', 'Granulocytes', 'Monocyte', 'Macrophage',
       #'Mature DC', 'Immature DC', 'Immune Other', 'PanCK+', 'aSMA+','Other Cells'
       ]
s_patient = 'Public_Patient_ID'

for s_loc in ['all','T','B','D']: 
    if s_loc == 'all':
        df_loc = df_ll
    else:
        df_loc = df_ll[df_ll.Location==s_loc]
    print(len(df_loc.Sample_ID_short.unique()))
    df_group = (df_loc.groupby(['Patient',s_column]).count().Sample_ID/(df_loc.groupby(['Patient']).count().Sample_ID)).unstack()
    df_group[s_patient] = df_group.index
    df_group['Cohort'] = df_group.index.map(dict(zip(df_ll_roi.Patient,df_ll_roi.loc[:,'Cohort'])))
    df_group = df_group.merge(df_surv.loc[:,['Survival','Days from Diagnosis to FU',s_define,s_patient]],on=s_patient)
    df_group = df_group[~df_group.loc[:,s_patient].duplicated()]
    df_group = df_group[df_group.Cohort!='Liver met'].fillna(0)
    ls_order = sorted(df_group.loc[:,s_group].unique())
    for s_marker in ls_mihc:
        try:
            s_high = df_group.loc[:,s_group].unique()[0]
            s_low = df_group.loc[:,s_group].unique()[1]
        except:
            continue
        n_high = sum(df_group.loc[:,s_group]==s_high)
        n_low = sum(df_group.loc[:,s_group]==s_low)
        statistic,pvalue = stats.ttest_ind(df_group.loc[df_group.loc[:,s_group]==s_high,s_marker],
                                           df_group.loc[df_group.loc[:,s_group]==s_low,s_marker])
        if pvalue <= alpha:
            df_group_roi = (df_loc.groupby(['Sample_ROI',s_column]).count().Sample_ID/(df_loc.groupby(['Sample_ROI']).count().Sample_ID)).unstack()
            df_group_roi['Cohort'] = df_group_roi.index.map(dict(zip(df_ll_roi.Sample_ROI,df_ll_roi.loc[:,'Cohort'])))
            df_group_roi['Patient'] = df_group_roi.index.map(dict(zip(df_ll_roi.Sample_ROI,df_ll_roi.loc[:,'Patient'])))
            df_group_roi = df_group_roi[df_group_roi.Cohort.isin(['liver_cohort','lung_cohort'])]
            fig, ax = plt.subplots(figsize=(3,3),dpi=300)
            sns.boxplot(data=df_group_roi,x=s_group,y=s_marker,showfliers=False,ax=ax,order=[str(item) for item in ls_order],palette=['mediumpurple','deepskyblue'])
            sns.stripplot(data=df_group_roi,x=s_group,y=s_marker,ax=ax,hue='Patient',s=3,palette=d_color)
            ax.set_ylim(ax.get_ylim()[0],ax.get_ylim()[1])
            ax.set_title(f'{s_group} versus\n {s_marker}\n p={pvalue:.4f} (n={n_low}, {n_high})')
            ax.set_ylabel(f'{s_marker} in {s_loc}')
            #ax.set_ylabel(f'{s_marker}')
            ax.get_legend().remove()
            plt.tight_layout()
            #fig.savefig(f'{s_date}/boxplot_mIHC_{s_marker}_versus_{s_group}_in_{s_loc}.png')
    #     break
    # break

In [None]:
# boxplot figure friendly
# look as carl's youden cutoff for pORG
if s_column == 'classI':
    cutoff = 0.0249
    df_group.loc[df_group.loc[:,'pORG_Primary'] > cutoff,f'pORG'] = 'high' # 'pORG_0.2_Primary','pORG_0.2_Met','pORG_0.2_allPrimary','pORG_0.2_allMet',
    df_group.loc[df_group.loc[:,'pORG_Primary'] <= cutoff,f'pORG'] = 'low'
    print(df_group.pORG.value_counts())
    ls_mihc = [ #'PanCK+','aSMA+',
               'CD4 T helper cells','CD8 T cells','B cells', 'T-regulatory CD4 cells',
        'Granulocytes', 'Monocyte',
     'Macrophage', 'Mature DC','Immature DC']
    df = df_group.loc[:,ls_mihc].unstack().reset_index().rename({'level_0':'cell type',0:'Fraction'},axis=1)
    df['pORG'] = df.level_1.map(dict(zip(df_group.index,df_group.pORG)))
    df['Cohort'] = df.level_1.map(dict(zip(df_group.index,df_group.Cohort)))
    for s_group in ['Cohort','pORG']:
        if s_group == 'Cohort':
            pal_liv = ('#0072B2','#D55E00')
            sns.set_palette(pal_liv)
        else:
            pal_porg = ('#56B4E9','#E69F00',)
            sns.set_palette(pal_porg)
        fig,ax=plt.subplots(figsize=(8,4),dpi=200)
        #sns.violinplot(data=df,x='cell type',y='Density',hue=s_group,cut=0,ax=ax)
        sns.stripplot(data=df,x='cell type',y='Fraction',hue=s_group,dodge=True,ax=ax)
        sns.boxplot(data=df,x='cell type',y='Fraction',hue=s_group,ax=ax,showmeans=True,
                    medianprops={'visible': False},whiskerprops={'visible': False},
                    meanline=True,showcaps=False,
                    meanprops={'color': 'k', 'ls': '-', 'lw': 2},
                    showfliers=False,showbox=False)#
        # Rotating X-axis labels
        ax.set_xticklabels(ax.get_xticklabels(), rotation = 50)
        ax.set_xlabel('')
        h, l = ax.get_legend_handles_labels()
        ax.legend(h[0:2],l[0:2],loc='upper right',title=s_group)
        ls_group = sorted(df.loc[:,s_group].unique())
        pairs = [((item,ls_group[1]),(item,ls_group[0])) for item in ls_mihc]
        annot = Annotator(ax, pairs, data=df,x='cell type',y='Fraction',hue=s_group,
                          order=ls_mihc,hue_order=ls_group)
        annot.configure(test='t-test_ind', text_format='star',fontsize=7) #,comparisons_correction='fdr_bh'
        res = annot.apply_test()
        d_pval = dict([(res.data.group1[0],res.data.pvalue) for res in annot.annotations])
        pvalues = [d_pval[item] for item in ls_mihc]
        reject, corrected, __, __ = statsmodels.stats.multitest.multipletests(pvalues,method='fdr_bh')
        formatted_pvalues = [f'p={pvalue:.2}' for pvalue in list(pvalues)]
        annot.set_custom_annotations(formatted_pvalues)
        annot.annotate()


In [None]:

print(df_group.groupby('Cohort').mean(numeric_only=True).loc[:,s_define])
print(df_group.groupby('Cohort').sem(numeric_only=True).loc[:,s_define])

In [None]:
fig,ax=plt.subplots(dpi=300)
df_group.groupby('Cohort').mean().loc[:,ls_mihc].plot(kind='bar',width=.9,stacked=True,ax=ax,colormap='Paired')
ax.legend(bbox_to_anchor=(1,.9))
ax.set_ylabel('Fraction in Tissue')
plt.tight_layout()

# Section 4 <a name="meta"></a>

patient metadata


[contents](#contents)

### Patients in LabKey

In [None]:
# load patient vital status
#also had stage/ grade/ lymph nodes etc.
df_vital = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/FMP_Patients_Nov17_2021.xlsx',sheet_name='Sheet1')

#Stage 1 pancreatic cancer means the cancer is not more than 4cm in size and it hasn't spread outside the pancreas. It is split into 1A and 1B.
# 1A In TNM staging, this is the same as T1, N0, M0 1B In TNM staging, this is the same as T2, N0, M0

# Stage 2A means the cancer is bigger than 4cm but is still within the pancreas. It has not spread to the lymph nodes Open a glossary item or other areas of the body.
#2A in TNM staging, this is the same as T3, N0, M0. 2B In TNM staging, this is the same as T1, 2 or 3, N1, M0.

#3 Stage 3 can mean that the cancer is any size within the pancreas and has spread to 4 or more nearby lymph nodes Open a glossary item.
#In TNM staging, this is the same as T1, 2 or 3, N2, M0.
#Or stage 3 can mean the cancer has started to grow outside the pancreas into the major blood vessels nearby. It may or may not have spread into the lymph nodes. It hasn't spread to any other areas of the body.
#In TNM staging, this is the same as T4, Any N, M0.

#4 Your doctor might call this advanced (metastatic) cancer.
#In TNM staging, this is the same as Any T, Any N, M1.

#collapse stage
d_stage = {'2B - IIB':'II', 'p2A':'II', 'p2B':'II', '4 - IV':'IV', '2A - IIA':'II', 'p3':'III', '1B - IB':'I',
    'c1B':'I', 'c2B':'I', 'nan':pd.NA, 'p4':'IV','2B - T1-3, N1, M0':'II','p0':'0','p1B':'I','c1':'I',
    'c4':'IV','c3':'III','p1A':'I','pNA':pd.NA,'3 - T4, Any N, M0':'III','c2A':'II','p4B':'IV','c4B':'IV',
     '2B - T1, N1, M0 / T2, N1, M0 / T3, N1, M0':'II','pUNK':pd.NA,'p2':'II','p3B':'III','c2':'II','p3A':'III',
    '3 - III':'III','99 - Unknown':pd.NA,'1A - IA':'I','c3A':'III','c4A':'IV','p1':'I','c1A':'I','p4A':'IV',
    '88 - Not applicable to 7th Edition staging':pd.NA,
           '88 - No classification is recommended in 6th Edition':pd.NA,
    '2A - T3, N0, M0':'II','4 - Any T, Any N, M1':'IV'}
df_vital['Stage'] = pd.NA
df_vital['Stage'] = df_vital.loc[:,'Stage Grouping _ Dominant'].replace(d_stage) 

#collapse grade

d_grade = {'Grade II  Moderately Diff / Mod Well Diff':'2',
       'Grade I   Well Differentiated/Differentiated':'1',
       'Cell type not determined; not stated;N/A;Unk; high grade dysplas':pd.NA,
       'Grade III Poorly Differentiated':'3', 'nan':pd.NA,'Grade IV Undifferentiated, Anaplastic':'4',
         'B-CELL    LYMPHOMA OR LEUKEMIA ONLY':pd.NA}
df_vital['Grade'] = pd.NA
df_vital['Grade'] = df_vital.loc[:,'Grade_Differentiation'].replace(d_grade)

#collapse LV invasion
d_replace = {'nan':pd.NA,np.nan:pd.NA, 'LYMPHOVASCULAR INVASION STATED AS NOT PRESENT':'NO',
       'LYMPHOVASCULAR INVASION PRESENT/IDENTIFIED':'YES',
       'Unknown/Indeterminate':pd.NA, 'NOT APPLICABLE':pd.NA,
       'Lymph-vascular Invasion Present/Identified':'YES',
       'LYMPHATIC AND SMALL VESSEL INVASION ONLY (L)':'YES',
       'BOTH LYMPHATIC AND SMALL VESSEL AND VENOUS (LARGE VESSEL) INVASION':'YES'}
df_vital['LV_Invasion'] = pd.NA
df_vital['LV_Invasion'] = df_vital.loc[:,'Lymph_vascular Invasion'].replace(d_replace)

# LN positivity
df_vital['LN_Pos'] = pd.NA
df_vital.loc[df_vital.loc[:,'Regional Lymph Nodes Positive'] >= 1,'LN_Pos'] = True
df_vital.loc[df_vital.loc[:,'Regional Lymph Nodes Positive'] < 1,'LN_Pos'] = False
df_vital.loc[df_vital.loc[:,'Regional Lymph Nodes Positive'].isna(),'LN_Pos'] = pd.NA
df_vital['LN_Pos']  = df_vital.LN_Pos.replace({True:'YES',False:'NO'})

df_vital['Survival'] = df_vital.cVitalStatus.replace({'Alive':0,'Dead':1})
print(df_vital.Grade.unique())
print(df_vital.Stage.unique())
print(df_vital.LN_Pos.unique())
print(df_vital.LV_Invasion.unique())
print(len(df_vital))

In [None]:
# kaplan meier
# %matplotlib inline
# # check prognostic value of clinicopathologiocal variables

# ls_vital = ['Stage', 'Grade','LV_Invasion','LN_Pos'] 
# s_time = 'cDays from Diagnosis to FU'
# s_censor = 'Survival'
# for s_vital in ls_vital:
#     print(s_vital)
#     df = df_vital.loc[df_vital.loc[:,'Primary Site _ Major Groups For Staging']=='Pancreas',[s_vital,s_time,s_censor]].dropna(how='any')
#     fig, __ = util.km_plot(df,s_vital,s_time,s_censor)
#     fig.savefig(f'figures/KM_clinicopath_{s_vital}.png')
#     #break

#CPH
s_time = 'cDays from Diagnosis to FU'
#df_vital['Survival_time'] = df_vital.loc[:,s_time]
s_censor = 'Survival'
ls_vital = ['LV_Invasion','Stage', 'Grade','LN_Pos'] #,'Age'
for s_vital in ls_vital:
    print(s_vital)
    df = df_vital.loc[df_vital.loc[:,'Primary Site _ Major Groups For Staging']=='Pancreas',[s_vital,s_time,s_censor]].dropna(how='any')
    if df.columns.isin(['Stage']).any():
        df.Stage = df.Stage.replace({'I':1,'II':2,'III':3,'IV':4}).astype('int')
    if df.columns.isin(['Grade']).any():
        df.Grade = df.Grade.astype('int')
    if df.loc[:,s_vital].dtype=='O':
        df_dummy = pd.get_dummies(df.loc[:,[s_vital]],drop_first=True)
        df.drop(s_vital,axis=1,inplace=True)
        s_vital = df_dummy.columns[0]
        df[s_vital] = df_dummy
    fig, cph = util.cph_plot(df,s_vital,s_time,s_censor,figsize=(3,1.5))
    plt.tight_layout()
    fig.savefig(f'figures/CPH_single_{s_vital}_all.png')
    plt.close(fig)
    break

### Patients in Paper 

n=434 specimens

n=422 patients

In [None]:
#load full patient data, neoadjuvant
df_patient = pd.read_excel(f'data/SupplementalDataset1.xlsx',
                           sheet_name='Patients - Tab 1')
df_patient.rename({'Patient ID':'Public_Patient_ID'},axis=1,inplace=True)
df_patient['Cohort'] = pd.NA
df_patient.loc[df_patient.loc[:,'Lung Met Present']=='YES','Cohort'] = 'Lung'
df_patient.loc[df_patient.loc[:,'Liver Met Present']=='YES','Cohort'] = 'Liver'
#how many in cohorts
for s_site in df_patient.Cohort.dropna().unique():
    n_patients = df_patient[df_patient.Cohort==s_site].Public_Patient_ID.nunique()
    print(f'{s_site} {n_patients}')
print(f'number rows {len(df_patient)}')
print(f'number unique pts. {df_patient.Public_Patient_ID.nunique()}')

#define neodjuvant binary
df_patient.loc[:,'Neoadjuvant Treatment'].unique()
df_patient.loc[df_patient.loc[:,'Neoadjuvant Treatment'] == 'Yes Neoadjuvant','Neoadjuvant'] = 'Yes'
df_patient['Neoadjuvant'] = df_patient.Neoadjuvant.fillna('No').replace('nan','No')

#load T cell data
df_tcell_tumor = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset6.xlsx',
                         sheet_name='Tumor Samples')

df_tcell_blood = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset6.xlsx',
                         sheet_name='Blood Samples')

#merge T cell blood and tumor
df_tcell = df_tcell_tumor.merge(df_tcell_blood,on='Patient ID',suffixes=(' tumor',' blood'),how='outer')

df_tcell.rename({'Productive Rearrangements (Observed Richness)':'Productive_Rearrangements',
                'Templates per ng':'Templates_per_ng','Patient ID':'Public_Patient_ID'},axis=1,inplace=True)
print(f'Add TCR patients {len(df_tcell)}')
# #merge - no, add TCR later
# df_patient = df_patient.merge(df_tcell,on='Public_Patient_ID',how='left',suffixes=('','_x'))
# df_patient['INDEX'] = df_patient.index + 1
print(f'number rows {len(df_patient)}')
print(f'number unique pts. {df_patient.Public_Patient_ID.nunique()}')
#all 422 have OS and vital status
print(f"Missing OS: {df_patient.loc[:,'Days from Diagnosis to FU'].isna().sum()}")
print(f"Missing Vital Status: {df_patient.loc[:,'Vital Status at FU'].isna().sum()}")

In [None]:
# #61?
# set([item.split('-')[2] for item in df_gsva.loc[:,'Patient Specimen ID']])
# df_purist.loc['ST-00014524']
# df_gsva[df_gsva.Public_Patient_ID=='ST-00014524']

In [None]:
#load Purist
d_gsva = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset1.xlsx', sheet_name=None)
df_gsva = d_gsva['Specimen Subtype - Tab 2'].loc[:,['PurIST Score','Patient Specimen ID','PurIST Subtype','Patient ID']]
df_gsva.rename({'Patient ID':'Public_Patient_ID'},axis=1,inplace=True)
df_gsva['Specimen_ID'] = [item.split('-')[-1] for item in df_gsva.loc[:,'Patient Specimen ID']]
df_gsva.drop('Patient Specimen ID',axis=1,inplace=True)
df_purist = df_gsva.pivot(index='Public_Patient_ID',columns=['Specimen_ID'],values='PurIST Score')
df_purist['PurIST_Primary'] = df_purist.loc[:,'T'].fillna(df_purist.F).fillna(df_purist.T2)
df_purist['PurIST_Primary_T2'] = df_purist.T2
#odd that there are only 71 mets with purist calls
df_purist['PurIST_Met'] = df_purist.loc[:,'M'].fillna(df_purist.M2)
print(f'Pts with RNAseq: {df_purist.reset_index().Public_Patient_ID.nunique()}')
df_patient = df_patient.merge(df_purist.reset_index().loc[:,['PurIST_Primary','PurIST_Met',
                            'PurIST_Primary_T2','Public_Patient_ID']],
                            on='Public_Patient_ID',how='left')
print(f'number rows {len(df_patient)}')
print(f'number unique pts. {df_patient.Public_Patient_ID.nunique()}')

In [None]:
df_gene2 = pd.read_csv('data/HighMedImpactVariantDataForStatsFromMH.tsv',sep='\t')
df_gene2.columns = [item.replace('.',' ') for item in df_gene2.columns]

#use tempus
df_gene = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset2.xlsx',
                            sheet_name='Mutation Data')

def catch(func, handle=lambda e : e, *args, **kwargs):
    try:
        return func(*args, **kwargs)
    except Exception as e:
        return np.nan
df_gene['Alteration_Function'] = [catch(lambda : item.split(' - ')[1]) for item in df_gene.loc[:,'Alteration Type']]
df_gene['Gene_Function'] = df_gene.Gene + '_' + df_gene.Alteration_Function.fillna('__')
df_gene['Gene_Function'] = [item.replace('___','') for item in df_gene['Gene_Function']]
df_gene.head()

In [None]:
df_gene.Alteration_Function.value_counts()/len(df_gene)
df_gene.Alteration_Function.isna().sum()/len(df_gene)

In [None]:
#add mutation data, if there are more than 9 patient with a mutation
# df_gene = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset2.xlsx',
#                            sheet_name='Mutation Data')
#which patients were sequencedL
se_patients = pd.Series(df_gene.loc[:,'Patient Specimen ID'].unique())
print(f"number of genomics specimens: {len(se_patients)}")
se_mets = se_patients[se_patients.str.contains('-M')]
ls_met = [item.split('-')[0] + '-' + item.split('-')[1] for item in se_mets]
se_pri = se_patients[~se_patients.str.contains('-M')]
ls_pri = [item.split('-')[0] + '-' + item.split('-')[1] for item in se_pri]
print(f'no. primary specimens {len(ls_pri)}')
print(f'no. met specimens {len(ls_met)}')
print(f'no. unique pts {len(set(ls_pri+ls_met))}')
print(f'no. matched pts {len(set(ls_pri).intersection(set(ls_met)))}')
print(f'no. duplicated pri specimens {pd.Series(ls_pri).duplicated().sum()}')
print(f'no. duplicated met specimens {pd.Series(ls_met).duplicated().sum()}')

d_fill = {'_Primary':ls_pri,'_Met':ls_met,'':ls_pri+ls_met}
ls_genes = []
for s_gene in df_gene.Gene.unique():  
    df_gene_alt = df_gene.loc[df_gene.Gene==s_gene,['Patient ID','Patient Specimen ID','Gene']].copy()
    if df_gene_alt.loc[:,'Patient ID'].nunique() > 9 or s_gene.find('BRCA') > -1:
        print(s_gene)
        df_gene_alt['Specimen_ID'] = [item.split('-')[-1] for item in df_gene_alt.loc[:,'Patient Specimen ID']]
        df_pivot = df_gene_alt.drop_duplicates().pivot(index='Patient ID',columns=['Specimen_ID'],values='Gene')
        try:
            df_pivot[f'{s_gene}_Altered_Primary'] = df_pivot.loc[:,'T'].fillna(df_pivot.F).replace({s_gene:True})#.fillna(False)
        except:
            df_pivot[f'{s_gene}_Altered_Primary'] = df_pivot.loc[:,'T'].replace({s_gene:True})#.fillna(False)
        if df_pivot.columns.isin(['M']).any():
            ls_col = [f'{s_gene}_Altered_Primary',f'{s_gene}_Altered_Met']
            try:
                df_pivot[f'{s_gene}_Altered_Met'] = df_pivot.loc[:,'M'].fillna(df_pivot.M2).replace({s_gene:True})#.fillna(False)
            except:
                df_pivot[f'{s_gene}_Altered_Met'] = df_pivot.loc[:,'M'].replace({s_gene:True})#.fillna(False)
        else:
            ls_col = [f'{s_gene}_Altered_Primary']
        df_pivot = df_pivot.loc[:,ls_col]
        df_pivot[f'{s_gene}_Altered'] = pd.NA 
        df_pivot = df_pivot.reset_index().rename({'Patient ID':'Public_Patient_ID'},axis=1)#,inplace=True
        df_pivot.loc[(df_pivot.iloc[:,1::]).any(axis=1),f'{s_gene}_Altered'] = True
        df_patient = df_patient.merge(df_pivot,on='Public_Patient_ID',how='left',suffixes=('','__'))
        # add the false values (differnet from na)
        for s_loc in ['_Primary','_Met','']:
            try:
                b_na = df_patient.loc[:,f'{s_gene}_Altered{s_loc}'].isna()
                b_loc = df_patient.Public_Patient_ID.isin(d_fill[s_loc])
                df_patient.loc[(b_na) & (b_loc),f'{s_gene}_Altered{s_loc}'] = False 
                #print(df_patient.loc[:,f'{s_gene}_Altered{s_loc}'].value_counts())
                ls_genes.append(f'{s_gene}_Altered{s_loc}')
            except:
                print(f'no {s_loc}')
    #break
#gain versus loss
#'''
ls_arid_pts =['ST-00007313','ST-00007175','ST-00015114','ST-00020482',
 'ST-00019171','ST-00019171','ST-00007120','ST-00007120'] #LOF
print(df_patient.ARID1A_Altered.value_counts())
df_patient['ARID1aLOF_Altered'] = pd.NA
df_patient.loc[df_patient.ARID1A_Altered.notna(),'ARID1aLOF_Altered'] = False
df_patient.loc[(df_patient.Public_Patient_ID.isin(ls_arid_pts)) & ((df_patient.ARID1A_Altered)),'ARID1aLOF_Altered'] = True
df_patient.loc[df_patient.Public_Patient_ID.isin(ls_arid_pts),'ARID1A_Altered'] = False
ls_genes.append('ARID1aLOF_Altered')
print(df_patient.ARID1A_Altered.value_counts())
print(df_patient.ARID1aLOF_Altered.value_counts())

ls_more_than_10 = sorted(set([item.split('_')[0] for item in ls_genes]))
se_test = df_patient.loc[:,[f'{item}_Altered' for item in ls_more_than_10]].sum()
print(f"genes with 10 or more: {len(ls_more_than_10) - len(se_test[se_test < 10])}")
#'''

In [None]:
print(df_patient.RNF43_Altered.notna().sum())
print(df_patient.RNF43_Altered_Primary.notna().sum())
print(df_patient.RNF43_Altered_Met.notna().sum())

In [None]:
#add age, filter patients who died after surgery
d_ids = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/OLD Versions/Simplified_Public_IDs_Key.xlsx',sheet_name=None)

df_id = d_ids['RnaSeqKey']
ls_ids = df_id.loc[:,'Public.Specimen.ID']

# add patients w/o RNA seq
for s_key in ['TcrTumorKey','TcrBloodKey','SuppTable2Key','DnaPanelKey']:
    df_add = d_ids[s_key].loc[~d_ids[s_key].loc[:,'Public.Specimen.ID'].isin(ls_ids)]
    df_id = pd.concat([df_id,df_add])

#check
print(df_id.loc[:,'OPTR.Specimen.ID'].duplicated().any())
ls_drop = df_id.loc[df_id.loc[:,'OPTR.Specimen.ID'].str.contains('-T2')].index
df_unique = df_id.loc[df_id.loc[:,'OPTR.Specimen.ID'].str.contains('-T')].drop(ls_drop)
#check
print(df_unique.OPTR.duplicated().any())

#add id
df_vital['Public_Patient_ID'] = df_vital.OPTR.map(dict(zip(df_unique.OPTR,df_unique.loc[:,'Biolibrary.Subject.ID'])))

#omics data plus clinical data
df_patient = df_patient.merge(df_vital,on='Public_Patient_ID',how='left',suffixes=('','_x'))
print('adding cinical data')
print(f'number rows {len(df_patient)}')
print(f'number unique pts. {df_patient.Public_Patient_ID.nunique()}')
#df_id.to_csv('Patient_IDs.csv')

#add age category
df_patient.loc[df_patient.loc[:,'Age at Diagnosis'] > 70,'Age'] = '>70'
df_patient.loc[df_patient.loc[:,'Age at Diagnosis'] <= 70,'Age'] = '<=70'

# drop less than 30 days #died of surgery
ls_drop_surgery = df_patient[(df_patient.loc[:,'Days from Resection to FU'] < 30) & (df_patient.loc[:,'cVitalStatus'] == 'Dead')].index
print('patients who died 1 month after surgery')
print(df_patient[df_patient.index.isin(ls_drop_surgery)].Public_Patient_ID)
df_patient['Alive_30_days_post_surgery'] = True
df_patient.loc[ls_drop_surgery,'Alive_30_days_post_surgery'] = False
#df_patient.Alive_30_days_post_surgery.fillna(True,inplace=True)
print('identifying died of surgery')
print(f'{df_patient.Alive_30_days_post_surgery.value_counts()}')
print(f'number unique pts. {df_patient.Public_Patient_ID.nunique()}')

#add lung and liver versus all (not just lung, liver and NA)
df_patient['Lung_Cohort'] = False
df_patient.loc[df_patient.Cohort=='Lung','Lung_Cohort'] = True

df_patient['Liver_Cohort'] = False
df_patient.loc[df_patient.Cohort=='Liver','Liver_Cohort'] = True


print(f'number rows {len(df_patient)}')
print(f'number unique pts. {df_patient.Public_Patient_ID.nunique()}')

## Find survival diff in GOF, LOF, altered

In [None]:
sns.set_palette('tab10')
s_time = 'Days from Diagnosis to FU'
s_censor = 'Survival'
df_patient['Survival'] = df_patient.loc[:,'Vital Status at FU'].replace({'Alive':0,'Dead':1})
for s_gene in sorted(set([item.split('_')[0] for item in ls_genes])):

    print(s_gene)
    df_altered = df_gene[(df_gene.loc[:,'Gene_Function'].str.contains(f'{s_gene}_')) | (df_gene.loc[:,'Gene_Function'] == (f'{s_gene}'))].copy()
    df_altered.rename({'Patient ID':'Public_Patient_ID'},axis=1,inplace=True)
    df_merge = df_patient.merge(df_altered,on='Public_Patient_ID',how='left')
    #df_merge.loc[(df_merge.loc[:,'Gene_Function'].isna()) & (df_merge.loc[:,'MTAP_Altered'].notna()),'Gene_Function'] = 'WT'
    df_km = df_merge.loc[df_merge.Alive_30_days_post_surgery,['Public_Patient_ID',s_time,s_censor,'Gene_Function']].dropna()
    try:
        fig,ax,ls_order = util.km_plot(df_km,'Gene_Function',s_time,s_censor)
        ax.set_ylabel('Fraction Alive')
        ax.set_xlabel('Overall Survival (Days)')
        fig.savefig(f'figures/KM_GOF_LOF_{s_gene}.png')
        plt.close(fig)
    except:
        if s_gene == 'ARID1A':
            ls_arid_pts = df_km.loc[df_km.Gene_Function=='ARID1A_LOF','Public_Patient_ID']
        

In [None]:
%matplotlib inline
for s_gene in ['ARID1A_Altered','NOTCH1_Altered','ARID1aLOF_Altered']:
    df_km = df_patient.loc[df_patient.Alive_30_days_post_surgery,['Public_Patient_ID',s_time,s_censor,s_gene]].dropna()
    df_km.loc[:,s_gene].replace({True:'Yes',False:'No'},inplace=True)
    fig,ax,ls_order = util.km_plot(df_km,s_gene,s_time,s_censor)
    ax.set_ylabel('Fraction Alive')
    ax.set_xlabel('Overall Survival (Days)')
    fig.savefig(f'figures/KM_GOF_LOF_{s_gene}.png')

In [None]:
# a patient who has a period of resection to FU but not to recurrence had no recurrence and 
#a patient who had a period of resection to recurrence but no liver or lung met had another site of recurrence.

#has a recurrence, not in liver or lung
b_recur = df_patient.loc[:,'Days from Resection to Recurrence'].notna() #& df_patient.Cohort.isna() 

#had a resection, no recurrence,  not liver or lung
b_no_recur = df_patient.loc[:,'Days from Resection to FU'].notna() & df_patient.loc[:,'Days from Resection to Recurrence'].isna()  #& df_patient.Cohort.isna() 

#add recurrence other
df_patient['Recurrence'] = False
df_patient.loc[b_recur,'Recurrence'] = True
#add recurrence none
df_patient['No_Recurrence'] = False
df_patient.loc[b_no_recur,'No_Recurrence'] = True
# should be 73 and 103, but I get 184 with a recurrence and 113 without a recurrence
print(df_patient.Recurrence.sum())
print(df_patient.No_Recurrence.sum())

print((df_patient.Recurrence & df_patient.No_Recurrence).sum())


In [None]:
# #what is up with these patients?
# df_patient.loc[(df_patient.Cohort.notna()) & (df_patient.No_Recurrence)].loc[:,['Public_Patient_ID','Cohort','Days from Resection to FU','Days from Resection to Recurrence']]

In [None]:
#all recurrence in one
df_patient.Cohort.replace('nan',np.nan,inplace=True)
df_patient['Recurrence_Sites_4'] = np.nan
df_patient['Recurrence_Sites_4'] = df_patient['Recurrence_Sites_4'].astype('object')
df_patient.loc[df_patient.Recurrence,'Recurrence_Sites_4'] = 'Other_site'
df_patient.loc[df_patient.No_Recurrence,'Recurrence_Sites_4'] = 'No_Doc_Recur'
df_patient.loc[(df_patient.Cohort.notna()) & (df_patient.Recurrence),'Recurrence_Sites_4'] = df_patient.Cohort
print(df_patient.Recurrence_Sites_4.isna().sum())
df_patient.Recurrence_Sites_4.fillna('No_Resection',inplace=True)
#lung ST-00020218
df_patient.loc[df_patient.Public_Patient_ID=='ST-00020218','Recurrence_Sites_4'] = 'Lung'
#liver ST-00024980
df_patient.loc[df_patient.Public_Patient_ID=='ST-00024980','Recurrence_Sites_4'] = 'Liver'

for s_site in df_patient.Recurrence_Sites_4.unique():
    n_patients = df_patient[df_patient.Recurrence_Sites_4==s_site].Public_Patient_ID.nunique()
    print(f'{s_site} {n_patients}')
print(df_patient.Cohort.dropna().value_counts())
print(f'number rows {len(df_patient)}')
print(f'number unique pts. {df_patient.Public_Patient_ID.nunique()}')

#looks good
print(82 + 125 + 113 + 29 + 73) # all patients
print(82 + 29 + 113 + 73)

In [None]:
#number of pateints alive 30 days after surgery and include in survival analysis
for s_site in df_patient.Recurrence_Sites_4.unique():
    n_patients = df_patient[(df_patient.Recurrence_Sites_4==s_site) &(df_patient.Alive_30_days_post_surgery)].Public_Patient_ID.nunique()
    print(f'{s_site} {n_patients}')
print(df_patient.loc[df_patient.Alive_30_days_post_surgery,'Cohort'].dropna().value_counts())

# 82 + 29 + 73 + 105
# 82 + 29 + 73 + 105 + 125
# 82 + 29 + 113 + 73
#add column for was resected
df_patient['Resected'] = False
df_patient.loc[df_patient.loc[:,'Days from Resection to FU'].notna(),'Resected'] = True
print(df_patient.Resected.value_counts())

In [None]:
ls_file = ['data/GSVA_All_Kallisto55_pORG_Up_55_pSUB_Up_51.tsv',
           'data/GSVA_Met_Kallisto55_pORG_Up_55_pSUB_Up_51.tsv',
           'data/GSVA_Primary_Kallisto55_pORG_Up_55_pSUB_Up_51.tsv']
d_rename = {'pORG_Up_55':'pORG', 'pSUB_Up_51':'pSUB'}
d_public_id = dict(zip(d_ids['RnaSeqKey'].loc[:,'OPTR.Specimen.ID'],d_ids['RnaSeqKey'].loc[:,'Public.Specimen.ID']))

for s_file in ls_file:
    df_pORG = pd.read_csv(s_file,sep='\t',index_col=0)
    s_type = s_file.split('_')[1]

    if s_type == 'All':
        #add 'GSVA_All': must pivot
        # add public IDs
        df_pORG['Public_Specimen_ID'] = df_pORG.index.map(d_public_id)
        df_pORG['Specimen_ID'] = [item.split('-')[-1] for item in df_pORG.Public_Specimen_ID]
        df_pORG['Public_Patient_ID'] = [item.split('-')[0] + '-' + item.split('-')[1] for item in df_pORG.Public_Specimen_ID]
        for key, item in d_rename.items():
            df_group = df_pORG.pivot(index='Public_Patient_ID',columns='Specimen_ID',values=key)
            df_group[f'{item}_allPrimary'] = df_group.loc[:,'T'].fillna(df_group.F)
            df_group[f'{item}_allMet'] = df_group.loc[:,'M'].fillna(df_group.M2)
            df_patient = df_patient.merge(df_group.reset_index().loc[:,['Public_Patient_ID',f'{item}_allPrimary',f'{item}_allMet']],on='Public_Patient_ID',how='left',suffixes=('','_x'))
            df_patient[f'{item}_{s_type}_T2'] = df_patient.Public_Patient_ID.map(df_group.loc[:,'T2'])
            print(len(df_patient))

    else:
        df_pORG.columns = [f'{d_rename[item]}_{s_type}' for item in df_pORG.columns]
        # add public IDs
        df_pORG['Public_Specimen_ID'] = df_pORG.index.map(d_public_id)
        df_pORG['Public_Patient_ID'] = [item.split('-')[0] + '-' + item.split('-')[1] for item in df_pORG.Public_Specimen_ID]
        #drop 'T2'
        df_t2 = df_pORG[df_pORG.Public_Specimen_ID.str.contains('-T2')]
        ls_drop = df_pORG[df_pORG.Public_Specimen_ID.str.contains('-T2')].index
        #merge
        df_patient = df_patient.merge(df_pORG.drop(ls_drop),on='Public_Patient_ID',suffixes=('','_x'),how='left')
        if s_type == 'Primary':
            for key, item in d_rename.items():
                se_t2 = df_pORG.loc[ls_drop,['Public_Patient_ID',f'{item}_{s_type}']].set_index('Public_Patient_ID')
                df_patient[f'{item}_{s_type}_T2'] = df_patient.Public_Patient_ID.map(se_t2.to_dict()[f'{item}_{s_type}'])
            print(len(df_patient))
    #break

In [None]:
df_carl = pd.read_csv('../annotation/JasonsPaperMetaDataWithPublicIDs.tsv',sep='\t')
print(len(df_carl))
ls_col = ['Tumor.Cellularity.by.DNA','cHas.DDR.Alteration','Specimen.Site','Public.Specimen.ID']
df_carl = df_carl.loc[:,ls_col]
df_carl.columns = [item.replace('.','_').replace('cHas_','') for item in df_carl.columns]
ls_drop = df_carl[df_carl['Public_Specimen_ID'].str.contains('T2')].index
df_carl = df_carl.drop(ls_drop)
df_carl['Public_Patient_ID'] = [item.split('-')[0] + '-' + item.split('-')[1] for item in df_carl.Public_Specimen_ID]
df_carl['Specimen_ID'] = [item.split('-')[-1] for item in df_carl.Public_Specimen_ID]
print(len(df_carl))
df_carl.head()

In [None]:
df_optr = pd.DataFrame(index=df_carl.Public_Patient_ID.unique(),dtype='O')
df_optr.index.name = 'Public_Patient_ID'
ls_col = ['Specimen_Site','DDR_Alteration','Tumor_Cellularity_by_DNA']
df_group = df_carl.pivot(index='Public_Patient_ID',columns='Specimen_ID',values=ls_col)
for s_col in ls_col:
    df_col = df_group.loc[:,s_col]
    if s_col == 'Specimen_Site':
        df_optr[f'RNA_DNA_{s_col}_Primary'] = df_col.loc[:,'T'].fillna(df_col.F)
        df_optr[f'RNA_DNA_{s_col}_Met'] = df_col.loc[:,'M'].fillna(df_col.M2)
    else:
        df_optr[f'{s_col}_Primary'] = df_col.loc[:,'T'].fillna(df_col.F)
        df_optr[f'{s_col}_Met'] = df_col.loc[:,'M'].fillna(df_col.M2)

df_patient = df_patient.merge(df_optr.reset_index(),on='Public_Patient_ID',how='left')

In [None]:
print(f'number rows {len(df_patient)}')
print(f'number unique pts. {df_patient.Public_Patient_ID.nunique()}')

In [None]:
# df_gene[df_gene.loc[:,'Patient ID'].isin(se_optr)]
# df_gene.loc[:,'Patient Specimen ID'].nunique()

In [None]:
# Panel, but no mutation data: 4044 4139 4404 4701 4731 4793
# We need to add a "place holder" gene for each of the samples with no calls.
# This is so that these will be included in the statistics and shown on the graphs.
OPTRsWithPanelButNoMutations = [4044, 4139, 4404, 4701, 4731, 4793]
se_optr = df_id.loc[df_id.OPTR.isin(OPTRsWithPanelButNoMutations),'Biolibrary.Subject.ID']
df_patient.loc[df_patient.Public_Patient_ID.isin(se_optr),df_patient.columns.str.contains('Altered_Primary')]=False
#Primary: 5 or six already had a False in the data (should have been NA)
#df_patient.loc[df_patient.Public_Patient_ID.isin(se_optr),df_patient.columns.str.endswith('Altered')]
#Met: 6 of 6 had False in data
#now we have 202 pri - but one has no RNA
print(df_patient.TP53_Altered_Primary.notna().sum())
len(set(df_patient.loc[df_patient.TP53_Altered_Primary.notna(),'Public_Patient_ID']).intersection(
    df_patient.loc[df_patient.pORG_Primary.notna(),'Public_Patient_ID']))

In [None]:
df_dove = pd.read_excel('../annotation/Adaptive-total samples-metadata-mets.xlsx',index_col=0)
print(df_dove.index.duplicated().any())
df_dove['Public_Patient_ID'] = df_dove.index.map(dict(zip(df_id.OPTR,df_id.loc[:,'Biolibrary.Subject.ID'])))
print(df_dove.Public_Patient_ID.duplicated().any())
df_patient['TCR_Met_Site'] = df_patient.Public_Patient_ID.map(dict(zip(df_dove.Public_Patient_ID,df_dove.loc[:,'Tumor tissue site'])))
d_replace = {'Supraclavicular lymph node':'Lymph Node','peritoneum':'Peritoneum',
             'lung':'Lung','omentum':'Omentum'}
df_patient['TCR_Met_Site'] = df_patient.TCR_Met_Site.replace(d_replace)
df_patient.groupby('Cohort').TCR_Met_Site.value_counts()#.sum()

In [None]:
#histology: add Desmoplasia +/- !!
df_ter = pd.read_excel('annotation/Histology Analyses - Terry and Brian.xlsx',sheet_name='final for TM batch4 list2')
df_ter['OPTR'] = [int(item.split('-')[0]) for item in df_ter.loc[:,'OPTR Specimen ID']]
df_ter.rename({'OPTR Specimen ID':'SpecimenID','PNI (Y/N)':'PNI'},axis=1,inplace=True)
df_ter.loc[:,'Tertiary Lymph Strucures (Peritumoral, intratumoral)'].replace({'Rare peritumoral':'Peritumoral'},inplace=True)
df_ter['TLS'] = df_ter.loc[:,'Tertiary Lymph Strucures (Peritumoral, intratumoral)'].replace({'Peritumoral':'Y', 'Intratumoral and peritumoral':'Y', 'Intratumoral':'Y'})

s_replace = 'Acute, Chronic, and/or plasmacytoid inflammation'
d_replace_imm = {'Acute, chronic':'Y', 'Acute':'Y', 'Chronic, plasmacytoid':'Y',
 'Acute, chronic, plasmacytoid':'Y', 'Chronic':'Y', 'None':'N', ' Acute, chronic':'Y'}

d_replace_acute = {'Acute, chronic':'Y', 'Acute':'Y', 'Chronic, plasmacytoid':'N',
 'Acute, chronic, plasmacytoid':'Y', 'Chronic':'N', 'None':'N', ' Acute, chronic':'Y'}

d_replace_chr = {'Acute, chronic':'Y', 'Acute':'N', 'Chronic, plasmacytoid':'Y',
 'Acute, chronic, plasmacytoid':'Y', 'Chronic':'Y', 'None':'N', ' Acute, chronic':'Y'}

d_replace_pl = {'Acute, chronic':'N', 'Acute':'N', 'Chronic, plasmacytoid':'Y',
 'Acute, chronic, plasmacytoid':'Y', 'Chronic':'N', 'None':'N', ' Acute, chronic':'N'}
d_pni = {'                           ':np.nan,'Y+':'Y',
        'in  slide 2, but not circled ROI':'Y', 'n':'N', 'yes':'Y', 'N/a':np.nan}

df_ter['Inflammation'] = df_ter.loc[:,s_replace].replace(d_replace_imm)
df_ter['Acute Inflammation'] =df_ter.loc[:,s_replace].replace(d_replace_acute)
df_ter['Chronic Inflammation'] =df_ter.loc[:,s_replace].replace(d_replace_chr)
df_ter['Plasmacytoid Inflammation'] =df_ter.loc[:,s_replace].replace(d_replace_pl)
df_ter['Tumor_Type'] = [item.split('-')[-1].replace('F','T') for item in df_ter.SpecimenID]
df_ter['PNI'] =df_ter.loc[:,'PNI'].replace(d_pni)
df_ter['Desmoplasia (Y/N)'].replace({'Y+':'Y'},inplace=True)

# second
df_ter2 = pd.read_excel('annotation/Digital Slide Tempus Final Dx tkm_bs_2021.xlsx',sheet_name='final for TM batch4 list2')
df_ter2['OPTR'] = df_ter2.loc[:,'Subject ID (BC ID#)'].astype(pd.Int64Dtype())
df_ter2.rename({'PN +/-':'PNI'},axis=1,inplace=True)
#two
#Acute or Chronic Inflammation
# Diagnosis TM_Ben
# for s_col in df_ter2.columns:
#     print(s_col)
#     ls_un = df_ter2.loc[:,s_col].unique()
#     if len(ls_un) < 25:
#         print(ls_un)
        


d_replace_tls = { 'Negative':'N', 'LN aggregates with plasma cells':'Y',
 'Severe acute inflammation':'N', 'mild chronic with plasma cells':'N',
 'Acute inflammation':'N', 'Acute &  chronic with plasma cells':'N',
 'LN aggregates &  plasma cells (acute in infarcted zone= false pos)':'Y',
 'Areas of elastosis are negative for PDAC (seems FP were targeted for LCM)':np.nan,
 'LN aggregates with numerous plasma cells':'Y', 'N/a':np.nan,
 'mild chronic with plasma cells (at edges of tumor there are LN aggregates, not in tumor)':'Y',
 'Marked LN aggregates with plasma cells throughout tumor':'Y',
 'Amazing sheets of lymphocytes in papillary stroma of IPMN':'Y',
 'Marked acute &  chronic with plasma cells':'N',
 'Negative (inflammation only around PanIN 2)':'N',
 'Negative (acute only in area of tumor infarction ~biopsy site)':'N'}

d_replace_imm = { 'Negative':'N', 'LN aggregates with plasma cells':'Y',
 'Severe acute inflammation':'Y', 'mild chronic with plasma cells':'Y',
 'Acute inflammation':'Y', 'Acute &  chronic with plasma cells':'Y',
 'LN aggregates &  plasma cells (acute in infarcted zone= false pos)':'Y',
 'Areas of elastosis are negative for PDAC (seems FP were targeted for LCM)':np.nan,
 'LN aggregates with numerous plasma cells':'Y', 'N/a':np.nan,
 'mild chronic with plasma cells (at edges of tumor there are LN aggregates, not in tumor)':'Y',
 'Marked LN aggregates with plasma cells throughout tumor':'Y',
 'Amazing sheets of lymphocytes in papillary stroma of IPMN':'Y',
 'Marked acute &  chronic with plasma cells':'Y',
 'Negative (inflammation only around PanIN 2)':'N',
 'Negative (acute only in area of tumor infarction ~biopsy site)':'N'}

d_replace_acute = { 'Negative':'N', 'LN aggregates with plasma cells':'N',
 'Severe acute inflammation':'Y', 'mild chronic with plasma cells':'N',
 'Acute inflammation':'Y', 'Acute &  chronic with plasma cells':'Y',
 'LN aggregates &  plasma cells (acute in infarcted zone= false pos)':'N',
 'Areas of elastosis are negative for PDAC (seems FP were targeted for LCM)':np.nan,
 'LN aggregates with numerous plasma cells':'N', 'N/a':np.nan,
 'mild chronic with plasma cells (at edges of tumor there are LN aggregates, not in tumor)':'N',
 'Marked LN aggregates with plasma cells throughout tumor':'N',
 'Amazing sheets of lymphocytes in papillary stroma of IPMN':'N',
 'Marked acute &  chronic with plasma cells':'Y',
 'Negative (inflammation only around PanIN 2)':'N',
 'Negative (acute only in area of tumor infarction ~biopsy site)':'N'}

d_replace_chr = { 'Negative':'N', 'LN aggregates with plasma cells':'Y',
 'Severe acute inflammation':'N', 'mild chronic with plasma cells':'Y',
 'Acute inflammation':'N', 'Acute &  chronic with plasma cells':'Y',
 'LN aggregates &  plasma cells (acute in infarcted zone= false pos)':'Y',
 'Areas of elastosis are negative for PDAC (seems FP were targeted for LCM)':np.nan,
 'LN aggregates with numerous plasma cells':'Y', 'N/a':np.nan,
 'mild chronic with plasma cells (at edges of tumor there are LN aggregates, not in tumor)':'Y',
 'Marked LN aggregates with plasma cells throughout tumor':'Y',
 'Amazing sheets of lymphocytes in papillary stroma of IPMN':'Y',
 'Marked acute &  chronic with plasma cells':'Y',
 'Negative (inflammation only around PanIN 2)':'N',
 'Negative (acute only in area of tumor infarction ~biopsy site)':'N'}


d_replace_pl = { 'Negative':'N', 'LN aggregates with plasma cells':'Y',
 'Severe acute inflammation':'N', 'mild chronic with plasma cells':'Y',
 'Acute inflammation':'N', 'Acute &  chronic with plasma cells':'Y',
 'LN aggregates &  plasma cells (acute in infarcted zone= false pos)':'Y',
 'Areas of elastosis are negative for PDAC (seems FP were targeted for LCM)':np.nan,
 'LN aggregates with numerous plasma cells':'Y', 'N/a':np.nan,
 'mild chronic with plasma cells (at edges of tumor there are LN aggregates, not in tumor)':'Y',
 'Marked LN aggregates with plasma cells throughout tumor':'Y',
 'Amazing sheets of lymphocytes in papillary stroma of IPMN':'N',
 'Marked acute &  chronic with plasma cells':'Y',
 'Negative (inflammation only around PanIN 2)':'N',
 'Negative (acute only in area of tumor infarction ~biopsy site)':'N'}

d_replace_type = {'PDAC':'T', 'Adenosquamous':'T', 'PDAC + Pan1':'T', 'IPMN 3/PDAC':'T',
       'IPMN 2':np.nan, 'Met PDAC (lung)':'M', 'PDAC (post Tx?) +Pan3':'T', 'PDAC ':'T',
       'Negative (post Tx?)':np.nan, 'Colloid PDAC':'T', 'Met vs primary lung':'M',
       'Met to liver (r/o Angiosarcoma)':'M', 'Met (PDAC) to lymph node':'M',
       'PDAC (post Tx?)':'T', 'ACA r/o colon primary':np.nan, 'Cytology PDAC ':'T',
       'IPMN ':np.nan, 'PDAC + Pan2':'T', 'Ampulllary ACA':'T'}
d_ali = { 'n':'N', 'YES':'Y', 'N/a':np.nan, 'no':'N'}
s_replace = 'Acute or Chronic Inflammation'
df_ter2['TLS'] = df_ter2.loc[:,s_replace].replace(d_replace_tls)
df_ter2['Inflammation'] = df_ter2.loc[:,s_replace].replace(d_replace_imm)
df_ter2['Acute Inflammation'] =df_ter2.loc[:,s_replace].replace(d_replace_acute)
df_ter2['Chronic Inflammation'] =df_ter2.loc[:,s_replace].replace(d_replace_chr)
df_ter2['Plasmacytoid Inflammation'] =df_ter2.loc[:,s_replace].replace(d_replace_pl)
df_ter2['Tumor_Type'] =df_ter2.loc[:,'Diagnosis TM_Ben'].replace(d_replace_type)
df_ter2['PNI'] =df_ter2.loc[:,'PNI'].replace(d_pni)
df_ter2['Desmoplasia (Y/N)'] = df_ter2.loc[:,'Desmoplasia +/-'].replace({np.nan:pd.NA, 'y':'Y',
                                                                         'NO':'N', 'n':'N', 
                                                                         'N/a':pd.NA, 'yes':'Y',})
df_ter2['ALI (Y/N)'] =df_ter2.loc[:,'ALI +/-'].replace(d_ali)

# 
ls_col = [ 'TLS', 'Inflammation','PNI','ALI (Y/N)','Desmoplasia (Y/N)',
       'Acute Inflammation', 'Chronic Inflammation',
       'Plasmacytoid Inflammation','Tumor_Type','OPTR',]
#'''
df_ter = df_ter.loc[:,ls_col].dropna()
df_ter2 = df_ter2.loc[:,ls_col].dropna()
ls_drop = df_ter2[(df_ter2.OPTR.isin(df_ter.OPTR)) & (df_ter2.Tumor_Type=='T')].index

#concat
df_hist = pd.concat([df_ter,df_ter2.drop(ls_drop)])
df_hist['Public_Patient_ID'] = df_hist.OPTR.map(dict(zip(df_id.OPTR,df_id.loc[:,'Biolibrary.Subject.ID'])))
print(f'missing public ID: {df_hist.Public_Patient_ID.isna().sum()}')
df_hist_tum = df_hist[df_hist.Tumor_Type.str.contains('T').fillna(False)].copy()
print(f'duplicated primary {df_hist_tum.OPTR.duplicated().sum()}')
df_hist_met = df_hist[df_hist.Tumor_Type.str.contains('M').fillna(False)].copy()
print(f'duplicated mets {df_hist_met.OPTR.duplicated().sum()}')
#merge
df_patient = df_patient.merge(df_hist_tum.loc[:,ls_col[0:5]+['Public_Patient_ID']],on='Public_Patient_ID',how='left',suffixes=('','_x'))
df_patient = df_patient.merge(df_hist_met.loc[:,ls_col[0:5]+['Public_Patient_ID']],on='Public_Patient_ID',how='left',suffixes=('','_Met'))
#'''

In [None]:
#add recurrence censor
df_patient['Recurrence'] = np.nan
df_patient.loc[~df_patient.loc[:,'Days from Resection to Recurrence'].isna(),'Recurrence'] = 1
df_patient.loc[df_patient.No_Recurrence,'Recurrence'] = 0
print(len(df_patient))
%matplotlib inline
# add purist subtype
# primary after met
df_patient['PurIST_Subtype'] = pd.NA
df_patient.loc[(df_patient.PurIST_Met <= 0.5),'PurIST_Subtype_Met'] = 'classical'
df_patient.loc[(df_patient.PurIST_Primary <= 0.5),'PurIST_Subtype'] = 'classical'
df_patient.loc[(df_patient.PurIST_Met > 0.5),'PurIST_Subtype_Met'] = 'basal-like'
df_patient.loc[(df_patient.PurIST_Primary > 0.5),'PurIST_Subtype'] = 'basal-like'
df_patient['PurIST_Subtype'] = df_patient.PurIST_Subtype.fillna(df_patient.PurIST_Subtype_Met)
df_patient.loc[df_patient.PurIST_Subtype=='classical','Classical_Cohort'] = df_patient.loc[df_patient.PurIST_Subtype=='classical','Cohort']
print(df_patient.Classical_Cohort.value_counts())

#fix no recur and other recur - out of resected patients
df_patient.loc[:,'No_Doc_Recur'] = pd.NA #np.nan #False
df_patient.loc[df_patient.Recurrence_Sites_4!='No_Resection','No_Doc_Recur'] = False#0 #False
df_patient.loc[df_patient.Recurrence_Sites_4=='No_Doc_Recur','No_Doc_Recur'] = True#1 #True
print(df_patient.No_Recurrence.value_counts())

df_patient.loc[:,'Other_Recurrence'] = pd.NA #np.nan #False
df_patient.loc[df_patient.Recurrence_Sites_4!='No_Resection','Other_Recurrence'] = False #0#
df_patient.loc[df_patient.Recurrence_Sites_4=='Other_site','Other_Recurrence'] = True #1#
print(df_patient.Other_Recurrence.value_counts())

d_replace = {'Yes Neoadjuvant':'Yes', 'No Neoadjuvant (but Yes Chemo)':'No',
       'No Chemotherapy':'No', 'No Resection':np.nan, 'No Chemo and No Resection':np.nan}
df_patient['Neoadjuvant'] = df_patient.loc[:,'Neoadjuvant Treatment'].replace(d_replace)
print(df_patient.Neoadjuvant.value_counts())

In [None]:
#lung wedge resection, liver met resection versus survival
df_lung = pd.read_excel(f'lung cohort-wedge resections_HPI_complete-v2.xlsx',index_col=0)
se_lung = df_lung[df_lung.loc[:,'lung wedge resection?']=='yes'].index

df_liver =  pd.read_csv(f'Liverlesions_supplemental_table122123.csv',index_col=0)
se_liver = df_liver.loc[df_liver.loc[:,'Liver Met Resection']=='Yes','Public_Patient_ID']

se_lung_pub = se_lung.map(dict(zip(df_id.OPTR,df_id.loc[:,'Biolibrary.Subject.ID'])))
df_patient['Lung_Met_Resection'] = pd.NA
df_patient.loc[df_patient.Cohort=='Lung','Lung_Met_Resection'] = 'No'
df_patient.loc[df_patient.Public_Patient_ID.isin(se_lung_pub),'Lung_Met_Resection'] = 'Yes'

df_patient['Cohort_Met_Resection'] = df_patient.Cohort.copy()
df_patient.loc[df_patient.Public_Patient_ID.isin(se_lung_pub),'Cohort_Met_Resection'] = 'Lung_Met_Res'
df_patient.loc[df_patient.Public_Patient_ID.isin(se_liver),'Cohort_Met_Resection'] = 'Liver_Met_Res'


In [None]:
s_out = '20231222_Patient_Metadata.csv'#'20230921_Patient_Metadata.csv'

df_patient.Stage = df_patient.Stage.replace({'I':1,'II':2,'III':3,'IV':4,'0':0}).astype('Int64')
df_patient.Grade = df_patient.Grade.astype('Int64')
df_patient['Survival'] = df_patient.loc[:,'Vital Status at FU'].replace({'Alive':0,'Dead':1})
ls_drop_columns = [#'OPTR',
                   'Lung Met Present in Patient blood','Liver Met Present in Patient blood',
    'Liver Met Present in Patient tumor','Lung Met Present in Patient tumor',
    'cDays from Diagnosis to FU','cDays from Earliest Recur to FU','cDays from Resection to FU',
    'cDays from Resection to Recurrence','cLiverMet','cLongTermRecurrer','cLongTermSurvivor',
    'cLungMet','cNeoadjuvant Treatment','cRapidRecurrer','cVitalStatus'] + df_patient.columns[df_patient.columns.str.contains('_x')].tolist() + df_patient.columns[df_patient.columns.str.contains('OPTR')].tolist()
se_col = df_patient.loc[:,~df_patient.columns.isin(ls_drop_columns)].columns
ls_col = se_col[~se_col.str.contains('Altered')].tolist() + se_col[se_col.str.contains('Altered')].tolist()
if not os.path.exists(s_out):
    print('saving')
    df_patient.loc[:,ls_col].to_csv(s_out)
    

In [None]:
df_patient.loc[~df_patient.Alive_30_days_post_surgery,'Recurrence_Sites_4'].value_counts()

In [None]:
df_patient.loc[~df_patient.Alive_30_days_post_surgery]

## Primary versus Met  <a name="primet"></a>

plots


[contents](#contents)


In [None]:
#61?
df_patient.loc[df_patient.Public_Patient_ID=='ST-00014524',df_patient.columns.str.contains('Pur')]

In [None]:
%matplotlib inline
s_out = '20231222_Patient_Metadata.csv'#'20230921_Patient_Metadata.csv'
df_patient = pd.read_csv(f'annotation/{s_out}',index_col=0)
sns.set_palette("tab10")
ls_site = [#['pORG_0.2_Primary', 'pORG_0.2_Met'],
           ['pORG_allPrimary', 'pORG_allMet'],
['pSUB_Primary', 'pSUB_allMet'],
 ['PurIST_Primary', 'PurIST_Met'] ]
for tu_site in ls_site:
    df_test = df_patient.loc[:,['Public_Patient_ID']+tu_site].dropna()
    s_value = tu_site[0].replace(f"_{tu_site[0].split('_')[-1]}",'')
    df_test.set_index('Public_Patient_ID',inplace=True)#.stack()
    df_test.columns = [item.split('_')[-1].replace('all','') for item in df_test.columns]
    df_long = df_test.stack().reset_index().rename({'level_1':'Specimen_Type',0:s_value},axis=1)
    df_long['Met_Site'] = df_long.Public_Patient_ID.map(dict(zip(df_patient.Public_Patient_ID,df_patient.RNA_DNA_Specimen_Site_Met)))
    #by site
    fig,ax=plt.subplots(figsize=(4,2.5),dpi=200)
    sns.pointplot(data=df_long,x='Specimen_Type',y=s_value,hue='Met_Site',
                  dodge=0.1,ax=ax,alpha=0.7,linestyles='-.')
    ax.legend(bbox_to_anchor=(1,1.1),title='Met_Site')
    ax.set_title(s_value)
    ax.set_ylabel('')
    plt.tight_layout()
    # average
    '''
    fig,ax=plt.subplots(figsize=(2,2),dpi=300)
    sns.pointplot(data=df_long,x='Specimen_Type',y=s_value,ax=ax)
    ax.set_xlabel('')
    ax.set_title(ax.get_ylabel())
    ax.set_ylabel('')
    plt.tight_layout()
    #by patient
    fig,ax=plt.subplots(figsize=(3.5,2.5),dpi=300)
    sns.pointplot(data=df_long,x='Specimen_Type',y=s_value,hue='Public_Patient_ID',ax=ax)
    ax.set_xlabel('')
    statistic, pvalue = stats.wilcoxon(x=df_test.Primary, y=df_test.Met)
    ax.set_title(f'{ax.get_ylabel()} p={pvalue:.2}')
    ax.set_ylabel('')
    ax.legend(bbox_to_anchor=(1,1.1),title='Patient',fontsize='small')
    plt.tight_layout()
    #by cohort
    df_long['Cohort'] = df_long.Public_Patient_ID.map(dict(zip(df_patient.Public_Patient_ID,df_patient.Cohort)))
    fig,ax=plt.subplots(figsize=(3,2),dpi=300)
    sns.pointplot(data=df_long[df_long.Cohort!='nan'],x='Specimen_Type',y=s_value,hue='Cohort',
                  ax=ax,palette=d_colorblind)
    ax.legend(bbox_to_anchor=(1,1),title='Cohort')
    ax.set_title(s_value)
    ax.set_ylabel('')
    ax.set_xlabel('')
    plt.tight_layout()
    #break'''


In [None]:
# #lung wedge resection, liver met resection versus survival

s_time = 'Days from Diagnosis to FU'
s_censor = 'Survival'
util.km_plot(df_patient[df_patient.Alive_30_days_post_surgery],'Lung_Met_Resection',s_time,s_censor)
util.km_plot(df_patient[df_patient.Alive_30_days_post_surgery],'Cohort_Met_Resection',s_time,s_censor)


In [None]:
#df_patient.loc[df_patient.Public_Patient_ID.isin(se_lung_pub),ls_col]

In [None]:
#df_patient.loc[df_patient.Public_Patient_ID.isin(se_liver),ls_col]

In [None]:
#plot pORG scores
df_patient.columns[df_patient.columns.str.contains('pORG')]
df_patient['pORG_All'] = df_patient.pORG_allPrimary.fillna(df_patient.pORG_Met)
df_patient['pORG_All'] = df_patient.pORG_allPrimary.fillna(df_patient.pORG_allMet)
ls_col = ['Public_Patient_ID','pORG_allPrimary','pORG_allMet','Days from Resection to Recurrence',
          'Days from Resection to FU','Resected','Vital Status at FU']
df_patient.loc[df_patient.Public_Patient_ID.isin(se_lung_pub),ls_col]#.to_csv('Lung_meta.csv')
df_patient.loc[df_patient.Public_Patient_ID.isin(se_liver),ls_col]#.to_csv('Liver_meta.csv')

importlib.reload(util)
ls_groups = ['Liver','Liver_Met_Res', 'Lung', 'Lung_Met_Res']
for y in ['pORG_All','pORG_Primary','pORG_Met']:
    plotting = {"data": df_patient,"x":"Cohort_Met_Resection","y":y,"order":ls_groups}
    util.annotated_stripplot(plotting,ls_groups,y)

In [None]:
se_match = {'ST-00004898',
 'ST-00006291',
 'ST-00007307',
 'ST-00010984',
 'ST-00014524',
 'ST-00015839',
 'ST-00017804',
 'ST-00017838',
 'ST-00019601'}
df_match = df_patient.loc[df_patient.Public_Patient_ID.isin(se_match),(df_patient.columns.str.contains('Altered_Primary')) | (df_patient.columns.str.contains('Altered_Met'))].copy()
df_match = df_match.astype('Int64')#
df_match = df_match.replace({True:1,False:0})
ls_gene = sorted(set([item.split('_')[0] for item in df_match.columns]))
df_match['Public_Patient_ID'] = df_patient.loc[df_patient.Public_Patient_ID.isin(se_match),'Public_Patient_ID']

# df_match = df_match.replace(np.nan,pd.NA)

In [None]:
ls_genes = ['RNF43_Altered_Primary', 'RNF43_Altered_Met', 'RNF43_Altered', 'TP53_Altered_Primary', 'TP53_Altered_Met', 
            'TP53_Altered', 'GNAS_Altered_Primary', 'GNAS_Altered_Met', 'GNAS_Altered', 'KRAS_Altered_Primary', 
            'KRAS_Altered_Met', 'KRAS_Altered', 'APOB_Altered_Primary', 'APOB_Altered_Met', 'APOB_Altered', 
            'SMAD4_Altered_Primary', 'SMAD4_Altered_Met', 'SMAD4_Altered', 'BRCA1_Altered_Primary', 'BRCA1_Altered', 
            'CDKN2A_Altered_Primary', 'CDKN2A_Altered_Met', 'CDKN2A_Altered', 'NOTCH1_Altered_Primary', 'NOTCH1_Altered_Met',
            'NOTCH1_Altered', 'BCOR_Altered_Primary', 'BCOR_Altered_Met', 'BCOR_Altered', 'STK11_Altered_Primary', 
            'STK11_Altered_Met', 'STK11_Altered', 'TGFBR2_Altered_Primary', 'TGFBR2_Altered_Met', 'TGFBR2_Altered', 
            'ARID1B_Altered_Primary', 'ARID1B_Altered_Met', 'ARID1B_Altered', 'TSC2_Altered_Primary', 'TSC2_Altered_Met', 
            'TSC2_Altered', 'ATRX_Altered_Primary', 'ATRX_Altered_Met', 'ATRX_Altered', 'KDM6A_Altered_Primary', 
            'KDM6A_Altered_Met', 'KDM6A_Altered', 'ATM_Altered_Primary', 'ATM_Altered_Met', 'ATM_Altered',
            'LRP1B_Altered_Primary', 'LRP1B_Altered_Met', 'LRP1B_Altered', 'ARID1A_Altered_Primary', 'ARID1A_Altered_Met',
            'ARID1A_Altered', 'CUX1_Altered_Primary', 'CUX1_Altered_Met', 'CUX1_Altered', 'KMT2A_Altered_Primary',
            'KMT2A_Altered_Met', 'KMT2A_Altered', 'KMT2D_Altered_Primary', 'KMT2D_Altered_Met', 'KMT2D_Altered', 
            'PBRM1_Altered_Primary', 'PBRM1_Altered_Met', 'PBRM1_Altered', 'BRCA2_Altered_Primary', 'BRCA2_Altered_Met',
            'BRCA2_Altered', 'GATA1_Altered_Primary', 'GATA1_Altered_Met', 'GATA1_Altered', 'KDM5C_Altered_Primary', 
            'KDM5C_Altered_Met', 'KDM5C_Altered', 'RBM10_Altered_Primary', 'RBM10_Altered_Met', 'RBM10_Altered',
            'SMARCB1_Altered_Primary', 'SMARCB1_Altered_Met', 'SMARCB1_Altered', 'ELF3_Altered_Primary', 'ELF3_Altered_Met', 
            'ELF3_Altered', 'MTAP_Altered_Primary', 'MTAP_Altered_Met', 'MTAP_Altered', 'BRAF_Altered_Primary',
            'BRAF_Altered_Met', 'BRAF_Altered', 'CREBBP_Altered_Primary', 'CREBBP_Altered_Met', 'CREBBP_Altered',
            'CDKN2B_Altered_Primary', 'CDKN2B_Altered_Met', 'CDKN2B_Altered', 'ARID1aLOF_Altered']

In [None]:
# genes
%matplotlib inline
warnings.simplefilter(action='ignore', category=FutureWarning)
ls_all = []
for s_gene in ls_gene:    
    tu_site = [f'{s_gene}_Altered_Primary',f'{s_gene}_Altered_Met']
    try:
        df_test = df_match.loc[:,['Public_Patient_ID']+tu_site].dropna(how='any')
        df_test.set_index('Public_Patient_ID',inplace=True)
        df_test = df_test.astype('int')
        if (df_test>0).any().any(): #df_test.mean(numeric_only=True).mean(numeric_only=True) > 0:
            ls_all = ls_all + tu_site
            s_value = tu_site[0].replace(f"_{tu_site[0].split('_')[-1]}",'')
            #.stack()
            df_test.columns = [item.split('_')[-1].replace('all','') for item in df_test.columns]
            df_long = df_test.stack().reset_index().rename({'level_1':'Specimen_Type',0:s_value},axis=1)
            df_long['Met_Site'] = df_long.Public_Patient_ID.map(dict(zip(df_patient.Public_Patient_ID,df_patient.RNA_DNA_Specimen_Site_Met)))
            # average (rosie says dont need)
            # stat, pvalue = scipy.stats.fisher_exact(pd.crosstab(df_long.Specimen_Type, df_long.loc[:,s_value]))
            # fig,ax=plt.subplots(figsize=(2,2),dpi=200)
            # sns.pointplot(data=df_long,x='Specimen_Type',y=s_value,ax=ax,errorbar=None)
            # ax.set_xlabel('')
            # ax.set_title(f"{s_value.replace('_',' ')}\nFisher's p={pvalue:.2}",fontsize=10)
            # ax.set_ylabel(f'Fraction Pts n={df_long.Public_Patient_ID.nunique()}')
            # ax.set_ylim(-0.1,1.1)
            # plt.tight_layout()
            # fig.savefig(f'figures/Fraction_{s_value}_Primary_met_matched.png')
            # if pvalue < 1.1:
            stat, pvalue = statsmodels.sandbox.stats.runs.mcnemar(x=df_test.iloc[:,0], y=df_test.iloc[:,1], exact=True)
            #stat, pvalue = scipy.stats.wilcoxon(x=df_test.iloc[:,0], y=df_test.iloc[:,1],)
            # result = scipy.stats.binomtest(k=sum(df_test.iloc[:,0]!=df_test.iloc[:,1]),
            #                                      n=len(df_test), p=0, alternative='greater')
            # pvalue = result.pvalue
            #by patient
            fig,ax=plt.subplots(figsize=(3.4,2.2),dpi=200)
            sns.pointplot(data=df_long,x='Specimen_Type',y=s_value,hue='Public_Patient_ID',ax=ax,alpha=0.7,
                          linestyles='-.',dodge=.4)
            ax.set_xlabel('')
            ax.set_title(f"{s_value.replace('_',' ')}\nMcNemar'sp={pvalue:.2}",fontsize=10)
            ax.set_ylabel('')
            ax.set_yticklabels('')
            ax.legend(bbox_to_anchor=(1,1.3),title='Patient',fontsize='small')
            plt.tight_layout()
            fig.savefig(f'figures/Mcnemar_primary_met_{s_gene}.png')    
        else:
            print(f'{s_gene} not altered')
    except:
        print(f'{s_gene} failed')
    #break
# alterations primary to met
df_pri = df_match.loc[:,ls_all].loc[:,::2]
df_pri.columns = [item.split('_')[0] for item in df_pri.columns]
df_met = df_match.loc[:,ls_all].iloc[:,1::2]
df_met.columns = [item.split('_')[0] for item in df_met.columns]
fig,ax=plt.subplots(dpi=300,figsize=(3,2))
se_diff = (df_pri != df_met).sum(axis=1)
se_diff.plot(kind='hist',ax=ax)
ax.set_title(f'Matched Primary vs. Met\nMean changes={se_diff.mean():.2}')
ax.set_xlabel('No. Alterations Different')

In [None]:
# genes - not matched
ls_pvalue = []
ls_gene_correct = []
d_fdr = {'AMER1': 0.8853937569128851,'APOB': 0.89561556844377,'ARID1A': 0.8853937569128851,'ARID1B': 0.5299424678780686,
 'ATRX': 0.89561556844377,'BCOR': 0.89561556844377,'BRCA2': 1.0,'CDKN2A': 0.5299424678780686,'CDKN2B': 0.349877752253849,
 'CUX1': 0.89561556844377,'GATA1': 0.349877752253849,'GNAS': 0.561191957075014,'IRS2': 0.561191957075014,'KDM5C': 0.5299424678780686,
 'KDM6A': 1.0,'KMT2D': 0.8183021772923266, 'KRAS': 0.89561556844377,'LRP1B': 1.0,
 'MTAP': 0.06251890221846075,'NOTCH1': 0.89561556844377,'NOTCH3': 0.8853937569128851,'RBM10': 0.561191957075014,
 'RNF43': 1.0,'SMAD4': 0.8853937569128851,'SMARCB1': 0.06392308434351372,'TGFBR2': 0.89561556844377,'TP53': 0.89561556844377,
 'TSC2': 0.8674093953492792}# michael
d_fdr= {'KDM6A': 0.8366401256596958,'RNF43': 0.8366401256596958,'TGFBR2': 0.8667385017597867,'ARID1B': 0.9101123704474682,
 'KMT2A': 0.7387547787208116,'KRAS': 1.0,'ATM': 0.8667385017597867,'GATA1': 0.11746700480125209,
 'BRAF': 0.7387547787208116,'CREBBP': 0.7387547787208116,'ATRX': 0.9101123704474682,'CUX1': 0.9246912476384586,
 'GNAS': 0.8366401256596958,'BCOR': 1.0,'SMARCB1': 0.11746700480125209,'ELF3': 0.9101123704474682,
 'MTAP': 0.07677475369566632,'APOB': 0.7387547787208116,
 'ARID1A': 0.8366401256596958, 'CDKN2A': 0.8078087242382573,'NOTCH1': 1.0,
 'KMT2D': 0.7387547787208116,'STK11': 0.7397789776431964,
 'CDKN2B': 0.5248250565713328,'SMAD4': 0.8366401256596958,'PBRM1': 0.6918495424837988,
 'TP53': 0.8366401256596958,'TSC2': 0.8366401256596958,'BRCA2': 1.0,
 'LRP1B': 0.8366401256596958,'KDM5C': 0.11746700480125209,'RBM10': 0.43577065365578377} #tempus
ls_gene = set([item.split('_')[0] for item in ls_genes]) - {'BRCA1','ARID1aLOF'}
for s_gene in ls_gene:
    try:
        tu_site = [f'{s_gene}_Altered_Primary',f'{s_gene}_Altered_Met']
        df_test = df_patient.loc[:,['Public_Patient_ID']+tu_site].dropna(how='all')
        s_value = tu_site[0].replace(f"_{tu_site[0].split('_')[-1]}",'')
        df_test.set_index('Public_Patient_ID',inplace=True)#.stack()
        df_test.columns = [item.split('_')[-1].replace('all','') for item in df_test.columns]
        df_long = df_test.stack().reset_index().rename({'level_1':'Specimen_Type',0:s_value},axis=1)
        #df_long['Met_Site'] = df_long.Public_Patient_ID.map(dict(zip(df_patient.Public_Patient_ID,df_patient.Specimen_Site_Met)))
        # average
        stat, pvalue = scipy.stats.fisher_exact(pd.crosstab(df_long.Specimen_Type, df_long.loc[:,s_value]))
        fig,ax=plt.subplots(figsize=(2,2),dpi=200)
        sns.pointplot(data=df_long,x='Specimen_Type',y=s_value,ax=ax,errorbar=None)
        ax.set_xlabel('')
        ax.set_title(f"{s_value.replace('_',' ')}\nFisher's p={pvalue:.2}\nFDR={d_fdr[s_gene]:.2}",fontsize=10)
        ax.set_ylabel(f'Fraction Pts n={df_long.Public_Patient_ID.nunique()}')
        ax.set_ylim(-0.1,1.1)
        plt.tight_layout()
        fig.savefig(f'figures/Fraction_{s_value}_Primary_met_all.png')
        if d_fdr[s_gene] > 0.2:
            plt.close(fig)
        ls_gene_correct.append(s_gene)
        ls_pvalue.append(pvalue)
    except:
        print(f'skipped {s_gene}')
    #break
# #correct pvalues
__, corrected = statsmodels.stats.multitest.fdrcorrection(np.array(ls_pvalue))
dict(zip(ls_gene_correct,corrected))

In [None]:
#my gene level analysis
d_fdr = {'pORG_Primary':{'KRAS' : 0.38287593347963944,
'GATA1' : 0.1855045586524062,
'CDKN2A' : 0.06609676055955013,
'TP53' : 1.038674992908113e-09,
'KDM5C' : 0.4251464077159879,},
        'pSUB_Primary':{'TSC2' : 1.0,},'Cohort':{'ATM' : 0.8990812819973347,
'GATA1' : 0.8990812819973347,},
        'PurIST_Primary':{'TGFBR2' : 0.284384813956165,
'ATM' : 0.284384813956165,
'GNAS' : 0.284384813956165,}}

df_patient['PurIST_All'] = df_patient.loc[:,'PurIST_Primary'].fillna(df_patient.loc[:,'PurIST_Met'])
import warnings
with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=FutureWarning)
    d_all = {}
    d_all_p = {}
    cutoff = 0.1
    for s_gsva_name in ['pORG','Cohort','pSUB','PurIST']:
        print(s_gsva_name)
        for s_type in ['Primary',]: #'All',
            if s_gsva_name == 'Cohort':
                s_gsva = 'Cohort'
            elif s_type == 'All' and s_gsva_name !='PurIST':
                ls = df_patient.columns[(df_patient.columns.str.contains(s_gsva_name)) & (df_patient.columns.str.contains(f'_all'))]
                s_gsva = f"{ls[0].split('_all')[0]}_All"
                df_patient[s_gsva] = df_patient.loc[:,ls[0]].fillna(df_patient.loc[:,ls[1]])
            else:
                s_gsva = df_patient.columns[(df_patient.columns.str.contains(s_gsva_name)) & (df_patient.columns.str.contains(f'_{s_type}'))][0]
            b_rna = df_patient.loc[:,s_gsva].notna()
            ls_pvalue = []
            ls_gene_correct = []
            for s_col in ls_gene:
                s_gene = f'{s_col}_Altered'
                try:
                    b_dna = df_patient.loc[:,s_gene].notna()
                except:
                    continue
                if not s_gsva == 'Cohort':
                    df_plot = df_patient.loc[b_rna & b_dna,['Public_Patient_ID',s_gene,s_gsva]].copy()
                    i_quart = df_plot.loc[:,s_gsva].notna().sum()//4
                    se_low = df_plot.loc[:,['Public_Patient_ID',s_gsva]].sort_values(by=s_gsva)[0:i_quart].Public_Patient_ID
                    se_high = df_plot.loc[:,['Public_Patient_ID',s_gsva]].sort_values(by=s_gsva)[-i_quart::].Public_Patient_ID
                    #df_plot['quartiles'] = pd.qcut(df_plot.loc[:,s_gsva],q=4,labels=['low','medX','med','high'])
                    #df_long = df_plot.loc[~df_plot.quartiles.str.contains('med'),[s_gene,'quartiles']]
                    df_plot['quartiles'] = pd.NA
                    df_plot.loc[df_plot.Public_Patient_ID.isin(se_low),'quartiles'] = 'low'
                    df_plot.loc[df_plot.Public_Patient_ID.isin(se_high),'quartiles'] = 'high'
                    #print(df_plot.quartiles.value_counts())
                    df_long = df_plot.dropna()
                    try:
                        stat, pvalue = scipy.stats.fisher_exact(pd.crosstab(df_long.quartiles, df_long.loc[:,s_gene]))
                    except:
                        print('error')
                        pvalue = 1
                else:
                    df_long = df_patient.loc[b_rna & b_dna,[s_gene,s_gsva]].copy()
                    try:
                        stat, pvalue = scipy.stats.fisher_exact(pd.crosstab(df_long.Cohort, df_long.loc[:,s_gene]))
                    except:
                        pvalue = 1
                ls_gene_correct.append(s_col)
                ls_pvalue.append(pvalue)
                #plot
                fig,ax=plt.subplots(figsize=(2,2),dpi=200)
                if not s_gsva == 'Cohort':
                    sns.pointplot(data=df_long,x='quartiles',y=s_gene,ax=ax,errorbar=None,order=['low','high'])
                else:
                    sns.pointplot(data=df_long,x='Cohort',y=s_gene,ax=ax,errorbar=None,order=['Lung','Liver'])
                ax.set_xlabel(s_gsva)
                ax.set_title(f"{s_col} {s_type}\nFisher's exact\np={pvalue:.2}",fontsize=10)
                ax.set_ylabel(f'Fraction Pts n={len(df_long)}')
                ax.set_ylim(-0.1,1.1)
                plt.tight_layout()
                if pvalue < cutoff:
                    ax.set_title(f"{s_col} {s_type}\nFisher's p={pvalue:.2}\nFDR={d_fdr[s_gsva][s_col]:.2}",fontsize=10)
                    fig.savefig(f'figures/Fraction_{s_col}_{s_type}_{s_gsva}.png')
                else:
                    plt.close(fig)
            #correct pvalues
            __, corrected = statsmodels.stats.multitest.fdrcorrection(np.array(ls_pvalue))
            d_pval = dict(zip(ls_gene_correct,ls_pvalue))
            d_all_p.update({f'{s_gsva}':d_pval})
            d_qval = dict(zip(ls_gene_correct,corrected))
            d_all.update({f'{s_gsva}':d_qval})
        #break

In [None]:
d_fdr = {'pORG_Primary':{'KRAS' : 0.38287593347963944,
'GATA1' : 0.1855045586524062,
'CDKN2A' : 0.06609676055955013,
'TP53' : 1.038674992908113e-09,
'KDM5C' : 0.4251464077159879,},
        'pSUB_Primary':{'TSC2' : 1.0,},'Cohort':{'ATM' : 0.8990812819973347,
'GATA1' : 0.8990812819973347,},
        'PurIST_Primary':{'TGFBR2' : 0.284384813956165,
'ATM' : 0.284384813956165,
'GNAS' : 0.284384813956165,}}
for big_key, big_item in d_all_p.items():
    print(big_key)
    for key, item in big_item.items():
        if item < 0.1:
            print(f"'{key}' : {d_all[big_key][key]},")
            

In [None]:
#load carl's gene level analysis
ls_pvalue = []
ls_gene_correct = []
cutoff = 0.1
ls_file = [ 'SigGenesBySelectCohorts_AllSamples_MH_MinAltered10.xlsx',
 'SigGenesBySelectCohorts_MetSamples_MH_MinAltered10.xlsx',
 'SigGenesBySelectCohorts_PrimarySamples_MH_MinAltered10.xlsx',]
for s_file in ls_file:
    s_type = s_file.split('_')[1].split('Samples')[0]
    df = pd.read_excel(f'data/{s_file}',index_col=0)
    ls_col = df.columns[df.columns.str.contains('qVal')]
    for s_col in ls_col:
        df_sig = df.loc[df.loc[:,s_col] < cutoff,s_col]
        if len(df_sig) > 0:
            print(s_col)
            s_score = s_col.split('_')[0]
            for (index, value) in df_sig.items():
                print(index)
                s_gene = f'{index}_Altered_{s_type}'
                if s_type == 'All':
                    ls = df_patient.columns[(df_patient.columns.str.contains(s_score)) & (df_patient.columns.str.contains(f'_all'))]
                    s_gsva = f"{ls[0].split('_all')[0]}_All"
                    df_patient[s_gsva] = df_patient.loc[:,ls[0]].fillna(df_patient.loc[:,ls[1]])
                    s_gene = f'{index}_Altered'
                else:
                    s_gsva = df_patient.columns[(df_patient.columns.str.contains(s_score)) & (df_patient.columns.str.contains(f'_{s_type}'))][0]
                b_rna = df_patient.loc[:,s_gsva].notna()
                try:
                    b_dna = df_patient.loc[:,s_gene].notna()
                except:
                    continue
                df_plot = df_patient.loc[b_rna & b_dna,[s_gene,s_gsva]].copy()
                df_plot['quartiles'] = pd.qcut(df_plot.loc[:,s_gsva],q=4,labels=['low','medX','med','high'])
                df_long = df_plot.loc[~df_plot.quartiles.str.contains('med'),[s_gene,'quartiles']]
                # average
                stat, pvalue = scipy.stats.fisher_exact(pd.crosstab(df_long.quartiles, df_long.loc[:,s_gene]))
                ls_gene_correct.append(s_gene)
                ls_pvalue.append(pvalue)
                #plot
                fig,ax=plt.subplots(figsize=(2,2),dpi=200)
                sns.pointplot(data=df_long,x='quartiles',y=s_gene,ax=ax,errorbar=None,order=['low','high'])
                ax.set_xlabel(s_score)
                #ax.set_title(f"{index} {s_type}\nFisher's exact\nFDR={value:.2}",fontsize=10)
                ax.set_ylabel(f'Fraction Pts n={len(df_long)}')
                ax.set_ylim(-0.1,1.1)
                plt.tight_layout()
                fig.savefig(f'figures/Fraction_{index}_{s_type}_{s_score}.png')
                break
            break
        break
    break
#correct pvalues
__, corrected = statsmodels.stats.multitest.fdrcorrection(np.array(ls_pvalue))
dict(zip(ls_gene_correct,corrected))

In [None]:
df_merge = pd.read_csv('annotation/20231215_Patient_Metadata_TCR_Metrics.csv',index_col=0)
b_tcr_pts = (df_merge.Clonality_Blood.notna() | df_merge.Clonality_Tumor.notna())
df = df_patient.merge(df_merge,on='Public_Patient_ID',how='left')

In [None]:
df_patient['PurIST_All'] = df_patient.loc[:,'PurIST_Primary'].fillna(df_patient.loc[:,'PurIST_Met'])
d_pat = { 'CT':((df_patient.Cohort.notna()) | (df_patient.Recurrence_Sites_4 == 'Other_site')),
         'RNA':df_patient.PurIST_All.notna(),
         'DNA':df_patient.BRAF_Altered.notna(),
         'TCRTumor':df.Productive_Rearrangements.notna(),
         'TCRBlood':df.loc[:,"Simpson's Evenness blood"].notna(),
        }
s_col = 'Public_Patient_ID'
d_sets = {}
for s_key, b_key in d_pat.items():
        print(s_key)
        print(b_key.sum())
        d_sets.update({s_key:df_patient.loc[b_key,['Public_Patient_ID']]})
if len(d_pat) == 5:
    labels = venn.get_labels([set(item.loc[:,s_col]) for key, item in d_sets.items()])
    fig,ax = venn.venn5(labels, names=[key for key, item in d_sets.items()])#,ax=ax
elif len(d_pat) == 4:
    labels = venn.get_labels([set(item.loc[:,s_col]) for key, item in d_sets.items()])
    fig,ax = venn.venn4(labels, names=[key for key, item in d_sets.items()])#,ax=ax
elif len(d_pat) == 3:
    labels = venn.get_labels([set(item.loc[:,s_col]) for key, item in d_sets.items()])
    fig,ax = venn.venn3(labels, names=[key for key, item in d_sets.items()])#,ax=ax
elif len(d_pat) == 2:
    labels = venn.get_labels([set(item.loc[:,s_col]) for key, item in d_sets.items()])
    fig,ax = venn.venn2(labels, names=[key for key, item in d_sets.items()])#,ax=ax


In [None]:
# df_patient.loc[:,df_patient.columns.str.contains('pORG')].notna().sum()
# 215 + 73

In [None]:
# ## overlap in omics data samples OLD - wrong
# #the samples overlap/don't overlap in assays
# # all samples 

# from matplotlib_venn import venn3, venn3_circles
# from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
# import venn
# d_ids = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}annotation/Simplified_Public_IDs_Key.xlsx',sheet_name=None)

# print(d_ids.keys())
# d_keys = {#'OPTR.Specimen.ID':['RnaSeqKey','TcrTumorKey','DnaPanelKey'], #'',
#           'Biolibrary.Subject.ID':['RnaSeqKey','TcrTumorKey','TcrBloodKey','DnaPanelKey'],
#           #'OPTR':['RnaSeqKey','TcrTumorKey','DnaPanelKey'],
#        }

# for s_col ,ls_keys in d_keys.items():
#     d_sets = {}
#     for s_key in ls_keys:
#             print(s_key)
#             print(len(d_ids[s_key]))
#             d_sets.update({s_key:d_ids[s_key]})
#             print(d_ids[s_key].loc[:,s_col].nunique())
#     #add all CT imaging patients
#     b_ct = ((df_patient.Cohort.notna()) | (df_patient.Recurrence_Sites_4 == 'Other_site'))
#     df_all = df_patient.loc[b_ct,['Public_Patient_ID']].rename({'Public_Patient_ID':'Biolibrary.Subject.ID'},axis=1)
#     d_sets.update({'CT Imaging':df_all})
#     ls_keys.append('CT Imaging')
#     #plot
#     if len(ls_keys) == 4:
#         labels = venn.get_labels([set(item.loc[:,s_col]) for key, item in d_sets.items()])
#         fig,ax = venn.venn4(labels, names=[key.split('Key')[0] for key, item in d_sets.items()])#,ax=ax
#     elif len(ls_keys) == 5:
#         labels = venn.get_labels([set(item.loc[:,s_col]) for key, item in d_sets.items()])
#         fig,ax = venn.venn5(labels, names=[key.split('Key')[0] for key, item in d_sets.items()])#,ax=ax
#     elif len(ls_keys) == 3:
#         fig,ax = plt.subplots(figsize=(3,3),dpi=300)
#         venn3([set(item.loc[:,s_col]) for key, item in d_sets.items()], [key.split('Key')[0] for key, item in d_sets.items()],ax=ax)

# # #primaries

# # from matplotlib_venn import venn3, venn3_circles
# # from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
# # from pyvenn import venn
# # d_ids = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}annotation/Simplified_Public_IDs_Key.xlsx',sheet_name=None)

# # print(d_ids.keys())
# # d_keys = {#'OPTR.Specimen.ID':['RnaSeqKey','TcrTumorKey','DnaPanelKey'], #'',
# #           #'OPTR':['RnaSeqKey','TcrTumorKey','TcrBloodKey','DnaPanelKey'],
# #           'OPTR':['RnaSeqKey','TcrTumorKey','DnaPanelKey'],
# #        }

# # for s_col ,ls_keys in d_keys.items():
# #     d_sets = {}
# #     for s_key in ls_keys:
# #             print(s_key)
# #             print(len(d_ids[s_key]))
# #             df_set = d_ids[s_key][~d_ids[s_key].loc[:,'OPTR.Specimen.ID'].str.contains('-M')]
# #             d_sets.update({s_key:df_set})
# #     #plot
# #     if len(ls_keys) == 4:
# #         labels = venn.get_labels([set(item.loc[:,s_col]) for key, item in d_sets.items()])
# #         fig,ax = venn.venn4(labels, names=[key.split('Key')[0] for key, item in d_sets.items()],ax=ax)
# #     elif len(ls_keys) == 3:
# #         fig,ax = plt.subplots(figsize=(3,3),dpi=300)
# #         venn3([set(item.loc[:,s_col]) for key, item in d_sets.items()], [key.split('Key')[0] for key, item in d_sets.items()],ax=ax)
# #'''

# Survival Analysis - patients in paper <a name="clin"></a>

KM, CPH


[contents](#contents)



In [None]:
s_out = 'annotation/20231221_Patient_Metadata.csv'#'annotation/20230921_Patient_Metadata.csv'
df_patient = pd.read_csv(s_out,index_col=0)
# #add recurrence censor
# df_patient['Recurrence'] = np.nan
# df_patient.loc[~df_patient.loc[:,'Days from Resection to Recurrence'].isna(),'Recurrence'] = 1
# df_patient.loc[df_patient.No_Recurrence,'Recurrence'] = 0
# print(len(df_patient))
# %matplotlib inline
# # add purist subtype
# df_patient.loc[(df_patient.PurIST_Primary <= 0.5),'PurIST_Subtype'] = 'classical'
# df_patient.loc[(df_patient.PurIST_Met <= 0.5),'PurIST_Subtype_Met'] = 'classical'
# df_patient.loc[(df_patient.PurIST_Met <= 0.5),'PurIST_Subtype'] = 'classical'
# df_patient.loc[(df_patient.PurIST_Primary > 0.5),'PurIST_Subtype'] = 'basal-like'
# df_patient.loc[(df_patient.PurIST_Met > 0.5),'PurIST_Subtype_Met'] = 'basal-like'
# df_patient.loc[(df_patient.PurIST_Met > 0.5),'PurIST_Subtype'] = 'basal-like'
# #df_patient['PurIST_Subtype'] = df_patient.PurIST_Subtype.fillna(df_patient.PurIST_Subtype_Met)
# df_patient.loc[df_patient.PurIST_Subtype=='classical','Classical_Cohort'] = df_patient.loc[df_patient.PurIST_Subtype=='classical','Cohort']
# print(df_patient.Classical_Cohort.value_counts())

# #fix no recur and other recur - out of resected patients
# df_patient.loc[:,'No_Doc_Recur'] = pd.NA #np.nan #False
# df_patient.loc[df_patient.Recurrence_Sites_4!='No_Resection','No_Doc_Recur'] = False#0 #False
# df_patient.loc[df_patient.Recurrence_Sites_4=='No_Doc_Recur','No_Doc_Recur'] = True#1 #True
# print(df_patient.No_Recurrence.value_counts())

# df_patient.loc[:,'Other_Recurrence'] = pd.NA #np.nan #False
# df_patient.loc[df_patient.Recurrence_Sites_4!='No_Resection','Other_Recurrence'] = False #0#
# df_patient.loc[df_patient.Recurrence_Sites_4=='Other_site','Other_Recurrence'] = True #1#
# print(df_patient.Other_Recurrence.value_counts())

In [None]:
df_patient.notna().sum()[0:50]

In [None]:
# high and low pORG
df_patient.columns[df_patient.columns.str.contains('pORG')]
#'pORG_0.2_Met','pORG_0.2_allPrimary', 'pORG_0.2_allMet',
for s_col in ['pORG_Primary','pSUB_Primary','PurIST_Primary','pORG_Met','pSUB_Met','pORG_allPrimary', 'pORG_allMet']:
    test = pd.qcut(df_patient.loc[:,s_col],q=4,labels=['low','med','medx','high'])
    df_patient[f'{s_col}_quartiles'] = test.replace({'med':np.nan,'medx':np.nan})
    #break
d_cutoff = {'pORG_Primary':0.0249,'pSUB_Primary':-0.318,'PurIST_Primary':0.013805}
for s_col, i_cut in d_cutoff.items():
    df_patient.loc[df_patient.loc[:,s_col] >= i_cut,f'{s_col.split("_")[0]} Primary'] = 'high'
    df_patient.loc[df_patient.loc[:,s_col] <= i_cut,f'{s_col.split("_")[0]} Primary'] = 'low'
    df_patient.loc[:,f'{s_col.split("_")[0]} Primary'].replace('nan',np.nan,inplace=True)
    print(df_patient[f'{s_col.split("_")[0]} Primary'].value_counts())
#df_patient.rename({'pSUB Primary':'pSUB Primary'},axis=1,inplace=True)
df_patient.replace('nan',np.nan,inplace=True)
#liver and not lung
b_liv_and_lung = (df_patient.Cohort=='Liver') & (df_patient.loc[:,'Lung Met Present']=='YES')
df_patient['Cohort2'] = df_patient['Cohort']
df_patient.loc[b_liv_and_lung,'Cohort2'] = np.nan

# #lung ST-00020218
# # df_patient.loc[df_patient.Public_Patient_ID=='ST-00020218','Recurrence_Sites_4'] = 'Lung'
# # #liver ST-00024980
# # df_patient.loc[df_patient.Public_Patient_ID=='ST-00024980','Recurrence_Sites_4'] = 'Liver'
# # 4991 = ST-00021102 was accidentally duplicate of 4101 (ST-00007303)
# print(df_patient[(df_patient.PurIST_Primary <= 0.5) & (df_patient.Cohort.notna())].Cohort.value_counts()) #56
# print(df_patient[(df_patient.PurIST_Met <= 0.5) & (df_patient.Cohort.notna())].Cohort.value_counts()) #31
# print(df_patient[(df_patient.PurIST_Subtype == 'classical') & (df_patient.Cohort.notna()) & (df_patient.Alive_30_days_post_surgery)].Cohort.value_counts())

In [None]:
#low = good
%matplotlib inline
importlib.reload(util)
df_patient.columns[df_patient.columns.str.contains('_Youden')]
ls_km = ['Recurrence_Sites_4','Cohort2',
         'Classical_Cohort',
         'PurIST_Subtype',#'pORG Primary', 'pSUB Primary','PurIST Primary',
       'Cohort',
               'Other_Recurrence', 'No_Doc_Recur',
            'Lung_Cohort','Liver_Cohort',
        ]

pal_porg_r = ('#E69F00','#56B4E9')
tu_time_censor = (('Days from Diagnosis to FU','Survival'),
                  ('Days from Resection to Recurrence', 'Recurrence')
                 )
for (s_time, s_censor) in tu_time_censor:
    for s_km in ls_km:
        if not s_km == 'Recurrence_Sites_4':
            sns.set_palette(pal_porg_r)
        else:
            sns.set_palette('tab10')
        df_km = df_patient.loc[df_patient.Alive_30_days_post_surgery,['Public_Patient_ID',s_km,s_time,s_censor]].dropna().copy()
        df_km[s_km] = df_km[s_km].astype('str')
        fig,ax,ls_order = util.km_plot(df_km,s_km,s_time,s_censor)
        ax.set_ylabel(f'Fraction Pts.')
        ax.set_xlabel(f'{s_time}')
        break
    #break
    

In [None]:
# import pandas as pd
# import sys
# import os


# with pd.ExcelWriter('Supplemental_Table_2.xlsx') as writer:
#     for csvfilename in sorted(os.listdir('results/CPH')):
#         df = pd.read_csv(f'results/CPH/{csvfilename}',index_col=0)
#         sheet_name = csvfilename.split("results_")[1][:31]
#         df.to_excel(writer, sheet_name=sheet_name)

In [None]:
#clinical covariates

#Lung Cohort TCR versus overall vs. Clinical Variables (chi squared)
ls_clin = ['Desmoplasia (Y/N)','Desmoplasia (Y/N)_Met',#'ALI (Y/N)',#'ALI (Y/N)_Met','PNI','PNI_Met',
    #'Resected','Grade',
           # 'Age','Sex',
           'Neoadjuvant',#
           # 'Stage','LV_Invasion','LN_Pos',
         #  'TLS', 'Inflammation', 'Acute Inflammation', 'Chronic Inflammation','Plasmacytoid Inflammation',
     #'TLS_Met', 'Inflammation_Met', 'Acute Inflammation_Met', 'Chronic Inflammation_Met','Plasmacytoid Inflammation_Met',
       
          ]
ls_cohort = ['Cohort',
             #'Lung_Cohort',#'No_Recurrence',
             #'Liver_Cohort',#'Other_Recurrence',
             ]

for s_clin in ls_clin:
    for s_cohort in ls_cohort:
        crosstab = pd.crosstab(df_patient.loc[:,s_clin],df_patient.loc[:,s_cohort])
        statistic,pvalue, dof, expected_freq = stats.chi2_contingency(crosstab)
        if pvalue < 1.1:
            fig, ax = plt.subplots(figsize=(2,2),dpi=300)
            #sns.heatmap(crosstab - expected_freq,ax=ax,annot=True,cbar_kws={'label':'Obs - Exp'})
            sns.heatmap(crosstab/crosstab.sum(),ax=ax,annot=True,cbar_kws={'label':'Percent Pts.'})
            ax.set_title(f'{s_clin}\nvs. {s_cohort} p={pvalue:.3}')
            #break
        #break
    #break

In [None]:
#effect of tumor cellularity
d_group = {#'PurIST_Subtype':['classical','basal-like',],'Cohort':['Lung','Liver',],
            #'pORG_Primary_quartiles':['low','high',], #'pORG_Met_quartiles', 'pORG_allMet_quartiles' #'pORG_allPrimary_quartiles',
            #'pSUB_Primary_quartiles':['low','high',],
          'pSUB_Met_quartiles':['low','high',],
          'pORG_Met_quartiles':['low','high',]}
ls_col = ['Tumor_Cellularity_by_DNA_Met','Tumor_Cellularity_by_DNA_Primary',]
sns.set_palette('tab10')
import warnings
with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=FutureWarning)
    for s_col in ls_col:
        for s_group, order  in d_group.items():
            df = df_patient.loc[:,[s_col,s_group]].dropna()
            #order = d_group[]#df.loc[:,s_group].unique()
            fig,ax = plt.subplots(figsize=(3,2),dpi=300)
            sns.stripplot(data=df,x=s_group,y=s_col,ax=ax,s=3,alpha=0.8,order=order)
            sns.boxplot(data=df,x=s_group,y=s_col,ax=ax,showmeans=True,medianprops={'visible': False},
                           whiskerprops={'visible': False},meanline=True,showcaps=False,order=order,
                           meanprops={'color': 'k', 'ls': '-', 'lw': 2},showfliers=False,showbox=False)
            a = df.loc[df.loc[:,s_group]==order[0],s_col].values
            b = df.loc[df.loc[:,s_group]==order[1],s_col].values
            stat, pvalue = stats.ttest_ind(a, b)
            ax.set_title(f'{s_col.replace("_"," ")}\nvs {s_group.replace("_"," ")}\np={pvalue:.3}')

In [None]:
# # matches kevin, 58 liver metastasis 18 lung metastasis 83 no documented recurrence 
# #53 other recurrence site 6 no resection
# print(df_patient[~df_patient.PurIST_Primary.isna()].Recurrence_Sites_4.value_counts())
# print(df_patient[~df_patient.PurIST_Primary_T2.isna()].Recurrence_Sites_4.value_counts())
# 58 + 18 + 83 + 53 + 6
# print(df_patient[~df_patient.loc[:,'pORG_0.2_Met'].isna()].Recurrence_Sites_4.value_counts().sum())

In [None]:
import sys
print(sys.version)

In [None]:
#CPH single variable

tu_time_censor = (('Days from Diagnosis to FU','Survival'),
                  ('Days from Resection to Recurrence', 'Recurrence'))
for (s_time, s_censor) in tu_time_censor:
    print(s_censor)
    df_result=pd.DataFrame()
    se_recur = {'Other_Recurrence', 'No_Doc_Recur','Lung_Cohort','Liver_Cohort'} #don't compare to the 74 with no resection
    ls_vital = ['Cohort','pORG_Primary','pORG_Met','Neoadjuvant','Age','Sex','Stage', 'Grade','LV_Invasion','LN_Pos', #'Age at Diagnosis',
          'Other_Recurrence', 'No_Doc_Recur',
                'PurIST_Subtype',
                'Classical_Cohort',
                'Lung_Cohort','Liver_Cohort',
               'pSUB_Primary', 'pSUB_Met','Classical_Cohort',
           'PurIST_Primary', 'PurIST_Met']
    for s_vital in ls_vital:
        print(s_vital)
        if s_vital == 'Neoadjuvant':
            df = df_patient.loc[(df_patient.Alive_30_days_post_surgery) &(df_patient.Resected),[s_vital,s_time,s_censor]].copy()
        else:
            df = df_patient.loc[df_patient.Alive_30_days_post_surgery,[s_vital,s_time,s_censor]].copy()#.dropna(how='any')
        if len(se_recur.intersection(set([s_vital]))):
            df.loc[df_patient.Recurrence_Sites_4=='No_Resection',s_vital] = np.nan
        df = df.dropna()    
        print(len(df))
        if df.columns.isin(['Stage']).any():
            df.Stage = df.Stage.replace({'I':1,'II':2,'III':3,'IV':4}).astype('int')
        if df.columns.isin(['Grade']).any():
            df.Grade = df.Grade.astype('int')
        if df.loc[:,s_vital].dtype=='O':
            df_dummy = pd.get_dummies(df.loc[:,[s_vital]],drop_first=True)
            if df_dummy.shape[1]>0:
                print(s_vital)
                df.drop(s_vital,axis=1,inplace=True)
                df[s_vital] = df_dummy
            else:
                continue
        try: 
            fig, cph = util.cph_plot(df,s_vital,s_time,s_censor,figsize=(3.8,1.5))
            pvalue = cph.summary.p[s_vital]
            plt.tight_layout()
            fig.savefig(f'figures/CPH_single_{s_vital}_{pvalue:.2}_{s_censor}.png')
            #if pvalue > 0.09:
            plt.close(fig)
            df_cph = cph.summary.loc[:,['exp(coef)','exp(coef) lower 95%','exp(coef) upper 95%','p']]
            df_cph['n'] = len(df)
            df_result=pd.concat([df_result,df_cph])
        except:
            print('')
    #'''
    #all the recurrence combos
    s_vital = 'Recurrence_Sites_4'
    for tu_combo in combinations(["Lung","Liver",'Other_site','No_Doc_Recur','No_Resection'],r=2):#combinations(df_patient.Recurrence_Sites_4.unique(),r=2):
        print(tu_combo)
        df = df_patient.loc[(df_patient.Alive_30_days_post_surgery) &(df_patient.Resected),[s_vital,s_time,s_censor]]#.dropna(how='any')
        # df['Recurrence_Sites_4'] = df['Recurrence_Sites_4'].replace({'Liver':'liver'})
        print(len(df))
        df = df[df.loc[:,s_vital].isin(tu_combo)].dropna()
        #order them
        df.Recurrence_Sites_4 = df.Recurrence_Sites_4.astype('category')
        df.Recurrence_Sites_4 = df.Recurrence_Sites_4.cat.set_categories(list(tu_combo))
        #break
        df_dummy = pd.get_dummies(df.loc[:,[s_vital]],drop_first=False)
        df.drop(s_vital,axis=1,inplace=True)
        s_compare = " to ".join((tu_combo))
        df[s_compare] = df_dummy.iloc[:,0]
        print(len(df))
        try:
            fig, cph = util.cph_plot(df,s_compare,s_time,s_censor,figsize=(4,1.5))
            plt.tight_layout()
            fig.savefig(f'figures/CPH_single_{s_vital}_{s_compare}_{s_censor}.png')
            plt.close(fig)
            df_cph = cph.summary.loc[:,['exp(coef)','exp(coef) lower 95%','exp(coef) upper 95%','p']]
            df_cph['n'] = len(df)
            df_result=pd.concat([df_result,df_cph])
        except:
            print('cph error')
        #break
        #save results
    df_result.to_csv(f'results/results_single_CPH_{s_censor}.csv')
    df = df_patient.loc[(df_patient.Alive_30_days_post_surgery) &(df_patient.Resected),[s_vital,s_time,s_censor]]
    for s_recur in df.Recurrence_Sites_4.unique():
        print(f'{s_recur}: Median {s_censor} = {df.loc[df.Recurrence_Sites_4==s_recur,s_time].median()}')
    #'''


In [None]:
#GENES
ls_vital = ls_genes

#CPH single variable

tu_time_censor = (('Days from Diagnosis to FU','Survival'),
                  ('Days from Resection to Recurrence', 'Recurrence'))
for (s_time, s_censor) in tu_time_censor:
    print(s_censor)
    df_result=pd.DataFrame()
    se_recur = {'Other_Recurrence', 'No_Recurrence','Lung_Cohort','Liver_Cohort','PurIST_All','pORG_All','pSUB_All',}
    for s_vital in ls_vital:
        print(s_vital)
        df = df_patient.loc[df_patient.Alive_30_days_post_surgery,[s_vital,s_time,s_censor]].copy()#.dropna(how='any')
        df = df.dropna()    
        print(len(df))
        if df.loc[:,s_vital].dtype=='O':
            df_dummy = pd.get_dummies(df.loc[:,[s_vital]],drop_first=True)
            if df_dummy.shape[1]>0:
                df.drop(s_vital,axis=1,inplace=True)
                df[s_vital] = df_dummy
            else:
                continue
        try: 
            fig, cph = util.cph_plot(df,s_vital,s_time,s_censor,figsize=(3.8,1.5))
            pvalue = cph.summary.p[s_vital]
            plt.tight_layout()
            fig.savefig(f'figures/CPH_single_{s_vital}_{pvalue:.2}_{s_censor}.png')
            #if pvalue > 0.09:
            plt.close(fig)
            df_cph = cph.summary.loc[:,['exp(coef)','exp(coef) lower 95%','exp(coef) upper 95%','p']]
            df_cph['n'] = len(df)
            df_result=pd.concat([df_result,df_cph])
        except:
            print('')
        #save results
    df_result.to_csv(f'results/results_single_CPH_{s_censor}_gene_alterations.csv')
    #'''
    #break

## multivariable

In [None]:
#CPH multivariable
for s_porg in ['pORG','pSUB','PurIST']:
    if s_porg == 'PurIST':
        df_patient[f'{s_porg}_All'] = df_patient.loc[:,f'{s_porg}_Primary'].fillna(df_patient.loc[:,f'{s_porg}_Met'])
    else:
        df_patient[f'{s_porg}_All'] = df_patient.loc[:,f'{s_porg}_allPrimary'].fillna(df_patient.loc[:,f'{s_porg}_allMet'])
    #break
# CDKN2A_Altered# ARID1B_Altered # KMT2D_Altered# MTAP_Altered TP53_Altered
# CDKN2A_Altered_Primary  MTAP_Altered_Primary # ARID1B_Altered_Primary # KMT2D_Altered_Primary
# KDM6A_Altered_Met # CDKN2A_Altered_Met
b_primary = False
for (s_time, s_censor) in tu_time_censor:
    print(s_censor)
    for b_genes in [True,False,]:
        if b_genes:
            s_covar = 'gene_alterations'
            figsize = (5,3)
        else:
            s_covar = 'clinical_covariates'
            figsize = (4,3)
        df_result_multi = pd.DataFrame()
        ls_multi = ['PurIST_Primary','pORG_Primary','pSUB_Primary', #'pORG_0.2_Met', 'pORG_0.2_All',,'PurIST_Met',
                'PurIST_All','pORG_All','pSUB_All',
                    'Liver_Cohort','Lung_Cohort',#'pSUB1e-04_Met',#'pSUB1e-04_All'
            'Other_Recurrence', 'No_Doc_Recur',
                     'Classical_Cohort',
               ]
        ls_cats = ['LV_Invasion','LN_Pos'] #categorical,'Neoadjuvant'
        if b_genes:
            ls_cats =['ARID1B_Altered_Primary','KMT2D_Altered_Primary','MTAP_Altered_Primary','CDKN2A_Altered_Primary']# ['CDKN2A_Altered_Primary','ARID1B_Altered_Primary','KMT2D_Altered_Primary','MTAP_Altered_Primary','TP53_Altered_Primary']#
        for s_multi in ls_multi:
            if not b_primary:
                if len(se_recur.intersection(set([s_multi]))) and b_genes:
                    ls_cats = ['CDKN2A_Altered','ARID1B_Altered','KMT2D_Altered','MTAP_Altered','TP53_Altered']
            print(s_multi)
            df = df_patient.loc[df_patient.Alive_30_days_post_surgery,[s_multi,s_time,s_censor,'Grade','Stage','Public_Patient_ID']].copy()#.dropna() #
            if ls_cats[0].find('Altered') > -1:
                df = df_patient.loc[df_patient.Alive_30_days_post_surgery,[s_multi,s_time,s_censor,'Public_Patient_ID']].copy()#.dropna()
            #if len(se_recur.intersection(set([s_multi]))):
            #    df.loc[df_patient.Recurrence_Sites_4=='No_Resection',s_multi] = np.nan
            df = df.dropna()    
            if df.loc[:,s_multi].dtype=='O':
                df_dummy = pd.get_dummies(df.loc[df_patient.Alive_30_days_post_surgery,[s_multi]],drop_first=True)
                if df_dummy.shape[1]>0:
                    df.drop(s_multi,axis=1,inplace=True)
                    s_multi = df_dummy.columns[0]
                    df[s_multi] = df_dummy
                else:
                    continue
            df_dummy = pd.get_dummies(df_patient.loc[df_patient.Alive_30_days_post_surgery,ls_cats+['Public_Patient_ID']].dropna().set_index('Public_Patient_ID'),drop_first=True) #
            df = df.merge(df_dummy.reset_index(),on='Public_Patient_ID',how='left').set_index('Public_Patient_ID')
            df = df.dropna()
            print(len(df))
            try:
                fig, cph = util.cph_plot(df,s_multi,s_time,s_censor,figsize=figsize)
                fig.savefig(f'figures/CPH_multi_{s_multi}_{s_covar}_{s_censor}.png')
                #plt.close(fig)
                df_result_model = cph.summary.loc[:,['exp(coef)','exp(coef) lower 95%','exp(coef) upper 95%','p']].reset_index()
                df_result_model['model'] = s_multi
                df_result_model['n'] = len(df)
                df_result_multi=pd.concat([df_result_multi,df_result_model])
            except:
                print('cph error')
        #     break
    #     break
    # break
    #all the recurrence combos - multi
    #'''
        s_vital = 'Recurrence_Sites_4'
        figsize=(5,3)
        for tu_combo in combinations(df_patient.Recurrence_Sites_4.unique(),r=2):
            print(tu_combo)
            if b_genes:
                df = df_patient.loc[df_patient.Alive_30_days_post_surgery,[s_vital,s_time,s_censor]]
            else:
                df = df_patient.loc[df_patient.Alive_30_days_post_surgery,[s_vital,s_time,s_censor,'Stage','Grade']]#.dropna(how='any')
            df = df[df.loc[:,s_vital].isin(tu_combo)].dropna()
            #order them
            df.Recurrence_Sites_4 = df.Recurrence_Sites_4.astype('category')
            df.Recurrence_Sites_4 = df.Recurrence_Sites_4.cat.set_categories((tu_combo))
            df_dummy = pd.get_dummies(df.loc[df_patient.Alive_30_days_post_surgery,[s_vital]],drop_first=False)
            df.drop(s_vital,axis=1,inplace=True)
            s_compare = " to ".join((tu_combo))
            df[s_compare] = df_dummy.iloc[:,0]
            df_dummy = pd.get_dummies(df_patient.loc[:,ls_cats].dropna(),drop_first=True) 
            df = pd.concat([df,df_dummy],axis=1)
            df = df.dropna()
            print(len(df))
            try:
                fig, cph = util.cph_plot(df,s_compare,s_time,s_censor,figsize=figsize)
                plt.tight_layout()
                fig.savefig(f'figures/CPH_mutli_{s_vital}_{s_compare}_{s_covar}_{s_censor}.png')
                plt.close(fig)
                df_result_model = cph.summary.loc[:,['exp(coef)','exp(coef) lower 95%','exp(coef) upper 95%','p']].reset_index()
                df_result_model['model'] = s_compare
                df_result_model['n'] = len(df)
                df_result_multi=pd.concat([df_result_multi,df_result_model])
            except:
                print('cph error')
        
        df_result_multi.reset_index(drop=True).to_csv(f'results/results_multi_CPH_{s_covar}_{s_censor}.csv')
        #'''
        
    #     break
    # break
# save with edits
#df_patient.to_csv('annotation/20231206_Patient_Metadata.csv')

In [None]:
# df_result_multi.reset_index(drop=True).to_csv(f'results/results_multi_CPH_{s_covar}_{s_censor}_All.csv')

## Cox PH plots <a name="cphplot"></a>


[contents](#contents)

In [None]:
def forrest_plot(df_plot,s_title,figsize=(4,2.5),ymargin=0.1):
    '''
    plot hazard ratios as error bars
    df_plot: dataframe with HR 'exp(coef)', confidence 'ci', pvalue 'p'
    s_title: axis title string
    '''
    #colors
    fig,ax = plt.subplots(figsize=figsize,dpi=300)
    if len(df_plot) > 0:
        df_plot['color'] = 'k'
        df_plot.loc[((df_plot.loc[:,'exp(coef)']<1)&(df_plot.p<0.05)),'color'] = 'limegreen'
        df_plot.loc[((df_plot.loc[:,'exp(coef)']>1)&(df_plot.p<0.05)),'color'] = 'r'
        for idx, row in df_plot.iterrows():
            ax.errorbar(x=row['exp(coef)'], y=row.name, 
                        xerr=np.array([row.ci[0],row.ci[1]]).reshape(2,1), #row.ci,#[0],
                    ecolor="k", capsize=3, linestyle='None', linewidth=1, marker="s", 
                             markersize=5, mfc=row.color, mec=row.color)
        ax.axvline(x=1, linewidth=0.8, linestyle='--', color='black')
        ax.set_xlabel('Hazard Ratio')
        ax.set_title(s_title)
        # for multivariable
        if df_plot.columns.isin(['model']).any():
            s_n = f"{df_plot.loc[df_plot.covariate==s_title,'n'][0]}"
            s_p = f"{df_plot.loc[df_plot.covariate==s_title,'p'][0]:.2}"
            s_hr = f"{df_plot.loc[df_plot.covariate==s_title,'exp(coef)'][0]:.2}"
            ax.set_title(f'{s_title}\nHR={s_hr} p={s_p} n={s_n}')
        ax.margins(y=ymargin)
        plt.tight_layout()
    return(fig,ax,df_plot)

def make_ci(df):
    df['upper'] = df['exp(coef) upper 95%'] - df['exp(coef)']
    df['lower'] = df['exp(coef)'] - df['exp(coef) lower 95%'] 
    df['ci'] = list(zip(df['lower'], df['upper']))
    return(df)

def bold_title(ax,s_title):
    labels = ax.get_yticklabels()
    for label in labels:
        if label.get_text() == s_title:
            label.set_fontweight('bold')
        else:
            continue
def title_replace(s_model): #
    s_new = s_model.replace('Classical_Cohort_Lung','Classical_Lung_vs_Classical_Liver').replace('_',' ').replace('True','').replace(' YES','').replace(' Classical ',' ')
    return(s_new)

In [None]:
#load patient metadata
%matplotlib inline
s_out = 'annotation/20231219_Patient_Metadata.csv'#'annotation/20230921_Patient_Metadata.csv'
df_patient = pd.read_csv(s_out,index_col=0)
# #add recurrence censor
# df_patient.loc[df_patient.loc[:,'Days from Resection to Recurrence'].notna(),'Recurrence'] = 1 #event observed
# df_patient.loc[df_patient.No_Recurrence.fillna(False),'Recurrence'] = 0

In [None]:
# load saved CPH results
for s_file in os.listdir('results'):
    if s_file.find('results_multi') > -1:
        print(s_file)


In [None]:
#single compare resected
df = pd.read_csv(f'results/results_single_CPH_Survival.csv',index_col=0)
df=make_ci(df)
df.rename({'Classical_Cohort':'Classical_Lung_vs_Liver'},axis=0,inplace=True)
# ls_row = ['Liver to No_Doc_Recur', 'Liver to Lung', 'Liver to Other_site',
#        'Lung to No_Doc_Recur', 'No_Doc_Recur to Other_site',
#        'Lung to Other_site']
# ls_row = ['Lung to Liver', 'Lung to Other_site',
#        'Lung to No_Doc_Recur', 'Liver to Other_site',
#        'Liver to No_Doc_Recur', 'Other_site to No_Doc_Recur']
ls_row = df.index[-6::]
df_plot = df.loc[ls_row]
#df_plot.index = [item.replace('_',' ').replace('Lung','Lung Cohort').replace('Liver','Liver Cohort') for item in df_plot.index]
df_plot.index = [item.replace('_',' ') for item in df_plot.index]
#df_plot.sort_index(ascending=False,inplace=True)
# # plot
s_title='Resected Patients'
s_title='Single Variable CPH\nSurvival'
fig,ax, __ = forrest_plot(df_plot,s_title,figsize=(3.5,2.5))
fig.savefig(f'figures/cph_final_cohorts_compare.png')
#########################################################
#compare to all patients

lls_row = [['PurIST_Subtype','Classical_Lung_vs_Liver'], #,'Classical_Cohort'
           [ 'Liver_Cohort','Other_Recurrence','No_Doc_Recur','Lung_Cohort',
         ]]
for ls_row in lls_row:
    df_plot = df.loc[ls_row].copy()
    df_plot.index = [item.replace('_',' ') for item in df_plot.index]
    df_plot = df_plot.iloc[::-1]
    # # plot
    s_title='Single Variable CPH: Survival'
    if len(ls_row)==2:
        fig,ax,__ = forrest_plot(df_plot,s_title,figsize=(1,0.8),ymargin=0.2)
    else:
        fig,ax,__ = forrest_plot(df_plot,s_title,figsize=(2.5,2),ymargin=0.1)
    fig.savefig(f'figures/cph_final_cohorts_{len(ls_row)}.png')
    break

In [None]:
#single compare resected
df = pd.read_csv(f'results/results_single_CPH_Recurrence.csv',index_col=0)
df=make_ci(df)

#compare to all patients
ls_row = df.index[-3::]#['Liver to Lung', 'Liver to Other_site', 'Lung to Other_site']
df_plot = df.loc[ls_row].copy()
df_plot.index = [item.replace('_',' ') for item in df_plot.index]
df_plot = df_plot.iloc[::-1]
# # plot
s_title='Single Variable CPH: Recurrence'
if len(ls_row)==2:
    fig,ax,__ = forrest_plot(df_plot,s_title,figsize=(1,0.8),ymargin=0.2)
else:
    fig,ax,__ = forrest_plot(df_plot,s_title,figsize=(3,2),ymargin=0.1)
fig.savefig(f'figures/cph_final_cohorts_recur.png')
#df.iloc[-3::,0:5]

In [None]:
# clinical covariates/genetics
#results_multi_CPH_clinical_covariates_Recurrence_JE.csv
s_variable = 'Survival' # 'Recurrence'# 
for s_covar in ['clinical_covariates','gene_alterations',]:#'gene_alterations_all',
    if s_covar == 'clinical_covariates':
        s_file = f'results_multi_CPH_{s_covar}_{s_variable}.csv'
    elif s_covar == 'gene_alterations_all':
        s_file = f'results_multi_CPH_{s_covar.split("_all")[0]}_{s_variable}_All.csv'
    else:
        s_file = f'results_multi_CPH_{s_covar}_{s_variable}.csv'
    df = pd.read_csv(f'results/{s_file}',index_col=0)
    df = make_ci(df)
    d_rename = {'pORG_0.2_Primary':'pORG_Primary', 'pSUB1e-04_Primary':'pSUB_Primary',}
    for s_col in ['covariate','model']:
        df[s_col] = df[s_col].replace(d_rename)
    
    with warnings.catch_warnings():
        warnings.simplefilter(action='ignore', category=FutureWarning)
        for s_model in df.model.unique(): #ls_model:
            print(s_model)
            df_plot = df[df.model==s_model].copy()
            df_plot.rename(d_rename,inplace=True)
            df_plot.covariate = [title_replace(item) for item in df_plot.covariate]
            df_plot.index = df_plot.covariate
            s_title = title_replace(s_model)
            df_plot.sort_values(by='exp(coef)',inplace=True)
            print(s_title)
            if s_model.find('to') > -1:
                figsize=(3.5,2.5)
            elif s_model.find('vs') > -1:
                figsize=(3,2.5)
            else:
                figsize=(2.8,2.5)
                #figsize = (2.5,2)
            fig,ax,__ = forrest_plot(df_plot,s_title,figsize=figsize)
            bold_title(ax,s_title)
            fig.savefig(f'figures/cph_final_{s_title}_{s_variable}_{s_covar}.png')
    break

# Section 6 <a name="geneexp"></a>

gene expresison correlation

The 7 genes used for the IRDS signature were: STAT1, IFI44, IFIT3, OAS1,208
IFIT1, G1P2, and MX1


[contents](#contents)

In [None]:
s_out = '20231215_Patient_Metadata.csv'
df_patient= pd.read_csv(f'annotation/{s_out}',index_col=0)

df_vst = pd.read_csv('data/VST_Genes_Link.csv',index_col=0)
df_rna = df_vst.T.copy()
df_rna.index = [item[0:-2] for item in df_rna.index]
print(len(df_rna))

ls_add = ['pORG_Primary','pORG_Met','pSUB_Primary','PurIST_Primary']
for s_add in ls_add:
    d_map = dict(zip(df_patient.loc[:,'Public_Patient_ID'],df_patient.loc[:,s_add]))
    df_rna[s_add] = df_rna.index.map(d_map)


In [None]:
d_rename={'txi_pORG_Up_42_Genes':'pORG_42',
          'trim_padj_0.2_pORG_Up_55_Genes':'pORG_55',
          'pORG.14':'pORG_14', 'pORG.15':'pORG_15',
          'pORG_0.2_Primary':'pORG_0.2',
          'pSUB1e-04_Primary':'pSUB1e-04',
          'MRC1':'MRC1 (CD206)',
          'OLR1':'OLR1 (LOX1)',
          'MS4A1':'MS4A1 (CD20)'}
from scipy.stats import pearsonr
dim = (4,3)
for s_add in ls_add:
    ls_marker = ['CD3E','CD4', 'CD8A', 'MS4A1',#'LAMP3','ITGAX','CD209',
                 'CD68','OLR1','MRC1','MX1','STAT1',
                 s_add] #
    df_all = df_rna.loc[:,ls_marker].corr().rename(d_rename,axis=1).rename(d_rename,axis=0)
    print(len(df_rna))
    g = sns.clustermap(df_all)
    plt.close()
    categories_order = df_all.iloc[g.dendrogram_col.reordered_ind,:].index.tolist()
    df_all = df_all.loc[categories_order,categories_order]
    rho = df_rna.loc[:,ls_marker].corr() #df_all.corr()
    pval = df_rna.loc[:,ls_marker].corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
    p_vals = pval.applymap(lambda x: ''.join(['*' for t in [0.001,0.005,0.05] if x<=t]))
    p_vals = p_vals.rename(d_rename,axis=1).rename(d_rename,axis=0)
    p_vals = p_vals.loc[categories_order,categories_order]
    fig, ax = plt.subplots(figsize=dim,dpi=300)
    sns.heatmap(df_all, vmin=-1, vmax=1, annot=p_vals, fmt = '', cmap='RdBu_r',ax=ax,
               cbar_kws={'shrink':0.85,'label':s_add})
    #temp
    plt.title(f'{s_add} n={len(df_rna)}')
    fig, ax = plt.subplots(figsize=dim,dpi=300)
    matrix = np.triu(np.ones_like(rho))
    np.fill_diagonal(matrix, val=0)
    np.fill_diagonal(p_vals.values,'')
    sns.heatmap(df_all, vmin=-1, vmax=1, annot=p_vals, fmt = '', cmap='RdBu_r',
                ax=ax,mask=matrix,cbar_kws={'shrink':0.85,'label':'Pearson Correlation'},
               ) #'anchor':(-1.4,0.0)

In [None]:
# viper
df_viper = pd.read_excel(f'data/SupplementalDataset5.xlsx',index_col=0)
df_viper.columns = [item[0:-2] for item in df_viper.columns]
df_viper = df_viper.T
#ls_add = ['pORG_0.2_Primary','pSUB1e-04_Primary','PurIST_Primary']
for s_add in ls_add:
    d_map = dict(zip(df_patient.loc[:,'Public_Patient_ID'],df_patient.loc[:,s_add]))
    df_viper[s_add] = df_viper.index.map(d_map)


In [None]:
d_rename={'txi_pORG_Up_42_Genes':'pORG_42',
          'trim_padj_0.2_pORG_Up_55_Genes':'pORG_55',
          'pORG.14':'pORG_14', 'pORG.15':'pORG_15',
          'pORG_0.2_Primary':'pORG_0.2',
          'pSUB1e-04_Primary':'pSUB1e-04',
          'MRC1':'MRC1 (CD206)',
          'OLR1':'OLR1 (LOX1)',
          'MS4A1':'MS4A1 (CD20)','CD274':'CD274 (PD-L1)'}
from scipy.stats import pearsonr
dim = (4,3)
    
for s_add in ls_add:
    ls_marker = ['CD3G', 'MS4A1','CD274',#'ITGAX',#'CD209','CD4', 'CD8A','LAMP3',
                'IFNAR1','IFNAR2', #'CD164',#'FOXP3',#'OLR1','MRC1',
                 'MX1','STAT1',#'CD68',
                 s_add] #
    df_all = df_viper.loc[:,ls_marker].corr().rename(d_rename,axis=1).rename(d_rename,axis=0)
    print(len(df_viper))
    g = sns.clustermap(df_all)
    plt.close()
    categories_order = df_all.iloc[g.dendrogram_col.reordered_ind,:].index.tolist()
    df_all = df_all.loc[categories_order,categories_order]
    rho = df_viper.loc[:,ls_marker].corr() #df_all.corr()
    pval = df_viper.loc[:,ls_marker].corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
    p_vals = pval.applymap(lambda x: ''.join(['*' for t in [0.001,0.005,0.05] if x<=t]))
    p_vals = p_vals.rename(d_rename,axis=1).rename(d_rename,axis=0)
    p_vals = p_vals.loc[categories_order,categories_order]
    fig, ax = plt.subplots(figsize=dim,dpi=300)
    sns.heatmap(df_all, vmin=-1, vmax=1, annot=p_vals, fmt = '', cmap='RdBu_r',ax=ax,
               cbar_kws={'shrink':0.85,'label':s_add})
    #temp
    plt.title(f'{s_add} n={len(df_viper)}')
    fig, ax = plt.subplots(figsize=dim,dpi=300)
    matrix = np.triu(np.ones_like(rho))
    np.fill_diagonal(matrix, val=0)
    np.fill_diagonal(p_vals.values,'')
    sns.heatmap(df_all, vmin=-1, vmax=1, annot=p_vals, fmt = '', cmap='RdBu_r',
                ax=ax,mask=matrix,cbar_kws={'shrink':0.85,'label':'Pearson Correlation'},
               ) #'anchor':(-1.4,0.0)

## gene expresison versus liver lung

In [None]:
Colorblind[8]
pal_porg = ('#56B4E9','#E69F00')
pal_liv = ('#0072B2','#D55E00')
sns.set_palette(pal_liv)
#sns.palplot(Colorblind[8])

In [None]:
# add cohort
df_rna['Public_Patient_ID'] = df_rna.index
df_rna['Cohort'] = df_rna.index.map(dict(zip(df_patient.Public_Patient_ID,df_patient.Cohort)))
#liver vs lung (t cell)
for s_samples in ['T cell samples','All Samples']:
    ls_marker = ['CD3E','CD4', 'CD8A', 'MS4A1','CD68','OLR1','MRC1','MX1','STAT1']
    if s_samples == 'T cell samples':
        df_plot = df_rna.loc[df_rna.Public_Patient_ID.isin(df_tcell.Public_Patient_ID),ls_marker].unstack().reset_index()
    else:
        df_plot = df_rna.loc[:,ls_marker].unstack().reset_index()
    df_plot['Public_Patient_ID'] = df_plot.level_1#[item[0:-2] for item in df_plot.level_1]
    df_plot['Cohort'] = df_plot.Public_Patient_ID.map(dict(zip(df_patient.Public_Patient_ID,df_patient.Cohort)))
    df_plot.rename({0:'Expression','level_0':'Gene'},axis=1,inplace=True)
    fig,ax = plt.subplots(dpi=200,figsize=(5,3))
    sns.stripplot(data=df_plot,x='Gene',y='Expression',hue='Cohort',dodge=True,ax=ax,s=2)
    sns.boxplot(data=df_plot,x='Gene',y='Expression',hue='Cohort',ax=ax,showmeans=True,medianprops={'visible': False},
                           whiskerprops={'visible': False},meanline=True,showcaps=False,
                           meanprops={'color': 'k', 'ls': '-', 'lw': 2},showfliers=False,showbox=False)
    h, l = ax.get_legend_handles_labels()
    ax.legend(h[0:2],l[0:2],loc='lower left')
    pairs = [((item,'Lung'),(item,'Liver')) for item in ls_marker]
    annot = Annotator(ax, pairs, data=df_plot,x='Gene',y='Expression',hue='Cohort',
                      order=ls_marker,hue_order=('Lung','Liver'))
    annot.configure(test='t-test_ind')
    annot.apply_and_annotate()
    ax.set_title(f'Liver vs Lung: {s_samples}')
    plt.tight_layout()

In [None]:
# viper vs  cohort
pal_liv_r = ('#D55E00','#0072B2',)
sns.set_palette(pal_liv_r)
df_viper['Public_Patient_ID'] = df_viper.index
df_viper['Cohort'] = df_viper.index.map(dict(zip(df_patient.Public_Patient_ID,df_patient.Cohort)))
#liver vs lung (t cell)
for s_samples in ['All Samples','T cell samples',]:#
    ls_marker = ['CD3G', 'MS4A1','CD274','MX1','STAT1','IFNAR1','IFNAR2']
    if s_samples == 'T cell samples':
        df_plot = df_viper.loc[df_viper.Public_Patient_ID.isin(df_tcell.Public_Patient_ID),ls_marker].unstack().reset_index()
    else:
        df_plot = df_viper.loc[:,ls_marker].unstack().reset_index()
    df_plot['Public_Patient_ID'] = df_plot.level_1#[item[0:-2] for item in df_plot.level_1]
    df_plot['Cohort'] = df_plot.Public_Patient_ID.map(dict(zip(df_patient.Public_Patient_ID,df_patient.Cohort)))
    df_plot.rename({0:'VIPER Score','level_0':'Gene'},axis=1,inplace=True)
    fig,ax = plt.subplots(dpi=200,figsize=(5,3))
    sns.stripplot(data=df_plot,x='Regulon',y='VIPER Score',hue='Cohort',dodge=True,ax=ax,s=2)
    sns.boxplot(data=df_plot,x='Regulon',y='VIPER Score',hue='Cohort',ax=ax,showmeans=True,medianprops={'visible': False},
                           whiskerprops={'visible': False},meanline=True,showcaps=False,
                           meanprops={'color': 'k', 'ls': '-', 'lw': 2},showfliers=False,showbox=False)
    h, l = ax.get_legend_handles_labels()
    ax.legend(h[0:2],l[0:2],loc='lower left')
    pairs = [((item,'Lung'),(item,'Liver')) for item in ls_marker]
    annot = Annotator(ax, pairs, data=df_plot,x='Regulon',y='VIPER Score',hue='Cohort',
                      order=ls_marker,hue_order=('Lung','Liver'))
    annot.configure(test='t-test_ind')
    annot.apply_and_annotate()
    ax.set_title(f'Liver vs Lung: {s_samples}')
    plt.tight_layout()

## Section 6 <a name="tcell"></a>

re-analyze t cell data

289 blood samples with matching 175 primary
tumors (141 overlapping with the RNA-seq dataset) and 43 metastatic tumors (33 overlapping with
the RNA-seq dataset). (218)

**missing 2 tumor (have 216), all 289 blood there**

290 unique patients (one primary tumor w/o blood: ST-00018360)

215 blood and tumor are matched

TOTAL = 174 primary, 42 met (216 total)

Of the 290 patients, 284 are analyzed (some dropped surgery)

we analyzed blood samples from 77 patients in the liver cohort and 16
patients in the lung cohort, of which 60 and 16 were matched with tumor samples from the same
patient, respectively

TOTAL = 94 blood
TOTAL = 76 tumor

tumor distinct clones
used data from 214 matched pairs of tumor and blood samples

**213 are there, missing 1**

(TCR tumor: 59 and 16)

#### Exclude those who died of surgery

yes      ST-00018963

yes      ST-00020077

yes     ST-00016968

yes     ST-00006625

yes     ST-00007146

yes    ST-00018260

#### Actually analyzed 
Liver    76
Lung     16

| Cohort  | Tumor Type  |  number pts.  |
|---------|-------------|---------------|
| Liver   | Met         | 17            |
|         | Primary     | 42            |
| Lung    | Met         | 3             |
|         | Primary     | 13            |

[contents](#contents)

In [None]:
#load full patient data
# s_out = '20230921_Patient_Metadata.csv' #'20231206_Patient_Metadata.csv'#
# df_patient= pd.read_csv(f'annotation/{s_out}',index_col=0)

s_out = '20231222_Patient_Metadata.csv'#'20230921_Patient_Metadata.csv'
df_patient= pd.read_csv(f'annotation/{s_out}',index_col=0)
if s_out == '20230921_Patient_Metadata.csv':
    d_rename = {'pORG_0.2_Primary':'pORG_Primary', 'pORG_0.2_Met':'pORG_Met', 
           'pORG_0.2_allPrimary':'pORG_allPrimary', 'pORG_0.2_allMet':'pORG_allMet' }
    df_patient.rename(d_rename,axis=1,inplace=True)
elif s_out == '20231206_Patient_Metadata.csv':
    df_patient.drop('pORG_All',axis=1,inplace=True)
elif s_out == '20231222_Patient_Metadata.csv':
    df_patient.drop('pORG_All',axis=1,inplace=True)
else:
    print(s_out)

# old patient data
df_patient_old = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset1.xlsx',
                           sheet_name='Patients - Tab 1')
#load T cell data - tumor
df_tcell = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset6.xlsx',
                         sheet_name='Tumor Samples')
print(f'T cell tumor {len(df_tcell)}')
df_tcell.rename({'Tumor Type':'Tumor_Type'},axis=1,inplace=True)
#blood
df_tcell_blood = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/SupplementalDataset6.xlsx',
                         sheet_name='Blood Samples')
#df_tcell_blood['Tumor Type'] = 'Blood'
print(f'T cell blood {len(df_tcell_blood)}')

#both
df_tcell = df_tcell.merge(df_tcell_blood,on='Patient ID',suffixes=(' tumor',' blood'),how='outer')
df_tcell.rename({'Productive Rearrangements (Observed Richness)':'Productive_Rearrangements',
                'Templates per ng':'Templates_per_ng','Patient ID':'Public_Patient_ID'},axis=1,inplace=True)
print(f'T cell both {len(df_tcell)}')
#df_merge = df_patient[df_patient.Public_Patient_ID.isin(df_tcell.Public_Patient_ID)]
df_merge = df_patient.merge(df_tcell,on='Public_Patient_ID',how='inner')
# missing_ids: some were dropped surgery due to death from surgery
missing_ids = df_tcell[~df_tcell.Public_Patient_ID.isin(df_merge.Public_Patient_ID)].Public_Patient_ID
df_not_matched = df_patient_old[df_patient_old.loc[:,'Patient ID'].isin(missing_ids)]
print(f'dropped {len(df_not_matched)} patients; days surv post surgery: {df_not_matched.loc[:,"Days from Resection to FU"].max()}')

print(f'Unique patients t cell {(df_merge.Public_Patient_ID.nunique())}')
print(df_merge.Cohort.value_counts())
print(df_merge.groupby(['Cohort','Tumor_Type']).count().Public_Patient_ID)
print(len(df_merge))

In [None]:
## add calculated Entropy, simpsons 
#load data
ls_clones = ['TCR_Blood_Shared_Clones_rare0to2_all.csv', 'TCR_Blood_Shared_Clones_rare2to5_all.csv',
              'TCR_Blood_Shared_Clones_rare5to10_all.csv','TCR_Blood_Shared_Clones_rare10to665800_all.csv',#'TCR_Blood_Shared_Clones_rare1_all.csv',
              'TCR_Tumor_Distinct_Clones_rare0to2_tumbld.csv','TCR_Tumor_Distinct_Clones_rare2to5_tumbld.csv',
              'TCR_Tumor_Distinct_Clones_rare5to10_tumbld.csv', 'TCR_Tumor_Distinct_Clones_rare10to665800_tumbld.csv', #  'TCR_Blood_Shared_Clones_rare2_all.csv',
#  'TCR_Blood_Shared_Clones_rare5_all.csv',
#  'TCR_Blood_Shared_Clones_rare10_all.csv',
#  'TCR_Tumor_Distinct_Clones_rare10_tumor.csv',
#  'TCR_Tumor_Distinct_Clones_rare1_tumor.csv',
#  'TCR_Tumor_Distinct_Clones_rare2_tumor.csv',
#  'TCR_Tumor_Distinct_Clones_rare5_tumor.csv'
             ]
d_load = {#'TCR_Tumor_Distinct_Clones_no_rare_denom.csv':['Public_Patient_ID',
           #     'Number Tumor Distinct Clones', 'Fraction Tumor Distinct Clones','Prod. Freq. Tumor Distinct Clones'],
          #'TCR_Tumor_Distinct_Rearrangements_no_rare.csv':['Public_Patient_ID',
       #'Number Tumor Distinct Rearrangements','Fraction Tumor Distinct Rearrangements','Prod. Freq. Tumor Distinct Rearrangements'],
        #  'TCR_Tumor_Distinct_Clones.csv':['Public_Patient_ID','Fraction_Tumor_Distinct_Clones'],#'Number Tumor Distinct Clones (rare)',
    'TCR_Simpsons_Evenness_templates.csv':['Public_Patient_ID','Simpsons_Evenness','Simpsons_Evenness_no_CMV'],
          'TCR_Simpsons_D_templates.csv':['Public_Patient_ID','Simpsons_D'],
          'TCR_Shannon_entropy_productive_frequency.csv':['Public_Patient_ID','Shannon_Entropy','Normalized_Shannon_Entropy'],
         }
for s_en, ls_markers in d_load.items():
    print(s_en)
    df_en = pd.read_csv(f'data/{s_en}')
    df_en.rename({'Unnamed: 0':'Patient_Specimen_ID','Shannon Entropy':'Shannon_Entropy',#'Percent Tumor Distinct Rearrangements in Tumor':'Percent Tumor Distinct Clones (JL)',   
             'Normalized Shannon Entropy':'Normalized_Shannon_Entropy'},axis=1,inplace=True)
    if s_en == 'TCR_Tumor_Distinct_Clones.csv':
        df_en.rename({'Unnamed: 0':'Patient_Specimen_ID','Fraction Tumor Distinct Clones in Tumor':'Fraction_Tumor_Distinct_Clones'},axis=1,inplace=True)
    df_en['dash_end'] = [item.split('-')[-1] for item in df_en.Patient_Specimen_ID]
    df_en['Public_Patient_ID'] = [item.split('-')[0] + '-' + item.split('-')[1] for item in df_en.Patient_Specimen_ID]
    #patient rows with columns for blood and tumor
    df_en_pat = pd.DataFrame(index=df_en.Public_Patient_ID.unique())
    df_tum = df_en[df_en.dash_end!='B'].loc[:,ls_markers].set_index('Public_Patient_ID')
    print(df_tum.index.duplicated().any())
    df_en_pat = df_en_pat.merge(df_tum,left_index=True,right_index=True,how='left')
    if s_en.find('Tumor_Distinct') == -1:
        df_bld = df_en[df_en.dash_end=='B'].loc[:,ls_markers].set_index('Public_Patient_ID')
        print(df_bld.index.duplicated().any())
        df_en_pat = df_en_pat.merge(df_bld,left_index=True,right_index=True,how='left',suffixes=('_Tumor','_Blood'))
    df_merge = df_merge.merge(df_en_pat.reset_index().rename({'index':'Public_Patient_ID'},axis=1),on='Public_Patient_ID',how='left')
    #break
for s_en in ls_clones:
    print(s_en)
    df_en = pd.read_csv(f'data/{s_en}')
    s_col_name = df_en.columns[df_en.columns.str.contains('Fraction')][0]
    s_type = s_en.split('TCR_')[1].split('_Clones')[0]
    s_rare = s_en.split('rare')[1].split('_')[0]
    s_new = f'{s_col_name.replace("Blood","").replace(" in Tumor","")} {s_rare}'
    s_new = s_new.replace("Clones 1","TCRs").replace("TCRs0","Clones 10").replace("  "," ").replace('10to665800','>=10')
    print(s_new)
    df_en.rename({'Unnamed: 0':'Patient_Specimen_ID',s_col_name:s_new},axis=1,inplace=True)
    df_en['dash_end'] = [item.split('-')[-1] for item in df_en.Patient_Specimen_ID]
    df_en['Public_Patient_ID'] = [item.split('-')[0] + '-' + item.split('-')[1] for item in df_en.Patient_Specimen_ID]
    #patient rows with columns for blood and tumor
    df_en_pat = pd.DataFrame(index=df_en.Public_Patient_ID.unique())
    df_tum = df_en[df_en.dash_end!='B'].loc[:,[s_new,'Public_Patient_ID']].set_index('Public_Patient_ID')
    print(df_tum.index.duplicated().any())
    df_en_pat = df_en_pat.merge(df_tum,left_index=True,right_index=True,how='left')
    df_merge = df_merge.merge(df_en_pat.reset_index().rename({'index':'Public_Patient_ID'},axis=1),on='Public_Patient_ID',how='left')
    
df_merge['Simpsons_Diversity_Tumor'] = 1/df_merge.Simpsons_D_Tumor#1-df_merge.Simpsons_D_Tumor#
df_merge['Simpsons_Diversity_Blood'] = 1/df_merge.Simpsons_D_Blood#1-df_merge.Simpsons_D_Blood#
df_merge['Clonality_Tumor'] = 1 - df_merge.Normalized_Shannon_Entropy_Tumor
df_merge['Clonality_Blood'] = 1 - df_merge.Normalized_Shannon_Entropy_Blood

#met or primary
df_merge['Percent Tumor Distinct Clones'] = 100 - df_merge.loc[:,'Percentage Tumor-Distinct Clones in Paired Tumor Sample']
df_merge.rename({'Patient Specimen ID':'Patient_Specimen_ID','Tumor Type':'Tumor_Type'},axis=1,inplace=True)
print((df_merge.Public_Patient_ID.nunique()))

# df_merge.loc[df_merge.loc[:,'PurIST_Met'] > 0.5,'PurIST_Subtype'] = 'basal-like'
# df_merge.loc[df_merge.loc[:,'PurIST_Met'] <= 0.5,'PurIST_Subtype'] = 'classical'

# define long and short lung
s_time = 'Days from Diagnosis to FU'
short_long = df_merge.loc[(df_merge.Tumor_Type.notna()) & (df_merge.Cohort=='Lung'),s_time].median()
df_merge.loc[df_merge.loc[:,s_time] > short_long,'TCR_Lung_Median_Surv'] = 'lung_long'
df_merge.loc[df_merge.loc[:,s_time] < short_long,'TCR_Lung_Median_Surv'] = 'lung_short'
df_merge.loc[(df_merge.Tumor_Type.isna()) | (df_merge.Cohort!='Lung'),'TCR_Lung_Median_Surv'] = np.nan
# define long and short liver
short_long = df_merge.loc[(df_merge.Tumor_Type.notna()) & (df_merge.Cohort=='Liver'),s_time].median()
df_merge.loc[df_merge.loc[:,s_time] > short_long,'TCR_Liver_Median_Surv'] = 'liver_long'
df_merge.loc[df_merge.loc[:,s_time] < short_long,'TCR_Liver_Median_Surv'] = 'liver_short'
df_merge.loc[(df_merge.Tumor_Type.isna()) | (df_merge.Cohort!='Liver'),'TCR_Liver_Median_Surv'] = np.nan

#add blood productive rearrangements
df = pd.read_csv('results/results_Productive_rearrangements.csv',index_col=0)
df_blood_productive = df[df.Site=='Blood']
#df_blood_productive.Public_Patient_ID.duplicated().any()
df_merge['Productive_Rearrangements_Blood'] = df_merge.Public_Patient_ID.map(dict(zip(df_blood_productive.Public_Patient_ID,df_blood_productive.amino_acid)))

In [None]:
## did I make a mistake here by commenting out 
#'Y - but include in all',
# add blood data
ls_blood = ['Shannon_Entropy_Blood','Simpsons_Diversity_Blood', 
            'Clonality_Blood', 'Simpsons_Evenness_Blood',
            "Simpson's Evenness blood",'Productive_Rearrangements_Blood',
            'Fraction Tumor Distinct TCRs',
           'Percent Tumor Distinct Clones',
            'Fraction Shared Clones 10',
            'Fraction Shared Clones 0to2',
       'Fraction Shared Clones 2to5', 'Fraction Shared Clones 5to10',
       'Fraction Shared Clones >=10', 'Fraction Tumor Distinct Clones 0to2',
       'Fraction Tumor Distinct Clones 2to5',
       'Fraction Tumor Distinct Clones 5to10',
       'Fraction Tumor Distinct Clones >=10', 
        ]
df_merge['Blood_Type'] = df_merge.Tumor_Type.copy()
d_ids = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/OLD Versions/Simplified_Public_IDs_Key.xlsx',sheet_name=None)
df_id = d_ids['RnaSeqKey']
ls_ids = df_id.loc[:,'Public.Specimen.ID']
# # add patients w/o RNA seq
for s_key in ['TcrTumorKey','TcrBloodKey']:
    df_add = d_ids[s_key].loc[~d_ids[s_key].loc[:,'Public.Specimen.ID'].isin(ls_ids)]
    df_id = pd.concat([df_id,df_add])
df_blood_dates = pd.read_excel(f'FINAL_Blood_dates_to_review_JE.xlsx',index_col=0)
df_blood_dates['Public_Patient_ID'] = df_blood_dates.OPTR.map(dict(zip(df_id.OPTR,df_id.loc[:,'Biolibrary.Subject.ID'])))
print(len(df_blood_dates))
#df_dk = pd.read_excel('Blood_dates_to_review_JE-dk.xlsx',index_col=0)
#df_blood_dates = df_blood_dates.merge(df_dk.loc[:,['Potential calls','chart notes','OPTR']],on='OPTR',how='left',suffixes=('','_x'))
#print(len(df_blood_dates))
ls_types = ['Y - but include in all', 'unknown', # no categorization of met blood
            'can be considered a primary blood',
        'can be considered a met blood',
       'exclude from analysis']
#df_blood_dates.loc[:,'exclude blood primary or met?'] = df_blood_dates.loc[:,'exclude blood primary or met?'].fillna(df_blood_dates.loc[:,'Potential calls'])
for s_type in ls_types: #df_blood_dates.loc[:,'exclude blood primary or met?'].unique()
    print(s_type)
    df_type = df_blood_dates[df_blood_dates.loc[:,'exclude blood primary or met?']==s_type]
    b_type = df_merge.Public_Patient_ID.isin(df_type.Public_Patient_ID)
    print(b_type.sum())
#     if s_type == 'N - change tumor to met, blood is primary': #old
#         df_merge.loc[b_type,'Tumor_Type'] = 'Met'
#         df_merge.loc[b_type,'Blood_Type'] = 'Primary'
    if s_type == 'exclude from analysis': #Y - exclude from analysis?
        df_merge.loc[b_type,'Blood_Type'] = np.nan
        df_merge.loc[b_type,ls_blood] = np.nan
    elif s_type == 'can be considered a primary blood':
        df_merge.loc[b_type,'Blood_Type'] = 'Primary'
    elif s_type == 'can be considered a met blood': #likely met
        df_merge.loc[b_type,'Blood_Type'] = 'Met'
    elif s_type == 'Y - but include in all':
        df_merge.loc[b_type,'Blood_Type'] = np.nan
    elif s_type == 'unknown':
        df_merge.loc[b_type,'Blood_Type'] = np.nan
    #break
#save
#ls_col = ['Stage','Lung_Cohort','Liver_Cohort','pORG_Primary', 'pORG_Met','Public_Patient_ID']
#df_blood_dates.merge(df_merge.loc[:,ls_col],on='Public_Patient_ID',how='left').to_csv('Blood_dates_to_review.csv')
# removed blood from analysi because it was collected 417 days before the PDAC diagnosis
# df_id[df_id.loc[:,'Biolibrary.Subject.ID']=='ST-00021096']
# df_tcell_blood[df_tcell_blood.loc[:,'Patient ID'] == 'ST-00021096']

In [None]:
#add immunarch metrics
importlib.reload(util)
s_dir =f'{codedir.split("Liver_Lung_PDAC")[0]}R'
df_meta = pd.read_csv(f'{s_dir}/raw_TCR_data_test/metadata.txt',sep='\t')
df_meta['Sample_Type'] = df_meta.Site.replace({'Primary':'Tumor','Met':'Tumor'})
for s_file in os.listdir(s_dir):
    if s_file.find('results_TCR') > -1:
        print(s_file)
        df = pd.read_csv(f'{s_dir}/{s_file}',index_col=0)
        if s_file =='results_TCR_hill_diversity.csv':
            df.set_index('Sample',inplace=True)
            df = df.pivot(columns='Q')
            df.columns = [f'Hill_{item[1]}' for item in df.columns]
            df.index.name = 'INDEX'
        elif s_file.find('overlap.csv') > -1:
            df.columns = [item.split('TCR_raw_data_')[1] for item in df.columns]
            df.index = [item.split('TCR_raw_data_')[1] for item in df.index]
            #break
            df = util.process_overlap(df,df_meta,ls_site=['Sample_Type'])
            prefix = f"{s_file.split('_')[-2]}_"
            df.rename({item:prefix+item for item in df.columns[df.columns!='Sample']},axis=1,inplace=True)
            print(df.columns)
            #break
        else:
            prefix = f"{s_file.split('_')[-2]}_"
            df.rename({item:prefix+item for item in df.columns[df.columns!='Sample']},axis=1,inplace=True)
        try:
            df['Sample'] = [item.split('TCR_raw_data_')[1] for item in df.Sample]
        except:
            try: 
                df['Sample'] = [item.split('TCR_raw_data_')[1] for item in df.index]
            except:
                df['Sample'] = df.index
        df['Public_Patient_ID'] = ['ST-' + item.split('-')[1] for item in df.Sample]
        df_blood = df.loc[df.Sample.str.contains('-B'),~df.columns.str.contains('Sample')]
        df_tum = df.loc[~df.Sample.str.contains('-B'),~df.columns.str.contains('Sample')]
        df_merge = df_merge.merge(df_tum,on='Public_Patient_ID',suffixes=('','_tumor'),how='left')
        df_merge = df_merge.merge(df_blood,on='Public_Patient_ID',suffixes=('','_blood'),how='left')
        #break


In [None]:
#add pORG quartiles, plot violins 
# good/ old way with pORG Primary, Mets
# Rosie chose this way
with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=FutureWarning)
    %matplotlib inline
    sns.set_palette('tab10')
    importlib.reload(util)
    alpha = 0.05
    b_correct= False #False #True #
    ls_foci = [#'Fraction Shared Clones 0to2',
    #        'Fraction Shared Clones 2to5', 'Fraction Shared Clones 5to10',
    #        'Fraction Shared Clones >=10', 'Fraction Tumor Distinct Clones 0to2',
    #        'Fraction Tumor Distinct Clones 2to5',
    #        'Fraction Tumor Distinct Clones 5to10',
    #        'Fraction Tumor Distinct Clones >=10', 
    #     'Hill_1', 'Hill_2', 'Hill_3',
    #        'Hill_4', 'Hill_5', 'Hill_6',
    #            'true_Value','ginisimp_Value',
    #             'd50_Clones','chao_Estimator', 
        ####### tumor ##########
    #            'Productive_Rearrangements',
    #    'Templates_per_ng','Simpsons_Diversity_Tumor',
    # 'Shannon_Entropy_Tumor', 'Clonality_Tumor',
    #    "Simpson's Evenness tumor",#'Fraction Tumor Distinct TCRs',# 'propshared_proportion_shared_templates','propshared_proportion_shared_clones',
    #          'Percent Tumor Distinct Clones',
    #            'morisita_Liver_Primary', 'morisita_Liver_Met', 'morisita_Lung_Primary',
    #        'morisita_Lung_Met', 'morisita_low_Primary',
    #     'morisita_high_Primary','morisita_low_Met','morisita_high_Met',
    #     'morisita_Liver_Primary_Bld', 'morisita_Lung_Primary_Bld', 
    #        'morisita_Liver_Met_Bld','morisita_Lung_Met_Bld',
    #        'morisita_high_Primary_Bld', 'morisita_low_Met_Bld',
    #        'morisita_low_Primary_Bld',  'morisita_high_Met_Bld',  
        ####### blood ##########
    #            'Hill_1_blood','Hill_2_blood', 'Hill_3_blood','Hill_4_blood', 
    #        # 'Hill_5_blood','Hill_6_blood', 'propshared_proportion_shared_clones_blood',  'propshared_proportion_shared_templates_blood',
    #            'chao_Estimator_blood','d50_Clones_blood',
    #            'ginisimp_Value_blood','true_Value_blood',
               'Productive_Rearrangements_Blood',
                "Simpson's Evenness blood",
              'Shannon_Entropy_Blood',
       'Simpsons_Diversity_Blood',
                'Clonality_Blood',
    #  'morisita_Liver_Primary_Bld_blood','morisita_Liver_Primary_blood', 'morisita_Lung_Primary_Bld_blood',
    #  'morisita_Lung_Primary_blood', 'morisita_high_Primary_Bld_blood',
    #  'morisita_high_Primary_blood', 'morisita_low_Primary_Bld_blood','morisita_low_Primary_blood' ,
    #  'morisita_Liver_Met_Bld_blood','morisita_Liver_Met_blood', 'morisita_Lung_Met_Bld_blood',
    #  'morisita_Lung_Met_blood','morisita_high_Met_Bld_blood',
    #  'morisita_high_Met_blood', 'morisita_low_Met_Bld_blood','morisita_low_Met_blood',
    ]
    ls_morisita = sorted(df_merge.loc[:,(df_merge.columns.str.contains('morisita'))])
    ls_jaccard = sorted(df_merge.loc[:,(df_merge.columns.str.contains('jaccard'))])
    ls_public = sorted(df_merge.loc[:,(df_merge.columns.str.contains('public'))])
    ls_foci =  ls_foci #+ ls_jaccard + ls_public +ls_morisita
    d_colorblind = {'Liver':'#d55e00','Lung':'#0072b2',
                   'high':'#e69f00','low': '#56b4e9',
                   'basal-like':'#000000','classical':'#cc79a7',
                   'high pSUB': '#f0e442','low pSUB':'#009E73',
                   'Met':'black','Primary':'lightgray'}
    #non- parametric
    se_non_para = pd.Series( ['Simpsons_Diversity_Tumor','Simpsons_Diversity_Blood', 'Simpsons_Evenness_Blood',
        'Templates_per_ng','Productive_Rearrangements','Simpsons_Evenness_Tumor',
         "Simpson's Evenness tumor",'Productive_Rearrangements_Blood',
        "Simpson's Evenness blood",    'Hill_1', 'Hill_2', 'Hill_3',
           'Hill_4', 'Hill_5', 'Hill_6', 'true_Value',#'ginisimp_Value',
                'd50_Clones','chao_Estimator',     'Hill_1_blood','Hill_2_blood', 'Hill_3_blood',
       'Number Tumor Distinct Clones','Hill_4_blood', 'chao_Estimator_blood','d50_Clones_blood',
          'true_Value_blood', 'morisita_Liver_Primary_Bld', 'morisita_Liver_Met_Bld',
           'morisita_Lung_Met_Bld', 'morisita_Lung_Primary_Bld',
           'morisita_Liver_Primary', 'morisita_Liver_Met', 'morisita_Lung_Primary',
           'morisita_Lung_Met', 'morisita_low_Primary',
           'morisita_high_Primary_Bld', 'morisita_high_Primary',
           'morisita_low_Primary_Bld', 'morisita_low_Met', 'morisita_high_Met_Bld',
           'morisita_high_Met', 'morisita_low_Met_Bld', 'morisita_Liver_Primary_Bld_blood','morisita_Liver_Primary_blood', 'morisita_Lung_Primary_Bld_blood',
     'morisita_Lung_Primary_blood', 'morisita_high_Primary_Bld_blood',
     'morisita_high_Primary_blood', 'morisita_low_Primary_Bld_blood','morisita_low_Primary_blood' ,
     'morisita_Liver_Met_Bld_blood','morisita_Liver_Met_blood', 'morisita_Lung_Met_Bld_blood',
     'morisita_Lung_Met_blood','morisita_high_Met_Bld_blood','morisita_high_Met_blood',
      'morisita_low_Met_Bld_blood','morisita_low_Met_blood',]+ls_public+ls_jaccard+ls_morisita
                           )
    d_order =  {#'Blood_Type':['Met','Primary'],
                'Cohort':['Liver','Lung'],#'PurIST_Subtype':['basal-like','classical'],
        #'Tumor_Type':['Met','Primary'],
        'quartiles':['high','low'],
               }
    # # met vs primary
    # d_order =  {'Blood_Type':['Met','Primary'],
    #     #'Tumor_Type':['Met','Primary'],
    #            }
    for s_porg in ['pORG_Met','pORG_All','pORG_Primary']:#
        for s_foci in ls_foci:#ls_public+ls_jaccard+ls_morisita:#
            #if  s_foci.find('blood') > -1:
            #    continue
            if s_porg.find('Primar') > -1:
                print('Primaries')
                df_pri = df_merge.loc[(df_merge.Tumor_Type=='Primary')].copy()
                if s_foci.find('lood') > -1:
                    df_pri = df_merge.loc[(df_merge.Blood_Type=='Primary')].copy()
            elif s_porg.find('All') > -1: 
                print('Primaries and Mets')
                df_whole = pd.DataFrame(columns=df_merge.columns)#
                for s_all in ['pORG_allPrimary', 'pORG_allMet']:
                    df_half = df_merge[(df_merge.loc[:,s_all].notna())]
                    df_half = df_half[~df_half.Public_Patient_ID.isin(df_whole.Public_Patient_ID)].copy()
                    df_half.rename({s_all:'pORG_All'},axis=1,inplace=True)
                    #df_half = df_half.loc[:,~df_half.columns.duplicated()]
                    #print(len(df_half))
                    with warnings.catch_warnings():
                        warnings.filterwarnings("ignore",category=FutureWarning)
                        df_whole = pd.concat([df_whole,df_half],axis=0,ignore_index=True)
                        df_whole.reset_index(inplace=True, drop=True)
                #s_foci not defined
                df_pri = df_merge.merge(df_whole.loc[:,['Public_Patient_ID','pORG_All']],on='Public_Patient_ID',how='left')#.loc[:,s_foci].notna()
                df_pri.loc[df_pri.Public_Patient_ID.duplicated(),'Cohort'] = np.nan
            elif s_porg.find('Met') > -1:
                print('Mets')
                df_pri = df_merge.loc[df_merge.Tumor_Type=='Met'].copy()
                if s_foci.find('lood') > -1:
                    df_pri = df_merge.loc[(df_merge.Blood_Type=='Met')].copy()
            df_pri = util.add_quartiles(df_pri,s_porg)#.drop(213) #p. rearrangements outlier 
            
            for s_trans in ['none','log']:#
                if se_non_para.isin([s_foci]).any():
                    s_stats = 'non-parametric'
                    if s_trans == 'log':#log transform
                        if df_pri.loc[:,s_foci].min() <= 0:
                            df_pri.loc[:,s_foci] = np.arcsinh((df_pri.loc[:,s_foci]))
                        else:
                            df_pri.loc[:,s_foci] = np.log(df_pri.loc[:,s_foci])
                        s_stats = 'mean'
                    else:
                        s_stats = 'non-parametric'#continue
                else:
                    s_stats = 'mean' 
                    if s_trans == 'log':
                        continue
                #df_pri = df_pri[df_pri.quartiles == 'high'] #df_pri = df_pri[df_pri.Cohort == 'Lung']
                df_both,d_pval,order = util.violin_stats2(df_pri,d_order,s_foci,s_stats)
                #util.qq_plot_hist(df_pri,s_cat,s_foci)  #anova eval
                fig,pvalues,corrected = util.plot_violins2(df_both,d_pval,d_order,s_stats,s_foci,order,d_colorblind,s_porg,b_correct=b_correct)#True#False
                if s_trans == 'log':
                    s_label = f'log {fig.get_axes()[0].get_ylabel()}'
                    fig.get_axes()[0].set_ylabel(s_label)
                fig.savefig(f'figures/violinplot_both_{s_foci}_{list(d_order.keys())[-1]}_{s_porg}_{s_stats}.png')
                #plt.close(fig)
                ax = fig.get_axes()
            #break
            '''
            # SPEARMAN corr to Templates_per_ng
            #for s_corr in ['morisita_Primary','morisita_Met','morisita_Blood']:
            #'jaccard_Blood','jaccard_Tumor','public_Blood','public_Tumor'
            for s_corr in ['Clonality_Tumor']:
                if s_porg.find('All') > -1:
                    continue
                    fig2, pvalues2 = util.plot_pearson(df_pri,s_corr,s_foci,s_stats='non-parametric',ls_plots=['Primaries','Mets','Both']) #='non-parametric'
                    fig2.savefig(f'figures/scatterplot_pearson_all_{s_foci}_{list(d_order.keys())[-1]}_{s_porg}_{s_stats}.png')
                elif s_porg.find('Primar') > -1:
                    fig2, pvalues2 = util.plot_pearson(df_pri,s_corr,s_foci,s_stats,ls_plots=['Primaries'])
                    fig2.savefig(f'figures/scatterplot_pearson_primary_{s_foci}_{list(d_order.keys())[-1]}_{s_porg}_{s_stats}.png')
                elif s_porg.find('Met') > -1:
                    fig2, pvalues2 = util.plot_pearson(df_pri,s_corr,s_foci,s_stats,ls_plots=['Mets'])
                    fig2.savefig(f'figures/scatterplot_pearson_met_{s_foci}_{list(d_order.keys())[-1]}_{s_porg}_{s_stats}.png')
                #plt.close(fig2)
            #break
            #'''
        
        #plot Cohort versus pORG
        '''
        pal_liv_r = ('#D55E00','#0072B2',)
        sns.set_palette(pal_liv_r)
        s_type = s_porg.split('_')[-1]
        if not s_type == 'All':
            fig3, ax3 = plt.subplots(figsize=(3,3),dpi=200)
            sns.stripplot(data=df_pri,x='Cohort',y=s_porg,ax=ax3)
            sns.boxplot(data=df_pri,x='Cohort',y=s_porg,showmeans=True,medianprops={'visible': False},
                               whiskerprops={'visible': False},meanline=True,showcaps=False,ax=ax3,
                               meanprops={'color': 'k', 'ls': '-', 'lw': 2},showfliers=False,showbox=False)
            a = df_pri.loc[(df_pri.Tumor_Type==s_type) & (df_pri.Cohort=='Liver'),s_porg].dropna()
            b = df_pri.loc[(df_pri.Tumor_Type==s_type) & (df_pri.Cohort=='Lung'),s_porg].dropna()
            stat, pvalue = stats.ttest_ind(a,b)
            ax3.set_title(f'{s_type}\np={pvalue:.3}')
            plt.close()
            #'''
        '''break
        #add pORG quartiles back to data (all, primary and met)
        df_merge[f'{s_porg}_quartiles'] = df_merge.Public_Patient_ID.map(dict(zip(df_pri.Public_Patient_ID,df_pri.quartiles)))
        print(df_merge[f'{s_porg}_quartiles'].value_counts())
        try:
            df_merge['pORG_0.2_All'] = df_merge.Public_Patient_ID.map(dict(zip(df_pri.Public_Patient_ID,df_pri.loc[:,'pORG_0.2_All'])))
        except:
            continue
        #break'''
        break

In [None]:
#old plots
# ls_col = [#'ginisimp_Value','d50_Clones',
#           #'true_Value','d50_Percentage',
#     'chao_Estimator',]
# %matplotlib inline
# #plot numbers
# import itertools
# from statannotations.Annotator import Annotator

# ls_x = ['Cohort_Met_Site',
#         'Cohort_Primary_Site',
#     'pORG_0.2_Met_quartiles_Site',
#     'pORG_0.2_Primary_quartiles_Site',
#         ]
# for y in ls_col:
#     for x in ls_x:
#         order = sorted(df_meta.loc[:,x].dropna().unique())
#         pairs = [item for item in itertools.combinations(order, 2)]
#         fig,ax=plt.subplots(dpi=200,figsize=(4 + len(pairs)*.3,4))
#         sns.boxplot(data=df_meta,x=x,y=y,showfliers=False,ax=ax)
#         sns.stripplot(data=df_meta,x=x,y=y,palette='dark',ax=ax,alpha=0.8,order=order)#s=2,
#         annotator = Annotator(ax=ax, pairs=pairs,data=df_meta,x=x,y=y)
#         annotator.configure(test='Mann-Whitney', verbose=0).apply_test().annotate()
#         ax.set_title(f'{y} vs {x}')
#     break
    

In [None]:
# for productive rearrangements
if s_foci == 'Productive_Rearrangements': #and s_porg=='pORG_0.2_Primary':
    print("Both conditions are True")
    ax[0].get_ylim()
    ax[0].set_ylim((-22727.04999999999, 150000))
    plt.show(fig)

# # #for carl
# df = df_merge.loc[:,['Public_Patient_ID',f'pORG_0.2_Met_quartiles','Cohort','Tumor_Type']].set_index(
#     'Public_Patient_ID').sort_values(by='pORG_0.2_Met_quartiles')
# df[df.Tumor_Type=='Met'].to_csv('TCR_met_patients.csv')

In [None]:
# ####### get diagnosis dates ############
# df_new_surv = pd.read_excel('../annotation/cancer_participant_overview_optr_2023-05-04_16-55-18.xlsx')
# df_date = df_new_surv.loc[:,(df_new_surv.columns.str.contains('Date')) | (df_new_surv.columns=='Participant ID')].copy()
# df_date.rename({'Participant ID':'OPTR ID'},axis=1,inplace=True)
# df_date.set_index('OPTR ID',inplace=True)
# df_date = df_date.astype('datetime64[ns]')
# df_date['OPTR'] = df_date.index
# ############ get adaptive TCR dates ############
# df_adap = pd.read_excel('../annotation/Adaptive-collection dates_JE.xlsx') #297 rows
# df_adap.rename({'OPTR ID':'OPTR'},axis=1,inplace=True)
# df_adap['date tumor collection'] = df_adap['date tumor collection'].astype('datetime64[ns]')
# b_blood = df_adap.loc[:,'tumor and blood same collection date?']!='yes'
# ls_check = df_adap.loc[b_blood,'OPTR']
# #number of unmatched bloods/ diff blood and tumor collection
# print(len(ls_check))
# #get IDs
# d_ids = pd.read_excel(f'{codedir.split("Liver_Lung_PDAC")[0]}MethodsAndReferencesSupplementalData/OLD Versions/Simplified_Public_IDs_Key.xlsx',sheet_name=None)
# df_id = d_ids['RnaSeqKey']
# # add patients w/o RNA seq
# for s_key in ['TcrTumorKey','TcrBloodKey']:
#     df_add = d_ids[s_key].loc[~d_ids[s_key].loc[:,'Public.Specimen.ID'].isin(ls_ids)]
#     df_id = pd.concat([df_id,df_add])
# ## get times from diagnosis, resection, recurrence
# df_patient = pd.read_csv('annotation/20230921_Patient_Metadata_OPTR.csv',index_col=0) #not all OPTR
# df_patient['OPTR'] = df_patient.Public_Patient_ID.map(dict(zip(df_id.loc[:,'Biolibrary.Subject.ID'],df_id.OPTR)))
# df_patient['OPTR'] = np.int64(df_patient.OPTR)
# df_patient['Diagnosis_Date'] = df_patient.OPTR.map(dict(zip(df_date.OPTR,df_date.loc[:,'Diagnosis Date'])))
# #df_check = df_patient[df_patient.OPTR.isin(ls_check)].copy()
# df_check = df_adap[df_adap.OPTR.isin(ls_check)].copy()
# df_check['OPTR'] = np.int64(df_check.OPTR)
# #number of not same day patients with diagnosis date
# print(len(df_check))
# df_patient.rename({'Vital Status at FU':'Vital_Status_at_FU'},axis=1, inplace=True)
# #convert to date time
# ls_col = ['OPTR', 'Days from Diagnosis to FU', 'Days from Resection to Recurrence',
#        'Days from Resection to FU', 'Days from Earliest Recur to FU','Diagnosis_Date','Vital_Status_at_FU']
# df_pt_dates = df_patient.loc[:,ls_col].copy()

# for s_col in ls_col:
#     if s_col.find(' ') > -1:
#         df_pt_dates[s_col.replace(' ',"_")] = pd.to_timedelta(df_pt_dates[s_col], unit='D')
# #calculate dates
# import datetime as dt
# df_pt_dates['FU_date'] = df_pt_dates.Diagnosis_Date + df_pt_dates.Days_from_Diagnosis_to_FU
# df_pt_dates['Recur_date'] = df_pt_dates.FU_date - df_pt_dates.Days_from_Earliest_Recur_to_FU
# df_pt_dates['Resect_date'] = df_pt_dates.FU_date - df_pt_dates.Days_from_Resection_to_FU
# #merge
# df_check = df_check.merge(df_pt_dates,on='OPTR',how='left')
# #classify bloods
# se_blood = df_check.loc[:,'date blood collection']
# df_check.loc[se_blood > df_check.Recur_date,'Blood_Status'] = 'Blood_after_recurrence'
# df_check.loc[se_blood <= df_check.Recur_date,'Blood_Status'] = 'Blood_after_resection_before_recurrence'
# df_check.loc[se_blood <= df_check.Resect_date,'Blood_Status'] = 'Blood_before_resection'
# df_check.loc[se_blood < df_check.Diagnosis_Date,'Blood_Status'] = 'Blood_before_diagnosis'
# #save out blood type
# ls_drop = ['Days from Diagnosis to FU', 'Days from Resection to Recurrence',
#        'Days from Resection to FU', 'Days from Earliest Recur to FU',]
# df_check.drop(ls_drop,axis=1).to_csv('Blood_dates_to_review.csv')

In [None]:
##save out metadata with TCR
#ls_col = df_merge.columns[~df_merge.columns.isin(df_patient.columns)].tolist() + ['Public_Patient_ID']
ls_merge = ['pORG_All_quartiles','pORG_Met_quartiles',
            'pORG_Primary_quartiles','Blood_Type','Tumor_Type','TCR_Met_Site']
ls_col = ls_foci + ls_merge 
#print(len(ls_col))
df_out = df_patient.loc[:,~df_patient.columns.isin(ls_col)].merge(df_merge.loc[:,ls_col+['Public_Patient_ID']],on='Public_Patient_ID',how='left')
df_out = df_merge.loc[:,ls_col+['Public_Patient_ID']].merge(df_patient.loc[:,~df_patient.columns.isin(ls_col)],on='Public_Patient_ID',how='left')
df_out.to_csv('annotation/20231215_Patient_Metadata_TCR_Metrics.csv') #20231108 reclassify bloods
df_out.Public_Patient_ID.duplicated().any()

In [None]:
# #add the all Cohort vs pORG
# if s_porg == 'pORG_0.2_All':
#     for s_type in ['Primary','Met']:
#         fig3, ax3 = plt.subplots(figsize=(3,3),dpi=300)
#         sns.stripplot(data=df_pri[df_pri.Tumor_Type==s_type],x='Cohort',y=s_porg,ax=ax3)
#         sns.boxplot(data=df_pri[df_pri.Tumor_Type==s_type],x='Cohort',y=s_porg,showmeans=True,medianprops={'visible': False},
#                            whiskerprops={'visible': False},meanline=True,showcaps=False,ax=ax3,
#                            meanprops={'color': 'k', 'ls': '-', 'lw': 2},showfliers=False,showbox=False)
#         a = df_pri.loc[(df_pri.Tumor_Type==s_type) & (df_pri.Cohort=='Liver'),s_porg].dropna()
#         b = df_pri.loc[(df_pri.Tumor_Type==s_type) & (df_pri.Cohort=='Lung'),s_porg].dropna()
#         stat, pvalue = stats.ttest_ind(a,b)
#         ax3.set_title(f'{s_type}\np={pvalue:.3}')

In [None]:
# #Lung cohort: low pORG vs high pORG
# for tu_plot in [('Lung','medians'),('Liver','medians'),('Lung','quartiles'),('Liver','quartiles')]:
#     fig,ax = plt.subplots(figsize=(3,3),dpi=200)
#     df_pri.loc[df_pri.Cohort==tu_plot[0],tu_plot[1]].value_counts(dropna=False).plot(kind='bar',ax=ax)
#     ax.set_title(f'{tu_plot[0]}: {s_porg} {tu_plot[1]}')

## TCR survival <a name="tcells"></a>

- The goal is to find a rational method to set a pORG threshold selecting High/Low cohorts for use in Kaplan-Meier and pathway analysis.
- Since we have survival data and it’s probably the best clinical endpoint we can use, we will start by selecting a survival threshold that seems relevant. (545 days)
- Based on the survival threshold, we will divide the patient into short/long term survivor cohorts.
- We can then generate rock curves testing the ability of our pORG score to predict these cohorts.
- Using the Youden’s Index, we can maximize the pORG threshold for making this prediction.
- Finally, we can use this pORG threshold to generate Kaplan-Meier curves and test the significance of these using log-rank p-value.
- We will try this with a few reasonable selections for our original survival threshold.


[contents](#contents)

In [None]:
# ls_foci =[#'jaccard_Met',#'public_Met','morisita_Met','
#           #'jaccard_Blood','public_Blood',
#     'morisita_Blood',
#         # 'public_Primary',morisita_Primary''jaccard_Primary',
#      #'public_Tumor',
#     'morisita_Tumor',#'jaccard_Tumor',
#     'morisita_Tumor_blood','public_Tumor_blood','jaccard_Tumor_blood',
#     'morisita_Blood_blood','public_Blood_blood','jaccard_Blood_blood',
# ]

In [None]:
##CPH
df_merge = pd.read_csv('annotation/20231215_Patient_Metadata_TCR_Metrics.csv',index_col=0)
b_tcr_pts = (df_merge.Clonality_Blood.notna() | df_merge.Clonality_Tumor.notna())
print(b_tcr_pts.sum())
df_km_samples = df_merge.loc[(df_merge.Alive_30_days_post_surgery) & b_tcr_pts]
df_result = pd.DataFrame()
se_recur = {'Other_Recurrence', 'No_Recurrence','Lung_Cohort','Liver_Cohort'} 
ls_cats = ['LV_Invasion','LN_Pos'] #categorical,'Neoadjuvant'
ls_foci = [#'pORG_0.2_Primary','pORG_0.2_Met','pORG_0.2_All',
           #'pSUB1e-04_Primary','PurIST_Primary',
   # 'No_Recurrence','Lung_Cohort','Liver_Cohort','Cohort',#'Other_Recurrence' not in index,
# "Simpson's Evenness blood",
#  'Shannon_Entropy_Tumor','Templates_per_ng','Shannon_Entropy_Blood',
#  'Productive_Rearrangements','Simpsons_Diversity_Tumor','Simpsons_Diversity_Blood',
#  'Clonality_Tumor','Clonality_Blood',"Simpson's Evenness tumor",
#  'Fraction Tumor Distinct TCRs','Percent Tumor Distinct Clones','Productive_Rearrangements_Blood'
# 'Fraction Shared Clones 0to2',
#  'Fraction Shared Clones 2to5',
#  'Fraction Shared Clones 5to10',
#  'Fraction Shared Clones >=10',
#  'Fraction Tumor Distinct Clones 0to2',
#  'Fraction Tumor Distinct Clones 2to5',
#  'Fraction Tumor Distinct Clones 5to10',
#  'Fraction Tumor Distinct Clones >=10',
#         'Hill_1', 'Hill_2', 'Hill_3',
#        'Hill_4', 'Hill_5', 'Hill_6',
#     'propshared_proportion_shared_clones',
#       'propshared_proportion_shared_templates',
#     'propshared_proportion_shared_clones_blood',
#        'propshared_proportion_shared_templates_blood',
#     'Fraction Shared Clones 10',
#       'Fraction Shared Clones 5',
#       'Fraction Shared Clones 2',
#            'Hill_1_blood','Hill_2_blood', 'Hill_3_blood',
#         'Hill_4_blood', 'Hill_5_blood','Hill_6_blood',
#     'd50_Clones', 'd50_Clones_blood',
#            'chao_Estimator','chao_Estimator_blood',
#            'true_Value','true_Value_blood',
#            'ginisimp_Value','ginisimp_Value_blood',
         ]

tu_time_censor = (('Days from Diagnosis to FU','Survival'),
    ('Days from Resection to Recurrence', 'Recurrence'),
               )
idx = 0
s_time = tu_time_censor[idx][0]
s_censor = tu_time_censor[idx][1]
df_result_all = pd.DataFrame()
#for (s_time, s_censor) in tu_time_censor:
for s_title_str in ['All','Primary','Met',]:
    df_result = pd.DataFrame(columns=['exp(coef)','exp(coef) lower 95%','exp(coef) upper 95%','p','n'])
    for s_col in ls_foci:
        ls_surv_col =[s_col,s_time,s_censor,'Public_Patient_ID']
        try:
            if s_title_str == 'Primary':
                if s_col.find('lood') > -1:
                    print(f'Primaries only blood. n={sum(df_km_samples.Blood_Type=="Primary")}')
                    df = df_km_samples.loc[(df_km_samples.Blood_Type=='Primary'),ls_surv_col].copy()
                else:
                    print(f'Primaries only tumor. n={sum(df_km_samples.Tumor_Type=="Primary")}')
                    df = df_km_samples.loc[(df_km_samples.Tumor_Type=='Primary'),ls_surv_col].copy()
            elif s_title_str == 'Met':
                if s_col.find('lood') > -1:
                    print(f'Met only blood. n={sum(df_km_samples.Blood_Type=="Met")}')
                    df = df_km_samples.loc[(df_km_samples.Blood_Type=='Met'),ls_surv_col].copy()
                else:
                    print(f'Met only tumor. n={sum(df_km_samples.Tumor_Type=="Met")}')
                    df = df_km_samples.loc[(df_km_samples.Tumor_Type=='Met'),ls_surv_col].copy()
                
            else:
                df = df_km_samples.loc[:,ls_surv_col].dropna().copy()
            df.set_index('Public_Patient_ID',inplace=True)
            if se_non_para.isin([s_col]).any():
                print(f'log {s_col}')
                df[s_col] = np.log(df.loc[:,s_col])  
                #continue
            else:
                #continue
                print('')
            if not df.loc[:,s_col].dtype== 'float64':
                df_dummy = pd.get_dummies(df.loc[:,[s_col]],drop_first=True)
                df.drop(s_col,axis=1,inplace=True)
                s_col = df_dummy.columns[0]
                df[s_col] = df_dummy
            if len(se_recur.intersection(set([s_col]))):
                 df.loc[df_km_samples.set_index('Public_Patient_ID').Recurrence_Sites_4=='No_Resection',s_col] = np.nan
            fig2, cph = util.cph_plot(df,s_col,s_time,s_censor,figsize=(4,1.5))
            plt.tight_layout()
            #plt.close(fig2)
            fig2.savefig(f'figures/CPH_single_TCR_{s_col}_{s_title_str}.png')
            df_result = pd.concat([df_result, cph.summary])
            df_result.loc[s_col,'n'] = cph._n_examples
            #multi
            '''
            df_multi = df_km_samples.loc[:,['Grade','Stage','Public_Patient_ID']].set_index('Public_Patient_ID')
            df = df.merge(df_multi,left_index=True,right_index=True)
            df_dummy = pd.get_dummies(df_km_samples.loc[:,ls_cats+['Public_Patient_ID']].dropna().set_index('Public_Patient_ID'),drop_first=True) 
            df = df.merge(df_dummy,left_index=True,right_index=True)
            df = df.dropna()
            fig, cph = util.cph_plot(df,s_col,s_time,s_censor) #figsize
            #plt.close(fig)
            '''
        except:
            print(f'error {s_col}')
    df_result = df_result.iloc[:,0:5]
    df_result['Samples_Included'] = s_title_str
    df_result_all = pd.concat([df_result_all,df_result])
    #break
df_result_all.to_csv(f'results/results_single_CPH_TCR_{s_censor}.csv')

# youden + kaplan meier in TCR patients

In [None]:
#define high and low survivors
%matplotlib inline
importlib.reload(util)
ls_foci = [#'public_Blood','jaccard_Blood',
          # 'jaccard_Tumor','public_Tumor',#'Clonality_Blood',"Simpson's Evenness blood",
#             'Simpsons_Diversity_Blood','Shannon_Entropy_Blood','Productive_Rearrangements_Blood',
         #'Productive_Rearrangements',#'Simpsons_Diversity_Tumor',
        #     'Shannon_Entropy_Tumor',#'Templates_per_ng',
        # "Simpson's Evenness tumor",
    'Percent Tumor Distinct Clones',
#        # 'Fraction Tumor Distinct TCRs',
             ]
s_time = 'Days from Diagnosis to FU'
s_censor = 'Survival'
df_patient = pd.read_csv('annotation/20231215_Patient_Metadata_TCR_Metrics.csv',index_col=0)
print(len(df_patient))
ls_primary = ['All','Primary','Met',] #'Lung',
for b_primary in ls_primary:
    print(b_primary)
    for s_tcr in ls_foci:
        print(s_tcr)
        if b_primary == 'Primary':
            if s_tcr.find('lood') > -1:
                print(f'Primaries only blood. n={sum(df_patient.Blood_Type=="Primary")}')
                df_patient2 = df_patient[(df_patient.Blood_Type=='Primary')& (df_patient.loc[:,s_tcr].notna()) & (df_patient.Alive_30_days_post_surgery)].copy()
            else:
                print(f'Primaries only tumor. n={sum(df_patient.Tumor_Type=="Primary")}')
                df_patient2 = df_patient[(df_patient.Tumor_Type=='Primary') & (df_patient.loc[:,s_tcr].notna()) & (df_patient.Alive_30_days_post_surgery)].copy()
        elif b_primary == 'Lung':
            df_patient2 = df_patient[(df_patient.Cohort=='Lung')& (df_patient.Alive_30_days_post_surgery)].copy()
        elif b_primary == 'Liver':
            df_patient2 = df_patient[(df_patient.Cohort=='Liver')& (df_patient.Alive_30_days_post_surgery)].copy()
        elif b_primary == 'Met':
            if s_tcr.find('lood') > -1:
                print(f'Met only blood. n={sum(df_patient.Blood_Type=="Met")}')
                df_patient2 = df_patient[(df_patient.Blood_Type=='Met')& (df_patient.loc[:,s_tcr].notna()) & (df_patient.Alive_30_days_post_surgery)].copy()
            else:
                print(f'Met only tumor. n={sum(df_patient.Tumor_Type=="Met")}')
                df_patient2 = df_patient[(df_patient.Tumor_Type=='Met')& (df_patient.loc[:,s_tcr].notna()) & (df_patient.Alive_30_days_post_surgery)].copy()
            
        else:
            #df_patient2 = df_patient.copy()
            df_patient2 = df_patient[(df_patient.loc[:,s_tcr].notna()) & (df_patient.Alive_30_days_post_surgery)].copy()
            print(len(df_patient2))
            print('using all')
#             if s_tcr.find('lood') > -1:
#                 df_patient2 = df_patient[df_patient.loc[:,s_foci].notna()].copy()
        for SurvivalThreshold in [545]:# 365, #90, #, 730 #180,
            GoodIdx = (df_patient2.loc[:,s_time] > SurvivalThreshold) & (df_patient2.loc[:,s_tcr].notna()) & (df_patient2.Alive_30_days_post_surgery)
            BadIdx = (df_patient2.loc[:,s_time] <= SurvivalThreshold) & (df_patient2.loc[:,s_tcr].notna()) & (df_patient2.loc[:,s_censor] == 1) & (df_patient2.Alive_30_days_post_surgery)
            df_patient2.loc[GoodIdx,f'{s_tcr}_{SurvivalThreshold}_day_survival'] = 'long'
            df_patient2.loc[BadIdx,f'{s_tcr}_{SurvivalThreshold}_day_survival'] = 'short'
        #plot
        d_fig_good = util.youden_high_good(df_patient2,b_primary,s_time,s_censor,s_tcr)
        d_fig_bad = util.youden_low_good(df_patient2,b_primary,s_time,s_censor,s_tcr)
        #break
    break

In [None]:
# # km - old - using predetermined cutoffs
# alpha = 0.1
# savedir = 'figures'
# d_cut = {4:['low','med-low','med-high','high'],
#          3: ['low','highX','high'], #['low','med','high'],#
#          2:['low','high']
#         }
# for i_cut, labels in d_cut.items():
#     for s_title_str in  ['All','Primary','Met',]:
#         df_result = pd.DataFrame()
#         if s_title_str == 'All':
#             df_km_samples = df_merge.loc[df_merge.Alive_30_days_post_surgery]
#         else:
#             df_km_samples = df_merge.loc[(df_merge.loc[:,'Tumor_Type']==s_title_str) & (df_merge.Alive_30_days_post_surgery),[s_time,s_censor,'Public_Patient_ID']+ls_foci]
#         for s_col in ls_foci: #[2::]
#             try:
#                 df_km, pvalue = util.quartile_km(df_km_samples,s_col,s_title_str,savedir,alpha,i_cut,
#                                                 labels,s_time=s_time,s_censor='Survival')
#                 print(f'{s_col} {s_title_str} {i_cut}: {pvalue}')
#             except:
#                 print(f'no KM {s_col} {s_title_str} {i_cut}')

# save out results

#df_km_pts
#         df_result_model = cph.summary.loc[:,['exp(coef)','p']].reset_index()
#         df_result=pd.concat([df_result,df_result_model])
#         #plt.close(fig2)
    #print(s_title_str)
#break
#df_result.sort_values(by='p').set_index('covariate').to_csv(f'results_CPH_TCR_{s_title_str}.csv')


In [None]:
#Lung Cohort vs. Clinical Variables (chi squared)
ls_clin = [#'Grade',
           #'Age','Sex','Neoadjuvant',
           #'Stage',#'LV_Invasion','LN_Pos'
    'Tumor_Type'#'Blood_Type'
          ]
ls_cohort = [#'Recurrence_Sites_4',#'Lung_Cohort',
#              'No_Recurrence',
#              'Liver_Cohort',
    'Cohort',
             'pORG_All_quartiles'
             ]
b_resected = (df_merge.Recurrence_Sites_4!='No_Resection') & (df_merge.Recurrence_Sites_4.notna())
for s_group in ['TCR','All', 'TCR Met']:
    if s_group == 'TCR':
        #df = df_merge.loc[b_tcr_pts & b_resected,ls_clin + ls_cohort]
        df = df_merge.loc[b_tcr_pts ,ls_clin + ls_cohort]
    elif s_group == 'All':
        df = df_merge.loc[b_resected,ls_clin + ls_cohort]
        df = df_merge.loc[:,ls_clin + ls_cohort]
    elif s_group == 'TCR Met':
        df = df_merge.loc[df_merge.Tumor_Type=='Met',ls_clin + ls_cohort]
    df = df.astype('object')
    for s_clin in ls_clin:
        for s_cohort in ls_cohort:
            df2 = df.loc[:,[s_clin,s_cohort]].dropna()
            print(len(df2))
            crosstab= pd.crosstab(df2[s_clin], df2[s_cohort]) 
            annot = pd.crosstab(df2[s_clin], df2[s_cohort],normalize='columns') 
            try:
                statistic,pvalue, dof, expected_freq = stats.chi2_contingency(crosstab)
                if pvalue < 0.99:
                    fig, ax = plt.subplots()
                    sns.heatmap(crosstab,ax=ax,annot=True,cbar_kws={'label':'Percent Pts.'}) #/crosstab.sum()
                    ax.set_title(f'{s_group} p={pvalue:.3} n={len(df2)}')
    #             break
    #         break
            except:
                continue
    break
#Lung Cohort TCR versus overall vs. Clinical Variables (chi squared)
ls_clin = [#'Grade',
           'Age','Sex','Neoadjuvant',#
           'Stage','LV_Invasion','LN_Pos'
          ]
ls_cohort = ['Lung_Cohort','No_Recurrence',
             'Liver_Cohort',
             ]

for s_clin in ls_clin:
    for s_cohort in ls_cohort:
        crosstab = pd.DataFrame()
        df1 = df_merge.loc[(b_tcr_pts) & (df_merge[s_cohort]),s_clin].dropna()
        crosstab['TCR'] = df1.value_counts()
        df2 = df_merge.loc[(df_merge[s_cohort]),s_clin].dropna()
        crosstab['All'] = df2.value_counts()
        statistic,pvalue, dof, expected_freq = stats.chi2_contingency(crosstab)
        if pvalue < 1.1:
            fig, ax = plt.subplots()
            #sns.heatmap(crosstab - expected_freq,ax=ax,annot=True,cbar_kws={'label':'Obs - Exp'})
            sns.heatmap(crosstab/crosstab.sum(),ax=ax,annot=True,cbar_kws={'label':'Percent Pts.'})
            ax.set_title(f'{s_clin} {s_cohort} p={pvalue:.3}')
            break
        break
#categorical correlation
sns.set_palette('tab10')
#df_all,s_group,s_marker,s_type,s_cell,alpha=0.05,s_propo='in',b_ttest=False):
ls_marker =  ['Shannon_Entropy_Tumor','Templates_per_ng','Shannon_Entropy_Blood',
 'Productive_Rearrangements','Simpsons_Diversity_Tumor','Simpsons_Diversity_Blood',
 'Clonality_Tumor','Clonality_Blood',"Simpson's Evenness tumor","Simpson's Evenness blood",
 'Fraction Tumor Distinct TCRs','Percent Tumor Distinct Clones','Productive_Rearrangements_Blood']
ls_marker = ['Tumor_Cellularity_by_DNA_Primary',
       'Tumor_Cellularity_by_DNA_Met',]
ls_group = [#'Grade','Stage','LV_Invasion','LN_Pos',
    'pORG_Primary_quartiles', 'pORG_Met_quartiles',
    'Cohort',#'Age','Sex',
    'Neoadjuvant'
 ]
df_all = df_merge.loc[b_tcr_pts]
for s_marker in ls_marker:
    for s_group in ls_group:
        fig, pvalue,__,__ = util.categorical_correlation_boxplot(df_all,s_group,s_marker,
                                    s_type='',s_cell='',alpha=0.05,s_propo='',b_ttest=True)

In [None]:
# look as carl's youden cutoff for pORG
cutoff = 0.02491
df_patient.loc[df_patient.loc[:,'pORG_0.2_Primary'] > cutoff,'TCR_pORG_Primary'] = 'high' # 'pORG_0.2_Primary','pORG_0.2_Met','pORG_0.2_allPrimary','pORG_0.2_allMet',
df_patient.loc[df_patient.loc[:,'pORG_0.2_Primary'] <= cutoff,'TCR_pORG_Primary'] = 'low'

cutoff = 0.02491
df_patient.loc[df_patient.loc[:,'pORG_0.2_allPrimary'].fillna(df_patient.loc[:,'pORG_0.2_allMet']) > cutoff,'TCR_pORG_All'] = 'high' # 'pORG_0.2_Primary','pORG_0.2_Met','pORG_0.2_allPrimary','pORG_0.2_allMet',
df_patient.loc[df_patient.loc[:,'pORG_0.2_allPrimary'].fillna(df_patient.loc[:,'pORG_0.2_allMet']) <= cutoff,'TCR_pORG_All'] = 'low'
%matplotlib inline
b_tcr = (df_patient.Clonality_Blood.notna() | df_patient.Clonality_Tumor.notna())
#b_tcr = (df_patient.Shannon_Entropy_Tumor.notna())
#b_tcr = (df_patient.Tumor_Type.notna())
d_alive = df_patient.Alive_30_days_post_surgery
df_patient['pORG_0.2_All'] = df_patient.loc[:,'pORG_0.2_allPrimary'].fillna(df_patient.loc[:,'pORG_0.2_allMet'])
#tcr
#util.km_plot(df_patient[df_patient.Tumor_Type.notna()],'Cohort',s_time,s_censor)
util.km_plot(df_patient[b_tcr & d_alive],'TCR_pORG_Primary',s_time,s_censor)
util.km_plot(df_patient[b_tcr & d_alive],'TCR_pORG_All',s_time,s_censor)
util.km_plot(df_patient[b_tcr & d_alive],'Cohort',s_time,s_censor)

# all 
# util.km_plot(df_patient[d_alive],'TCR_pORG_Primary',s_time,s_censor)
# util.km_plot(df_patient[d_alive],'TCR_pORG_All',s_time,s_censor)
# util.km_plot(df_patient[d_alive],'Cohort',s_time,s_censor)
# #lung and liver
# util.km_plot(df_patient[(df_patient.Tumor_Type.notna()) & (df_patient.Cohort=='Lung')],'TCR_pORG',s_time,s_censor)
# util.km_plot(df_patient[(df_patient.Tumor_Type.notna()) & (df_patient.Cohort=='Liver')],'TCR_pORG',s_time,s_censor)

#cox PH
for s_col in ['pORG_0.2_allPrimary','pORG_0.2_Primary','pORG_0.2_Met',
              'pORG_0.2_allMet','pORG_0.2_All']:
    df = df_patient.loc[b_tcr & d_alive,[s_col,s_time,s_censor,'Public_Patient_ID']].dropna() #
    print(df.Public_Patient_ID.duplicated().any())
    print(len(df))
    try: 
        fig2, cph = util.cph_plot(df.drop('Public_Patient_ID',axis=1),s_col,s_time,s_censor,figsize=(4,1.5))
        plt.tight_layout()
    except:
        print('error')

In [None]:
#df_patient.loc[b_tcr,df_patient.columns.str.contains('pORG')].dropna(how='all').count()

In [None]:
# #pORG vs survival within liver and lung TCR
# df_patient['pORG_0.2_All'] = df_patient.loc[:,'pORG_0.2_allPrimary'].fillna(df_patient.loc[:,'pORG_0.2_allMet'])
# for s_col in ['pORG_0.2_All']:
#     for s_cohort in ['Liver','Lung']:
#         print(s_cohort)
#         try: #cox
#             df = df_patient.loc[(df_patient.Tumor_Type.notna()) & (df_patient.Cohort==s_cohort),[s_col,s_time,s_censor]].dropna()
#             fig2, cph = util.cph_plot(df,s_col,s_time,s_censor,figsize=(4,1.5))
#             plt.tight_layout()
#         except:
#             print('error')

In [None]:
# #pORG vs survival within liver and lung (all patients)
# df_patient['pORG_0.2_All'] = df_patient.loc[:,'pORG_0.2_allPrimary'].fillna(df_patient.loc[:,'pORG_0.2_allMet']).fillna(df_patient.loc[:,'pORG_0.2_All_T2'])
# df_patient['pSUB_1e-04_All'] = df_patient.loc[:,'pSUB1e-04_allPrimary'].fillna(df_patient.loc[:,'pSUB1e-04_allMet'])
# df_patient['PurIST_All'] = df_patient.loc[:,'PurIST_Primary'].fillna(df_patient.loc[:,'PurIST_Met']).fillna(df_patient.loc[:,'PurIST_Primary_T2'])

# for s_col in ['PurIST_All','pORG_0.2_All','pSUB_1e-04_All']:
#     for s_cohort in ['Liver','Lung','both']:
#         print(s_cohort)
#         try: #cox
#             df = df_patient.loc[(df_patient.Cohort==s_cohort),[s_col,s_time,s_censor]].dropna()
#             fig2, cph = util.cph_plot(df,s_col,s_time,s_censor,figsize=(4,1.5))
#             plt.tight_layout()
#         except:
#             df = df_patient.loc[:,[s_col,s_time,s_censor]].dropna()
#             fig2, cph = util.cph_plot(df,s_col,s_time,s_censor,figsize=(4,1.5))
#             plt.tight_layout()
#     break

In [None]:
# define long and short lung
short_long = df_patient.loc[(df_patient.Tumor_Type.notna()) & (df_patient.Cohort=='Lung'),s_time].median()
df_patient.loc[df_patient.loc[:,s_time] > short_long,'TCR_Lung_Median_Surv'] = 'lung_long'
df_patient.loc[df_patient.loc[:,s_time] < short_long,'TCR_Lung_Median_Surv'] = 'lung_short'
df_patient.loc[(df_patient.Tumor_Type.isna()) | (df_patient.Cohort!='Lung'),'TCR_Lung_Median_Surv'] = np.nan
# define long and short liver
short_long = df_patient.loc[(df_patient.Tumor_Type.notna()) & (df_patient.Cohort=='Liver'),s_time].median()
df_patient.loc[df_patient.loc[:,s_time] > short_long,'TCR_Liver_Median_Surv'] = 'liver_long'
df_patient.loc[df_patient.loc[:,s_time] < short_long,'TCR_Liver_Median_Surv'] = 'liver_short'
df_patient.loc[(df_patient.Tumor_Type.isna()) | (df_patient.Cohort!='Liver'),'TCR_Liver_Median_Surv'] = np.nan


In [None]:
util.km_plot(df_patient,'Tumor_Type',s_time,s_censor)
util.km_plot(df_patient,'Blood_Type',s_time,s_censor)

In [None]:
#util.km_plot(df_patient,'TCR_Lung_Median_Surv',s_time,s_censor)

In [None]:
#util.km_plot(df_patient,'TCR_Liver_Median_Surv',s_time,s_censor)

In [None]:
#df_patient['TCR_Lung_Liver'] = df_patient.TCR_Lung_Median_Surv.fillna(df_patient.TCR_Liver_Median_Surv)
#util.km_plot(df_patient,'TCR_Lung_Liver',s_time,s_censor)

### TCR heatmap

In [None]:
 ls_foci_heatmap = ['Fraction Shared Clones 0to2','Fraction Shared Clones 2to5',
 'Fraction Shared Clones 5to10','Fraction Shared Clones >=10',
 'Fraction Tumor Distinct Clones 0to2','Fraction Tumor Distinct Clones 2to5',
 'Fraction Tumor Distinct Clones 5to10','Fraction Tumor Distinct Clones >=10',
 'Hill_1','Hill_2','Hill_3','Hill_4','Hill_5','Hill_6',
 'Hill_1_blood','Hill_2_blood','Hill_3_blood','Hill_4_blood','Hill_5_blood','Hill_6_blood',
 'd50_Clones','d50_Clones_blood',
 'chao_Estimator','chao_Estimator_blood',
 'true_Value','true_Value_blood',
 'ginisimp_Value','ginisimp_Value_blood',
 'Simpsons_Diversity_Tumor','Productive_Rearrangements',
 'Templates_per_ng','Shannon_Entropy_Tumor',
 'Clonality_Tumor',"Simpson's Evenness tumor",
 'Productive_Rearrangements_Blood',"Simpson's Evenness blood",'Shannon_Entropy_Blood',
 'Simpsons_Diversity_Blood','Clonality_Blood',
 'jaccard_Primary','jaccard_Blood','jaccard_Met',
 'public_Blood','public_Met','public_Primary',
 'morisita_Blood','morisita_Met','morisita_Primary',
 'public_Met_Bld_blood','public_Primary_Bld_blood',
 'jaccard_Primary_Bld_blood','jaccard_Met_Bld_blood',
 'morisita_Met_Bld_blood','morisita_Primary_Bld_blood']

In [None]:

#heatmap
# make sure to end on pORG all
if s_porg == 'pORG_0.2_All':
    import matplotlib as mpl
    df_pri['Survival interval'] = pd.qcut(df_merge.loc[:,s_time],6)
    df_pri.rename({'medians':f'{s_porg} (median)'},axis=1,inplace=True)
    ls_annot = ['Tumor_Type', 'Cohort',f'{s_porg} (median)' ,'Survival interval']
    cmap=plt.cm.get_cmap('Blues', 6)
    ls_color=[mpl.cm.Set1.colors,('#D55E00', '#0072B2'),('#E69F00', '#56B4E9'),
             [cmap(item) for item in np.arange(0,1,.17)]]
    df_annot = pd.DataFrame()
    dd_color = {}
    for idx, s_annot in enumerate(ls_annot):
        color_palette = ls_color[idx]
        d_color = dict(zip(sorted(df_pri.loc[:,s_annot].dropna().unique()),color_palette[0:len(df_pri.loc[:,s_annot].dropna().unique())]))
        d_color.update({'NA':'lightgray'})
        network_colors = df_pri.loc[:,s_annot].astype('object').fillna('NA').map(d_color)
        df_annot[s_annot] = pd.DataFrame(network_colors)
        dd_color.update({s_annot:d_color})
    from matplotlib.pyplot import gcf
    g = sns.clustermap(df_pri.loc[:,ls_foci_heatmap].dropna(),z_score=1,dendrogram_ratio=0.1,cmap='RdBu_r',vmin=-5,vmax=5,
          cbar_pos=(0.01, 0.94, 0.04, 0.08),figsize=(15, 10),row_colors=df_annot,cbar_kws={'label': 'z-score'},xticklabels=1,
                      method='complete')
    for idx, (s_annot, d_color) in enumerate(dd_color.items()):
        g.ax_col_dendrogram.bar(0, 0, color='w',label=' ', linewidth=0)
        for label,color in d_color.items():
            g.ax_col_dendrogram.bar(0, 0, color=color,label=label, linewidth=0)
    l1 = g.ax_col_dendrogram.legend(loc="right", ncol=1,bbox_to_anchor=(0, 0.7),bbox_transform=gcf().transFigure)
    g.ax_heatmap.set_yticks([])
    g.ax_heatmap.set_title(s_porg)
    g.savefig(f'figures/heatmap_TCR_{s_porg}.png',dpi=300)
    #plt.close(g.fig)

In [None]:

def heatmap_row_colors(df_meta,d_corr,ls_annot = ['Type','Cohort','pORG_0.2_All_quartiles']):
    ls_color=[mpl.cm.Set1.colors,('#D55E00', '#0072B2'),('#E69F00', '#56B4E9')]
    df_annot = pd.DataFrame()
    dd_color = {}
    for idx, s_annot in enumerate(ls_annot):
        color_palette = ls_color[idx]
        d_color = dict(zip(sorted(df_meta.loc[:,s_annot].dropna().unique()),color_palette[0:len(df_meta.loc[:,s_annot].dropna().unique())]))
        d_color.update({'NA':'lightgray'})
        d_corr[s_annot] = d_corr.index.map(dict(zip(df_meta.Sample,df_meta.loc[:,s_annot])))
        network_colors = d_corr.loc[:,s_annot].astype('object').fillna('NA').map(d_color)
        df_annot[s_annot] = pd.DataFrame(network_colors)
        dd_color.update({s_annot:d_color})
    return(df_annot,dd_color)

def row_colors_legend(g,dd_color):
    for idx, (s_annot, d_color) in enumerate(dd_color.items()):
        g.ax_col_dendrogram.bar(0, 0, color='w',label=' ', linewidth=0)
        for label,color in d_color.items():
            g.ax_col_dendrogram.bar(0, 0, color=color,label=label, linewidth=0)
    

In [None]:
#overlap heatmaps
s_dir =f'{codedir.split("Liver_Lung_PDAC")[0]}R'
df_meta = pd.read_csv(f'{s_dir}/raw_TCR_data_test/metadata.txt',sep='\t')
df_meta['Type'] = df_meta.Site.copy()
df_meta['Type'] = df_meta.Type.replace('Blood',np.nan)
df_meta['Type'] = df_meta.Type.fillna(df_meta.Blood_Type)
ls_file=['results_TCR_public_overlap.csv','results_TCR_morisita_overlap.csv',
         'results_TCR_jaccard_overlap.csv']
for s_file in ls_file:
    df = pd.read_csv(f'{s_dir}/{s_file}',index_col=0)
    df.columns = [item.split('TCR_raw_data_')[1] for item in df.columns]
    df.index = [item.split('TCR_raw_data_')[1] for item in df.index]
    df_blood = np.log(df.loc[df.index.str.contains('-B'),df.columns.str.contains('-B')])
    df_tum = np.log(df.loc[~df.index.str.contains('-B'),~df.columns.str.contains('-B')])
    df_tum = df_tum.replace(-np.Inf, np.nan)
    d_dfs = {'Tumor':df_tum,'Blood':df_blood}
    for s_type, d_corr in d_dfs.items():
        corr = d_corr.fillna(0)
        df_annot,dd_color = heatmap_row_colors(df_meta,d_corr)
        g = sns.clustermap(corr, cmap="coolwarm", figsize=(7, 6),row_colors=df_annot)
        row_colors_legend(g,dd_color)
        #mask = np.triu(np.ones_like(corr)) #triu
        mask = np.diag(np.diag(np.ones_like(corr)))
        values = g.ax_heatmap.collections[0].get_array().reshape(corr.shape)
        new_values = np.ma.array(values, mask=mask)
        g.ax_heatmap.collections[0].set_array(new_values)
#         g.ax_col_dendrogram.set_visible(False)
        g.ax_heatmap.set_yticks([])
        g.ax_heatmap.set_xticks([])
        s_title = f"{s_type} {s_file.split('results_TCR_')[1].split('.csv')[0].replace('_',' ').title()}"   
        g.ax_heatmap.set_title(s_title,pad=30)
        l1 = g.ax_col_dendrogram.legend(loc="right", ncol=1,bbox_to_anchor=(0, 0.7),bbox_transform=gcf().transFigure)
        #break
    #break

In [None]:
#high low pORG quartile versus gene expresison
pal_porg = ('#56B4E9','#E69F00')
pal_liv = ('#0072B2','#D55E00')
sns.set_palette(pal_porg)
#if s_porg == 'pORG_0.2_Primary':
#if s_porg == 'pORG_0.2_Met':
#df_rna['Public_Patient_ID'] = [item.split('-T')[0] for item in df_rna.index]
#liver vs lung (run t cell section to load t cell data!)
for s_porg in ['pORG_0.2_Primary','pORG_0.2_Met']:
    ls_marker = ['CD3E','CD4', 'CD8A', 'MS4A1','CD68','OLR1','MRC1','MX1','STAT1',]
    df_plot = df_rna.loc[df_rna.index.isin(df_merge.Public_Patient_ID),ls_marker].unstack().reset_index()
    df_plot['Public_Patient_ID'] = df_plot.level_1 #[item[0:-2] for item in df_plot.level_1]
    df_plot['pORG'] = df_plot.Public_Patient_ID.map(dict(zip(df_merge.Public_Patient_ID,df_merge.loc[:,s_porg])))
    df_plot.rename({0:'Expression','level_0':'Gene'},axis=1,inplace=True)
    fig,ax = plt.subplots(dpi=200,figsize=(5,3))
    sns.stripplot(data=df_plot,x='Gene',y='Expression',hue='pORG',dodge=True,ax=ax,s=2)
    sns.boxplot(data=df_plot,x='Gene',y='Expression',hue='pORG',ax=ax,showmeans=True,medianprops={'visible': False},
                           whiskerprops={'visible': False},meanline=True,showcaps=False,
                           meanprops={'color': 'k', 'ls': '-', 'lw': 2},showfliers=False,showbox=False)
    h, l = ax.get_legend_handles_labels()
    ax.legend(h[0:2],l[0:2],loc='lower left')
    pairs = [((item,'low'),(item,'high')) for item in ls_marker]
    annot = Annotator(ax, pairs, data=df_plot,x='Gene',y='Expression',hue='pORG',
                      order=ls_marker,hue_order=('low','high'))
    annot.configure(test='t-test_ind', text_format='simple',fontsize=7,comparisons_correction='fdr_bh') #
    annot.apply_test()
    d_pval = dict([(res.data.group1[0],res.data.pvalue) for res in annot.annotations])
    pvalues = [d_pval[item] for item in ls_marker]
    reject, corrected, __, __ = statsmodels.stats.multitest.multipletests(pvalues,method='fdr_bh')
    formatted_pvalues = [f'p={pvalue:.2}' for pvalue in list(corrected)]
    annot.set_custom_annotations(formatted_pvalues)
    annot.annotate()
    ax.set_title(f'{s_porg} Quartiles') 
    plt.tight_layout()
    fig.savefig(f'figures/gene_expression_binary_{s_porg}_Quartiles.png')
    #plt.close(fig)

In [None]:
sorted(df_merge.columns[df_merge.columns.str.contains('umor')])
ls_met_site = ['Templates_per_ng','Productive_Rearrangements', 'Clonality_Tumor',
              'Shannon_Entropy_Tumor', "Simpson's Evenness tumor",'Simpsons_Diversity_Tumor',
               'Percent Tumor Distinct Clones',
              ]


In [None]:
##TCR met sites
#color by met
#add pORG quartiles, plot violins 
#update the lung error
df_merge.loc[df_merge.Public_Patient_ID=='ST-00015839','TCR_Met_Site'] = 'Lung'
%matplotlib inline
sns.set_palette('tab10')
importlib.reload(util)

s_porg = 'pORG_0.2_Met'
for s_comp in ['Cohort','quartiles']:#, 'pORG_78_Primary',
    df_pri = df_merge.loc[df_merge.Tumor_Type=='Met'].copy()
    df_pri = util.add_quartiles(df_pri,s_porg)#.drop(219) #p. rearrangements outlier 
    for s_foci in ls_foci:
        s_foci =  "Productive_Rearrangements"#"Simpson's Evenness tumor"#'Percent Tumor Distinct Clones'
        print(s_foci)
        fig = util.plot_violins3(df_pri,s_stats,s_foci,s_comp,s_porg,hue='TCR_Met_Site',figsize=(5,3))#True#False
        fig.savefig(f'figures/violinplot_met_colors_{s_foci}_{s_porg}_{s_comp}.png')
        break
    #break

#group by met
#'''
d_replace = {'Gastric serosal':'Other', 'Lower quadrant mass':'Other','Mesocolon':'Other',
             'Tissue near left hepatic artery':'Other',
            'Pelvic nodule':'Other', 'Gallbladder':'Other', 'Omentum':'Other',
             'Retroperitoneum':'Other'}
df_pri['TCR_Met_Site_S'] = df_pri.TCR_Met_Site.replace(d_replace)
for s_foci in ls_met_site:#ls_foci:
    fig = util.plot_violins3(df_pri,s_stats,s_foci,s_comp='TCR_Met_Site_S',s_porg=s_porg,hue='TCR_Met_Site',figsize=(5.5,3))#True#False
    #break
df_pri.loc[~df_pri.Cohort.isna(),['Public_Patient_ID','Cohort','TCR_Met_Site']].sort_values(by=['Cohort','TCR_Met_Site']).to_csv('test.csv')
#'''

## Section 7 <a name="violin2"></a>


GSEA plots. I think we should say we are showing any pathway with FDRq < 0.15 and NES>1.5 for all for datasets, so if a pathway doesn’t meet these cutoffs, they are not shown as a bar in the two graphs: liver/lung and high/low pORG, and high/low PurIST and high/low pSUB. 

[contents](#contents)

## figure 2  



In [None]:
d_colorblind_g ={'Liver': '#d55e00',
 'Lung': '#0072b2',
 'high': '#e69f00',
 'low': '#56b4e9',
 'basal-like': '#383838',# 'dimgray',# '#000000',#
 'classical': '#cc79a7',
 'high pSUB': '#f0e442',
 'low pSUB': '#d55e00'}

In [None]:
#lper patient
%matplotlib inline
#Can you redo this removing the non lung and non liver mets in their respective cohorts:
importlib.reload(util)
import warnings
s_out = '20230828_Patient_Metadata.csv'
s_out = '20230921_Patient_Metadata.csv'
s_out = '20231206_Patient_Metadata.csv'
df_patient= pd.read_csv(f'annotation/{s_out}',index_col=0)
s_stats = 'mean'
d_order =  {'Cohort':['Liver','Lung'],
            'PurIST Subtype':['basal-like','classical']}
lls_score = [['pORG_Primary','pSUB_Primary','PurIST_Primary'],
           #['pORG_0.2_All', 'pSUB1e-04_All','PurIST_All'],
          ['pORG_Met','pSUB_Met','PurIST_Met'],
          ]
for ls_score in lls_score:
    if ls_score[0].find('All') > -1:
        ls_population = ['match','pORG_Met','pORG_Primary']
    elif ls_score[0].find('Met') > -1:
        ls_population = ['pORG_Met']
    else:
        ls_population = ['pORG_Primary']
    for s_population in ls_population:
        print(f'population {s_population}')
        if s_population.find('_Met')>-1:
            lb_met = [True,False]
        else:
            lb_met = [False]
        for b_met in lb_met:
            for s_score in ls_score:
                print(f'score {s_score}')
                if s_population == 'match':
                    s_population = s_score
                    print(f'population {s_population}')
                if s_score.find('_All')> -1: #need to calculate and add "all" scores
                    print('Primaries and Mets')
                    df_pri = df_patient.copy()
                    df_pri['PurIST_allPrimary'] = df_pri.PurIST_Primary.copy()
                    df_pri['PurIST_allMet'] = df_pri.PurIST_Met.copy()
                    for s_score_all in ['PurIST','pORG','pSUB']:
                        df_whole = pd.DataFrame(columns=df_pri.columns)
                        for s_all in [f'{s_score_all}_allPrimary',f'{s_score_all}_allMet']:
                            #select the mets or primaries (this will prioritze primary over met for matched)
                            df_half = df_pri[df_pri.loc[:,s_all.replace(s_score_all,'pORG')].notna()].copy()
                            #only fill in the non duplicate 
                            df_half = df_half[~df_half.Public_Patient_ID.isin(df_whole.Public_Patient_ID)].copy()
                            df_half.rename({s_all:f'{s_score_all}_All'},axis=1,inplace=True)
                            #df_whole.index = [str(item) + 'a' for item in df_whole.index]
                            with warnings.catch_warnings():
                                warnings.simplefilter(action='ignore', category=FutureWarning)
                                df_whole = pd.concat([df_whole,df_half],axis=0,ignore_index=True)
                        #print(len(df_whole))
                        df_pri = df_pri.merge(df_whole.loc[:,['Public_Patient_ID',f'{s_score_all}_All']],on='Public_Patient_ID',how='left',suffixes=('','_'))
                else:
                    df_pri = df_patient.copy()
                if s_population.find('_Met')> -1:
                    print('mets')
                    if b_met: #select mets specific to cohort
                        df_pri = df_pri[(df_pri.loc[:,s_population].notna()) & (df_patient.Specimen_Site_Met.isin(['Liver','Lung']))].copy()
                    else: #select mets
                        df_pri = df_pri[(df_pri.loc[:,s_population].notna())].copy()
                elif s_population.find('_Primary')> -1: #select primaries
                    print('primaries')
                    df_pri = df_pri[df_pri.loc[:,s_population].notna()].copy()
                df_pri['PurIST_All'] = df_pri.PurIST_Primary.fillna(df_pri.PurIST_Met)
                df_pri.loc[df_pri.loc[:,f'PurIST_{s_score.split("_")[-1]}'] > 0.5,'PurIST Subtype'] = 'basal-like'
                df_pri.loc[df_pri.loc[:,f'PurIST_{s_score.split("_")[-1]}'] <= 0.5,'PurIST Subtype'] = 'classical'
                df_pri.loc[df_pri.Public_Patient_ID.duplicated(),'PurIST Subtype'] = np.nan
                df_both,d_pval,order = util.violin_stats2(df_pri,d_order,s_score,s_stats)
                fig,pvalues,corrected = util.plot_violins2(df_both,d_pval,d_order,s_stats,s_score,order,
                                                           d_colorblind_g,s_population,b_correct=True,figsize=(3.5,2.5))
                plt.ylim(-0.7, 1.15)
                if s_score.find('PurIST') > -1:
                    plt.ylim(-0.1, 1.3)
                fig.savefig(f'figures/violinplot_fig2_{s_score}_{s_population.split("_")[-1]}_mets_only{b_met}.png')
            #break
        #break
    #break

In [None]:
print(df_patient.PurIST_Primary.notna().sum())
print(df_patient.loc[:,'pORG_Primary'].notna().sum())
print(df_patient.loc[:,'pSUB_Primary'].notna().sum())

print(df_patient.PurIST_Met.notna().sum()) #why only 71 mets have purist?
print(df_patient.loc[:, 'pSUB_Met'].notna().sum())
print(df_patient.loc[:, 'pORG_Met'].notna().sum())
print(216 + 72 - 10)

print(df_patient.loc[:,['PurIST_Primary','PurIST_Met']].notna().all(axis=1).sum())
print(df_patient.loc[:,['pSUB_Primary','pSUB_Met']].notna().all(axis=1).sum())
print(df_patient.loc[:,['pORG_Primary','pORG_Met']].notna().all(axis=1).sum())

print(df_patient.loc[:,'Cohort'].notna().sum())
print(df_patient.loc[df_patient.loc[:,'pORG_Primary'].notna(),'Cohort'].notna().sum())
print(df_patient.loc[df_patient.loc[:,'pORG_Met'].notna(),'Cohort'].notna().sum())
print(df_patient.loc[df_patient.loc[:,'PurIST_Met'].notna(),'Cohort'].notna().sum())
print(df_patient.loc[df_patient.loc[:,['pORG_Primary','pORG_Met']].notna().all(axis=1),'Cohort'].notna().sum())
75 + 38 - 7

## per specimen

In [None]:
#per specimen
#Can you redo this removing the non lung and non liver mets in their respective cohorts:
importlib.reload(util)
import warnings
s_out = '20231206_Patient_Metadata.csv'
df_patient= pd.read_csv(f'annotation/{s_out}',index_col=0)
df_patient['PurIST_allPrimary'] = df_patient.PurIST_Primary.copy()
df_patient['PurIST_allMet'] = df_patient.PurIST_Met.copy()
s_stats = 'mean'
d_order =  {'Cohort':['Liver','Lung'],
            'PurIST Subtype':['basal-like','classical']}
#primary, met
d_pm ={'Met': 'black','Primary': 'gray'}
d_order_pm = {'Specimen Type':['Primary','Met']}
lls_score = [['pORG_All', 'pSUB_All','PurIST_All']]
for ls_score in lls_score:
    ls_population = ['match']
    for s_population in ls_population:
        print(f'population {s_population}')
        if s_population.find('_Met')>-1:
            lb_met = [True,False]
        else:
            lb_met = [False]
        for b_met in lb_met:
            for s_score in ls_score:
                print(f'score {s_score}')
                if s_population == 'match':
                    s_population = s_score
                print(f'population {s_population}')
                #met specimens
                met_index = df_patient.loc[:,['pORG_allMet','pSUB_allMet']].dropna(how='any').index #72
                df_met = df_patient.loc[met_index,['Public_Patient_ID','pORG_allMet','pSUB_allMet','PurIST_allMet']] #72
                df_met['Public_Specimen_ID'] = [item + f"-Met" for item in df_met.Public_Patient_ID]
                print(len(df_met)) 
                df_met['Specimen Type'] = 'Met'
                #primary specimens
                pri_index = df_patient.loc[:,['pORG_allPrimary','pSUB_allPrimary']].dropna(how='any').index #72
                df_primary = df_patient.loc[pri_index,['Public_Patient_ID','pORG_allPrimary','pSUB_allPrimary','PurIST_allPrimary']] #216
                df_primary['Public_Specimen_ID'] = [item + f"-Primary" for item in df_primary.Public_Patient_ID]
                print(len(df_primary))   
                df_primary['Specimen Type'] = 'Primary'
                df_pri = pd.concat([df_met,df_primary],ignore_index=True)
                print(df_pri.Public_Specimen_ID.duplicated().any())
                print(f'specimen data {len(df_pri)}')
                df_pri['pORG_All'] = df_pri.loc[:,'pORG_allPrimary'].fillna(df_pri.loc[:,'pORG_allMet'])
                df_pri['pSUB_All'] = df_pri.loc[:,'pSUB_allPrimary'].fillna(df_pri.loc[:,'pSUB_allMet'])
                df_pri['PurIST_All'] = df_pri.PurIST_allPrimary.fillna(df_pri.PurIST_allMet)
                df_pri.loc[df_pri.loc[:,f'PurIST_{s_score.split("_")[-1]}'] > 0.5,'PurIST Subtype'] = 'basal-like'
                df_pri.loc[df_pri.loc[:,f'PurIST_{s_score.split("_")[-1]}'] <= 0.5,'PurIST Subtype'] = 'classical'
                df_pri['Cohort'] = df_pri.Public_Patient_ID.map(dict(zip(df_patient.Public_Patient_ID,df_patient.Cohort)))
                df_both,d_pval,order = util.violin_stats2(df_pri,d_order,s_score,s_stats)
                fig,pvalues,corrected = util.plot_violins2(df_both,d_pval,d_order,s_stats,s_score,order,
                                                           d_colorblind_g,s_population,b_correct=True,figsize=(3.5,2.5))
                plt.ylim(-0.7, 1.13)
                if s_score.find('PurIST') > -1:
                    plt.ylim(-0.1, 1.2)
                fig.savefig(f'figures/violinplot_fig2_spec_{s_score}_{s_population.split("_")[-1]}_mets_only{b_met}.png')
                df_both,d_pval,order = util.violin_stats2(df_pri,d_order_pm,s_score,s_stats)
                fig,pvalues,corrected = util.plot_violins2(df_both,d_pval,d_order_pm,s_stats,s_score,order,
                                            d_pm,s_population,b_correct=True,figsize=(3,3))
        break

In [None]:
df_patient.loc[:,df_patient.columns.str.contains('DDR')].value_counts()

## Figure 3: GSEA barplots <a name="bars2"></a> 

stacked, hatched

[contents](#contents)


In [None]:
#os.listdir('data')


In [None]:
#load GSEA data
ls_columns = ['NAME', 'SIZE', 'ES', 'NES', 'NOM.p.val', 'FDR.q.val', 'FWER.p.val' ]
ls_gsea = ['Top4th_PurIST.Score_vs_Bottom4th_PurIST.Score_h.xlsx',
     'basal-like_vs_classical_h.xlsx','LiverCohort_vs_LungNotLiverCohort_h.xlsx',
 'Top4th_pSUB.1eNeg4_vs_Bottom4th_pSUB.1eNeg4_h.xlsx',
 'Top4th_pSUB.1eNeg5_vs_Bottom4th_pSUB.1eNeg5_h.xlsx',
 'Top4th_pORG.20_vs_Bottom4th_pORG.20_h.xlsx',
           #mets
    'Top4thMet_pORG_Up_55_vs_Bottom4thMet_JM_pORG_Up_55_h.xlsx',
           'Top4thMet_pSUB_Up_51_vs_Bottom4thMet_JM_pSUB_Up_51_h.xlsx',
           'LiverCohortMet_vs_LungNotLiverCohortMet_h.xlsx',
'Top4thMet_PurIST_vs_Bottom4thMet_PurIST_h.xlsx',  
           #tcr mets
 'LiverTcr_pORG_Up_55_JM_vs_LungTcr_JM_pORG_Up_55_JM_h.xlsx',
 'Top4thTcr_pORG_Up_55_JM_vs_Bot4thTcr_JM_pORG_Up_55_JM_h.xlsx',
    ]
d_en = {}
for s_gsea in ls_gsea:
    d_load =  pd.read_excel(f'data/{s_gsea}',sheet_name=None)#,index_col=0
    df_up = d_load['GSEA_UP'].loc[:,ls_columns] #df[df.ES>0]
    df_down = d_load['GSEA_DN'].loc[:,ls_columns]#df[df.ES<0]
    d_en.update({f"{s_gsea.split('_h.xlsx')[0]}_UP":df_up})
    d_en.update({f"{s_gsea.split('_h.xlsx')[0]}_DN":df_down})
    #break

d_colorblind = {'Liver': '#d55e00',
 'Lung': '#0072b2',
 'high pORG': '#e69f00',
 'low pORG': '#56b4e9',
 'basal-like': '#000000','high PurIST': '#000000',
 'classical': '#cc79a7','low PurIST': '#cc79a7',
 'high pSUB': '#f0e442',
 'low pSUB': '#009E73'}

In [None]:
## USED:  bar plots of GSEA, hashmarks  
%matplotlib inline
importlib.reload(util)
hatch='||'#'///'#
my_cmap = plt.get_cmap("viridis_r")
my_cmap = plt.get_cmap("bwr")
my_cmap = plt.get_cmap("RdYlBu_r")
height=0.35
#rescale = lambda y: (y - np.min(y)) / (np.max(y) - np.min(y))
#rescale = lambda y: (y - np.min(y)) / (np.max(y) - np.min(y))
vmax=0.4
norm = mpl.colors.Normalize(vmin=0,vmax=vmax)
mappable = mpl.cm.ScalarMappable(norm=norm, cmap=my_cmap)
d_labels = {'Top4th_PurIST.Score_vs_Bottom4th_PurIST.Score':'Top vs Bottom\nQuartile by PurIST',
            'LiverCohort_vs_LungNotLiverCohort':'Liver Cohort vs.\nLung Cohort',
           'Top4th_pORG.20_vs_Bottom4th_pORG.20':'Top vs Bottom\nQuartile by pORG',# 0.2
           'Top4th_pSUB.1eNeg4_vs_Bottom4th_pSUB.1eNeg4':'Top vs Bottom\nQuartile by pSUB',# 1e-4
           #mets
            'Top4thMet_pORG_Up_55_vs_Bottom4thMet_JM_pORG_Up_55':'Top vs Bottom\nQuartile by pORG Mets',
           'Top4thMet_pSUB_Up_51_vs_Bottom4thMet_JM_pSUB_Up_51':'Top vs Bottom\nQuartile by pSUB Mets',
            'LiverCohortMet_vs_LungNotLiverCohortMet':'Liver Cohort vs.\nLung Cohort Mets',
            'Top4thMet_PurIST_vs_Bottom4thMet_PurIST':'Top vs Bottom\nQuartile by PurIST Mets'
           } 
#subtype
sorter_combined = ['HALLMARK_XENOBIOTIC_METABOLISM','HALLMARK_PEROXISOME',
  'HALLMARK_FATTY_ACID_METABOLISM','HALLMARK_BILE_ACID_METABOLISM',
  'HALLMARK_PANCREAS_BETA_CELLS','HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION',
  'HALLMARK_APICAL_JUNCTION','HALLMARK_HYPOXIA','HALLMARK_GLYCOLYSIS']
ls_plot_items = ['Top4th_pSUB.1eNeg4_vs_Bottom4th_pSUB.1eNeg4',
    'Top4th_PurIST.Score_vs_Bottom4th_PurIST.Score']
#generate dataframe with comparisons
df_plot_bar,es_marker =util.compare_dataframe(ls_plot_items,d_en,sorter_combined,ls_columns)
# add color
#df_plot_bar['FDR_color'] = rescale(df_plot_bar.loc[:,'FDR.q.val'])
df_plot_bar['FDR_color'] = df_plot_bar['FDR_color'] = np.clip(df_plot_bar.loc[:,'FDR.q.val']/vmax,a_min=0,a_max=1)
#plot figure
fig, ax = util.plot_double_bars_heat(ls_plot_items,df_plot_bar,d_labels,sorter_combined,mappable,
                                hatch=hatch,height=height,figsize=(4,4),anchor=(.5,-.13))
ax.set_title('Subtype')
ax.set_yticklabels([item.replace('HALLMARK_','').replace('_',' ').title() for item in sorter_combined])
################################################################################
# organo
#'''
sorter_combined =  ['HALLMARK_MYOGENESIS' ,'HALLMARK_CHOLESTEROL_HOMEOSTASIS',#'HALLMARK_KRAS_SIGNALING_UP' ,
  'HALLMARK_ANDROGEN_RESPONSE','HALLMARK_OXIDATIVE_PHOSPHORYLATION',
  'HALLMARK_DNA_REPAIR', 'HALLMARK_INTERFERON_ALPHA_RESPONSE',
  'HALLMARK_E2F_TARGETS','HALLMARK_MYC_TARGETS_V1',
  'HALLMARK_G2M_CHECKPOINT','HALLMARK_MITOTIC_SPINDLE',
  'HALLMARK_GLYCOLYSIS','HALLMARK_MTORC1_SIGNALING','HALLMARK_PROTEIN_SECRETION',]

ls_plot_items = ['Top4th_pORG.20_vs_Bottom4th_pORG.20',
    'LiverCohort_vs_LungNotLiverCohort']

#generate dataframe with comparisons
df_plot_bar,es_marker = util.compare_dataframe(ls_plot_items,d_en,sorter_combined,ls_columns)
#df_plot_bar['FDR_color'] = rescale(df_plot_bar.loc[:,'FDR.q.val'])
df_plot_bar['FDR_color'] = np.clip(df_plot_bar.loc[:,'FDR.q.val']/vmax,a_min=0,a_max=1)
fig, ax = util.plot_double_bars_heat(ls_plot_items,df_plot_bar,d_labels,sorter_combined,mappable,
                                hatch=hatch,
                                height=height,figsize=(4,3.7),anchor=(.5,-.13))
ax.set_title('Organotropism')
labels = [item.replace('HALLMARK_','').replace('_',' ').title().replace('Mtorc1','mTORC1').replace('Myc','MYC').replace('Dna','DNA') for item in sorter_combined]
__ = ax.set_yticklabels(labels)
#'''

In [None]:
# pORG mets

sorter_combined =  ['HALLMARK_MYOGENESIS','HALLMARK_WNT_BETA_CATENIN_SIGNALING',
 'HALLMARK_OXIDATIVE_PHOSPHORYLATION','HALLMARK_G2M_CHECKPOINT','HALLMARK_MYC_TARGETS_V1',
 'HALLMARK_PROTEIN_SECRETION','HALLMARK_E2F_TARGETS','HALLMARK_GLYCOLYSIS','HALLMARK_MTORC1_SIGNALING']

# sorter_combined = ['HALLMARK_UV_RESPONSE_DN','HALLMARK_ESTROGEN_RESPONSE_LATE',
#  'HALLMARK_HYPOXIA','HALLMARK_BILE_ACID_METABOLISM','HALLMARK_MITOTIC_SPINDLE',
#  'HALLMARK_MTORC1_SIGNALING','HALLMARK_UNFOLDED_PROTEIN_RESPONSE','HALLMARK_DNA_REPAIR',
#  'HALLMARK_MYC_TARGETS_V1','HALLMARK_ANGIOGENESIS','HALLMARK_GLYCOLYSIS',
#  'HALLMARK_COAGULATION','HALLMARK_XENOBIOTIC_METABOLISM','HALLMARK_G2M_CHECKPOINT',
#  'HALLMARK_E2F_TARGETS','HALLMARK_MYC_TARGETS_V2',]
ls_plot_items = ['Top4thMet_pORG_Up_55_vs_Bottom4thMet_JM_pORG_Up_55',
   # 'LiverCohortMet_vs_LungNotLiverCohortMet'
 #'Top4thTcr_pORG_Up_55_JM_vs_Bot4thTcr_JM_pORG_Up_55_JM',
   #'LiverTcr_pORG_Up_55_JM_vs_LungTcr_JM_pORG_Up_55_JM',
                ]

#generate dataframe with comparisons
df_plot_bar,es_marker = util.compare_dataframe(ls_plot_items,d_en,sorter_combined,ls_columns)
#df_plot_bar['FDR_color'] = rescale(df_plot_bar.loc[:,'FDR.q.val'])
df_plot_bar['FDR_color'] = np.clip(df_plot_bar.loc[:,'FDR.q.val']/vmax,a_min=0,a_max=1)
fig, ax = util.plot_double_bars_heat(ls_plot_items,df_plot_bar,d_labels,sorter_combined,mappable,
                                hatch=hatch,
                                #height=0.4,figsize=(4,4),anchor=(.5,-.16))
                                height=0.9,figsize=(4,3),anchor=(.5,-.16))
ax.set_title('Organotropism Mets')
labels = [item.replace('HALLMARK_','').replace('_',' ').title().replace('Mtorc1','mTORC1').replace('Myc','MYC').replace('Dna','DNA') for item in sorter_combined]
__ = ax.set_yticklabels(labels)
#'''

## VIPER Barplots

In [None]:
#load viper
df = pd.read_excel('data/ViperANOVA.xlsx',index_col=0)
ls_gsea = ['MeanDiff_pORG_Up_55', 'MeanDiff_pSUB_Up_51',
       'MeanDiff_LiverVsLungNotLiver', 'MeanDiff_PurIST.Score']
d_en = {}
for s_gsea in ls_gsea:
    s_comp = s_gsea.split('MeanDiff_')[1]
    ls_columns = df.columns[df.columns.str.contains(s_comp)]
    #up
    df_up = df.loc[df.loc[:,s_gsea]>0,ls_columns] 
    df_up.columns = [item.replace(s_comp,'').replace('TopBottomQuarter_','').replace('_','') for item in df_up.columns]
    df_up.reset_index(inplace=True)
    df_up.rename({'MeanDiff':'NES', 'qVal':'FDR.q.val','regulons':'NAME'},axis=1,inplace=True)
    #down
    df_down = df.loc[df.loc[:,s_gsea]<0,ls_columns]
    df_down.columns = [item.replace(s_comp,'').replace('TopBottomQuarter_','').replace('_','') for item in df_down.columns]
    df_down.reset_index(inplace=True)
    df_down.rename({'MeanDiff':'NES', 'qVal':'FDR.q.val','regulons':'NAME'},axis=1,inplace=True)
    #add to data dict
    d_en.update({f"{s_comp}_UP":df_up})
    d_en.update({f"{s_comp}_DN":df_down})
    #break

d_colorblind = {'Liver': '#d55e00','Lung': '#0072b2',
 'high pORG': '#e69f00','low pORG': '#56b4e9',
 'basal-like': '#000000','high PurIST': '#000000',
 'classical': '#cc79a7','low PurIST': '#cc79a7',
 'high pSUB': '#f0e442','low pSUB': '#009E73'}
d_labels = {'pORG_Up_55':'pORG','LiverVsLungNotLiver':'Cohort',
          'pSUB_Up_51':'pSUB','PurIST.Score':'PurIST'}

#df.index[df.index.str.contains('CCN')]
d_sorter_comb =  { 'DNA Repair':['ATM', 'ARID1A', 'ATR', 'BRCA1', 'BARD1', 
                                 'BRCA2', 'RAD51', 'CHEK1', 'PALB2'],
'Replication': ['MCM3','MCM8', 'MCM6', 'MCM7', 'MCM2', 'ORC2'],#'MCM5',
'Cell Cycle':['CDKN1B', 'CDK4', 'E2F2', 'CCND1', 'CDK2', 'CDK6', 'CCNA2','E2F1'],
 "Immune":['MS4A1', 'CD3G', 'IFNAR2', 'IFNAR1', 'MX1', 'STAT1'],
                 }

In [None]:
#viper
importlib.reload(util)
hatch='||'
ls_plot_items = ['pORG_Up_55','LiverVsLungNotLiver'] #
#ls_plot_items = ['pSUB_Up_51', 'PurIST.Score']
ls_columns = ['NAME', 'NES', 'pVal', 'FDR.q.val']

for s_title, sorter_combined in d_sorter_comb.items():
    #generate dataframe with comparisons
    df_plot_bar,es_marker =util.compare_dataframe(ls_plot_items,d_en,sorter_combined,ls_columns)
    # add color
    #df_plot_bar['FDR_color'] = rescale(df_plot_bar.loc[:,'FDR.q.val'])
    df_plot_bar['FDR_color'] = np.clip(df_plot_bar.loc[:,'FDR.q.val']/vmax,a_min=0,a_max=1)
    #plot figure
    fig, ax = util.plot_double_bars_heat(ls_plot_items,df_plot_bar,d_labels,
                                         sorter_combined,mappable,
                                    hatch=hatch,height=height,figsize=(3,3.2),anchor=(0.5,-.15),
                                        x_label='Mean Difference')
    ax.set_title(s_title)

## old code  <a name="split"></a>


[contents](#contents)

In [None]:
#dot plots - size of dot = FDR.Q
# #generate more GSEA
# ls_columns = ['NAME', 'SIZE', 'ES', 'NES', 'NOM.p.val', 'FDR.q.val', 'FWER.p.val',#'RANK.AT.MAX'
#        ]
# ls_gsea = ['Top4th_PurIST.Score_vs_Bottom4th_PurIST.Score_h.xlsx',
#      'basal-like_vs_classical_h.xlsx','LiverCohort_vs_LungNotLiverCohort_h.xlsx',
#  'Top4th_pSUB.1eNeg4_vs_Bottom4th_pSUB.1eNeg4_h.xlsx',
#  'Top4th_pSUB.1eNeg5_vs_Bottom4th_pSUB.1eNeg5_h.xlsx',
#  'Top4th_pORG.20_vs_Bottom4th_pORG.20_h.xlsx'
#     ]
# d_en = {}
# for s_gsea in ls_gsea:
#     d_load =  pd.read_excel(f'data/{s_gsea}',sheet_name=None)#,index_col=0
#     df_up = d_load['GSEA_UP'].loc[:,ls_columns] #df[df.ES>0]
#     df_down = d_load['GSEA_DN'].loc[:,ls_columns]#df[df.ES<0]
#     d_en.update({f"{s_gsea.split('_h.xlsx')[0]}_UP":df_up})
#     d_en.update({f"{s_gsea.split('_h.xlsx')[0]}_DN":df_down})
#     #break

# #subtype
# #bubble plots
# %matplotlib inline
# sorter_combined = ['HALLMARK_XENOBIOTIC_METABOLISM','HALLMARK_PEROXISOME',
#   'HALLMARK_FATTY_ACID_METABOLISM','HALLMARK_BILE_ACID_METABOLISM',
#   'HALLMARK_PANCREAS_BETA_CELLS', 'HALLMARK_GLYCOLYSIS',
#   'HALLMARK_HYPOXIA','HALLMARK_APICAL_JUNCTION',
#     'HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION',]
# sns.set_palette(('#000000','#CC79A7','#F0E442','#009E73'))
# palette ={'PurIST_UP': '#000000',
#  'PurIST_DN': '#CC79A7',
#  'pSUB_UP': '#F0E442',
#  'pSUB_DN': '#009E73'}
# ls_plot_items = [ 'Top4th_PurIST.Score_vs_Bottom4th_PurIST.Score',
#     'Top4th_pSUB.1eNeg4_vs_Bottom4th_pSUB.1eNeg4']
# marker = 'o'
# df_plot_long = pd.DataFrame()
# for s_comp in ls_plot_items:
#     for s_direction in ['UP','DN']:
#         s_compare =f'{s_comp}_{s_direction}'
#         df_plot_o = d_en[s_compare]
#         df_plot_o['comparison'] = s_compare
#         df_plot_long = pd.concat([df_plot_long,df_plot_o],ignore_index=True)
# #plot
# df_plot = df_plot_long[df_plot_long.NAME.isin(sorter_combined)]
# d_rename = {'Top4th_pSUB.1eNeg4_vs_Bottom4th_pSUB.1eNeg4_UP':'pSUB_UP','Top4th_pSUB.1eNeg4_vs_Bottom4th_pSUB.1eNeg4_DN':'pSUB_DN',
#        'Top4th_PurIST.Score_vs_Bottom4th_PurIST.Score_UP':'PurIST_UP','Top4th_PurIST.Score_vs_Bottom4th_PurIST.Score_DN':'PurIST_DN'}
# df_plot['comparison'] = df_plot.comparison.replace(d_rename)
# size_order =  ['<0.05','<0.1','<0.25','0.25+']
# sizes = [100,50,25,10]
# bins=[0,0.05,0.1,0.25,1]
# d_size = dict(zip(size_order,sizes))
# df_plot['FDR.Q'] = pd.cut(df_plot.loc[:,'FDR.q.val'],bins=bins,
#                           right=True,labels=size_order)
# fig, ax = plt.subplots(dpi=300,figsize=(3.2,3))
# sns.scatterplot(data=df_plot, y='NAME',x='NES',hue='comparison',size='FDR.Q',ax=ax,linewidth=0,
#                 marker=marker,alpha=0.8,size_order=size_order)
# ax.set_title('Subtype vs. Hallmarks',fontweight='bold')
# handles, labels = zip(*[(plt.scatter([], [],s=color,ec='k',linewidth=0,fc='k',marker=marker), key) for key, color in d_size.items()]) #,
# legend1 = plt.legend(handles, labels, title="FDR.Q",loc=2,bbox_to_anchor=(1,0.54)) #,markerscale=0.9
# handles, labels = zip(*[(plt.scatter([], [], fc=color, marker=marker), key) for key, color in palette.items()])
# legend2 = plt.legend(handles, labels, title="comparison",bbox_to_anchor=(1,1.01),fontsize='small',loc=2)
# ax.add_artist(legend1)
# ax.add_artist(legend2)
# ax.set_title('Subtype vs. Hallmarks',fontweight='bold')
# ax.set_ylabel('')#HALLMARK
# labels_long = ax.get_yticklabels().copy()
# ticks = ax.get_yticks()
# labels_ = [item.replace('HALLMARK_','').replace('_',' ').title().replace('Epithelial Mesenchymal Transition','EMT') for item in reversed(sorter_combined)]
# ax.set_yticks(ticks=ticks,labels=labels_)
# ax.set_xlim(-2.3,2.3)
# plt.tight_layout()
# fig.savefig('figures/GSEA_Subtype.png')


# # organo
# sorter_combined =  ['HALLMARK_MYOGENESIS' ,'HALLMARK_CHOLESTEROL_HOMEOSTASIS','HALLMARK_ANDROGEN_RESPONSE',
#   'HALLMARK_OXIDATIVE_PHOSPHORYLATION', 'HALLMARK_DNA_REPAIR','HALLMARK_INTERFERON_ALPHA_RESPONSE',
#   'HALLMARK_E2F_TARGETS','HALLMARK_MYC_TARGETS_V1','HALLMARK_G2M_CHECKPOINT','HALLMARK_MITOTIC_SPINDLE',
#   'HALLMARK_GLYCOLYSIS','HALLMARK_MTORC1_SIGNALING','HALLMARK_PROTEIN_SECRETION',
#                   ]
# sns.set_palette(('#E69F00','#56B4E9','#D55E00','#0072B2'))
# d_palette = {'Liver_UP': '#D55E00','Liver_DN': '#0072B2','pORG_UP': '#E69F00',
#     'pORG_DN': '#56B4E9'}
# ls_plot_items = ['Top4th_pORG.20_vs_Bottom4th_pORG.20',
#     'LiverCohort_vs_LungNotLiverCohort']
# df_plot_long = pd.DataFrame()
# for s_comp in ls_plot_items:
#     for s_direction in ['UP','DN']:
#         s_compare =f'{s_comp}_{s_direction}'
#         df_plot_o = d_en[s_compare]
#         df_plot_o['comparison'] = s_compare
#         df_plot_long = pd.concat([df_plot_long,df_plot_o])
# #plot
# df_plot = df_plot_long[df_plot_long.NAME.isin(sorter_combined)].copy()
# d_rename = {'Top4th_pORG.20_vs_Bottom4th_pORG.20_UP':'pORG_UP','Top4th_pORG.20_vs_Bottom4th_pORG.20_DN':'pORG_DOWN',
#        'LiverCohort_vs_LungNotLiverCohort_UP':'Liver_UP','LiverCohort_vs_LungNotLiverCohort_DN':'Liver_DN'}
# df_plot['comparison'] = df_plot.comparison.replace(d_rename)
# df_plot['FDR.Q'] = pd.cut(df_plot.loc[:,'FDR.q.val'],bins=bins,right=True,labels=size_order)
# fig, ax = plt.subplots(dpi=300,figsize=(3.3,3))
# sns.scatterplot(data=df_plot, y='NAME',x='NES',hue='comparison',size='FDR.Q',
#                 ax=ax,size_order=size_order,sizes=sizes,marker=marker,linewidth=0)
# handles1, labels1 = zip(*[(plt.scatter([], [],s=color,ec='k', fc='k',marker=marker,linewidth=0), key) for key, color in d_size.items()])
# legend1 = plt.legend(handles1, labels1, title="FDR.Q",bbox_to_anchor=(1,0.56))#,markerscale=0.95
# handles2, labels2 = zip(*[(plt.scatter([], [], fc=color, marker=marker), key) for key, color in d_palette.items()])
# legend2 = plt.legend(handles2, labels2, title="comparison",bbox_to_anchor=(1,1.02),fontsize='small')
# ax.add_artist(legend1)
# ax.add_artist(legend2)
# ax.set_title('Organotropism vs. Hallmarks',fontweight='bold')
# ax.set_ylabel('')
# ticks = ax.get_yticks()
# ax.set_yticks(ticks=ticks,labels=[item.replace('HALLMARK_','').replace('_',' ').title().replace('Dna','DNA').replace('Myc','MYC') for item in reversed(sorter_combined)])
# ax.set_xlim(-2.2,2.35)
# plt.tight_layout()
# fig.savefig('figures/GSEA_Organotropism.png')

In [None]:
# #organo
# sorter_combined = ['ATM', 'ARID1A', 'ATR', 'BRCA1', 'BARD1', 'BRCA2', 'RAD51', 'CHEK1', 'PALB2']
# sorter_combined = ['MCM3', 'MCM6', 'MCM7', 'MCM2', 'ORC2']
# sorter_combined = ['CDKN1B', 'CDK4', 'E2F2', 'CCND1', 'CDK2', 'CDK6', 'E2F1']
# sorter_combined = ['CDKN1B', 'CDK4', 'E2F2', 'CCND1', 'CDK2', 'CDK6', 'E2F1','MCM3', 'MCM6', 'MCM7', 'MCM2', 'ORC2']
# ls_plot_items = ['pORG_Up_55','LiverVsLungNotLiver'] #['pSUB_Up_51', 'PurIST.Score']
# ls_columns = ['NAME', 'NES', 'pVal', 'FDR.q.val']
# #generate dataframe with comparisons
# df_plot_bar,es_marker = util.compare_dataframe(ls_plot_items,d_en,sorter_combined,ls_columns)
# #add colors
# b_purist = df_plot_bar.comparison==ls_plot_items[1]#'PurIST.Score'
# b_psub = df_plot_bar.comparison==ls_plot_items[0]#'pSUB_Up_51'
# df_plot_bar.loc[b_purist & (df_plot_bar.direction=='UP'),'color'] = f'high {d_label[ls_plot_items[1]]}'
# df_plot_bar.loc[b_purist & (df_plot_bar.direction=='DN'),'color'] = f'low {d_label[ls_plot_items[1]]}'
# df_plot_bar.loc[b_psub & (df_plot_bar.direction=='UP'),'color'] = f'high {d_label[ls_plot_items[0]]}'
# df_plot_bar.loc[b_psub & (df_plot_bar.direction=='DN'),'color'] = f'low {d_label[ls_plot_items[0]]}'
# df_plot_bar['color'] = df_plot_bar.color.replace({'high Cohort':'Liver','low Cohort':'Lung'})
# # plot figure
# fig, ax = util.plot_double_bars(df_plot_bar,ls_plot_items,d_colorblind,
#                                 sorter_combined,hatchbar,figsize=(4,3))
# #custom legend
# handles = []
# for s_color in df_plot_bar[df_plot_bar.comparison==ls_plot_items[0]].color.unique():
#     facecolor = d_colorblind[s_color]
#     handles.append(mpl.patches.Patch(facecolor=facecolor, edgecolor='black',label=s_color,alpha=0.8,))
# #hatch
# for s_color in df_plot_bar[df_plot_bar.comparison==ls_plot_items[1]].color.unique():
#     facecolor = d_colorblind[s_color]
#     handles.append(mpl.patches.Patch(facecolor=facecolor, edgecolor='black',label=s_color,
#                                      alpha=0.8,hatch=hatchbar))
# ax.legend(handles=handles,bbox_to_anchor = (1.01,1.01),markerscale=1,title='NES',loc='upper left')
# ax.set_yticklabels([item.replace('HALLMARK_','').replace('EPITHELIAL_MESENCHYMAL_TRANSITION','EMT') for item in sorter_combined])
# #add pvalue axis
# fig, ax2 = util.twin_pvalue_axis(df_plot_bar,ls_plot_items,sorter_combined,fig,ax,
#                                  height=height,hatch=hatch)
# ax2.set_xlim((-0.0092, 0.21))
# plt.tight_layout()
# #add FDRQ
# util.add_fdrq_legend(texts= ['pORG','Cohort'],hatch=hatch,anchor=(1.01,0.4))

In [None]:
# #pSUB and purist (side by side barplot and dot plot - motoyuki suggestion - looks okay, not used)
# importlib.reload(util)
# hatch = '//'#''#
# hatchbar = '//'#''
# height=0.4
# sorter_combined = ['HALLMARK_XENOBIOTIC_METABOLISM','HALLMARK_PEROXISOME', 'HALLMARK_FATTY_ACID_METABOLISM',
#   'HALLMARK_BILE_ACID_METABOLISM', 'HALLMARK_PANCREAS_BETA_CELLS','HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION',
#   'HALLMARK_APICAL_JUNCTION', 'HALLMARK_HYPOXIA', 'HALLMARK_GLYCOLYSIS']
# ls_plot_items = ['Top4th_pSUB.1eNeg4_vs_Bottom4th_pSUB.1eNeg4', 'Top4th_PurIST.Score_vs_Bottom4th_PurIST.Score']
# #generate dataframe with comparisons
# df_plot_bar,es_marker = util.compare_dataframe(ls_plot_items,d_en,sorter_combined,ls_columns)
# #add colors
# b_purist = df_plot_bar.comparison=='Top4th_PurIST.Score_vs_Bottom4th_PurIST.Score'
# b_psub = df_plot_bar.comparison=='Top4th_pSUB.1eNeg4_vs_Bottom4th_pSUB.1eNeg4'
# df_plot_bar.loc[b_purist & (df_plot_bar.direction=='UP'),'color'] = 'high PurIST'
# df_plot_bar.loc[b_purist & (df_plot_bar.direction=='DN'),'color'] = 'low PurIST'
# df_plot_bar.loc[b_psub & (df_plot_bar.direction=='UP'),'color'] = 'high pSUB'
# df_plot_bar.loc[b_psub & (df_plot_bar.direction=='DN'),'color'] = 'low pSUB'
# # plot figure
# fig, ax1 = util.plot_double_bars_grid(df_plot_bar,ls_plot_items,d_colorblind,sorter_combined,
#                                  hatchbar,figsize=(5.5,4))
# #custom legend
# handles = []
# for s_color in ['high pSUB', 'low pSUB']:
#     facecolor = d_colorblind[s_color]
#     handles.append(mpl.patches.Patch(facecolor=facecolor, edgecolor='black',label=s_color,alpha=0.8,))
# #hatch
# for s_color in [ 'high PurIST', 'low PurIST']:
#     facecolor = d_colorblind[s_color]
#     handles.append(mpl.patches.Patch(facecolor=facecolor, edgecolor='black',label=s_color,
#                                      alpha=0.8,hatch=hatchbar))

# ax1.set_yticklabels([item.replace('HALLMARK_','').replace('EPITHELIAL_MESENCHYMAL_TRANSITION','EMT') for item in sorter_combined])
# #add pvalue axis
# fig, ax2 = util.add_pvalue_axis(df_plot_bar,ls_plot_items,sorter_combined,fig,ax1,
#                                  height=height,hatch=hatch)
# ax2.set_xlim((-0.1, 0.5))
# legend1 = ax2.legend(handles=handles,bbox_to_anchor = (1.1,1),markerscale=1,title='NES',frameon=False)
# ax2.axvline(0.2,linestyle='--',color='gray')
# plt.tight_layout()
# legend2 = util.add_fdrq_legend_ax2(texts= ['pSUB', 'PurIST'],hatch=hatch,ax2=ax2)
# ax2.add_artist(legend1)
# ax2.add_artist(legend2)


In [None]:
# #per rosie GSVA question
# what is the difference between pORG All and pORG Primary of Met
# for s_met in ['Met','Primary']:
#     df1 = df_patient.sort_values(by=f'pORG_0.2_{s_met}').loc[:,['Public_Patient_ID',f'pORG_0.2_{s_met}']].dropna()
#     df2 = df_patient.sort_values(by=f'pORG_0.2_all{s_met}').loc[:,['Public_Patient_ID',f'pORG_0.2_all{s_met}']].dropna()
#     df1.reset_index(inplace=True)
#     df2.reset_index(inplace=True)
#     df1.reset_index(inplace=True)
#     df2.reset_index(inplace=True)
#     df1.rename({'index':'INDEX','level_0':s_met},axis=1,inplace=True)
#     df2.rename({'index':'INDEX'},axis=1,inplace=True)
#     df1['All'] = df1.INDEX.map(dict(zip(df2.INDEX,df2.level_0)))
#     df1[f'{s_met} quartiles'] = pd.qcut(df1.loc[:,s_met],4,labels = ['low','medx','med','high'])
#     df1[f'{s_met} quartiles'] = df1.loc[:,f'{s_met} quartiles'].replace({'medx':'med',})#'med':np.nan
#     #rank
# #     fig, ax = plt.subplots()
# #     sns.scatterplot(data=df1, x=s_met, y='All', hue=f'{s_met} quartiles',ax=ax)
# #     ax.set_title(f'{s_met} pORG Rank')
# #     ax.set_ylabel('Rank Order with GSVA All')
# #     ax.set_xlabel(f'Rank Order with GSVA {s_met}')
# #     for i_quantile in np.quantile(df1.All, q=[0.25,0.75]):
# #         ax.axhline(i_quantile)
#     #scatter
#     fig, ax =plt.subplots()
#     df_test = df1.merge(df2,on='Public_Patient_ID')
#     sns.scatterplot(data=df_test,x=f'pORG_0.2_{s_met}',y=f'pORG_0.2_all{s_met}',hue=f'{s_met} quartiles',ax=ax)    
#     for i_quantile in np.quantile(df_test.loc[:,f'pORG_0.2_all{s_met}'], q=[0.25,0.75]):
#             ax.axhline(i_quantile)
#     #kde plot
#     fig,ax=plt.subplots()
#     sns.kdeplot(df1.loc[:,f'pORG_0.2_{s_met}'],ax=ax,label=s_met,cut=0)
#     sns.kdeplot(df2.loc[:,f'pORG_0.2_all{s_met}'],ax=ax,label='All',cut=0)
#     plt.legend()
#     ax.set_xlabel('pORG')
#     #break
# #all
# fig, ax = plt.subplots()
# sns.kdeplot(np.concatenate([df_patient.loc[:,f'pORG_0.2_allMet'].dropna().values,
#            df_patient.loc[:,f'pORG_0.2_allPrimary'].dropna().values]),cut=0,ax=ax,
#            label='pORG All')
# plt.legend()
# #add pORG quartiles, plot violins 
# # use pORG All
# %matplotlib inline
# sns.set_palette('tab10')
# importlib.reload(util)
# alpha = 0.05
# b_correct= False#False #True #
# ls_foci = ['Shannon_Entropy_Tumor','Templates_per_ng', 'Shannon_Entropy_Blood',
#            'Productive_Rearrangements',
#         'Simpsons_Diversity_Tumor','Simpsons_Diversity_Blood',
#            'Clonality_Tumor', 'Clonality_Blood',
#         "Simpson's Evenness tumor", "Simpson's Evenness blood",
#             'Fraction Tumor Distinct TCRs',
#            'Percent Tumor Distinct Clones',
#             'Fraction Shared Clones 10'
#         ]
# d_colorblind = {'Liver':'#d55e00','Lung':'#0072b2',
#                 'lung_short':'#d55e00','lung_long':'#0072b2',
#                'high':'#e69f00','low': '#56b4e9',
#                'basal-like':'#000000','classical':'#cc79a7',
#                'high pSUB': '#f0e442','low pSUB':'#009E73'} #
# #non- parametric
# se_non_para = pd.Series(['Simpsons_Diversity_Blood', 
#     'Templates_per_ng','Productive_Rearrangements',
#      "Simpson's Evenness tumor",
#     "Simpson's Evenness blood",
#    'Number Tumor Distinct Clones'])
# d_order =  {#'Cohort':['Liver','Lung'],    
#         'TCR_Lung_Median_Surv':['lung_short','lung_long'],
#     'quartiles':['high','low']}
# for s_porg in ['pORG_0.2_All', 'pORG_0.2_Met','pORG_0.2_Primary', ]:#, 'pORG_78_Primary',
#     print('Primaries and Mets')
#     df_whole = pd.DataFrame(columns=df_merge.columns)
#     for s_all in ['pORG_0.2_allPrimary', 'pORG_0.2_allMet']:
#         df_half = df_merge[(df_merge.loc[:,s_all].notna())]
#         df_half = df_half[~df_half.Public_Patient_ID.isin(df_whole.Public_Patient_ID)].copy()
#         df_half.rename({s_all:'pORG_0.2_All'},axis=1,inplace=True)
#         print(len(df_half))
#         df_whole = pd.concat([df_whole,df_half],axis=0,ignore_index=True)
#     #s_foci not defined
#     df_pri = df_merge.merge(df_whole.loc[df_whole.loc[:,['Simpsons_Diversity_Blood','Simpsons_Diversity_Tumor']].notna().any(axis=1),['Public_Patient_ID','pORG_0.2_All']],on='Public_Patient_ID',how='left')
#     df_pri.loc[df_pri.Public_Patient_ID.duplicated(),'Cohort'] = np.nan
#     if s_porg.find('Primar') > -1:
#         print('Primaries')
#         df_pri = df_pri.loc[(df_pri.Tumor_Type=='Primary')].copy()
#     elif s_porg.find('Met') > -1:
#         df_pri = df_pri.loc[df_pri.Tumor_Type=='Met'].copy()
#     df_pri = util.add_quartiles(df_pri,'pORG_0.2_All')#s_porg#.drop(219) #p. rearrangements outlier 
#     for s_foci in ls_foci:
#         print(s_foci)
#         if se_non_para.isin([s_foci]).any():
#             s_stats = 'non-parametric'
#         else:
#             s_stats = 'mean' 
#         df_both,d_pval,order = util.violin_stats2(df_pri,d_order,s_foci,s_stats)
#         #util.qq_plot_hist(df_pri,s_cat,s_foci)  #anova eval
#         fig,pvalues,corrected = util.plot_violins2(df_both,d_pval,d_order,s_stats,s_foci,order,d_colorblind,s_porg,b_correct=b_correct)#True#False
#         #fig.savefig(f'figures/violinplot_both_{s_foci}_{list(d_order.keys())[-1]}_{s_porg}_{s_stats}.png')
#         break
#     break

In [None]:
# OLD #add pORG 
# ls_pORG = ['trim_padj_0.2_pORG_Up_55_Genes','Original_pORG_Up_78_Genes']
# s_pSUB = 'Kallisto55_pSUB1e-04'
# df_pORG = pd.read_csv(f'../data/20230608_GSVA_Scores.csv',index_col=0)
# df_pORG.reset_index(drop=True,inplace=True)
# df_pORG['Specimen_ID'] = [item.split('-')[-1] for item in df_pORG.Public_Specimen_ID]
# df_pORG['Public_Patient_ID'] = [item.split('-')[0] + '-' + item.split('-')[1] for item in df_pORG.Public_Specimen_ID]
# #drop 'T2'
# df_t2 = df_pORG[df_pORG.Public_Specimen_ID.str.contains('-T2')]
# ls_drop = df_pORG[df_pORG.Public_Specimen_ID.str.contains('-T2')].index
# df_pORG = df_pORG.drop(ls_drop)
# #add pri and met
# for s_pORG in ls_pORG:
#     for s_group in ['GSVA_Primary', 'GSVA_Met']: 
#         df_group = df_pORG.loc[df_pORG.Group==s_group,[s_pORG,'Public_Patient_ID']]
#         df_group.rename({'trim_padj_0.2_pORG_Up_55_Genes':'pORG_0.2',
#                         'Original_pORG_Up_78_Genes':'pORG_78'},axis=1,inplace=True)
#         s_suffix = s_group.replace('GSVA','')
#         df_patient = df_patient.merge(df_group,on='Public_Patient_ID',suffixes=('',s_suffix),how='left')
# df_patient.rename({'pORG_0.2':'pORG_0.2_Primary',
#                       'pORG_78':'pORG_78_Primary'},axis=1,inplace=True)

# #add 'GSVA_All': must pivot
# df_group = df_pORG.loc[df_pORG.Group=='GSVA_All',[s_pORG,'Public_Patient_ID','Specimen_ID']]
# df_group = df_group.pivot(index='Public_Patient_ID',columns='Specimen_ID',values=s_pORG)
# df_group['pORG_0.2_allPrimary'] = df_group.loc[:,'T'].fillna(df_group.F)
# df_group['pORG_0.2_allMet'] = df_group.loc[:,'M'].fillna(df_group.M2)
# df_patient = df_patient.merge(df_group.reset_index().loc[:,['Public_Patient_ID','pORG_0.2_allPrimary','pORG_0.2_allMet']],on='Public_Patient_ID',how='left')
# print(len(df_patient))
# #add T2 specimens, primary, all
# d_t2 = dict(zip(df_t2[df_t2.Group=='GSVA_Primary'].Public_Patient_ID,df_t2.loc[df_t2.Group=='GSVA_Primary',s_pORG]))
# df_patient['pORG_0.2_Primary_T2'] = df_patient.Public_Patient_ID.map(d_t2)
# d_t2_all = dict(zip(df_t2[df_t2.Group=='GSVA_All'].Public_Patient_ID,df_t2.loc[df_t2.Group=='GSVA_All',s_pORG]))
# df_patient['pORG_0.2_All_T2'] = df_patient.Public_Patient_ID.map(d_t2)

# #label 10 matched samples
# df_patient.loc[(~df_patient['pORG_0.2_allPrimary'].isna()) | (~df_patient['pORG_0.2_allMet'].isna()),'Matched_Primary_Met_RNA'] = False 
# df_patient.loc[(~df_patient['pORG_0.2_allPrimary'].isna()) & (~df_patient['pORG_0.2_allMet'].isna()),'Matched_Primary_Met_RNA'] = True
# df_patient.Matched_Primary_Met_RNA.value_counts()
