In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

## system
import os
import pickle
import warnings
warnings.filterwarnings("ignore")

## ML
import PyComplexHeatmap as pyc
import umap.umap_ as umap
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score


## Extra
from anndata import read_h5ad

## Load Tissue Data

In [45]:
ad = read_h5ad('/screening/analysis/2024_05_03_tissue_enhanced_mrd_AS/intermediate/anndata/2024_06_28_pilot_tpb_anndata.h5ad')
tissue_data_snr = ad[ad.obs['specimen_type'] == 'tumor_tissue'].to_df()

## Obtain region scores of each plasma-tissue paired sample
tissue_data = pd.DataFrame()
plasma_data = pd.DataFrame()

for tissue_id in tissue_data_snr.index:

    region_scores = ad[ad.obs['patient_id'] == tissue_id.split("__")[0]].to_df(layer='region_scores')

    tissue_sample = region_scores.query(' index == @tissue_id ')

    plasma_id = tissue_id.split("__")[0] + "__plasma_patient"
    
    plasma_sample = region_scores.query(' index == @plasma_id ')
    
    tissue_data = pd.concat([tissue_data, tissue_sample], axis=0)
    
    plasma_data = pd.concat([plasma_data, plasma_sample ], axis=0)
    

print(f"The shape of tissue data is {tissue_data.shape}")
print(f"The shape of plasma data is {plasma_data.shape}")

The shape of tissue data is (25, 3410)
The shape of plasma data is (25, 3410)


In [52]:
ad.obs

Unnamed: 0_level_0,unique_sample_id,patient_id,specimen_type,sample_group,epi_tf,stage_group,breast_s3_logitMAF,CRCv2_logitMAF,LungV4_logitMAF
patient_id_specimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
130806729__buffy_coat,DEI42_20240508_26_Mym7sAvVaol_240508_A01774_00...,130806729,buffy_coat,crc,0.008082,stage_ii,-22.569944,-21.256445,-22.309183
130806729__plasma_patient,DEI42_20240508_18_ha9d4Uhn7OK_240508_A01774_00...,130806729,plasma_patient,crc,0.003140,stage_ii,-19.537673,-15.306478,-20.235548
130806729__tumor_tissue,DEI42_20240508_10_YHbLtRqBsn2_240508_A01774_00...,130806729,tumor_tissue,crc,0.316540,stage_ii,-10.200184,29.313922,-22.599333
200007801__buffy_coat,DEI42_20240508_29_8LqbCKYGTc1_240508_A01774_00...,200007801,buffy_coat,breast,0.000482,stage_i,-22.326165,-21.767457,-22.562240
200007801__plasma_patient,DEI42_20240508_21_5tRVdh5JA5N_240508_A01774_00...,200007801,plasma_patient,breast,0.001918,stage_i,-21.534230,-19.358855,-21.968628
...,...,...,...,...,...,...,...,...,...
MT9923-0252__plasma_patient,DEI0074_059_GElST8hEg4C_240613_lh00141_0093_A2...,MT9923-0252,plasma_patient,lung,0.006337,stage_iii,-21.677168,-20.989362,-20.351356
MT9923-0252__tumor_tissue,DEI0074_056_oWgLgIk3VIg_240613_lh00141_0093_A2...,MT9923-0252,tumor_tissue,lung,0.722857,stage_iii,10.830528,5.862875,4.625623
MT9923-0272__buffy_coat,DEI0074_045_QzWAX4T8G5A_240613_lh00141_0093_A2...,MT9923-0272,buffy_coat,lung,0.001745,stage_ii,-22.883653,-20.749712,-22.737288
MT9923-0272__plasma_patient,DEI0074_060_C0CabD3S0G5_240613_lh00141_0093_A2...,MT9923-0272,plasma_patient,lung,0.003439,stage_ii,-19.055966,-18.403180,-17.867295


In [41]:
region_scores = ad[ad.obs['patient_id'] == tissue_data_snr.index[0].split("__")[0]].to_df(layer='region_scores')

In [42]:
region_scores.shape

(3, 3410)

In [47]:
df = pd.read_csv('/screening/analysis/2024_03_29_MCD_intermediate_files/intermediate/s3_region_design/s3_v1.2b_V6_noCntrl_and_tcga_overlap_EP1_full.tsv', sep='\t')

In [48]:
df.head()

Unnamed: 0,chrom,start,end,region_id
0,1,975869,976223,s3_region_20
1,1,1072543,1072784,s3_region_30
2,1,1181851,1182185,s3_region_35
3,1,1475653,1475887,s3_region_58
4,1,1935190,1935433,s3_region_80


In [49]:
df.shape

(3642, 4)