In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

## system
import os
import pickle
import warnings
warnings.filterwarnings("ignore")

## statistics
from scipy.stats import zscore
import statsmodels.formula.api as sm

## ML
import PyComplexHeatmap as pyc
from PyComplexHeatmap.utils import define_cmap
import umap.umap_ as umap
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


## Extra
from anndata import read_h5ad

## Load Data

In [None]:
## Load plasma-tissue paired samples
ad = read_h5ad('/screening/analysis/2024_05_03_tissue_enhanced_mrd_AS/intermediate/anndata/2024_06_28_pilot_tpb_anndata.h5ad')

## Load plasma-tissue paired samples' meta-data
columns = ["unique_sample_id", "sample_id", "age","qc_status", "histological_subtype", "flowcell", "molecule_file_path", "g360_max_maf_pct"]
meta_info = pd.read_csv("/screening/analysis/2024_05_03_tissue_enhanced_mrd_AS/resources/metadata/2024-06-23_updated_pilot_TPB_and_cf_Ep1_Epiv2p0_with_epiTFgamma.tsv", sep="\t").\
    loc[:,columns]
    
## combine meta-data
tissue_meta = ad.obs.reset_index().\
    query(' patient_id_specimen.str.endswith("__tumor_tissue") ').\
    merge(meta_info, how="left", on="unique_sample_id")

plasma_meta = ad.obs.reset_index().\
    query(' patient_id_specimen.str.endswith("__plasma_patient") ').\
    merge(meta_info, how="left", on="unique_sample_id")

plasma_tissue_meta = pd.concat([plasma_meta,tissue_meta],axis=0)
print(f"The shape of plasma-tissue meta-data before filtering is {plasma_tissue_meta.shape}")

## load plasma-tissue paired samples' region scores
plasma_tissue_lung_region_scores = pd.read_csv("/home/eforouzmand/repos/gh-eforouzmand/2024-Jul-09-Tissue-Plasma-Comparisons/tissue_plasma_lung_region_scores.tsv", sep="\t")

## Missing sample filtering
missing_patient_id_specimen = plasma_tissue_meta.query('~sample_id.isin(@plasma_tissue_lung_region_scores.run_sample_id) ').patient_id_specimen.iloc[0].split("__")[0]
missing_run_sample_id = plasma_tissue_meta.query(' patient_id_specimen.str.startswith(@missing_patient_id_specimen) ').sample_id

plasma_tissue_meta = plasma_tissue_meta.query(' ~sample_id.isin(@missing_run_sample_id) ').\
    set_index("sample_id")

print(f"Meta-data shape after filtering: {plasma_tissue_meta.shape}")

plasma_tissue_lung_region_scores = plasma_tissue_lung_region_scores.query(' ~run_sample_id.isin(@missing_run_sample_id) ').\
    set_index("run_sample_id")

print(f"Region score data shape after filtering: {plasma_tissue_lung_region_scores.shape}")

## Load clustering information of plasma-tissue pairs
plasma_tissue_cluster = pd.read_csv("/screening/notebooks/zhuang/summer_2024/data/plasma_tissue_cluster.tsv", sep="\t").\
    set_index("run_sample_id")

In [None]:
## region meta-info
tissue_regions_meta = pd.read_csv('/screening/analysis/2024_03_29_MCD_intermediate_files/intermediate/s3_region_design/s3_v1.2b_V6_noCntrl_and_tcga_overlap_EP1_full.tsv', sep='\t')
tissue_regions_meta.head()

## lung model
s3_v4_model = pd.read_table("/home/eforouzmand/repos/gh-eforouzmand/2024-Jun-12-Lung-S3-in-region-discovery/intermediate_data/CANDIDATE_S3_Lung_-_V4.model_file.98_spec_set_on_s3_data.tsv")

## Load cancer-free & lung samples for the discovery set

In [None]:
with open('/screening/notebooks/zhuang/summer_2024/data/region_ids.pkl', 'rb') as f:
        region_ids = pickle.load(f)

with open('/screening/notebooks/zhuang/summer_2024/data/df_ldt.pkl', 'rb') as f:
        df_ldt = pickle.load(f)

df_cf = df_ldt.loc[:,["unique_sample_id","sample_group"]+region_ids].\
    query('sample_group == "cancer_free"').\
    drop("sample_group",axis=1).\
    set_index("unique_sample_id").\
    loc[:,s3_v4_model.region_id]

print(f"The shape of the cancer-free sample data from the discovery set is {df_cf.shape}")

## Mean and Std of each region in cancer-free samples
cf_means = df_cf.mean(axis=0)
cf_stds = df_cf.std(axis=0)

meta_columns = ["specimen_type","patient_id","histological_subtype"]

plasma_tissue_lung_region_scores = pd.merge(plasma_tissue_lung_region_scores, plasma_tissue_cluster[meta_columns], how="left", left_index=True, right_index=True)

## Region-specific Tissue Multiplier

In [None]:
tissue_lung_region_scores = plasma_tissue_lung_region_scores.query(' specimen_type == "tumor_tissue" ').\
    drop(meta_columns, axis=1)

plasma_lung_region_scores = plasma_tissue_lung_region_scores.query(' specimen_type == "plasma_patient" ').\
    drop(meta_columns, axis=1)
