In [None]:
import pandas as pd
import glob
import os
import pybedtools

from multiprocessing import Pool

import altair as alt
alt.data_transformers.enable('default', max_rows=None)

In [None]:
cancer_code_dict = {
    'LAML'	: 'Acute Myeloid Leukemia',
    'ACC'	: 'Adrenocortical carcinoma',
    'BLCA'	: 'Bladder Urothelial Carcinoma',
    'LGG'	: 'Brain Lower Grade Glioma',
    'BRCA'	: 'Breast invasive carcinoma',
    'CESC'	: 'Cervical squamous cell carcinoma and endocervical adenocarcinoma',
    'CHOL'	: 'Cholangiocarcinoma',
    'LCML'	: 'Chronic Myelogenous Leukemia',
    'COAD'	: 'Colon adenocarcinoma',
    'CNTL'	: 'Controls',
    'ESCA'	: 'Esophageal carcinoma',
    'FPPP'	: 'FFPE Pilot Phase II',
    'GBM'	: 'Glioblastoma multiforme',
    'HNSC'	: 'Head and Neck squamous cell carcinoma',
    'KICH'	: 'Kidney Chromophobe',
    'KIRC'	: 'Kidney renal clear cell carcinoma',
    'KIRP'	: 'Kidney renal papillary cell carcinoma',
    'LIHC'	: 'Liver hepatocellular carcinoma',
    'LUAD'	: 'Lung adenocarcinoma',
    'LUSC'	: 'Lung squamous cell carcinoma',
    'DLBC'	: 'Lymphoid Neoplasm Diffuse Large B-cell Lymphoma',
    'MESO'	: 'Mesothelioma',
    'MISC'	: 'Miscellaneous',
    'OV'	: 'Ovarian serous cystadenocarcinoma',
    'PAAD'	: 'Pancreatic adenocarcinoma',
    'PCPG'	: 'Pheochromocytoma and Paraganglioma',
    'PRAD'	: 'Prostate adenocarcinoma',
    'READ'	: 'Rectum adenocarcinoma',
    'SARC'	: 'Sarcoma',
    'SKCM'	: 'Skin Cutaneous Melanoma',
    'STAD'	: 'Stomach adenocarcinoma',
    'TGCT'	: 'Testicular Germ Cell Tumors',
    'THYM'	: 'Thymoma',
    'THCA'	: 'Thyroid carcinoma',
    'UCS'	: 'Uterine Carcinosarcoma',
    'UCEC'	: 'Uterine Corpus Endometrial Carcinoma',
    'UVM'	: 'Uveal Melanoma',
}

cancer_code_simple_dict = {
    'LAML'	: 'AML',
    'ACC'	: 'Adrenocortical',
    'BLCA'	: 'Bladder',
    'LGG'	: 'Lower Grade Glioma',
    'BRCA'	: 'Breast',
    'CESC'	: 'Cervix',
    'CHOL'	: 'Bile duct',
    'LCML'	: 'CML',
    'COAD'	: 'Colon',
    'CNTL'	: 'Controls',
    'ESCA'	: 'Esophagus',
    'FPPP'	: 'FFPE',
    'GBM'	: 'Glioblastoma',
    'HNSC'	: 'Head and Neck',
    'KICH'	: 'Kidney Chromophobe',
    'KIRC'	: 'Kidney clear cell',
    'KIRP'	: 'Kidney papillary cell',
    'LIHC'	: 'Liver',
    'LUAD'	: 'Lung adeno',
    'LUSC'	: 'Lung squamous',
    'DLBC'	: 'Lymphoma',
    'MESO'	: 'Mesothelioma',
    'MISC'	: 'Miscellaneous',
    'OV'	: 'Ovary',
    'PAAD'	: 'Pancreas',
    'PCPG'	: 'Neuroendocrine',
    'PRAD'	: 'Prostate',
    'READ'	: 'Rectum',
    'SARC'	: 'Sarcoma',
    'SKCM'	: 'Melanoma',
    'STAD'	: 'Stomach',
    'TGCT'	: 'Testicular',
    'THYM'	: 'Thymus',
    'THCA'	: 'Thyroid',
    'UCS'	: 'Uterus',
    'UCEC'	: 'Endometrium',
    'UVM'	: 'Uveal Melanoma',
}

tissue_map_dict = {
    'Gastric': 'Stomach',
    'Small': 'Colon',
    'Oligodend': 'Neuron',
}

cancer_stem_map_dict = {
    'Lower': 'Neuron',
    'Glioblastoma': 'Neuron',
    'Rectum': 'Colon',
    'Bile': 'Liver',
}

# Read in data and process

In [None]:
workdir = "../work"
output_dir = f"{workdir}/tcga_summaries/for_alans_regions"
data_dir = "/home/ubuntu/data"
tcga_files = glob.glob(f"{data_dir}/tcga/*/HumanMethylation450K/*_beta_values.tsv.gz")

loyfer = pd.read_table(f"{data_dir}/Loyfer2022/loyfer2022_atlas_final_regions_top25.tsv")
alan_hyper = pd.read_table(f"{data_dir}/2023_07_10_HYPER_design_AS/regions_selection/02_08012023_HYPER_regions_hg19_high60_low40_fdr_5e2_minCpG_7.tsv") # clusters not merged
cpgs_path = f"{data_dir}/2023_08_01_SRT_tcga_mced_discovery_MJG/data/probes_450k/probe_to_coord.sorted.bed"

tumor_purity_raw = pd.read_table(f"{data_dir}/2023_08_01_SRT_tcga_mced_discovery_MJG/data/tumor_purity/arun_2015.tsv", skiprows=3)
tumor_purity = tumor_purity_raw[['Sample ID', 'Cancer type', 'ESTIMATE']].copy()
tumor_purity.columns = ['sample_id', 'cancer_code', 'purity']

### Process regions

In [None]:
pybedtools.set_tempdir(f"../pybedtools_tmp")

Merge clusters in Alan's identified regions

In [None]:
merged = pybedtools.BedTool.from_dataframe(alan_hyper.sort_values(['chr', 'start'])).merge(c=5, o="distinct")
hyper_merged_df = merged.to_dataframe()
hyper_merged_df.columns = ['region_chr', 'region_start', 'region_end', 'target_celltype']
hyper_merged_df['region_id'] = hyper_merged_df['target_celltype'] + "_" + hyper_merged_df['region_chr'] + ":" + hyper_merged_df['region_start'].astype(str) + "-" + hyper_merged_df['region_end'].astype(str)
hyper_merged_df['direction'] = 'hyper'

Combine published hypo and Alan's merged hyper regions into a single data frame

In [None]:
loyfer_simple = loyfer[['region_chr', 'region_start', 'region_end', 'target_celltype', 'region_id']].copy()
loyfer_simple['direction'] = 'hypo'
regions = pd.concat([loyfer_simple, hyper_merged_df], ignore_index=True).sort_values(['region_chr', 'region_start'])

Merge probes and regions

In [None]:
intersect = pybedtools.BedTool.from_dataframe(regions).intersect(pybedtools.BedTool(cpgs_path), wo=True, sorted=True)
region_cpg_df = intersect.to_dataframe()
region_cpg_df.columns = ['region_chr', 'region_start', 'region_end', 'target_celltype', 'region_id', 'direction', 'cpg_chr', 'cpg_start', 'cpg_end', 'probe', 'overlap']
region_cpg_df = region_cpg_df[['target_celltype', 'region_id', 'direction', 'probe']]

### Check proportion of regions that overlap with TCGA probes

In [None]:
subcols = ['target_celltype', 'region_id', 'direction']
probe_overlap_bool_df = (region_cpg_df[subcols]
                         .drop_duplicates()
                         .assign(overlap=1))
probe_overlap_bool_df = regions[subcols].merge(probe_overlap_bool_df, on=subcols, how='left').fillna(0)

In [None]:
probe_overlap_summary = (probe_overlap_bool_df
                         .groupby(['target_celltype', 'direction', 'overlap'])
                         .size()
                         .reset_index(name='n_regions')
                         .pivot_table(index=['target_celltype', 'direction'], columns='overlap', values='n_regions', fill_value=0)
                         .reset_index()
                         .rename(columns={0: 'no_overlap', 1: 'overlap'}))
probe_overlap_summary['n_regions'] = probe_overlap_summary['no_overlap'] + probe_overlap_summary['overlap']
probe_overlap_summary['perc_overlap'] = probe_overlap_summary['overlap'] / probe_overlap_summary['n_regions'] * 100
# remove target cell types with , or :
probe_overlap_summary = probe_overlap_summary[~probe_overlap_summary['target_celltype'].str.contains("[,:]")]
probe_overlap_summary['target_celltype'] = probe_overlap_summary['target_celltype'].str.replace('_', '-')

probe_overlap_summary

In [None]:
alt.Chart(probe_overlap_summary).mark_boxplot().encode(
    x=alt.X("direction", title='Region type'),
    y=alt.Y("perc_overlap", title='% regions overlapping TCGA probes'),
    color=alt.Color("direction", legend=None)
).properties(
    width=100,
    height=200
)

In [None]:
alt.Chart(probe_overlap_summary).mark_boxplot().encode(
    x=alt.X("direction", title='Region type'),
    y=alt.Y("n_regions", title='Number of regions per tissue'),
    color=alt.Color("direction", legend=None),
    #fill=alt.Fill("direction", legend=None)
).properties(
    width=100,
    height=200
)

In [None]:
probe_overlap_summary.query("target_celltype == ['Colon-Ep', 'Kidney-Ep', 'Liver-Hep', 'Thyroid-Ep']")[['target_celltype', 'direction', 'n_regions', 'overlap']]

### Check tumor purity distributions
Use this to decide a reasonable tumor purity cutoff for this analysis

In [None]:
alt.Chart(tumor_purity).mark_boxplot().encode(
    x=alt.X('cancer_code'),
    y=alt.Y('purity'),
)

In [None]:
purity = 0.8
alt.Chart(tumor_purity.query(f"purity > {purity}")).mark_bar().encode(
    x=alt.X('cancer_code'),
    y=alt.Y("count()"),
).properties(
    title = "Number of samples with tumor purity > 0.8"
)

In [None]:
high_purity_samples = tumor_purity.query(f"purity > {purity}")['sample_id'].tolist()

### Summarize methylation over regions for each cancer type

In [None]:
def read_and_summarize_beta(filepath):
    cancer_code, sample_type = os.path.basename(filepath).replace("_beta_values.tsv.gz", "").split('_')
    # skip processing if had alreday been run
    for direction in ['hyper', 'hypo']:
            outfile = f"{output_dir}/{direction}_{sample_type}_{cancer_code}_purity{purity}_median_beta_per_region.tsv.gz"
            if os.path.isfile(outfile):
                 return
    
    print(f"Reading {cancer_code} {sample_type}")
    beta_df = pd.read_table(filepath)
    beta_df['sample_id'] = beta_df['sample_barcode'].str.slice(0, 16)
    if sample_type == 'tumor':
        beta_df = beta_df.query(f"sample_id == {high_purity_samples}")
    beta_summary_df = (beta_df
                       .merge(region_cpg_df)
                       .groupby(['target_celltype', 'region_id', 'direction', 'sample_id'])['beta_value'].median()
                       .reset_index())
    beta_summary_df['cancer_code'] = cancer_code
    beta_summary_df['sample_type'] = sample_type
    # split hyper and hypo
    print(f"Writing {cancer_code} {sample_type}")
    if beta_summary_df.shape[0] > 0:
        for direction in ['hyper', 'hypo']:
            outfile = f"{output_dir}/{direction}_{sample_type}_{cancer_code}_purity{purity}_median_beta_per_region.tsv.gz"
            beta_summary_df.query(f"direction == '{direction}'").to_csv(outfile, sep="\t", index=False)
    return
    

In [None]:
# # ran on r5a.8xlarge
# with Pool(12) as p:
#     p.map(read_and_summarize_beta, tcga_files)

Read in summaries

In [None]:
def read_summaries(prefix):
    file_list = glob.glob(f"{output_dir}/{prefix}*")
    df = pd.concat([pd.read_table(f) for f in file_list], ignore_index=True)
    return df

In [None]:
hyper_tumor = read_summaries('hyper_tumor')
hypo_tumor = read_summaries('hypo_tumor')
hyper_norm = read_summaries('hyper_norm')
hypo_norm = read_summaries('hypo_norm')

# Analysis

In [None]:
def mbd_theme(*args, **kwargs):
    return {
        "config": {
            "legend": {"titleFontSize": 12, "labelFontSize": 12, "labelLimit": 0},
            "axis": {"titleFontSize": 12, "labelFontSize": 12},
            "header": {"titleFontSize": 14, "labelFontSize": 14},
            "title": {"titleFontSize": 30, "labelFontSize": 30},
        }
    }


alt.themes.register("mbd_theme", mbd_theme)
alt.themes.enable("mbd_theme")

In [None]:
def plot_by_tissue(df, title, ncols):
    p = alt.Chart(df).mark_circle().encode(
        x=alt.X("cancer_type", title=""),
        y=alt.Y("beta_value", title="Median beta value"),
        facet=alt.Facet('target_celltype', title="Target celltype", columns=ncols),
        color=alt.Color("match", legend=None),
    ).properties(
        width=200,
        height=200
    ).properties(title=title)
    # hacky way to get subset plots to be cleaner without adding extra param
    if ncols == 5:
        p = p.resolve_scale(x='independent')
    return p

def make_full_and_filt_plots(df, title_prefix, ncols=5):
    targets_single = df.loc[~df['target_celltype'].str.contains(',|:'), 'target_celltype'].drop_duplicates().tolist()
    single = (df.query(f"target_celltype == {targets_single}")
              .groupby(['target_celltype', 'cancer_code', 'region_id'])['beta_value'].median()
              .reset_index())
    single['cancer_type'] = single['cancer_code'].map(cancer_code_simple_dict)
    single['celltype_stem'] = single['target_celltype'].str.replace('_.*', '', regex=True).str.replace('-.*', '', regex=True).replace(tissue_map_dict)
    single['cancer_stem'] = single['cancer_type'].str.replace(' .*', '', regex=True).replace(cancer_stem_map_dict)
    single['match'] = single['cancer_stem'] == single['celltype_stem']
    p1 = plot_by_tissue(single, title=f'{title_prefix} - all regions', ncols=ncols)

    bad_regions = single.query("match == False & beta_value > 0.3")['region_id'].tolist()
    single_filt = single.query(f"region_id != {bad_regions}")
    p2 = plot_by_tissue(single_filt, title=f'{title_prefix} - filtered regions', ncols=ncols)

    return p1, p2

In [None]:
hyper_tumor1, hyper_tumor2 = make_full_and_filt_plots(hyper_tumor, 'Tumor: hyper')
hypo_tumor1, hypo_tumor2 = make_full_and_filt_plots(hypo_tumor, 'Tumor: hypo')
hyper_norm1, hyper_norm2 = make_full_and_filt_plots(hyper_norm, 'Normal: hyper')
hypo_norm1, hypo_norm2 = make_full_and_filt_plots(hypo_norm, 'Normal: hypo')

In [None]:
hyper_tumor1

In [None]:
hyper_tumor2

In [None]:
hypo_tumor1

In [None]:
hyper_norm1

In [None]:
hyper_norm2

In [None]:
hypo_norm1

### Make subsetted versions of the same plots

In [None]:
hyper_celltypes = ['Bladder_Ep', 'Breast_Basal_Ep', 'Breast_Luminal_Ep', 'Colon_Ep', 'Head_Neck_Ep', 'Kidney_Ep', 'Liver_Hep', 
                   'Lung_Ep_Alveo', 'Neuron_plus_Oligodend', 'Prostate_Ep', 'Small_Int_Ep', 'Thyroid_Ep']
hypo_celltypes = ['Bladder-Ep', 'Breast-Basal-Ep', 'Breast-Luminal-Ep', 'Colon-Ep', 'Kidney-Ep', 'Liver-Hep', 
                  'Lung-Ep-Alveo', 'Neuron', 'Oligodend', 'Pancreas-Acinar', 'Prostate-Ep', 'Thyroid-Ep']

In [None]:
hyper_tumor1filt, hyper_tumor2filt = make_full_and_filt_plots(hyper_tumor.query(f"target_celltype == {hyper_celltypes}"), 'Tumor: hyper', ncols=6)
hypo_tumor1filt, _ = make_full_and_filt_plots(hypo_tumor.query(f"target_celltype == {hypo_celltypes}"), 'Tumor: hypo', ncols=6)
hyper_norm1filt, hyper_norm2filt = make_full_and_filt_plots(hyper_norm.query(f"target_celltype == {hyper_celltypes}"), 'Normal: hyper', ncols=6)
hypo_norm1filt, _ = make_full_and_filt_plots(hypo_norm.query(f"target_celltype == {hypo_celltypes}"), 'Normal: hypo', ncols=6)

In [None]:
hyper_tumor1filt

In [None]:
hyper_tumor2filt

In [None]:
hyper_norm1filt

In [None]:
hyper_norm2filt

In [None]:
hypo_tumor1filt

In [None]:
hypo_norm1filt

### Make clean version sof plot for overview presentation

In [None]:
def plot_by_tissue2(df, title, ytitle, ncols):
    df = df.copy()
    df['target_celltype'] = df['target_celltype'].str.replace('_.*', '', regex=True).str.replace('-.*', '', regex=True) + ' regions'
    p = alt.Chart(df).mark_circle().encode(
        x=alt.X("cancer_type", title=""),
        y=alt.Y("beta_value", title=ytitle),
        facet=alt.Facet('target_celltype', title="", columns=ncols, sort=['Colon regions', 'Liver regions']),
        color=alt.Color("match", legend=None),
    ).properties(
        width=200,
        height=150
    ).properties(title=title)
    # hacky way to get subset plots to be cleaner without adding extra param
    if ncols == 5:
        p = p.resolve_scale(x='independent')
    return p

def make_full_and_filt_plots2(df, title_prefix, ncols=5, hypo=False):
    # for these cleaned up plots, limit to specific cell types
    if hypo:
        targets_single = ['Colon-Ep', 'Kidney-Ep', 'Liver-Hep', 'Thyroid-Ep']
        ytitle = 'Median UNmethylation rate'
    else:
        targets_single = ['Colon_Ep', 'Kidney_Ep', 'Liver_Hep', 'Thyroid_Ep']
        ytitle = 'Median methylation rate'
    single = (df.query(f"target_celltype == {targets_single}")
              .groupby(['target_celltype', 'cancer_code', 'region_id'])['beta_value'].median()
              .reset_index())
    if hypo:
        single['beta_value'] = 1 - single['beta_value']
    single['cancer_type'] = single['cancer_code'].map(cancer_code_simple_dict)
    single['celltype_stem'] = single['target_celltype'].str.replace('_.*', '', regex=True).str.replace('-.*', '', regex=True).replace(tissue_map_dict)
    single['cancer_stem'] = single['cancer_type'].str.replace(' .*', '', regex=True).replace(cancer_stem_map_dict)
    single['match'] = single['cancer_stem'] == single['celltype_stem']
    p1 = plot_by_tissue2(single, title=f'{title_prefix} - all regions', ytitle=ytitle, ncols=ncols)

    bad_regions = single.query("match == False & beta_value > 0.3")['region_id'].tolist()
    single_filt = single.query(f"region_id != {bad_regions}")
    p2 = plot_by_tissue2(single_filt, title=f'{title_prefix} - filtered regions', ytitle=ytitle, ncols=ncols)

    return p1, p2

In [None]:
hyper_tumor1filt, hyper_tumor2filt = make_full_and_filt_plots2(hyper_tumor, 'Tumor: hyper')
hypo_tumor1filt, _ = make_full_and_filt_plots2(hypo_tumor, 'Tumor: hypo',  hypo=True)
hyper_norm1filt, hyper_norm2filt = make_full_and_filt_plots2(hyper_norm, 'Normal: hyper')
hypo_norm1filt, _ = make_full_and_filt_plots2(hypo_norm, 'Normal: hypo', hypo=True)

In [None]:
hyper_tumor1filt

In [None]:
hyper_tumor2filt

In [None]:
hypo_tumor1filt

In [None]:
hyper_norm1filt

In [None]:
hyper_norm2filt

In [None]:
hypo_norm1filt