#### Select hyper DMRs for tissue cell types

### Initialization

In [None]:
import os
import numpy as np
import pandas as pd
import glob
import pybedtools

import altair as alt
alt.data_transformers.enable('default', max_rows=None)

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)

In [None]:
def mbd_theme(*args, **kwargs):
    return {
        "config": {
            "legend": {"titleFontSize": 12, "labelFontSize": 12, "labelLimit": 0},
            "axis": {"titleFontSize": 12, "labelFontSize": 12},
            "header": {"titleFontSize": 14, "labelFontSize": 14},
            "title": {"titleFontSize": 30, "labelFontSize": 30},
        }
    }


alt.themes.register("mbd_theme", mbd_theme)
alt.themes.enable("mbd_theme")

### Parameters and file paths

In [None]:
#--- Parameters
REGIONS = 'hg19_cpg_clusters_k3_s150_w150'

FILTER_COV = 10

BED_COLS = ['region_chr', 'region_start', 'region_end', 'region_id']

#--- Local paths
ROOT_DIR = '/home/ubuntu/git/etsang/projects'
PROJECT_SLUG = '2023_10_10_SRT_hyper_tissue_dmr_selection_EKT'
PROJECT_DIR = f"{ROOT_DIR}/{PROJECT_SLUG}/work"

ALAN_PROJECT_DIR = "/home/ubuntu/data/2023_07_10_HYPER_design_AS"
# Samples
SAMPLES_PATH = ALAN_PROJECT_DIR + '/stage/metadata/loyfer2022_samples_with_blueprint.tsv'
# Regions
REGIONS_PATH = PROJECT_DIR + '/hyper_dmr_selected'
# Methylation data
METH_DIR = (
    PROJECT_DIR + 
    '/bp_loyfer_meth_summaries/standard-{regions}.filtered'
).format(regions=REGIONS)

# # Where to store graphics
RESULTS_PATH = (
    PROJECT_DIR + '/plots/'
)


In [None]:
def get_meth_data(regions, samples_df, raw_meth_df):
    """
    Extracts methylation data from a raw methylation dataframe for a given set of regions and samples.

    Args:
        regions (pandas.DataFrame): A dataframe containing information about the regions of interest.
        samples_df (pandas.DataFrame): A dataframe containing information about the samples.
        raw_meth_df (pandas.DataFrame): A dataframe containing raw methylation data.

    Returns:
        tuple: A tuple containing two dataframes:
            - meth_df (pandas.DataFrame): A dataframe containing the methylation data for the specified regions and samples.
            - meta_df (pandas.DataFrame): A dataframe containing metadata for the samples in meth_df.
    """
    ridxs = raw_meth_df['sample_id'].isin(samples_df['sample_id'])
    ridxs &= raw_meth_df['region_id'].isin(regions['region_id'])
    ridxs &= (raw_meth_df['region_number_total']>=FILTER_COV)
    meth_df = raw_meth_df[ridxs]\
        .pivot_table(index='region_id', columns='sample_id', values='region_meth_rate')
    # Remove samples with many missing values
    cidxs = (meth_df.isna().sum(axis=0)<(0.1*meth_df.shape[0]))
    meth_df = meth_df.loc[:, cidxs].copy()
    # Remove missing values (removes regions)
    #ridxs = (meth_df.isna().sum(axis=1)==0)
    #meth_df = meth_df.loc[ridxs, :].copy()    
    # align with sample metadata
    ridxs = samples_df['sample_id'].isin(meth_df.columns)
    meta_df = samples_df[ridxs][['sample_id', 'germ_layer', 'group', 
                                 'sample_group', 'cell_type']]\
        .sort_values('sample_group')
    meth_df = meth_df.loc[:, meta_df['sample_id']].transpose()
    
    return(meth_df, meta_df)

### Sample Metadata

In [None]:
samples_df = pd.read_csv(SAMPLES_PATH, sep='\t')
samples_df['sample_group'] = samples_df['super_group']\
    .str.replace('-', '_', regex=False)\
    .str.replace('+', '_plus_', regex=False)
ridxs = ~(
    samples_df['sample_group'].isna() |
    samples_df['sample_group'].str.startswith('Blueprint')
)
# drop umbilical endothelium
ridxs = ridxs & (samples_df['super_group'] != 'Umbilical-Endothelium')
samples_df = samples_df[ridxs].copy()

### Region Data

In [None]:
tcga_filt_clusters = pd.read_table(f"{REGIONS_PATH}/tcga_filtered_regions_clusters.tsv")
non_tcga_clusters = pd.read_table(f"{REGIONS_PATH}/non_tcga_filtered_regions_0or1_exceptions_clusters.tsv")
top20_clusters = pd.read_table(f"{REGIONS_PATH}/top20_regions_per_celltype_clusters.tsv")

### Methylation Data

In [None]:
%%time
raw_meth_df = pd.concat([
    pd.read_csv(ifile, sep='\t', 
                names=['sample_id', 'region_id', 
                       'region_number_total', 
                       'region_meth_rate'])
    for ifile in glob.glob(METH_DIR+'/*.csv')
])

## Plot selected regions

In [None]:
%load_ext rpy2.ipython

In [None]:
%%time
# region_df = top20_clusters
# title = 'Top 20 regions per cell type'

region_df = tcga_filt_clusters
title = 'TCGA filtered regions'

# region_df = non_tcga_clusters
# title = 'Regions not filtered by TCGA'

meth_df, meta_df = get_meth_data(region_df, samples_df, raw_meth_df)
regions_ordered = region_df.sort_values(['target_celltype', 'meth_base_min'], ascending=[True, False])['region_id'].drop_duplicates()
meth_df = meth_df.loc[:, regions_ordered]
meth_df.shape

In [None]:
%%R -i meth_df,meta_df,RESULTS_PATH,title

library(tidyverse)
library(pheatmap)
library(pals)

save_pheatmap_pdf <- function(x, filename, width=7, height=10) {
   stopifnot(!missing(x))
   stopifnot(!missing(filename))
   pdf(filename, width=width, height=height)
   grid::grid.newpage()
   grid::grid.draw(x$gtable)
   dev.off()
}

row_annot <- data.frame(sample_group=meta_df$sample_group)
rownames(row_annot) <- meta_df$sample_id
sample_group_colors = c(stepped(), stepped2())[1:length(unique(meta_df$sample_group))]
names(sample_group_colors) = unique(meta_df$sample_group)
row_colors = list(sample_group=sample_group_colors)
p <- pheatmap(
    meth_df, 
    cluster_rows=FALSE, show_rownames=FALSE, 
    cluster_cols=FALSE, show_colnames=FALSE, 
    annotation_row=row_annot, annotation_colors=row_colors, annotation_names_row=FALSE,
    scale='none', na_col='black', main=title
)

title_simple = gsub(' ', '_', tolower(title))

save_pheatmap_pdf(p, file=paste0(RESULTS_PATH, 'selected__', title_simple, '.pdf'))
