#### Select hyper DMRs for tissue cell types

### Initialization

In [1]:
import io
import subprocess
import os
import glob
import sys
import pwd

import pandas as pd
import numpy as np
import scipy
import scipy.stats as stats
from sklearn.decomposition import PCA
import pickle
import re
import math
import plotnine as pn
import seaborn as sns
import glob
import subprocess
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from numpy import log
from collections import defaultdict,Counter
import re
import statsmodels
import warnings
import statsmodels.stats.multitest as smm
import json

from pathlib import Path
from midas import m
from datetime import datetime, date
import altair as alt

# gh specific
sys.path.append('/ghsfa/projects/pharma/shared_scripts/')
import bibp.functions as functions
import hgvslib.pHGVS as pHGVS  ## do a pull on the hgvslib git repo

# !pip install statannotations==0.4.0
# #!pip uninstall statannotations --yes
# from statannotations.Annotator import Annotatorå

warnings.filterwarnings("ignore")
pd.options.display.max_rows = None
pd.options.display.max_columns=None
pd.options.display.max_colwidth=None
work_dir="/ghsfa/projects/pharma/projects/sirius_pharma/hazhang_projects/Hyper_lung_tissue_selection_Emily_10312024"

In [2]:
import os
import numpy as np
import pandas as pd
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import pybedtools

import altair as alt
alt.data_transformers.enable('default', max_rows=None)

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
sns.set_context('talk')

In [3]:
def mbd_theme(*args, **kwargs):
    return {
        "config": {
            "legend": {"titleFontSize": 12, "labelFontSize": 12, "labelLimit": 0},
            "axis": {"titleFontSize": 12, "labelFontSize": 12},
            "header": {"titleFontSize": 14, "labelFontSize": 14},
            "title": {"titleFontSize": 30, "labelFontSize": 30},
        }
    }


alt.themes.register("mbd_theme", mbd_theme)
alt.themes.enable("mbd_theme")

ThemeRegistry.enable('mbd_theme')

### Parameters and file paths

In [4]:
#--- Parameters
REGIONS = 'hg19_cpg_clusters_k3_s150_w150'

FILTER_COV = 10
FILTER_RATE_TARGET = 0.6
FILTER_RATE_OTHER = 0.5
FILTER_RATE_DELTA = -0.2 # added this instead of using the filter rate other; negative because looking for hypermethylated regions
FILTER_FDR = 5e-2 # note: already applied this filter to output generated upstream
FILTER_BROAD_FDR = 5e-2
FILTER_MIN_CPGs_REGION = 10

BED_COLS = ['region_chr', 'region_start', 'region_end', 'region_id']
BLOOD_CELLTYPES = [ 
    'Blood_B', 'Blood_Granul',
    'Blood_Mega_plus_Eryth', 'Blood_Mono_plus_Macro', 
    'Blood_NK', 'Blood_T', 
]
REMOVE_CELLTYPES = [
    'Eryth_prog', 'Umbilical_Endothelium' # note umbilical endothelium is already being excluded before explicit filtering on this variable
]
#--- Local paths
# ROOT_DIR = '/home/ubuntu/git/etsang/projects'
# PROJECT_SLUG = '2023_10_10_SRT_hyper_tissue_dmr_selection_EKT'
# PROJECT_DIR = f"{ROOT_DIR}/{PROJECT_SLUG}/work"

PROJECT_DIR = f"{work_dir}/2024_11_01_SRT_hyper_tissue_dmr_selection_EKT"

# ALAN_PROJECT_DIR = "/home/ubuntu/data/2023_07_10_HYPER_design_AS"
ALAN_PROJECT_DIR = f"{work_dir}/2023_07_10_HYPER_design_AS"


# Samples
SAMPLES_PATH = ALAN_PROJECT_DIR + '/stage/metadata/loyfer2022_samples_with_blueprint.tsv'
### Units/regions file
REGION_PATH = (
    ALAN_PROJECT_DIR + '/stage/metadata/{regions}.ov_cpg_loci.tsv.gz'
).format(regions=REGIONS)
# Methylation data
METH_DIR = (
    PROJECT_DIR + 
    '/bp_loyfer_meth_summaries/standard-{regions}.filtered'
).format(regions=REGIONS)
# Differential methylation data
DIFFMETH_PATH = (PROJECT_DIR + 
    '/diff_meth/standard_diffmeth_{regions}_*_minus_*.tsv.gz'
).format(regions=REGIONS)

# # Where to store graphics
RESULTS_PATH = (
    PROJECT_DIR + '/plots/'
)


In [9]:
def get_meth_data(regions, samples_df, raw_meth_df):
    """
    Extracts methylation data from a raw methylation dataframe for a given set of regions and samples.

    Args:
        regions (pandas.DataFrame): A dataframe containing information about the regions of interest.
        samples_df (pandas.DataFrame): A dataframe containing information about the samples.
        raw_meth_df (pandas.DataFrame): A dataframe containing raw methylation data.

    Returns:
        tuple: A tuple containing two dataframes:
            - meth_df (pandas.DataFrame): A dataframe containing the methylation data for the specified regions and samples.
            - meta_df (pandas.DataFrame): A dataframe containing metadata for the samples in meth_df.
    """
    ridxs = raw_meth_df['sample_id'].isin(samples_df['sample_id'])
    ridxs &= raw_meth_df['region_id'].isin(regions['region_id'])
    ridxs &= (raw_meth_df['region_number_total']>=FILTER_COV)
    meth_df = raw_meth_df[ridxs]\
        .pivot_table(index='region_id', columns='sample_id', values='region_meth_rate')
    # Remove samples with many missing values
    cidxs = (meth_df.isna().sum(axis=0)<(0.1*meth_df.shape[0]))
    meth_df = meth_df.loc[:, cidxs].copy()
    # Remove missing values (removes regions)
    #ridxs = (meth_df.isna().sum(axis=1)==0)
    #meth_df = meth_df.loc[ridxs, :].copy()    
    # align with sample metadata
    ridxs = samples_df['sample_id'].isin(meth_df.columns)
    meta_df = samples_df[ridxs][['sample_id', 'germ_layer', 'group', 
                                 'sample_group', 'cell_type']]\
        .sort_values('sample_group')
    meth_df = meth_df.loc[:, meta_df['sample_id']].transpose()
    
    return(meth_df, meta_df)

### Sample Metadata

In [7]:
samples_df = pd.read_csv(SAMPLES_PATH, sep='\t')
samples_df['sample_group'] = samples_df['super_group']\
    .str.replace('-', '_', regex=False)\
    .str.replace('+', '_plus_', regex=False)
ridxs = ~(
    samples_df['sample_group'].isna() |
    samples_df['sample_group'].str.startswith('Blueprint')
)
# drop umbilical endothelium
ridxs = ridxs & (samples_df['super_group'] != 'Umbilical-Endothelium')
samples_df = samples_df[ridxs].copy()

In [8]:
summary = samples_df\
    .fillna('none')\
    .groupby(['group', 'refined_group', 'cell_type'])\
    .size()
summary

group              refined_group      cell_type                                                     
Adipocytes         Adipocytes         Adipocytes                                                         3
Bladder-Ep         Bladder-Ep         Epithelium                                                         5
Blood-B            Blood-B            B cells                                                            3
                                      Memory B cells                                                     2
Blood-Granul       Blood-Granul       Granulocytes                                                       3
Blood-Mega+Eryth   Blood-Mega+Eryth   CD34-negative, CD41-positive, CD42-positive megakaryocyte cell     2
                                      erythroblast                                                       2
Blood-Mono+Macro   Blood-Mono+Macro   Macrophages                                                        8
                                      Monoc

In [10]:
SAMPLE_GROUPS = sorted(samples_df['sample_group'].unique())
NONBLOOD_CELLTYPES = [x for x in SAMPLE_GROUPS if x not in BLOOD_CELLTYPES]
#NONBLOOD_CELLTYPES_WITH_REMOVE = [x for x in NONBLOOD_CELLTYPES if x not in REMOVE_CELLTYPES]
len(SAMPLE_GROUPS), len(BLOOD_CELLTYPES), len(NONBLOOD_CELLTYPES)#, len(NONBLOOD_CELLTYPES_WITH_REMOVE)

(30, 6, 24)

In [32]:
NONBLOOD_CELLTYPES

['Adipocytes',
 'Bladder_Ep',
 'Breast_Basal_Ep',
 'Breast_Luminal_Ep',
 'Colon_Ep',
 'Endothelium',
 'Eryth_prog',
 'Esophagus_Ep',
 'Fibro_Musc',
 'Gastric_Ep',
 'Head_Neck_Ep',
 'Heart_Cardio',
 'Kidney_Ep',
 'Liver_Hep',
 'Lung_Ep_Alveo',
 'Lung_Ep_Bron',
 'Neuron_plus_Oligodend',
 'Ovary_Ep',
 'Pancreas',
 'Pancreas_Acinar',
 'Pancreas_Duct',
 'Prostate_Ep',
 'Small_Int_Ep',
 'Thyroid_Ep']

### Methylation Data

In [12]:
%%time
raw_meth_df = pd.concat([
    pd.read_csv(ifile, sep='\t', 
                names=['sample_id', 'region_id', 
                       'region_number_total', 
                       'region_meth_rate'])
    for ifile in glob.glob(METH_DIR+'/*.csv')
])

CPU times: user 2min 55s, sys: 19.7 s, total: 3min 15s
Wall time: 3min 24s


In [13]:
%%time
diffmeth_df = pd.concat([
    pd.read_csv(ifile, sep='\t', usecols=['region_id', 'fdr', 'meth_base', 'meth_delta'])\
        .assign(source=os.path.basename(ifile))
    for ifile in glob.glob(DIFFMETH_PATH)
])

CPU times: user 2min 1s, sys: 6.47 s, total: 2min 8s
Wall time: 2min 11s


In [14]:
def extract_ct1(filename):
    """
    Extracts the name of the first cell type from a differential methylation filename.

    Args:
        filename (str): The name of the differential methylation file.

    Returns:
        str: The name of the first cell type in the filename.
    """
    ct1 = str.replace(filename, 'standard_diffmeth_%s_'%REGIONS, '')\
    .replace('.tsv.gz', '')\
    .split('_minus_')[1]

    return(ct1)


def extract_ct2(filename):
    """
    Extracts the name of the second cell type from a differential methylation filename.

    Args:
        filename (str): The name of the differential methylation file.

    Returns:
        str: The name of the second cell type in the filename.
    """
    ct2 = str.replace(filename, 'standard_diffmeth_%s_'%REGIONS, '')\
    .replace('.tsv.gz', '')\
    .split('_minus_')[0]

    return(ct2)

# extract ct1 and ct2 from the name
sources = list(diffmeth_df['source'].unique())
SOURCE_TO_CT1_MAP = dict(zip(sources, [extract_ct1(source) for source in sources]))
SOURCE_TO_CT2_MAP = dict(zip(sources, [extract_ct2(source) for source in sources]))
diffmeth_df['ct1'] = diffmeth_df['source'].map(SOURCE_TO_CT1_MAP)
diffmeth_df['ct2'] = diffmeth_df['source'].map(SOURCE_TO_CT2_MAP)

In [15]:
sorted(diffmeth_df['ct1'].unique())

['Adipocytes',
 'Bladder_Ep',
 'Blood_B',
 'Blood_Granul',
 'Blood_Mono_plus_Macro',
 'Blood_NK',
 'Blood_T',
 'Breast_Basal_Ep',
 'Breast_Luminal_Ep',
 'Colon_Ep',
 'Endothelium',
 'Eryth_prog',
 'Esophagus_Ep',
 'Fibro_Musc',
 'Gastric_Ep',
 'Head_Neck_Ep',
 'Heart_Cardio',
 'Kidney_Ep',
 'Liver_Hep',
 'Lung_Ep_Alveo',
 'Lung_Ep_Bron',
 'Neuron_plus_Oligodend',
 'Ovary_Ep',
 'Pancreas',
 'Pancreas_Acinar',
 'Pancreas_Duct',
 'Prostate_Ep',
 'Small_Int_Ep']

In [16]:
sorted(diffmeth_df['ct2'].unique())

['Bladder_Ep',
 'Blood_B',
 'Blood_Granul',
 'Blood_Mono_plus_Macro',
 'Blood_NK',
 'Blood_T',
 'Breast_Basal_Ep',
 'Breast_Luminal_Ep',
 'Colon_Ep',
 'Endothelium',
 'Eryth_prog',
 'Esophagus_Ep',
 'Fibro_Musc',
 'Gastric_Ep',
 'Head_Neck_Ep',
 'Heart_Cardio',
 'Kidney_Ep',
 'Liver_Hep',
 'Lung_Ep_Alveo',
 'Lung_Ep_Bron',
 'Neuron_plus_Oligodend',
 'Ovary_Ep',
 'Pancreas',
 'Pancreas_Acinar',
 'Pancreas_Duct',
 'Prostate_Ep',
 'Small_Int_Ep',
 'Thyroid_Ep']

##### add reversed tests 

In [17]:
%%time
add_df = diffmeth_df\
    .rename(columns={'ct1': 'ct2', 'ct2': 'ct1'})
add_df['meth_base'] = add_df['meth_base']+add_df['meth_delta']
add_df['meth_delta'] = -add_df['meth_delta']
diffmeth_df = pd.concat([diffmeth_df, add_df])
diffmeth_df['meth_other'] = diffmeth_df['meth_base']+diffmeth_df['meth_delta']


CPU times: user 8.48 s, sys: 6.05 s, total: 14.5 s
Wall time: 14 s


In [18]:
# HYPER criteria
ridxs = (
    (diffmeth_df['meth_base']>=FILTER_RATE_TARGET) &
    (diffmeth_df['meth_other']<=FILTER_RATE_OTHER) &
    (diffmeth_df['meth_delta']<=FILTER_RATE_DELTA) &
    (diffmeth_df['fdr']<=FILTER_FDR)
)
diffmeth_df_filtered = diffmeth_df[ridxs].copy()

In [19]:
print(f"Number of regions/celltype pairs before filtering: {diffmeth_df.shape[0]}\nAfter filtering: {diffmeth_df_filtered.shape[0]}")

Number of regions/celltype pairs before filtering: 102844354
After filtering: 20138651


### Add CpG count and filter

Only keep regions with FILTER_MIN_CPGs_REGION or more CpGs

In [20]:
region_df = pd.read_csv(REGION_PATH, sep='\t').groupby('region_id').size().to_frame('n_cpgs').reset_index()

In [21]:
diffmeth_df_filtered = diffmeth_df_filtered.merge(region_df, on='region_id', validate='m:1')
diffmeth_df_filtered = diffmeth_df_filtered.query(f"n_cpgs>={FILTER_MIN_CPGs_REGION}")

In [22]:
print(f"Number of regions/celltype pairs left after requiring at least {FILTER_MIN_CPGs_REGION} CpGs: {diffmeth_df_filtered.shape[0]}")

Number of regions/celltype pairs left after requiring at least 10 CpGs: 10609976


# Actual selection of regions

### Look at pairwise number of hypo regions found
Use this to determine which cell types to combine

In [23]:
hyper_pairwise_df = (diffmeth_df_filtered.groupby(['ct1', 'ct2'])
                     .size()
                     .to_frame('n_clusters')
                     .reset_index()
                     .pivot_table(index='ct1', columns='ct2', values='n_clusters')
                     .reset_index()
                     .fillna(0))

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R -i hyper_pairwise_df,RESULTS_PATH

library(tidyverse)
library(pheatmap)
library(pals)

save_pheatmap_pdf <- function(x, filename, width=7, height=7) {
   stopifnot(!missing(x))
   stopifnot(!missing(filename))
   pdf(filename, width=width, height=height)
   grid::grid.newpage()
   grid::grid.draw(x$gtable)
   dev.off()
}

hyper_pairwise_mat = as.matrix(hyper_pairwise_df %>% select(-ct1))
rownames(hyper_pairwise_mat) = hyper_pairwise_df %>% pull(ct1)

p_cluster = pheatmap(
    hyper_pairwise_mat, 
    cluster_rows=TRUE, show_rownames=TRUE, 
    cluster_cols=TRUE, show_colnames=TRUE, 
)
p_cluster
save_pheatmap_pdf(p_cluster, file=paste0(RESULTS_PATH, 'heatmap_pairwise_hyper_test_number_clustered.pdf'))

### For each and celltype pair, count the number diff meth with blood cell types
Diff with all blood cell types would be a count of 5

In [45]:
ridxs = diffmeth_df_filtered['ct2'].isin(BLOOD_CELLTYPES)
ridxs &= diffmeth_df_filtered['ct1'].isin(NONBLOOD_CELLTYPES)
blood_differential = (diffmeth_df_filtered.loc[ridxs].copy()
                      .groupby(['ct1', 'region_id'])
                      .size().to_frame('n_blood_diff').reset_index())

In [46]:
blood_differential.describe()

Unnamed: 0,n_blood_diff
count,394206.0
mean,3.206689
std,1.725217
min,1.0
25%,1.0
50%,3.0
75%,5.0
max,5.0


### Select non-blood regions
First try 1 vs most and require diff wrt blood

In [47]:
ridxs = diffmeth_df_filtered['ct1'].isin(NONBLOOD_CELLTYPES)
ridxs &= diffmeth_df_filtered['ct2'].isin(NONBLOOD_CELLTYPES)
select_nonblood = (diffmeth_df_filtered[ridxs]
                   .groupby(['ct1', 'region_id', 'n_cpgs'])
                   .agg(number_other=('ct2', 'count'),
                        celltypes_other=('ct2', ','.join),
                        meth_base_min=('meth_base', 'min'),
                        meth_delta_max=('meth_delta', 'max'))
                   .reset_index())

Filter out region that are not differential with all blood cell types and at least 19 "other" celltypes

In [48]:
# Original filter rule
candidates_v_many = (select_nonblood.merge(blood_differential, on=['ct1', 'region_id'], how='left')
                     .fillna(0)
                     .query(f"n_blood_diff == 5 & number_other >= {len(NONBLOOD_CELLTYPES) -5}"))
candidates_v_many[['chrom','start','stop']] = (candidates_v_many['region_id']
                                               .str.replace('.*_', '', regex=True)
                                               .str.split(pat='[:|-]', expand=True, regex=True))
# adjust start by 0 for bedlike format and also make end int
candidates_v_many['start'] = candidates_v_many['start'].astype(int) - 1 
candidates_v_many['stop'] = candidates_v_many['stop'].astype(int)

In [50]:
# Haiyang modified filter: for Diff meth according to the above criteria for all 5 blood cell types and low in breast but not other cell types (up to 5 cell types allowed to not be DE; these are referred to as exceptions). 
# Check for low methylation in breast tissues
breast_tissue_types = ['Breast_Basal_Ep', 'Breast_Luminal_Ep']
ridxs_breast = diffmeth_df_filtered['ct2'].isin(breast_tissue_types)
breast_differential = (diffmeth_df_filtered.loc[ridxs_breast]
                       .groupby(['ct1', 'region_id'])
                       .size().to_frame('n_breast_diff').reset_index())

# Merge and adjust the filtering (meth needs to be low in breast tissue types and all 5 blood cell types, but not necessarily compared to all other cell types )
candidates_v_many_loose1 = (select_nonblood.merge(blood_differential, on=['ct1', 'region_id'], how='left')
                     .merge(breast_differential, on=['ct1', 'region_id'], how='left')
                     .fillna(0)
                     .query("n_blood_diff == 5 & n_breast_diff >= 2"))  

# Adjusted filtering condition (meth needs to be low in breast tissue types and all 5 blood cell types, and at least half all other cell types low as well)
candidates_v_many_loose2 = (select_nonblood.merge(blood_differential, on=['ct1', 'region_id'], how='left')
                     .merge(breast_differential, on=['ct1', 'region_id'], how='left')
                     .fillna(0)
                     .query(f"n_blood_diff == 5 & n_breast_diff >= 2 & number_other >= {len(NONBLOOD_CELLTYPES) // 2}"))

# For loosen rule 1
candidates_v_many_loose1[['chrom','start','stop']] = (candidates_v_many_loose1['region_id']
                                               .str.replace('.*_', '', regex=True)
                                               .str.split(pat='[:|-]', expand=True, regex=True))
# adjust start by 0 for bedlike format and also make end int
candidates_v_many_loose1['start'] = candidates_v_many_loose1['start'].astype(int) - 1 
candidates_v_many_loose1['stop'] = candidates_v_many_loose1['stop'].astype(int)

# For loosen rule 2
candidates_v_many_loose2[['chrom','start','stop']] = (candidates_v_many_loose2['region_id']
                                               .str.replace('.*_', '', regex=True)
                                               .str.split(pat='[:|-]', expand=True, regex=True))
# adjust start by 0 for bedlike format and also make end int
candidates_v_many_loose2['start'] = candidates_v_many_loose2['start'].astype(int) - 1 
candidates_v_many_loose2['stop'] = candidates_v_many_loose2['stop'].astype(int)


In [51]:
print (candidates_v_many_loose1.shape)
print (candidates_v_many_loose2.shape)

(50344, 12)
(45939, 12)


Add a column for exceptions based on what's in the celltypes other column

In [52]:
def identify_exceptions(celltypes_other, target_celltype):
    celltypes_other_list = celltypes_other.split(',')
    exception_list = [x for x in NONBLOOD_CELLTYPES if x not in celltypes_other_list+[target_celltype]]
    return ','.join(sorted(exception_list))

candidates_v_many['exceptions'] = candidates_v_many.apply(lambda row: identify_exceptions(row['celltypes_other'], row['ct1']), axis=1)
candidates_v_many.drop(columns='celltypes_other', inplace=True)

In [53]:
# For loosen rule 1
candidates_v_many_loose1['exceptions'] = candidates_v_many_loose1.apply(lambda row: identify_exceptions(row['celltypes_other'], row['ct1']), axis=1)
candidates_v_many_loose1.drop(columns='celltypes_other', inplace=True)
print(candidates_v_many_loose1.shape)

# For loosen rule 2
candidates_v_many_loose2['exceptions'] = candidates_v_many_loose2.apply(lambda row: identify_exceptions(row['celltypes_other'], row['ct1']), axis=1)
candidates_v_many_loose2.drop(columns='celltypes_other', inplace=True)
print(candidates_v_many_loose2.shape)


(50344, 12)
(45939, 12)


Merge clusters into regions (separately for each cell type and number of "other")

In [None]:
pybedtools.set_tempdir(f"../pybedtools_tmp")

In [110]:
# this is a trick to get the merge separately for each cell type and exceptions: add them to the chromosome name
candidates_v_many['composite_chrom'] = candidates_v_many['chrom'] + '__' + candidates_v_many['ct1'] + '__' + candidates_v_many['number_other'].astype(str) + '__' + candidates_v_many['exceptions']

In [54]:
# this is a trick to get the merge separately for each cell type and exceptions: add them to the chromosome name
# For loosen rule 1
candidates_v_many_loose1['composite_chrom'] = candidates_v_many_loose1['chrom'] + '__' + candidates_v_many_loose1['ct1'] + '__' + candidates_v_many_loose1['number_other'].astype(str) + '__' + candidates_v_many_loose1['exceptions']

# For loosen rule 2
candidates_v_many_loose2['composite_chrom'] = candidates_v_many_loose2['chrom'] + '__' + candidates_v_many_loose2['ct1'] + '__' + candidates_v_many_loose2['number_other'].astype(str) + '__' + candidates_v_many_loose2['exceptions']

In [111]:
candidates_for_merge = (candidates_v_many[['composite_chrom', 'start', 'stop', 'meth_base_min', 'meth_delta_max', 'n_cpgs']]
                        .sort_values(['composite_chrom', 'start']))
merged = pybedtools.BedTool.from_dataframe(candidates_for_merge).merge(c="4,5,6", o="min,max,min")
candidate_regions_summary_df = merged.to_dataframe()
candidate_regions_summary_df.columns = ['composite_chr', 'region_start', 'region_end', 'meth_base_min', 'meth_delta_max', 'min_n_cpgs']
candidate_regions_summary_df[['region_chrom', 'ct1', 'number_other', 'exceptions']] = candidate_regions_summary_df['composite_chr'].str.split('__', expand=True)
candidate_regions_summary_df['n_non_diff'] = (len(NONBLOOD_CELLTYPES) - 1) - candidate_regions_summary_df['number_other'].astype(int)

In [55]:
# For loosen rule 1
candidates_for_merge_loose1 = (candidates_v_many_loose1[['composite_chrom', 'start', 'stop', 'meth_base_min', 'meth_delta_max', 'n_cpgs']]
                        .sort_values(['composite_chrom', 'start']))
merged = pybedtools.BedTool.from_dataframe(candidates_for_merge_loose1).merge(c="4,5,6", o="min,max,min")
candidate_regions_summary_loose1_df = merged.to_dataframe()
candidate_regions_summary_loose1_df.columns = ['composite_chr', 'region_start', 'region_end', 'meth_base_min', 'meth_delta_max', 'min_n_cpgs']
candidate_regions_summary_loose1_df[['region_chrom', 'ct1', 'number_other', 'exceptions']] = candidate_regions_summary_loose1_df['composite_chr'].str.split('__', expand=True)
candidate_regions_summary_loose1_df['n_non_diff'] = (len(NONBLOOD_CELLTYPES) - 1) - candidate_regions_summary_loose1_df['number_other'].astype(int)

# For loosen rule 2
candidates_for_merge_loose2 = (candidates_v_many_loose2[['composite_chrom', 'start', 'stop', 'meth_base_min', 'meth_delta_max', 'n_cpgs']]
                        .sort_values(['composite_chrom', 'start']))
merged = pybedtools.BedTool.from_dataframe(candidates_for_merge_loose2).merge(c="4,5,6", o="min,max,min")
candidate_regions_summary_loose2_df = merged.to_dataframe()
candidate_regions_summary_loose2_df.columns = ['composite_chr', 'region_start', 'region_end', 'meth_base_min', 'meth_delta_max', 'min_n_cpgs']
candidate_regions_summary_loose2_df[['region_chrom', 'ct1', 'number_other', 'exceptions']] = candidate_regions_summary_loose2_df['composite_chr'].str.split('__', expand=True)
candidate_regions_summary_loose2_df['n_non_diff'] = (len(NONBLOOD_CELLTYPES) - 1) - candidate_regions_summary_loose2_df['number_other'].astype(int)

In [35]:
candidate_plot_df = candidate_regions_summary_df.groupby(['ct1', 'n_non_diff']).size().to_frame('num_candidates').reset_index()
alt.Chart(candidate_plot_df).mark_bar().encode(
    x=alt.X('ct1', title='Target cell type'),
    y=alt.Y('num_candidates', title='Number of regions'),
    order=alt.Order('n_non_diff:N', sort='ascending'),
    color=alt.Color('n_non_diff:N', title='Num cell types not diff meth')
).properties(
    height=300, width=400
)

In [105]:
# For loose rule 1
candidate_plot_df = candidate_regions_summary_loose1_df.groupby(['ct1', 'n_non_diff']).size().to_frame('num_candidates').reset_index()
alt.Chart(candidate_plot_df).mark_bar().encode(
    x=alt.X('ct1', title='Target cell type'),
    y=alt.Y('num_candidates', title='Number of regions'),
    order=alt.Order('n_non_diff:N', sort='ascending'),
    color=alt.Color('n_non_diff:N', title='Num cell types not diff meth')
).properties(
    height=300, width=400
)

In [108]:
# For loose rule 2
candidate_plot_df = candidate_regions_summary_loose2_df.groupby(['ct1', 'n_non_diff']).size().to_frame('num_candidates').reset_index()
alt.Chart(candidate_plot_df).mark_bar().encode(
    x=alt.X('ct1', title='Target cell type'),
    y=alt.Y('num_candidates', title='Number of regions'),
    order=alt.Order('n_non_diff:N', sort='ascending'),
    color=alt.Color('n_non_diff:N', title='Num cell types not diff meth')
).properties(
    height=300, width=400
)

## Select top regions and plot heatmap
First just look at 1 v all

In [75]:
%%time
one_v_all = candidates_v_many.query(f'number_other == {len(NONBLOOD_CELLTYPES)-1}')
meth_df, meta_df = get_meth_data(one_v_all, samples_df, raw_meth_df)
regions_ordered = one_v_all.sort_values(['ct1', 'meth_base_min'], ascending=[True, False])['region_id'].drop_duplicates()
meth_df = meth_df.loc[:, regions_ordered]
meth_df.shape

CPU times: user 19.9 s, sys: 560 ms, total: 20.5 s
Wall time: 19.8 s


(208, 4793)

In [None]:
%%R -i meth_df,meta_df,RESULTS_PATH

library(tidyverse)
library(pheatmap)
library(pals)

save_pheatmap_pdf <- function(x, filename, width=7, height=10) {
   stopifnot(!missing(x))
   stopifnot(!missing(filename))
   pdf(filename, width=width, height=height)
   grid::grid.newpage()
   grid::grid.draw(x$gtable)
   dev.off()
}


row_annot <- data.frame(sample_group=meta_df$sample_group)
rownames(row_annot) <- meta_df$sample_id
sample_group_colors = c(stepped(), stepped2())[1:length(unique(meta_df$sample_group))]
names(sample_group_colors) = unique(meta_df$sample_group)
row_colors = list(sample_group=sample_group_colors)
p <- pheatmap(
    meth_df, 
    cluster_rows=FALSE, show_rownames=FALSE, 
    cluster_cols=FALSE, show_colnames=FALSE, 
    annotation_row=row_annot, annotation_colors=row_colors, annotation_names_row=FALSE,
    scale='none', na_col='black'
)
save_pheatmap_pdf(p, file=paste0(RESULTS_PATH, 'heatmap_hyper_candidates_one_vs_all.pdf'))


Also check out 2 v rest

In [76]:
%%time
two_v_rest = candidates_v_many.query(f'number_other == {len(NONBLOOD_CELLTYPES)-2}')
meth_df, meta_df = get_meth_data(two_v_rest, samples_df, raw_meth_df)
regions_ordered = two_v_rest.sort_values(['ct1', 'meth_base_min'], ascending=[True, False])['region_id'].drop_duplicates()
meth_df = meth_df.loc[:, regions_ordered]
meth_df.shape

CPU times: user 18.1 s, sys: 525 ms, total: 18.6 s
Wall time: 18 s


(208, 7360)

In [None]:
%%R -i meth_df,meta_df,RESULTS_PATH

library(tidyverse)
library(pheatmap)
library(pals)

save_pheatmap_pdf <- function(x, filename, width=7, height=10) {
   stopifnot(!missing(x))
   stopifnot(!missing(filename))
   pdf(filename, width=width, height=height)
   grid::grid.newpage()
   grid::grid.draw(x$gtable)
   dev.off()
}


row_annot <- data.frame(sample_group=meta_df$sample_group)
rownames(row_annot) <- meta_df$sample_id
sample_group_colors = c(stepped(), stepped2())[1:length(unique(meta_df$sample_group))]
names(sample_group_colors) = unique(meta_df$sample_group)
row_colors = list(sample_group=sample_group_colors)
p <- pheatmap(
    meth_df, 
    cluster_rows=FALSE, show_rownames=FALSE, 
    cluster_cols=FALSE, show_colnames=FALSE, 
    annotation_row=row_annot, annotation_colors=row_colors, annotation_names_row=FALSE,
    scale='none', na_col='black'
)
save_pheatmap_pdf(p, file=paste0(RESULTS_PATH, 'heatmap_hyper_candidates_one_vs_all_but_one.pdf'))

## Write candidates
Will continue filtering outside this analysis (with TCGA)

In [38]:
cluster_outfile = f"{PROJECT_DIR}/hyper_dmr_candidates/{REGIONS}_one_vs_many_cpg{FILTER_MIN_CPGs_REGION}_target{FILTER_RATE_TARGET}_other{FILTER_RATE_OTHER}_delta{FILTER_RATE_DELTA}_fdr{FILTER_FDR}.tsv"
candidates_v_many.to_csv(cluster_outfile, sep="\t", index=False)

region_outfile = f"{PROJECT_DIR}/hyper_dmr_candidates/REGIONS_{REGIONS}_one_vs_many_cpg{FILTER_MIN_CPGs_REGION}_target{FILTER_RATE_TARGET}_other{FILTER_RATE_OTHER}_delta{FILTER_RATE_DELTA}_fdr{FILTER_FDR}.tsv"
candidate_regions_summary_df.to_csv(region_outfile, sep="\t", index=False)

In [113]:
# Check the lung region numbers
candidate_regions_summary_df_lung = candidate_regions_summary_df[candidate_regions_summary_df['ct1'].str.contains('lung', case=False, na=False)]
print(candidate_regions_summary_df_lung.shape)

# For loose rule 1
candidate_regions_summary_loose1_df_lung = candidate_regions_summary_loose1_df[candidate_regions_summary_loose1_df['ct1'].str.contains('lung', case=False, na=False)]
print(candidate_regions_summary_loose1_df_lung.shape)

# For loose rule 2
candidate_regions_summary_loose2_df_lung = candidate_regions_summary_loose2_df[candidate_regions_summary_loose2_df['ct1'].str.contains('lung', case=False, na=False)]
print(candidate_regions_summary_loose2_df_lung.shape)

(265, 11)
(655, 11)
(463, 11)


In [114]:
# Save the data for loose rule 1 cluter and regions
cluster_outfile = f"{PROJECT_DIR}/hyper_dmr_candidates/{REGIONS}_one_vs_many_cpg{FILTER_MIN_CPGs_REGION}_target{FILTER_RATE_TARGET}_other{FILTER_RATE_OTHER}_delta{FILTER_RATE_DELTA}_fdr{FILTER_FDR}_loose1.tsv"
candidates_v_many_loose1.to_csv(cluster_outfile, sep="\t", index=False)

region_outfile = f"{PROJECT_DIR}/hyper_dmr_candidates/REGIONS_{REGIONS}_one_vs_many_cpg{FILTER_MIN_CPGs_REGION}_target{FILTER_RATE_TARGET}_other{FILTER_RATE_OTHER}_delta{FILTER_RATE_DELTA}_fdr{FILTER_FDR}_loose1.tsv"
candidate_regions_summary_loose1_df.to_csv(region_outfile, sep="\t", index=False)

# Save the data for loose rule 2 cluter and regions
cluster_outfile = f"{PROJECT_DIR}/hyper_dmr_candidates/{REGIONS}_one_vs_many_cpg{FILTER_MIN_CPGs_REGION}_target{FILTER_RATE_TARGET}_other{FILTER_RATE_OTHER}_delta{FILTER_RATE_DELTA}_fdr{FILTER_FDR}_loose2.tsv"
candidates_v_many_loose2.to_csv(cluster_outfile, sep="\t", index=False)

region_outfile = f"{PROJECT_DIR}/hyper_dmr_candidates/REGIONS_{REGIONS}_one_vs_many_cpg{FILTER_MIN_CPGs_REGION}_target{FILTER_RATE_TARGET}_other{FILTER_RATE_OTHER}_delta{FILTER_RATE_DELTA}_fdr{FILTER_FDR}_loose2.tsv"
candidate_regions_summary_loose2_df.to_csv(region_outfile, sep="\t", index=False)