In [1]:
import pandas as pd
import glob
import numpy as np
import itertools
import functools
import os
import regex as re
import random

from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql.types import IntegerType, LongType, ArrayType, StringType, DoubleType
from pyspark.sql.functions import udf, explode, broadcast, count, lit, length, col
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
# UPDATE HOME!
os.environ["SPARK_HOME"] = "/home/ec2-user/mambaforge/envs/2023_06_26_SRT_deconvolution_MS/lib/python3.7/site-packages/pyspark"
# THIS needs to be set-up before running the notebook
os.environ["SPARK_LOCAL_DIRS"] = "/temp"
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

spark_conf = SparkConf()
spark_conf.set("spark.ui.showConsoleProgress", "True")
spark_conf.set("spark.executor.instances", "2")
spark_conf.set("spark.executor.cores", "2")
spark_conf.set("spark.executor.memory", "16g")
spark_conf.set("spark.driver.memory", "64g")
spark_conf.set("spark.driver.maxResultSize", "32g")
spark_conf.set("spark.parquet.filterPushdown", "true")
spark_conf.set("spark.local.dir", "/temp")
spark_conf.getAll()

sc = SparkContext(conf=spark_conf)
sc.setLogLevel("ERROR")
spark = SparkSession(sc)



In [3]:
REGIONS = 'deconvolution_v2.v23_conv.with_cpg_index'
REGION_BED_COLS = [
    'region_chr', 'region_start', 'region_end', 
    'region_cpg_index_min', 'region_cpg_index_max', 'region_id'
]
FILTER_CG_COUNT = 3
FILTER_CG_COUNT_REGION = 1

#--- Local paths
ROOT_DIR = '/analysis/gh-msun/projects'
PROJECT_SLUG = '2023_06_26_SRT_deconvolution_MS'
PROJECT_DIR = ROOT_DIR + '/{}'.format(PROJECT_SLUG)

# Regions
REGION_PATH = (
    PROJECT_DIR + '/stage/panel_data/{regions}.bed'
).format(regions=REGIONS)

# CpG map; genomic coordinate to CpG index;
CPG_MAP_PATH = PROJECT_DIR + '/stage/cpg_loci/cpg_loci_hg19.combined_annot.tsv.gz'

#--- Where to store results
RESULT_PATH = PROJECT_DIR + '/output/methyl_score/'

### CpG Map

In [4]:
cpg_map = pd.read_csv(CPG_MAP_PATH, usecols=['chr', 'start', 'end', 'cpg_index', 'cpg_index_hg38'], sep='\t')

In [5]:
%%time
ridxs = ~cpg_map['cpg_index_hg38'].isna()
hg19_hg38_map = dict(itertools.zip_longest(cpg_map[ridxs]['cpg_index'], cpg_map[ridxs]['cpg_index_hg38'].astype(int)))
hg38_hg19_map = dict(itertools.zip_longest(cpg_map[ridxs]['cpg_index_hg38'].astype(int), cpg_map[ridxs]['cpg_index']))


CPU times: user 15.4 s, sys: 7.43 s, total: 22.8 s
Wall time: 22.8 s


### Regions

In [6]:
region_df = pd.read_csv(REGION_PATH, sep='\t', usecols=range(0, 6), names=REGION_BED_COLS)

region_df['region_cpg_index_max'] -= 1
region_df.sort_values('region_cpg_index_min', inplace=True)
region_df['region_cpg_index_min_hg38'] = region_df['region_cpg_index_min'].map(hg19_hg38_map)
region_df['region_cpg_index_max_hg38'] = region_df['region_cpg_index_max'].map(hg19_hg38_map)

region_df.shape[0], region_df['region_id'].nunique()

(1658, 1658)

In [7]:
ridxs = ~region_df['region_cpg_index_min_hg38'].isna()
ridxs &= ~region_df['region_cpg_index_max_hg38'].isna()
region_df = region_df[ridxs].copy()
region_df.shape[0], region_df['region_id'].nunique()

(1658, 1658)

In [8]:
cg_count_hg19 = region_df['region_cpg_index_max']-region_df['region_cpg_index_min'] + 1
cg_count_hg38 = region_df['region_cpg_index_max_hg38']-region_df['region_cpg_index_min_hg38'] + 1
ridxs = (cg_count_hg19==cg_count_hg38)
ridxs &= (cg_count_hg19>=FILTER_CG_COUNT_REGION)
region_df = region_df[ridxs].copy()
region_df.shape[0], region_df['region_id'].nunique()

(1657, 1657)

In [9]:
region_df['region_cpg_index_min_hg38'] = region_df['region_cpg_index_min_hg38'].astype(int)
region_df['region_cpg_index_max_hg38'] = region_df['region_cpg_index_max_hg38'].astype(int)

In [10]:
### Restrict to immune regions
#-------------- CHANGE HERE FOR DIFFERENT REGION SUBSET ----------------------
# BLUEPRINT immune regions
ATLAS_PATH = PROJECT_DIR + f'/output/deconv_inhouse_v2.atlas.tsv.gz'
atlas = pd.read_csv(ATLAS_PATH, sep='\t')
subset_region_set = set(atlas.region_id)
#-----------------------------------------------------------------------------

# filter regions down to regions of interest
region_df = region_df[region_df['region_id'].isin(subset_region_set)]
region_df.head()


Unnamed: 0,region_chr,region_start,region_end,region_cpg_index_min,region_cpg_index_max,region_id,region_cpg_index_min_hg38,region_cpg_index_max_hg38
0,chr1,1114771,1114971,20117,20129,Immune_Broad_B-chr1:1114772-1114971,21119,21131
1,chr1,1157450,1157720,21684,21703,Immune_Broad_NK-chr1:1157451-1157720,22686,22705
2,chr1,1157879,1158277,21710,21726,Immune_Broad_NK-chr1:1157880-1158277,22712,22728
14,chr1,6341182,6341377,140667,140681,Immune_Broad_Eosi-chr1:6341183-6341377,142368,142382
19,chr1,9147788,9147871,188605,188608,Immune_Broad_Neutro-chr1:9147789-9147871,190307,190310


In [11]:
region_df.shape

(280, 8)

## Fragment Level Scoring

In [39]:
def get_file_paths(directory):

    list_paths = []

    for filename in os.listdir(directory):
        list_paths.append(os.path.abspath(os.path.join(directory, filename)))

    return(list_paths)


def compute_frag_scores(cpg_number_cutoff: int) -> pd.DataFrame:
    
    """
    Function that returns a function, used for reduce
    """
    
    def compute_frag_scores_inner(pat_df: pd.DataFrame) -> pd.DataFrame:
        
        data = pat_df.copy()
        data['offset_min'] = (data['region_cpg_index_min'] - data['cpg_index_min']).clip(lower=0)
        data['offset_max'] = np.minimum(
            data['region_cpg_index_max'] - data['cpg_index_min'], 
            data['cpg_index_max'] - data['cpg_index_min'])
        data['trimmed_pat'] = data.apply(lambda x: x['pat_string'][x['offset_min']:(x['offset_max']+1)], axis=1)
        #--- Filter molecules based on observed CpG loci
        observed_cpg_number = (data['trimmed_pat'].str.count('C')+data['trimmed_pat'].str.count('T'))
        ridxs = (observed_cpg_number>=cpg_number_cutoff)
        data = data[ridxs].copy()
        if (data.shape[0]>0):
            # Compute k-mer methylation states
            for k in KMERS:
                data['meth_k%i'%k] = data['trimmed_pat']\
                    .apply(lambda x: len(re.findall('[C]{%i}'%k, x, overlapped=True)))
                data['unmeth_k%i'%k] = data['trimmed_pat']\
                    .apply(lambda x: len(re.findall('[T]{%i}'%k, x, overlapped=True)))
                data['total_k%i'%k] = data['trimmed_pat']\
                    .apply(lambda x: len(re.findall('[TC]{%i}'%k, x, overlapped=True)))
            # Compute alpha distribution metrics
            data['alpha'] = data['meth_k1']/data['total_k1']
            for rate in RATES_LEQ:
                data['frac_alpha_leq_%ipct'%(100*rate)] = np.where(data['alpha']<=rate, 1, 0)
            for rate in RATES_GEQ:
                data['frac_alpha_geq_%ipct'%(100*rate)] = np.where(data['alpha']>=rate, 1, 0)
            # Expand entries that correspond to multiple molecules
            data['number_molecules'] = data['number_molecules'].apply(lambda x: list(range(x)))
            data = data.explode('number_molecules')
            data['number_molecules'] = 1
            # Aggregate metrics
            #rv = data.groupby(['region_id', 'sample_id'])\
            rv = data.groupby(['region_id'])\
                [['meth_k1', 'unmeth_k1', 'total_k1',
                  'meth_k3', 'unmeth_k3', 'total_k3',
                  'meth_k4', 'unmeth_k4', 'total_k4',
                  'frac_alpha_leq_25pct', 'frac_alpha_geq_75pct', 'number_molecules']].sum()\
                .reset_index()
            rv['frac_alpha_leq_25pct'] = rv['frac_alpha_leq_25pct']/rv['number_molecules']
            rv['frac_alpha_geq_75pct'] = rv['frac_alpha_geq_75pct']/rv['number_molecules']
        else:
            rv = pd.DataFrame(columns=RETURN_SCHEMA.names)
                      
        
        return rv[RETURN_SCHEMA.names]

    return compute_frag_scores_inner


def score_matrix(parquet_path, result_path, pat_cols, region_df, batch_size, schema, save=False, verbose=False):
    '''
    one parquet file --> one score matrix
    '''
    # Load single parquet file
    pat_df = spark.read.parquet(parquet_path).select(*pat_cols)
    
    # Compute scores by batch
    region_df['batch'] = (np.arange(region_df.shape[0])/batch_size).astype(int)
    rv_scores = list()
    
    for batch, batch_region_df in region_df.groupby('batch'):
        rv_ov = list()
        if verbose: print('--------------> Processing batch %i...' % batch)
        for _, row in batch_region_df.iterrows():
            ov_ddf = pat_df.filter(col('cpg_index_min')<=row['region_cpg_index_max_hg38'])\
                .filter(col('cpg_index_max') >= row['region_cpg_index_min_hg38'])\
                .withColumn('region_id', lit(row['region_id']))\
                .withColumn('region_cpg_index_min', lit(row['region_cpg_index_min_hg38']))\
                .withColumn('region_cpg_index_max', lit(row['region_cpg_index_max_hg38']))
            rv_ov.append(ov_ddf)
        scores_df = functools.reduce(DataFrame.union, rv_ov)\
            .groupby('region_id')\
            .applyInPandas(compute_frag_scores_udf, schema=RETURN_SCHEMA)\
            .toPandas()
        rv_scores.append(scores_df)
    
    scores_df = pd.concat(rv_scores)
    
    if save:
        file_name = os.path.basename(parquet_path)
        file_name_without_ext = os.path.splitext(file_name)[0]
        save_path = result_path + '/' + file_name_without_ext + '.tsv.gz'
        scores_df.to_csv(save_path, sep='\t', index=False)


def score_matrix_n_times(mix_dir_path, result_path, pat_cols, region_df, batch_size, schema, save=False, verbose=False):
    '''
    mixture directory of replicate mixture parquets --> score matrix per replicate mixture parquet
    '''
    
    # create result directory   
    if not os.path.exists(result_path):
        os.mkdir(result_path)
    
    # given directory path grab all parquet --> load path strings into a list
    list_parquet_paths = get_file_paths(mix_dir_path)
    
    # for each parquet in the list run score_matrix
    for path in list_parquet_paths:
        file_name = os.path.basename(path)
        file_name_without_ext = os.path.splitext(file_name)[0]
        print(f'--------> Computing score matrix for {file_name_without_ext}')
            
        score_matrix(parquet_path=path,
                    result_path=result_path,
                    pat_cols=pat_cols, 
                    region_df=region_df, 
                    batch_size=batch_size, 
                    schema=schema, 
                    save=save, 
                    verbose=verbose) 
    print('\n')


def score_matrix_from_mixture_directory(path_to_mixture_dir, result_path, pat_cols, region_df, batch_size, schema, save=False, verbose=False):
    '''
    dir_path_to_experiment
    '''
    print(f'>>> Start computing score matrices <<< \n')
    
    # create result directory
    result_dir_path = result_path + 'methyl_score/'
    
    if not os.path.exists(result_dir_path):
        os.mkdir(result_dir_path)
        
    # given directory path grab all mixture directories containing parquet
    list_mixture_dir_paths = get_file_paths(path_to_mixture_dir)
    
    # iterate through each mixture proportion directory
    for path in list_mixture_dir_paths:
        
        mixture_dir_name = os.path.basename(path)
        file_name_without_ext = os.path.splitext(mixture_dir_name)[0]
        save_path = result_dir_path + file_name_without_ext + '/'
        
        print(f'--> {file_name_without_ext}')

        score_matrix_n_times(mix_dir_path=path, 
                             result_path=save_path,
                             pat_cols=pat_cols, 
                             region_df=region_df, 
                             batch_size=batch_size, 
                             schema=schema, 
                             save=save, 
                             verbose=verbose)
        
    print('>>> Complete. <<< \n')

    

In [40]:
%%time

PAT_COLS = [
    'molecule_id', 'chr', 'number_molecules',
    'cpg_index_min', 'cpg_index_max', 'pat_string'
]

QUANTILES = [0.1, 0.25, 0.75, 0.9]
KMERS = [1, 3, 4]
RATES_LEQ = [0.25]
RATES_GEQ = [0.75]

RETURN_SCHEMA = StructType()\
    .add('region_id', 'string')\
    .add('number_molecules', 'integer')\
    .add('meth_k1', 'integer')\
    .add('unmeth_k1', 'integer')\
    .add('total_k1', 'integer')\
    .add('meth_k3', 'integer')\
    .add('unmeth_k3', 'integer')\
    .add('total_k3', 'integer')\
    .add('meth_k4', 'integer')\
    .add('unmeth_k4', 'integer')\
    .add('total_k4', 'integer')\
    .add('frac_alpha_leq_25pct', 'float')\
    .add('frac_alpha_geq_75pct', 'float')

compute_frag_scores_udf = compute_frag_scores(cpg_number_cutoff=FILTER_CG_COUNT)

#--- Local paths
ROOT_DIR = '/analysis/gh-msun/projects'
PROJECT_SLUG = '2023_06_26_SRT_deconvolution_MS/'
PROJECT_DIR = ROOT_DIR + '/{}'.format(PROJECT_SLUG)
EXPERIMENT_NAME = 'BLUEPRINT_B'
PATH_TO_MIXTURE_DIR = PROJECT_DIR + f'output/experiment/{EXPERIMENT_NAME}/mixture/'
RESULT_PATH = PROJECT_DIR + f'output/experiment/{EXPERIMENT_NAME}/'

# compute methyl score for are parquet files
score_matrix_from_mixture_directory(path_to_mixture_dir = PATH_TO_MIXTURE_DIR, 
                                   result_path = RESULT_PATH, 
                                   pat_cols = PAT_COLS, 
                                   region_df = region_df, 
                                   batch_size = 20, 
                                   schema = RETURN_SCHEMA, 
                                   save=True, 
                                   verbose=False)

>>> Start computing score matrices <<< 

--> E1B_E18CD4_E18CD8_E18NK_E18MONO_E18NEUTRO
--------> Computing score matrix for mix0_seed512070
--------> Computing score matrix for mix1_seed150400
--------> Computing score matrix for mix2_seed53691


--> E01B_E198CD4_E198CD8_E198NK_E198MONO_E198NEUTRO
--------> Computing score matrix for mix0_seed776570
--------> Computing score matrix for mix1_seed581495
--------> Computing score matrix for mix2_seed787335


--> E001B_E1998CD4_E1998CD8_E1998NK_E1998MONO_E1998NEUTRO
--------> Computing score matrix for mix0_seed372142
--------> Computing score matrix for mix1_seed285922
--------> Computing score matrix for mix2_seed689569


>>> Complete. <<< 

CPU times: user 5.58 s, sys: 1.26 s, total: 6.84 s
Wall time: 4min 34s


In [None]:
# PCA sanity check