In [1]:
import pandas as pd
import glob
import numpy as np
import itertools
import functools
import os
import regex as re
import random

from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql.types import IntegerType, LongType, ArrayType, StringType, DoubleType
from pyspark.sql.functions import udf, explode, broadcast, count, lit, length, col
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
# UPDATE HOME!
os.environ["SPARK_HOME"] = "/home/ec2-user/mambaforge/envs/2023_06_26_SRT_deconvolution_MS/lib/python3.7/site-packages/pyspark"
# THIS needs to be set-up before running the notebook
os.environ["SPARK_LOCAL_DIRS"] = "/temp"
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

spark_conf = SparkConf()
spark_conf.set("spark.ui.showConsoleProgress", "True")
spark_conf.set("spark.executor.instances", "2")
spark_conf.set("spark.executor.cores", "2")
spark_conf.set("spark.executor.memory", "16g")
spark_conf.set("spark.driver.memory", "64g")
spark_conf.set("spark.driver.maxResultSize", "32g")
spark_conf.set("spark.parquet.filterPushdown", "true")
spark_conf.set("spark.local.dir", "/temp")
spark_conf.getAll()

sc = SparkContext(conf=spark_conf)
sc.setLogLevel("ERROR")
spark = SparkSession(sc)



In [3]:
REGIONS = 'deconvolution_v2.v23_conv.with_cpg_index'
REGION_BED_COLS = [
    'region_chr', 'region_start', 'region_end', 
    'region_cpg_index_min', 'region_cpg_index_max', 'region_id'
]
FILTER_CG_COUNT = 3
FILTER_CG_COUNT_REGION = 1

#--- Local paths
ROOT_DIR = '/analysis/gh-msun/projects'
PROJECT_SLUG = '2023_06_26_SRT_deconvolution_MS'
PROJECT_DIR = ROOT_DIR + '/{}'.format(PROJECT_SLUG)

# Regions
REGION_PATH = (
    PROJECT_DIR + '/stage/panel_data/{regions}.bed'
).format(regions=REGIONS)

# CpG map; genomic coordinate to CpG index;
CPG_MAP_PATH = PROJECT_DIR + '/stage/cpg_loci/cpg_loci_hg19.combined_annot.tsv.gz'

#--- Where to store results
RESULT_PATH = PROJECT_DIR + '/output/methyl_score/'

### CpG Map

In [4]:
cpg_map = pd.read_csv(CPG_MAP_PATH, usecols=['chr', 'start', 'end', 'cpg_index', 'cpg_index_hg38'], sep='\t')

In [5]:
%%time
ridxs = ~cpg_map['cpg_index_hg38'].isna()
hg19_hg38_map = dict(itertools.zip_longest(cpg_map[ridxs]['cpg_index'], cpg_map[ridxs]['cpg_index_hg38'].astype(int)))
hg38_hg19_map = dict(itertools.zip_longest(cpg_map[ridxs]['cpg_index_hg38'].astype(int), cpg_map[ridxs]['cpg_index']))


CPU times: user 15.4 s, sys: 5.06 s, total: 20.4 s
Wall time: 20.4 s


### Regions

In [6]:
region_df = pd.read_csv(REGION_PATH, sep='\t', usecols=range(0, 6), names=REGION_BED_COLS)

region_df['region_cpg_index_max'] -= 1
region_df.sort_values('region_cpg_index_min', inplace=True)
region_df['region_cpg_index_min_hg38'] = region_df['region_cpg_index_min'].map(hg19_hg38_map)
region_df['region_cpg_index_max_hg38'] = region_df['region_cpg_index_max'].map(hg19_hg38_map)

region_df.shape[0], region_df['region_id'].nunique()

(1658, 1658)

In [7]:
ridxs = ~region_df['region_cpg_index_min_hg38'].isna()
ridxs &= ~region_df['region_cpg_index_max_hg38'].isna()
region_df = region_df[ridxs].copy()
region_df.shape[0], region_df['region_id'].nunique()

(1658, 1658)

In [8]:
cg_count_hg19 = region_df['region_cpg_index_max']-region_df['region_cpg_index_min'] + 1
cg_count_hg38 = region_df['region_cpg_index_max_hg38']-region_df['region_cpg_index_min_hg38'] + 1
ridxs = (cg_count_hg19==cg_count_hg38)
ridxs &= (cg_count_hg19>=FILTER_CG_COUNT_REGION)
region_df = region_df[ridxs].copy()
region_df.shape[0], region_df['region_id'].nunique()

(1657, 1657)

In [9]:
region_df['region_cpg_index_min_hg38'] = region_df['region_cpg_index_min_hg38'].astype(int)
region_df['region_cpg_index_max_hg38'] = region_df['region_cpg_index_max_hg38'].astype(int)

In [10]:
### Restrict to immune regions
#-------------- CHANGE HERE FOR DIFFERENT REGION SUBSET ----------------------
# BLUEPRINT immune regions
ATLAS_PATH = PROJECT_DIR + f'/output/deconv_inhouse_v2.atlas.tsv.gz'
atlas = pd.read_csv(ATLAS_PATH, sep='\t')
subset_region_set = set(atlas.region_id)
#-----------------------------------------------------------------------------

# filter regions down to regions of interest
region_df = region_df[region_df['region_id'].isin(subset_region_set)]
region_df.head()


Unnamed: 0,region_chr,region_start,region_end,region_cpg_index_min,region_cpg_index_max,region_id,region_cpg_index_min_hg38,region_cpg_index_max_hg38
0,chr1,1114771,1114971,20117,20129,Immune_Broad_B-chr1:1114772-1114971,21119,21131
1,chr1,1157450,1157720,21684,21703,Immune_Broad_NK-chr1:1157451-1157720,22686,22705
2,chr1,1157879,1158277,21710,21726,Immune_Broad_NK-chr1:1157880-1158277,22712,22728
14,chr1,6341182,6341377,140667,140681,Immune_Broad_Eosi-chr1:6341183-6341377,142368,142382
19,chr1,9147788,9147871,188605,188608,Immune_Broad_Neutro-chr1:9147789-9147871,190307,190310


In [11]:
region_df.shape

(280, 8)

## Fragment Level Scoring

In [17]:
def get_file_paths(directory):

    list_paths = []

    for filename in os.listdir(directory):
        list_paths.append(os.path.abspath(os.path.join(directory, filename)))

    return(list_paths)


def compute_frag_scores(cpg_number_cutoff: int) -> pd.DataFrame:
    
    """
    Function that returns a function, used for reduce
    """
    
    def compute_frag_scores_inner(pat_df: pd.DataFrame) -> pd.DataFrame:
        
        data = pat_df.copy()
        data['offset_min'] = (data['region_cpg_index_min'] - data['cpg_index_min']).clip(lower=0)
        data['offset_max'] = np.minimum(
            data['region_cpg_index_max'] - data['cpg_index_min'], 
            data['cpg_index_max'] - data['cpg_index_min'])
        data['trimmed_pat'] = data.apply(lambda x: x['pat_string'][x['offset_min']:(x['offset_max']+1)], axis=1)
        #--- Filter molecules based on observed CpG loci
        observed_cpg_number = (data['trimmed_pat'].str.count('C')+data['trimmed_pat'].str.count('T'))
        ridxs = (observed_cpg_number>=cpg_number_cutoff)
        data = data[ridxs].copy()
        if (data.shape[0]>0):
            # Compute k-mer methylation states
            for k in KMERS:
                data['meth_k%i'%k] = data['trimmed_pat']\
                    .apply(lambda x: len(re.findall('[C]{%i}'%k, x, overlapped=True)))
                data['unmeth_k%i'%k] = data['trimmed_pat']\
                    .apply(lambda x: len(re.findall('[T]{%i}'%k, x, overlapped=True)))
                data['total_k%i'%k] = data['trimmed_pat']\
                    .apply(lambda x: len(re.findall('[TC]{%i}'%k, x, overlapped=True)))
            # Compute alpha distribution metrics
            data['alpha'] = data['meth_k1']/data['total_k1']
            for rate in RATES_LEQ:
                data['frac_alpha_leq_%ipct'%(100*rate)] = np.where(data['alpha']<=rate, 1, 0)
            for rate in RATES_GEQ:
                data['frac_alpha_geq_%ipct'%(100*rate)] = np.where(data['alpha']>=rate, 1, 0)
            # Expand entries that correspond to multiple molecules
            data['number_molecules'] = data['number_molecules'].apply(lambda x: list(range(x)))
            data = data.explode('number_molecules')
            data['number_molecules'] = 1
            # Aggregate metrics
            #rv = data.groupby(['region_id', 'sample_id'])\
            rv = data.groupby(['region_id'])\
                [['meth_k1', 'unmeth_k1', 'total_k1',
                  'meth_k3', 'unmeth_k3', 'total_k3',
                  'meth_k4', 'unmeth_k4', 'total_k4',
                  'frac_alpha_leq_25pct', 'frac_alpha_geq_75pct', 'number_molecules']].sum()\
                .reset_index()
            rv['frac_alpha_leq_25pct'] = rv['frac_alpha_leq_25pct']/rv['number_molecules']
            rv['frac_alpha_geq_75pct'] = rv['frac_alpha_geq_75pct']/rv['number_molecules']
        else:
            rv = pd.DataFrame(columns=RETURN_SCHEMA.names)
                      
        
        return rv[RETURN_SCHEMA.names]

    return compute_frag_scores_inner


def score_matrix(parquet_path, pat_cols, region_df, batch_size, schema, save=False, verbose=False):
    '''
    one parquet file --> one score matrix
    '''
    # Load single parquet file
    pat_df = spark.read.parquet(parquet_path).select(*pat_cols)
    
    # Compute scores by batch
    region_df['batch'] = (np.arange(region_df.shape[0])/batch_size).astype(int)
    rv_scores = list()
    
    for batch, batch_region_df in region_df.groupby('batch'):
        rv_ov = list()
        if verbose: print('----------> Processing batch %i...' % batch)
        for _, row in batch_region_df.iterrows():
            ov_ddf = pat_df.filter(col('cpg_index_min')<=row['region_cpg_index_max_hg38'])\
                .filter(col('cpg_index_max') >= row['region_cpg_index_min_hg38'])\
                .withColumn('region_id', lit(row['region_id']))\
                .withColumn('region_cpg_index_min', lit(row['region_cpg_index_min_hg38']))\
                .withColumn('region_cpg_index_max', lit(row['region_cpg_index_max_hg38']))
            rv_ov.append(ov_ddf)
        scores_df = functools.reduce(DataFrame.union, rv_ov)\
            .groupby('region_id')\
            .applyInPandas(compute_frag_scores_udf, schema=RETURN_SCHEMA)\
            .toPandas()
        rv_scores.append(scores_df)
    
    scores_df = pd.concat(rv_scores)
        
    return scores_df


def score_matrix_from_mixture_directory(dir_path, result_path, pat_cols, region_df, batch_size, schema, save=False, verbose=False):
    '''
    mixture directory or replicate mixture parquets --> score matrix per replicate mixture parquet
    '''
    
    # create result directory
#     dir_name = os.path.basename(os.path.normpath(dir_path))
#     print(dir_name)
    result_dir_path = result_path # + dir_name
    
    if not os.path.exists(result_dir_path):
        os.mkdir(result_dir_path)
        print("Folder %s created!" % result_dir_path)
    else:
        print("Folder %s already exists" % result_dir_path)
    
    # given directory path grab all parquet --> load path strings into a list
    list_parquet_paths = get_file_paths(dir_path)
    
    # check for .parquet
    
    # for each parquet in the list run score_matrix
    #list_score_df = []
    
    for path in list_parquet_paths:
        if verbose: print(f'-----> Computing score matrix for {path}')
        score_df = score_matrix(parquet_path=path, 
                                pat_cols=pat_cols, 
                                region_df=region_df, 
                                batch_size=batch_size, 
                                schema=schema, 
                                save=save, 
                                verbose=verbose)
      #  list_score_df.append(score_df)
    
        # save each score df for each replicate mixture
        if save:
            file_name = os.path.basename(path)
            file_name_without_ext = os.path.splitext(file_name)[0]
            save_path = result_dir_path + '/' + file_name_without_ext + '.tsv.gz'
            print(save_path)
            score_df.to_csv(save_path,
                 sep='\t', 
                 index=False)
    
    print('>>> Mixture score matrices complete. <<<')


def score_matrix_from_experiment_directory(dir_path_to_experiment, result_path, pat_cols, region_df, batch_size, schema, save=False, verbose=False):
    '''
    dir_path_to_experiment
    '''
    
    # create result directory
    dir_name = os.path.basename(os.path.normpath(dir_path_to_experiment))
    result_dir_path = result_path + dir_name
    
    if not os.path.exists(result_dir_path):
        os.mkdir(result_dir_path)
        print("Folder %s created!" % result_dir_path)
    else:
        print("Folder %s already exists" % result_dir_path)
        
    # given directory path grab all mixture directories containing parquet
    list_mixture_dir_paths = get_file_paths(dir_path_to_experiment)
    
    # iterate through each mixture directory
    for path in list_mixture_dir_paths:
    
        if verbose: print(f'--> Computing score matrices for {path}')
        
        mixture_dir_name = os.path.basename(path)
        file_name_without_ext = os.path.splitext(mixture_dir_name)[0]
        save_path = result_dir_path + '/' + file_name_without_ext + '/'

        score_matrix_from_mixture_directory(dir_path=path, 
                                            result_path=save_path,
                                            pat_cols=pat_cols, 
                                            region_df=region_df, 
                                            batch_size=batch_size, 
                                            schema=schema, 
                                            save=save, 
                                            verbose=verbose)
        
    print('>>>>> Experiment score matrices complete. <<<<<')

    

In [15]:
BATCH_SIZE=20

PAT_COLS = [
    'molecule_id', 'chr', 'number_molecules',
    'cpg_index_min', 'cpg_index_max', 'pat_string'
]

QUANTILES = [0.1, 0.25, 0.75, 0.9]
KMERS = [1, 3, 4]
RATES_LEQ = [0.25]
RATES_GEQ = [0.75]

RETURN_SCHEMA = StructType()\
    .add('region_id', 'string')\
    .add('number_molecules', 'integer')\
    .add('meth_k1', 'integer')\
    .add('unmeth_k1', 'integer')\
    .add('total_k1', 'integer')\
    .add('meth_k3', 'integer')\
    .add('unmeth_k3', 'integer')\
    .add('total_k3', 'integer')\
    .add('meth_k4', 'integer')\
    .add('unmeth_k4', 'integer')\
    .add('total_k4', 'integer')\
    .add('frac_alpha_leq_25pct', 'float')\
    .add('frac_alpha_geq_75pct', 'float')
#     .add('sample_id', 'string')\


compute_frag_scores_udf = compute_frag_scores(cpg_number_cutoff=FILTER_CG_COUNT)


In [18]:
%%time
score_matrix_from_experiment_directory(dir_path_to_experiment = '/analysis/gh-msun/projects/2023_06_26_SRT_deconvolution_MS/output/mixture/', 
                                       result_path = '/analysis/gh-msun/projects/2023_06_26_SRT_deconvolution_MS/output/methyl_score/', 
                                       pat_cols = PAT_COLS, 
                                       region_df = region_df, 
                                       batch_size = 20, 
                                       schema = RETURN_SCHEMA, 
                                       save=True, 
                                       verbose=True)


Folder /analysis/gh-msun/projects/2023_06_26_SRT_deconvolution_MS/output/methyl_score/mixture already exists
--> Computing score matrices for /analysis/gh-msun/projects/2023_06_26_SRT_deconvolution_MS/output/mixture/mix_50B_50CD4_00CD8_00NK_00Mono_00Neutro_seed_888
Folder /analysis/gh-msun/projects/2023_06_26_SRT_deconvolution_MS/output/methyl_score/mixture/mix_50B_50CD4_00CD8_00NK_00Mono_00Neutro_seed_888/ already exists
-----> Computing score matrix for /analysis/gh-msun/projects/2023_06_26_SRT_deconvolution_MS/output/mixture/mix_50B_50CD4_00CD8_00NK_00Mono_00Neutro_seed_888/mix0_seed_83723.parquet
----------> Processing batch 0...
----------> Processing batch 1...
----------> Processing batch 2...
----------> Processing batch 3...
----------> Processing batch 4...
----------> Processing batch 5...
----------> Processing batch 6...
----------> Processing batch 7...
----------> Processing batch 8...
----------> Processing batch 9...
----------> Processing batch 10...
----------> Proce

In [19]:
# %%time
# DIR_PATH = '/analysis/gh-msun/projects/2023_06_26_SRT_deconvolution_MS/output/mixture/'+'mix_50B_50CD4_00CD8_00NK_00Mono_00Neutro_seed_888'
# scores_df_list = score_matrix_from_mixture_directory(dir_path=DIR_PATH, 
#                                                     result_path=RESULT_PATH,
#                                                     pat_cols=PAT_COLS, 
#                                                     region_df=region_df, 
#                                                     batch_size=BATCH_SIZE, 
#                                                     schema=RETURN_SCHEMA, 
#                                                     save=False,
#                                                     verbose=True)

In [None]:
# PCA sanity check