In [1]:
import pandas as pd
import glob
import numpy as np
import itertools
import functools
import os
import regex as re
import random
import importlib
import sys

from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql.types import IntegerType, LongType, ArrayType, StringType, DoubleType
from pyspark.sql.functions import udf, explode, broadcast, count, lit, length, col
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType

# UPDATE HOME!
os.environ["SPARK_HOME"] = "/home/ec2-user/mambaforge/envs/2023_06_26_SRT_deconvolution_MS/lib/python3.7/site-packages/pyspark"
# THIS needs to be set-up before running the notebook
os.environ["SPARK_LOCAL_DIRS"] = "/temp"
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

spark_conf = SparkConf()
spark_conf.set("spark.ui.showConsoleProgress", "True")
spark_conf.set("spark.executor.instances", "2")
spark_conf.set("spark.executor.cores", "2")
spark_conf.set("spark.executor.memory", "16g")
spark_conf.set("spark.driver.memory", "64g")
spark_conf.set("spark.driver.maxResultSize", "32g")
spark_conf.set("spark.parquet.filterPushdown", "true")
spark_conf.set("spark.local.dir", "/temp")
spark_conf.getAll()

sc = SparkContext(conf=spark_conf)
sc.setLogLevel("ERROR")
spark = SparkSession(sc)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)



In [2]:
# Local paths
EXPERIMENT_NAME = 'TESTING'
ROOT_DIR = '/analysis/gh-msun/'
PROJECT_SLUG = '2023_06_26_SRT_deconvolution_MS'
PROJECT_DIR = ROOT_DIR + f'projects/{PROJECT_SLUG}/'
EXPERIMENT_DIR = PROJECT_DIR + f'output/experiment/{EXPERIMENT_NAME}/'

# Load all custom scripts
SCRIPT_DIR = PROJECT_DIR + 'scripts/'
SCRIPT_MIXTURE = SCRIPT_DIR + 'create_mixture.py'
SCRIPT_SCORE = SCRIPT_DIR + 'score_matrix.py'
SCRIPT_DECONVOLUTION = SCRIPT_DIR + 'deconvolution.py'
sys.path.append(SCRIPT_DIR)

import create_mixture as cm
import score_matrix as sm
import deconvolution as dcnv

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [34]:
importlib.reload(cm)
importlib.reload(sm)
importlib.reload(dcnv)

<module 'deconvolution' from '/analysis/gh-msun/projects/2023_06_26_SRT_deconvolution_MS/scripts/deconvolution.py'>

## Create mixture

In [3]:
############################
#   Parameters and paths   # 
############################

# PARQUET_PATH = EXPERIMENT_DIR + 'mixture_source/'
PARQUET_PATH = PROJECT_DIR + 'output/mixture_source/'
RESULT_PATH = EXPERIMENT_DIR + 'mixture/'

def punif(p, n):
    return((1-p)/n)

k=5
p1, p2, p3 = 0.1, 0.01, 0.001
p1_, p2_, p3_ = punif(p1, k), punif(p2, k), punif(p3, k)

PROPORTIONS = [np.array([p1, p1_, p1_, p1_, p1_, p1_]),
               np.array([p2, p2_, p2_, p2_, p2_, p2_]),
               np.array([p3, p3_, p3_, p3_, p3_, p3_])]

N=3
SEED = 888
TOTAL_READS_TO_SAMPLE = 25000
CELLTYPES = ['Blueprint-B', 'Blueprint-CD4', 'Blueprint-CD8', 'Blueprint-NK', 'Blueprint-Mono', 'Blueprint-Neutro']
CELLTYPES_ABRIDGED_NAME = ['B', 'CD4', 'CD8', 'NK', 'Mono', 'Neutro']

In [4]:
%%time
# load parquet files for each celltype & count rows
parquet_df, total_reads_per_celltype = cm.load_parquet_dataframe(parquet_path=PARQUET_PATH,
                                                                  cell_types=CELLTYPES,
                                                                  spark=spark,
                                                                  verbose=True)

# mix cell types for each proportion set
cm.mix_celltypes_multiple_proportions(parquet_df=parquet_df,
                                       total_reads_per_celltype=total_reads_per_celltype,
                                       n=N, 
                                       cell_types=CELLTYPES, 
                                       cell_type_abridged_name=CELLTYPES_ABRIDGED_NAME, 
                                       total_reads_to_sample=TOTAL_READS_TO_SAMPLE, 
                                       list_of_proportions=PROPORTIONS, 
                                       seed=SEED, 
                                       result_path=RESULT_PATH, 
                                       spark=spark,
                                       verbose=False, 
                                       save=True)

>>> Load parquet files and count rows... <<<
----------> Loading cell type: Blueprint-B
----------> Loading cell type: Blueprint-CD4
----------> Loading cell type: Blueprint-CD8
----------> Loading cell type: Blueprint-NK
----------> Loading cell type: Blueprint-Mono
----------> Loading cell type: Blueprint-Neutro
>>> Complete. <<< 

>>> Start mixing... <<<
--> PROPORTION: [0.1  0.18 0.18 0.18 0.18 0.18]
----------> Creating mixture 0... 
----------> Creating mixture 1... 
----------> Creating mixture 2... 
--> PROPORTION: [0.01  0.198 0.198 0.198 0.198 0.198]
----------> Creating mixture 0... 
----------> Creating mixture 1... 
----------> Creating mixture 2... 
--> PROPORTION: [0.001  0.1998 0.1998 0.1998 0.1998 0.1998]
----------> Creating mixture 0... 
----------> Creating mixture 1... 
----------> Creating mixture 2... 
>>> Complete. <<< 



## Compute score matrix

In [39]:
############################
#   Parameters and paths   # 
############################

REGIONS = 'deconvolution_v2.v23_conv.with_cpg_index'
REGION_BED_COLS = [
    'region_chr', 'region_start', 'region_end', 
    'region_cpg_index_min', 'region_cpg_index_max', 'region_id'
]

# Regions
REGION_PATH = (
    PROJECT_DIR + '/stage/panel_data/{regions}.bed'
).format(regions=REGIONS)

# CpG map; genomic coordinate to CpG index;
CPG_MAP_PATH = PROJECT_DIR + '/stage/cpg_loci/cpg_loci_hg19.combined_annot.tsv.gz'

#--- Where to store results
PATH_TO_MIXTURE_DIR = EXPERIMENT_DIR + 'mixture/'
RESULT_PATH = EXPERIMENT_DIR


In [6]:
cpg_map = pd.read_csv(CPG_MAP_PATH, usecols=['chr', 'start', 'end', 'cpg_index', 'cpg_index_hg38'], sep='\t')

In [7]:
%%time
ridxs = ~cpg_map['cpg_index_hg38'].isna()
hg19_hg38_map = dict(itertools.zip_longest(cpg_map[ridxs]['cpg_index'], cpg_map[ridxs]['cpg_index_hg38'].astype(int)))
hg38_hg19_map = dict(itertools.zip_longest(cpg_map[ridxs]['cpg_index_hg38'].astype(int), cpg_map[ridxs]['cpg_index']))

CPU times: user 15.9 s, sys: 23.2 s, total: 39.1 s
Wall time: 39.5 s


In [8]:
region_df = pd.read_csv(REGION_PATH, sep='\t', usecols=range(0, 6), names=REGION_BED_COLS)

region_df['region_cpg_index_max'] -= 1
region_df.sort_values('region_cpg_index_min', inplace=True)
region_df['region_cpg_index_min_hg38'] = region_df['region_cpg_index_min'].map(hg19_hg38_map)
region_df['region_cpg_index_max_hg38'] = region_df['region_cpg_index_max'].map(hg19_hg38_map)

region_df.shape[0], region_df['region_id'].nunique()

(1658, 1658)

In [9]:
ridxs = ~region_df['region_cpg_index_min_hg38'].isna()
ridxs &= ~region_df['region_cpg_index_max_hg38'].isna()
region_df = region_df[ridxs].copy()
region_df.shape[0], region_df['region_id'].nunique()

(1658, 1658)

In [10]:
cg_count_hg19 = region_df['region_cpg_index_max']-region_df['region_cpg_index_min'] + 1
cg_count_hg38 = region_df['region_cpg_index_max_hg38']-region_df['region_cpg_index_min_hg38'] + 1
ridxs = (cg_count_hg19==cg_count_hg38)
ridxs &= (cg_count_hg19>=FILTER_CG_COUNT_REGION)
region_df = region_df[ridxs].copy()
region_df.shape[0], region_df['region_id'].nunique()

(1657, 1657)

In [11]:
region_df['region_cpg_index_min_hg38'] = region_df['region_cpg_index_min_hg38'].astype(int)
region_df['region_cpg_index_max_hg38'] = region_df['region_cpg_index_max_hg38'].astype(int)

In [13]:
### Restrict to immune regions
#-------------- CHANGE HERE FOR DIFFERENT REGION SUBSET ----------------------
# BLUEPRINT immune regions
ATLAS_PATH = PROJECT_DIR + f'/output/reference/deconv_inhouse_v2.atlas.tsv.gz'
atlas = pd.read_csv(ATLAS_PATH, sep='\t')
subset_region_set = set(atlas.region_id)
#-----------------------------------------------------------------------------

# filter regions down to regions of interest
region_df = region_df[region_df['region_id'].isin(subset_region_set)]
region_df.head()

Unnamed: 0,region_chr,region_start,region_end,region_cpg_index_min,region_cpg_index_max,region_id,region_cpg_index_min_hg38,region_cpg_index_max_hg38
0,chr1,1114771,1114971,20117,20129,Immune_Broad_B-chr1:1114772-1114971,21119,21131
1,chr1,1157450,1157720,21684,21703,Immune_Broad_NK-chr1:1157451-1157720,22686,22705
2,chr1,1157879,1158277,21710,21726,Immune_Broad_NK-chr1:1157880-1158277,22712,22728
14,chr1,6341182,6341377,140667,140681,Immune_Broad_Eosi-chr1:6341183-6341377,142368,142382
19,chr1,9147788,9147871,188605,188608,Immune_Broad_Neutro-chr1:9147789-9147871,190307,190310


In [None]:
%%time

FILTER_CG_COUNT = 3
FILTER_CG_COUNT_REGION = 1

PAT_COLS = [
    'molecule_id', 'chr', 'number_molecules',
    'cpg_index_min', 'cpg_index_max', 'pat_string'
]

QUANTILES = [0.1, 0.25, 0.75, 0.9]
KMERS = [1, 3, 4]
RATES_LEQ = [0.25]
RATES_GEQ = [0.75]

RETURN_SCHEMA = StructType()\
    .add('region_id', 'string')\
    .add('number_molecules', 'integer')\
    .add('meth_k1', 'integer')\
    .add('unmeth_k1', 'integer')\
    .add('total_k1', 'integer')\
    .add('meth_k3', 'integer')\
    .add('unmeth_k3', 'integer')\
    .add('total_k3', 'integer')\
    .add('meth_k4', 'integer')\
    .add('unmeth_k4', 'integer')\
    .add('total_k4', 'integer')\
    .add('frac_alpha_leq_25pct', 'float')\
    .add('frac_alpha_geq_75pct', 'float')

compute_frag_scores_udf = sm.compute_frag_scores(cpg_number_cutoff=FILTER_CG_COUNT, schema=RETURN_SCHEMA, kmers=KMERS, rates_leq=RATES_LEQ, rates_geq=RATES_GEQ)


# compute methyl score for are parquet files
sm.score_matrix_from_mixture_directory(path_to_mixture_dir = PATH_TO_MIXTURE_DIR, 
                                       result_path = RESULT_PATH, 
                                       pat_cols = PAT_COLS, 
                                       region_df = region_df, 
                                       batch_size = 20, 
                                       schema = RETURN_SCHEMA, 
                                       spark=spark,
                                       compute_frag_scores_udf=compute_frag_scores_udf,
                                       save=True, 
                                       verbose=False)

>>> Start computing score matrices <<< 

--> E1B_E18CD4_E18CD8_E18NK_E18MONO_E18NEUTRO
--------> Computing score matrix for mix0_seed512070


In [36]:
RESULT_PATH 

'/analysis/gh-msun/projects/2023_06_26_SRT_deconvolution_MS/output/experiment/BLUEPRINT_B/'

## Deconvolution

In [None]:
############################
#   Parameters and paths   # 
############################

SCORE_VAR = 'frac_alpha_leq_25pct'

# Reference matrix
ATLAS_PATH = PROJECT_DIR + 'output/reference/deconv_inhouse_v2.atlas.tsv.gz'
REGION_PATH = PROJECT_DIR + 'output/reference/deconv_inhouse_v2.region_to_ref_celltype.tsv.gz'

# Methylation score matrix
SCORE_DIR = EXPERIMENT_DIR + 'methyl_score/'

# SCORE_PATH = (
#     SCORE_DIR + \
#     'E1B_E18CD4_E18CD8_E18NK_E18MONO_E18NEUTRO/' + \
#     'mix0_seed512070.tsv.gz'
# )

In [None]:
atlas = pd.read_csv(ATLAS_PATH, sep='\t',index_col=None)
atlas = atlas.set_index('region_id')
atlas.columns.name = 'ref_celltype'
atlas.shape

In [None]:
ref_region_df = pd.read_csv(REGION_PATH, sep='\t')
ref_region_df.shape[0]

In [None]:
atlas = atlas.loc[ref_region_df['region_id'], :].copy()
atlas.shape

In [None]:
%%time
testing = dcnv.compute_deconvolution_methyl_score_dir(path_to_methyl_score_dir=SCORE_DIR, 
                                                      score_type=SCORE_VAR, 
                                                      atlas=atlas, 
                                                      match=False)
