In [None]:
import pandas as pd
import glob
import numpy as np
import itertools
import functools
import os
import regex as re
import random

from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql.types import IntegerType, LongType, ArrayType, StringType, DoubleType
from pyspark.sql.functions import udf, explode, broadcast, count, lit, length, col
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
# UPDATE HOME!
os.environ["SPARK_HOME"] = "/home/ec2-user/mambaforge/envs/2023_06_26_SRT_deconvolution_MS/lib/python3.7/site-packages/pyspark"
# THIS needs to be set-up before running the notebook
os.environ["SPARK_LOCAL_DIRS"] = "/temp"
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

spark_conf = SparkConf()
spark_conf.set("spark.ui.showConsoleProgress", "True")
spark_conf.set("spark.executor.instances", "2")
spark_conf.set("spark.executor.cores", "2")
spark_conf.set("spark.executor.memory", "16g")
spark_conf.set("spark.driver.memory", "64g")
spark_conf.set("spark.driver.maxResultSize", "32g")
spark_conf.set("spark.parquet.filterPushdown", "true")
spark_conf.set("spark.local.dir", "/temp")
spark_conf.getAll()

sc = SparkContext(conf=spark_conf)
sc.setLogLevel("ERROR")
spark = SparkSession(sc)

In [3]:
# Local paths
EXPERIMENT_NAME = 'TESTING'
ROOT_DIR = '/analysis/gh-msun/'
PROJECT_SLUG = '2023_06_26_SRT_deconvolution_MS'
PROJECT_DIR = ROOT_DIR + f'projects/{PROJECT_SLUG}/'
EXPERIMENT_DIR = PROJECT_DIR + f'output/experiment/{EXPERIMENT_NAME}/'

# Load all custom scripts
SCRIPT_DIR = ROOT_DIR + 'scripts/'
SCRIPT_SCORE = SCRIPT_DIR + 'create_mixture.py'
SCRIPT_SCORE = SCRIPT_DIR + 'score_matrix.py'
SCRIPT_DECONVOLUTION = SCRIPT_DIR + 'deconvolution.py'

import SCRIPT_SCORE


ModuleNotFoundError: No module named 'SCRIPT_SCORE'

In [None]:
# create experiment directory if it doesn't already exist

## Create mixture

In [None]:
############################
#   Parameters and paths   # 
############################

PARQUET_PATH = EXPERIMENT_DIR + 'mixture_source/'
RESULT_PATH = EXPERIMENT_DIR + 'mixture/'

## Compute score matrix

In [None]:
############################
#   Parameters and paths   # 
############################

FILTER_CG_COUNT = 3
FILTER_CG_COUNT_REGION = 1

REGIONS = 'deconvolution_v2.v23_conv.with_cpg_index'
REGION_BED_COLS = [
    'region_chr', 'region_start', 'region_end', 
    'region_cpg_index_min', 'region_cpg_index_max', 'region_id'
]

# Regions
REGION_PATH = (
    PROJECT_DIR + '/stage/panel_data/{regions}.bed'
).format(regions=REGIONS)

# CpG map; genomic coordinate to CpG index;
CPG_MAP_PATH = PROJECT_DIR + '/stage/cpg_loci/cpg_loci_hg19.combined_annot.tsv.gz'

#--- Where to store results
RESULT_PATH = EXPERIMENT_DIR + 'methyl_score/'

## Deconvolution

In [None]:
############################
#   Parameters and paths   # 
############################

SCORE_VAR = 'frac_alpha_leq_25pct'

# Reference matrix
ATLAS_PATH = PROJECT_DIR + 'output/reference/deconv_inhouse_v2.atlas.tsv.gz'
REGION_PATH = PROJECT_DIR + 'output/reference/deconv_inhouse_v2.region_to_ref_celltype.tsv.gz'

# Methylation score matrix
SCORE_DIR = EXPERIMENT_DIR + 'methyl_score/'

# SCORE_PATH = (
#     SCORE_DIR + \
#     'E1B_E18CD4_E18CD8_E18NK_E18MONO_E18NEUTRO/' + \
#     'mix0_seed512070.tsv.gz'
# )