In [1]:
import pandas as pd
import glob
import numpy as np
import itertools
import functools
import os
import regex as re

from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql.types import IntegerType, LongType, ArrayType, StringType, DoubleType
from pyspark.sql.functions import udf, explode, broadcast, count, lit, length, col
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [16]:
#--- Local paths
ROOT_DIR = '/analysis/gh-msun/projects'
PROJECT_SLUG = '2023_06_26_SRT_deconvolution_MS'
PROJECT_DIR = ROOT_DIR + f'/{PROJECT_SLUG}'
DATA_DIR = ROOT_DIR + f'/{PROJECT_SLUG}' + '/stage'
SAMPLE_PATH = DATA_DIR + '/metadata/samples_wgbs.20230329.tsv'

#--- parquet
PARQUET_PATH_LIST_HG38 = [
    '/analysis/hg38_20160816.pat.db_version.parquet'
]

#--- regions
REGIONS = 'deconvolution_v2.v23_conv.with_cpg_index'

REGION_BED_COLS = [
    'region_chr', 'region_start', 'region_end', 
    'region_cpg_index_min', 'region_cpg_index_max', 'region_id'
]

REGION_PATH = (
    PROJECT_DIR + '/stage/panel_data/{regions}.bed'
).format(regions=REGIONS)

# Subset parquets to immune regions

### Load regions we want to subset
Load regions of interest to subset. e.g. The ATLAS dataframe contains BLUEPRINT immune regions only. The regions we want to subset to should be represented as a set of region id called `subset_region_set`

In [3]:
#-------------- CHANGE HERE FOR DIFFERENT REGION SUBSET ----------------------
# BLUEPRINT immune regions
ATLAS_PATH = PROJECT_DIR + f'/output/deconv_inhouse_v2.atlas.tsv.gz'
atlas = pd.read_csv(ATLAS_PATH, sep='\t')
subset_region_set = set(atlas.region_id)
#-----------------------------------------------------------------------------

# filter regions down to regions of interest
region_df = pd.read_csv(REGION_PATH, sep='\t', usecols=range(0, 6), names=REGION_BED_COLS)
region_df_subset = region_df[region_df['region_id'].isin(subset_region_set)]
region_df_subset.shape

(280, 6)

In [4]:
region_df_subset.head()

Unnamed: 0,region_chr,region_start,region_end,region_cpg_index_min,region_cpg_index_max,region_id
0,chr1,1114771,1114971,20117,20130,Immune_Broad_B-chr1:1114772-1114971
1,chr1,1157450,1157720,21684,21704,Immune_Broad_NK-chr1:1157451-1157720
2,chr1,1157879,1158277,21710,21727,Immune_Broad_NK-chr1:1157880-1158277
14,chr1,6341182,6341377,140667,140682,Immune_Broad_Eosi-chr1:6341183-6341377
19,chr1,9147788,9147871,188605,188609,Immune_Broad_Neutro-chr1:9147789-9147871


### Load parquet file as a pyspark dataframe

In [5]:
## this works for PySpark v3.3.1 - only need to run this once
os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages {aws_java},{aws_hadoop} pyspark-shell".\
   format(aws_java="com.amazonaws:aws-java-sdk-bundle:1.11.271",
          aws_hadoop="org.apache.hadoop:hadoop-aws:3.1.2")
#####

# UPDATE HOME!
os.environ["SPARK_HOME"] = "/home/ec2-user/mambaforge/envs/2023_06_26_SRT_deconvolution_MS/lib/python3.7/site-packages/pyspark"
# THIS needs to be set-up before running the notebook
os.environ["SPARK_LOCAL_DIRS"] = "/temp"
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

spark_conf = SparkConf()
spark_conf.set("spark.executor.instances", "2")
spark_conf.set("spark.executor.cores", "2")
spark_conf.set("spark.executor.memory", "16g")
spark_conf.set("spark.driver.memory", "64g")
spark_conf.set("spark.driver.maxResultSize", "32g")
spark_conf.set("spark.parquet.filterPushdown", "true")
spark_conf.set("spark.local.dir", "/temp")
spark_conf.getAll()

sc = SparkContext(conf=spark_conf)
sc.setLogLevel("ERROR")
spark = SparkSession(sc)



In [6]:
PAT_COLS = [
    'sample_id', 'molecule_id', 'chr', 'number_molecules',
    'cpg_index_min', 'cpg_index_max', 'pat_string'
]

In [7]:
pat_parquet_files = [spark.read.parquet(ifile).select(*PAT_COLS) for ifile in PARQUET_PATH_LIST_HG38]
pat_hg38_ddf = functools.reduce(DataFrame.unionByName, pat_parquet_files)
pat_hg38_ddf.printSchema()

root
 |-- sample_id: string (nullable = true)
 |-- molecule_id: string (nullable = true)
 |-- chr: string (nullable = true)
 |-- number_molecules: integer (nullable = true)
 |-- cpg_index_min: long (nullable = true)
 |-- cpg_index_max: long (nullable = true)
 |-- pat_string: string (nullable = true)



In [11]:
pat_hg38_ddf.show(5)

+---------+-----------+----+----------------+-------------+-------------+--------------------+
|sample_id|molecule_id| chr|number_molecules|cpg_index_min|cpg_index_max|          pat_string|
+---------+-----------+----+----------------+-------------+-------------+--------------------+
|ERS337091|          2|chr1|               1|           17|           57|TTCTCCTTTCCCCCTTC...|
|ERS337091|          1|chr1|               1|           19|           58|TTCCCCCTTCTCCTTTC...|
|ERS337091|          3|chr1|               1|           99|          115|   CCCCCCCCCCCCCCCTT|
|ERS337091|          6|chr1|               1|          101|          120|CCTCCCCCCCC....TTCTT|
|ERS337091|          5|chr1|               1|          108|          118|         CCCCCCTCCTC|
+---------+-----------+----+----------------+-------------+-------------+--------------------+
only showing top 5 rows



### Create a dataframe of reads that fall in the subsetted regions

In [9]:
subset_fragment_df_list = list()

# for each region (row) get reads that fall into that region
for _, row in region_df_subset.iterrows():
    ov_ddf = pat_hg38_ddf.filter(col('cpg_index_min')<=row['region_cpg_index_max'])\
            .filter(col('cpg_index_max') >= row['region_cpg_index_min'])\
            .withColumn('region_id', lit(row['region_id']))\
            .withColumn('region_cpg_index_min', lit(row['region_cpg_index_min']))\
            .withColumn('region_cpg_index_max', lit(row['region_cpg_index_max']))
    subset_fragment_df_list.append(ov_ddf)
    
# concatenate (union) the objects in list into one parquet file
subset_parquet_df = subset_fragment_df_list[0]

for df in subset_fragment_df_list[1:]:
    subset_parquet_df = subset_parquet_df.unionByName(df)

In [12]:
subset_parquet_df.show(5)

+---------+-----------+----+----------------+-------------+-------------+----------+--------------------+--------------------+--------------------+
|sample_id|molecule_id| chr|number_molecules|cpg_index_min|cpg_index_max|pat_string|           region_id|region_cpg_index_min|region_cpg_index_max|
+---------+-----------+----+----------------+-------------+-------------+----------+--------------------+--------------------+--------------------+
|ERS337091|      61653|chr1|               1|        20108|        20117|CCCC..CCCC|Immune_Broad_B-ch...|               20117|               20130|
|ERS337091|      61654|chr1|               1|        20111|        20117|   CCCCCCC|Immune_Broad_B-ch...|               20117|               20130|
|ERS337091|      61655|chr1|               1|        20111|        20117|   CCCCCCC|Immune_Broad_B-ch...|               20117|               20130|
|ERS337091|      61659|chr1|               1|        20111|        20117|   CCC.CCC|Immune_Broad_B-ch...|       

In [14]:
read_count_by_sample = subset_parquet_df.groupBy('sample_id').count().orderBy('count')
read_count_by_sample.show(6)

### Create parquet file for each cell type

In [38]:
cellType = [   
    'Blueprint-B',
    'Blueprint-CD4',
    'Blueprint-CD8',
    'Blueprint-NK',
    'Blueprint-Dend',
    'Blueprint-Macro',
    'Blueprint-Mono',
    'Blueprint-Eosi',
    'Blueprint-Neutro',
    'Blueprint-Eryth',
    'Blueprint-Mega',
    'Eryth-prog'
]

# map between sample and cell type
sample_df = pd.read_csv(SAMPLE_PATH, sep='\t')
ridxs = (sample_df['source']=='blueprint_loyfer2022')
ridxs &= sample_df['sample_group'].isin(cellType)
ref_sample_df = sample_df[ridxs].copy()
ref_sample_df.head()

Unnamed: 0,sample_id,patient_id,cell_type,sample_group,age,source,stage_group,tumor_purity
98,GSM5652274,Erythrocyte_progenitors,Erythrocyte progenitors,Eryth-prog,60.0,blueprint_loyfer2022,,
99,GSM5652275,Erythrocyte_progenitors,Erythrocyte progenitors,Eryth-prog,53.0,blueprint_loyfer2022,,
100,GSM5652276,Erythrocyte_progenitors,Erythrocyte progenitors,Eryth-prog,64.0,blueprint_loyfer2022,,
211,ERS661049,BM030613,band form neutrophil,Blueprint-Neutro,65 - 70,blueprint_loyfer2022,,
212,ERS661048,BM030613,neutrophilic metamyelocyte,Blueprint-Neutro,65 - 70,blueprint_loyfer2022,,


In [44]:
# how many samples do each cell type have?
ref_sample_df.groupby('sample_group').sample_id.count()

sample_group
Blueprint-B         17
Blueprint-CD4       14
Blueprint-CD8       10
Blueprint-Dend       2
Blueprint-Eosi       2
Blueprint-Eryth      2
Blueprint-Macro     18
Blueprint-Mega       2
Blueprint-Mono       8
Blueprint-NK         4
Blueprint-Neutro    21
Eryth-prog           3
Name: sample_id, dtype: int64

In [None]:
# try saving parquet file for just one celltype and time it 
# need to count the number of molecules per region by cell type per sample

In [57]:
CELLTYPE = 'Blueprint-B'
samples_by_celltype = list(ref_sample_df[ref_sample_df['sample_group'] == CELLTYPE]['sample_id'])

In [58]:
samples_by_celltype

['ERS666927',
 'ERS337605',
 'ERS337607',
 'ERS666931',
 'ERS214672',
 'ERS214675',
 'ERS222206',
 'ERS222208',
 'ERS568736',
 'ERS666930',
 'ERS666929',
 'ERS822885',
 'ERS1022343',
 'ERS222266',
 'ERS523616',
 'ERS523625',
 'ERS763500']

In [61]:
df_celltype = subset_parquet_df.filter((col('sample_id')).isin(samples_by_celltype))

In [None]:
df_celltype.show(5)