This notebook is for creating mixtures from cell type parquet files created in notebook `01_combine_subjects_by_celltype.ipynb`.

In [1]:
import pandas as pd
import glob
import numpy as np
import itertools
import functools
import os
import regex as re
import random

from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql.types import IntegerType, LongType, ArrayType, StringType, DoubleType
from pyspark.sql.functions import udf, explode, broadcast, count, lit, length, col
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
# UPDATE HOME!
os.environ["SPARK_HOME"] = "/home/ec2-user/mambaforge/envs/2023_06_26_SRT_deconvolution_MS/lib/python3.7/site-packages/pyspark"
# THIS needs to be set-up before running the notebook
os.environ["SPARK_LOCAL_DIRS"] = "/temp"
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

spark_conf = SparkConf()
spark_conf.set("spark.ui.showConsoleProgress", "True")
spark_conf.set("spark.executor.instances", "2")
spark_conf.set("spark.executor.cores", "2")
spark_conf.set("spark.executor.memory", "16g")
spark_conf.set("spark.driver.memory", "64g")
spark_conf.set("spark.driver.maxResultSize", "32g")
spark_conf.set("spark.parquet.filterPushdown", "true")
spark_conf.set("spark.local.dir", "/temp")
spark_conf.getAll()

sc = SparkContext(conf=spark_conf)
sc.setLogLevel("ERROR")
spark = SparkSession(sc)



### Sampling in PySpark
The `sample()` function takes fraction of reads to sample, not the number of reads to sample. \
We can compute the fraction from the total number of reads and the number of desired reads to sample. \
Mapping: `(N rows to sample) --> (F fraction to sample)`
```
N rows to sample = fraction * total reads
fraction = N rows to sample / total reads
```

In [6]:
def generate_mixture_dir_name_string(list_celltype_name, list_proportion):
    '''
    Generate name for cell type given list of cell type name 
    ['B', 'CD4', 'CD8', 'NK', 'Mono', 'Neutro'] and 
    list of proportion np.array([0.5, 0.5, 0, 0, 0, 0]).
    Output string: 'mix_50B_50CD4_00CD8_00NK_00Mono_00Neutro]'
    '''
    list_proportion = [int(x) for x in PROPORTION * 100]
    names = [f"{b:02d}{a}" if b < 10 else f"{b}{a}" for a, b in zip(list_celltype_name, list_proportion)]
    dir_name = '_'.join(names)
    return(dir_name)


def mix_celltypes_helper(parquet_df, total_reads_per_celltype, cell_types, total_reads_to_sample, proportions, seed, parquet_path, result_path, verbose, save=False, itr=None):
    ''' Mix reads from different cell types based on given proportion and total reads to sample.
    Data is loaded once in mix_celltypes() to avoid loading dataframes repeatedly.
    
    Arguments:
    paquet_df -- list of dataframes loaded in mix_celltypes()
    total_reads_per_celltype -- calculated while reading in dataframe (nrow of each df)
    cell_types -- list of cell type to mix
    total_reads_to_sample -- integer representing the total number of reads to sample across all cell types
    proportions -- list of proportions to sample for each cell type
    seed -- seed for .sample()
    parquet_path -- string of path to the directory with source cell type reads to mix from
    result_path -- string of path to output parquet file
    itr -- mixture iteration for creating multiple mixtures
    
    Output:
    mixture -- pyspark.sql.dataframe.DataFrame
    '''
    
    if verbose: print(f'--> seed: {seed}')
    
    # compute fraction to sample for each cell type (later convert to index)
    n_reads_to_sample = proportions * total_reads_to_sample
    sampling_fraction = n_reads_to_sample / total_reads_per_celltype
    if verbose: print(f'Sampling fraction: {sampling_fraction}')
    
    # sample reads from each cell type
    sampled_df = []
    
    if verbose: print('--> Sample rows for each cell type...')
    for i in range(0, len(cell_types)):
        if verbose: print(f'----------> Sampling cell type: {cell_types[i]}')
        df = parquet_df[i]
        frac = sampling_fraction[i]
        df_sample = df.sample(False, frac, seed)
        sampled_df.append(df_sample)
        n_sampled = df_sample.count()
        if verbose: print(f'----------> {n_sampled}')
    
    # combine reads
    if verbose: print('--> Combining sampled reads into one dataframe...')
    mixture = functools.reduce(DataFrame.union, sampled_df)
    
    if save:
        # create file name 
        seed_string = str(int(seed))
        celltype_string = '_'.join(cell_types)
        proportion_str = [str(i) for i in proportions]
        proportion_string = '_'.join(proportion_str)
        mixture_itr = f'mix{itr}_'
        file_name = mixture_itr + \
                    f'seed_{seed_string}' + \
                    '.parquet/'

        if verbose: print('--> Saving parquet file...')
        save_path = result_path + file_name
        mixture.write.mode('overwrite').parquet(save_path)
        if verbose: print(f'--> Saved to: {save_path}')
    
    return(mixture)


def mix_celltypes(n, cell_types, total_reads_to_sample, proportions, seed, parquet_path, result_path, verbose, save=False):
    '''Create n mixtures by mixing reads from different cell types based on given proportion and total reads to sample. 
    Calls mix_celltypes_helper() n times.
    
    Arguments:
    n -- total number of mixtures to make
    cell_types -- list of cell type to mix
    total_reads_to_sample -- integer representing the total number of reads to sample across all cell types
    proportions -- list of proportions to sample for each cell type
    seed -- seed for .sample()
    parquet_path -- string of path to the directory with source cell type reads to mix from
    result_path -- string of path to output parquet file
    
    Output:
    mixtures -- list(pyspark.sql.dataframe.DataFrame)
    '''
    
    # Create output directory
    dir_name = generate_mixture_dir_name_string(CELLTYPES_ABRIDGED_NAME, PROPORTION)
    dir_name = 'mix_' + dir_name + '_seed_' + str(seed) + '/'
    dir_path = result_path + dir_name
    
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
        print("Folder %s created!" % dir_path)
    else:
        print("Folder %s already exists" % dir_path)
        
    # Generate seeds (between 0 and 1 million)
    random.seed(seed)
    seeds = [random.randint(0, 10**6) for _ in range(n)]
    
    # Only load data that has nonzero proportion to sample
    cell_types = [a for a, b in zip(cell_types, proportions) if b != 0]
    proportions = np.array([b for b in proportions if b != 0])
    
    # Load the parquet files for selected cell types & count rows
    parquet_df = []
    total_reads_per_celltype = []
    
    if verbose: print('--> Load parquet files and count rows...')
    for cell_type in cell_types:
        if verbose: print(f'----------> Loading cell type: {cell_type}')
        df = spark.read.parquet(f'{parquet_path}collapsed_reads_{cell_type}/')
        parquet_df.append(df)
        total_reads_per_celltype.append(df.count())

    total_reads_per_celltype = np.array(total_reads_per_celltype)
    print(f'TOTAL_READS: {total_reads_per_celltype}')
    # Create n mixtures
    mixtures = []
    
    for i in range(0, n):
        
        print(f'################ Creating mixture {i}... ################')
        mixture = mix_celltypes_helper(parquet_df=parquet_df,
                                       total_reads_per_celltype=total_reads_per_celltype,
                                       cell_types=cell_types,
                                       total_reads_to_sample=total_reads_to_sample, 
                                       proportions=proportions, 
                                       seed=seeds[i],
                                       parquet_path=parquet_path,
                                       result_path=dir_path,
                                       save=save,
                                       itr=i,
                                       verbose=verbose)
        mixtures.append(mixture)
        print(' ')
    
    print(f'>>> Complete. <<<')
    return(mixtures)

In [5]:
%%time

# 1 mixture with N replicates

N=10
CELLTYPES = ['Blueprint-B', 'Blueprint-CD4', 'Blueprint-CD8', 'Blueprint-NK', 'Blueprint-Mono', 'Blueprint-Neutro']
CELLTYPES_ABRIDGED_NAME = ['B', 'CD4', 'CD8', 'NK', 'Mono', 'Neutro']
TOTAL_READS_TO_SAMPLE = 25000
PROPORTION = np.array([0.5, 0.5, 0, 0, 0, 0])
SEED = 888
PARQUET_PATH = '/analysis/gh-msun/projects/2023_06_26_SRT_deconvolution_MS/output/mixture_source/'
RESULT_PATH = '/analysis/gh-msun/projects/2023_06_26_SRT_deconvolution_MS/output/mixture/'

test_mixtures = mix_celltypes(n=N,
                             cell_types=CELLTYPES,
                             total_reads_to_sample=TOTAL_READS_TO_SAMPLE, 
                             proportions=PROPORTION, 
                             seed=SEED,
                             parquet_path=PARQUET_PATH,
                             result_path=RESULT_PATH,
                             save=True,
                             verbose=True)

Folder /analysis/gh-msun/projects/2023_06_26_SRT_deconvolution_MS/output/mixture/mix_50B_50CD4_00CD8_00NK_00Mono_00Neutro_seed_888/ already exists
--> Load parquet files and count rows...
----------> Loading cell type: Blueprint-B
----------> Loading cell type: Blueprint-CD4
TOTAL_READS: [2963964 1965191]
################ Creating mixture 0... ################
--> seed: 83723
total_reads_to_sample: [2963964 1965191]
Sampling fraction: [0.00421733 0.0063607 ]
--> Sample rows for each cell type...
----------> Sampling cell type: Blueprint-B
----------> 12479
----------> Sampling cell type: Blueprint-CD4
----------> 12431
--> Combining sampled reads into one dataframe...
--> Saving parquet file...
--> Saved to: /analysis/gh-msun/projects/2023_06_26_SRT_deconvolution_MS/output/mixture/mix_50B_50CD4_00CD8_00NK_00Mono_00Neutro_seed_888/mix0_seed_83723.parquet/
 
################ Creating mixture 1... ################
--> seed: 452891
total_reads_to_sample: [2963964 1965191]
Sampling fraction

In [14]:
from pyspark.sql.functions import countDistinct
test_mixtures[0].select(countDistinct('sample_id')).show()

+-------------------------+
|count(DISTINCT sample_id)|
+-------------------------+
|                       31|
+-------------------------+



In [15]:
test_mixtures[0].select(countDistinct('region_id')).show()

+-------------------------+
|count(DISTINCT region_id)|
+-------------------------+
|                     1655|
+-------------------------+



In [270]:
test_mixtures[2].show(5)

+---------+-----------+-----+----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+
|sample_id|molecule_id|  chr|number_molecules|cpg_index_min|cpg_index_max|          pat_string|           region_id|region_cpg_index_min|region_cpg_index_max|
+---------+-----------+-----+----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+
|ERS666930|  410443726|chr16|               1|     21773402|     21773416|     CCCCCCCCCCCCCCC|Loyfer2022_Prepri...|            21773392|            21773411|
|ERS763500|  420348869|chr15|               1|     21491080|     21491103|CCCCCCCCCCCCCCCCC...|Umbilical_Endothe...|            21491086|            21491152|
|ERS666930|  410292996|chr16|               1|     21756236|     21756247|        CCCCCCCCCCCC|Neuron_plus_Oligo...|            21756240|            21756264|
|ERS666930|  410293129|chr16|               1|