In [109]:
import pandas as pd
import glob
import numpy as np
import itertools
import functools
import os
import regex as re
import random

from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql.types import IntegerType, LongType, ArrayType, StringType, DoubleType
from pyspark.sql.functions import udf, explode, broadcast, count, lit, length, col, monotonically_increasing_id
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
# UPDATE HOME!
os.environ["SPARK_HOME"] = "/home/ec2-user/mambaforge/envs/2023_06_26_SRT_deconvolution_MS/lib/python3.7/site-packages/pyspark"
# THIS needs to be set-up before running the notebook
os.environ["SPARK_LOCAL_DIRS"] = "/temp"
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

spark_conf = SparkConf()
spark_conf.set("spark.ui.showConsoleProgress", "True")
spark_conf.set("spark.executor.instances", "2")
spark_conf.set("spark.executor.cores", "2")
spark_conf.set("spark.executor.memory", "16g")
spark_conf.set("spark.driver.memory", "64g")
spark_conf.set("spark.driver.maxResultSize", "32g")
spark_conf.set("spark.parquet.filterPushdown", "true")
spark_conf.set("spark.local.dir", "/temp")
spark_conf.getAll()

sc = SparkContext(conf=spark_conf)
sc.setLogLevel("ERROR")
spark = SparkSession(sc)



### Sampling in PySpark
The `sample()` function takes fraction of reads to sample, not the number of reads to sample. \
We can compute the fraction from the total number of reads and the number of desired reads to sample. \
Mapping: `(N rows to sample) --> (F fraction to sample)`
```
N rows to sample = fraction * total reads
fraction = N rows to sample / total reads
```

In [178]:
def mix_celltypes_helper(cell_types, total_reads_to_sample, proportions, seed, parquet_path, result_path, save=False, itr=None):
    ''' Mix reads from different cell types based on given proportion and total reads to sample.
    
    Arguments:
    cell_types -- list of cell type to mix
    total_reads_to_sample -- integer representing the total number of reads to sample across all cell types
    proportions -- list of proportions to sample for each cell type
    seed -- seed for .sample()
    parquet_path -- string of path to the directory with source cell type reads to mix from
    result_path -- string of path to output parquet file
    itr -- mixture iteration for creating multiple mixtures
    
    
    Output:
    combined_df -- pyspark.sql.dataframe.DataFrame
    '''
    
    print(f'--> seed: {seed}')
    
    # load the parquet files for selected cell types & count rows
    parquet_df = []
    total_reads_per_celltype = []
    
    print('--> Load parquet files and count rows...')
    for cell_type in cell_types:
        print(f'----------> Loading cell type: ### {cell_type}')
        df = spark.read.parquet(f'{parquet_path}collapsed_reads_{cell_type}/')
        parquet_df.append(df)
        total_reads_per_celltype.append(df.count())

    total_reads_per_celltype = np.array(total_reads_per_celltype)
    
    # compute fraction to sample for each cell type (later convert to index)
    n_reads_to_sample = proportions * total_reads_to_sample
    sampling_fraction = n_reads_to_sample / total_reads_per_celltype
    print(f'total_reads_to_sample: {total_reads_per_celltype}')
    print(f'Sampling fraction: {sampling_fraction}')
    
    # sample reads from each cell type
    sampled_df = []
    
    print('--> Sample rows for each cell type...')
    for i in range(0, len(cell_types)):
        print(f'----------> Sampling cell type: {i}')
        df = parquet_df[i]
        frac = sampling_fraction[i]
        df_sample = df.sample(False, frac, seed)
        sampled_df.append(df_sample)
        n_sampled = df_sample.count()
        print(f'----------> {n_sampled}')
    
    # combine reads
    print('--> Combining sampled reads into one dataframe...')
    combined_df = functools.reduce(DataFrame.union, sampled_df)
    
    if save:
        # create file name 
        seed_string = str(int(seed))
        celltype_string = '_'.join(cell_types)
        proportion_str = [str(i) for i in proportions]
        proportion_string = '_'.join(proportion_str)
        mixture_itr = f'M{itr}_'
        file_name = mixture_itr + \
                    celltype_string + \
                    '_proportions_' + proportion_string + \
                    f'_seed_{seed_string}' + \
                    '.parquet/'

        print('--> Saving parquet file...')
        save_path = result_path + file_name
        combined_df.write.mode('overwrite').parquet(save_path)
        print(f'--> Saved to: {save_path}')
    
    return(combined_df)


def mix_celltypes(n, cell_types, total_reads_to_sample, proportions, seed, parquet_path, result_path, save=False):
    ''' Mix reads from different cell types based on given proportion and total reads to sample.
    
    Arguments:
    n -- total number of mixtures to make
    cell_types -- list of cell type to mix
    total_reads_to_sample -- integer representing the total number of reads to sample across all cell types
    proportions -- list of proportions to sample for each cell type
    seed -- seed for .sample()
    parquet_path -- string of path to the directory with source cell type reads to mix from
    result_path -- string of path to output parquet file
    
    Output:
    combined_df -- pyspark.sql.dataframe.DataFrame
    '''

    # Generate seeds (assume we want values between 0 and 1 million)
    random.seed(seed)
    seeds = [random.randint(0, 10**6) for _ in range(n)]
    
    # Create n mixtures
    mixtures = []
    
    for i in range(0, n):
        
        print(f'################ Creating mixture {i}... ################')
        mixture = mix_celltypes_helper(cell_types=cell_types,
                                       total_reads_to_sample=total_reads_to_sample, 
                                       proportions=proportions, 
                                       seed=seeds[i],
                                       parquet_path=parquet_path,
                                       result_path=result_path,
                                       save=save,
                                       itr=i)
        mixtures.append(mixture)
        print(' ')
    
    print(f'>>> Complete. <<<')
    return(mixtures) 



In [144]:
# def mix_celltypes_index(cell_types, total_reads_to_sample, proportions, seed, parquet_path, result_path, save=False):
#     ''' Mix reads from different cell types based on given proportion and total reads to sample.
    
#     Arguments:
#     cell_types -- list of cell type to mix
#     total_reads_to_sample -- integer representing the total number of reads to sample across all cell types
#     proportions -- list of proportions to sample for each cell type
#     seed -- seed for .sample()
#     parquet_path -- string of path to the directory with source cell type reads to mix from
#     result_path -- string of path to output parquet file
    
#     Output:
#     combined_df -- pyspark.sql.dataframe.DataFrame
#     '''
    
#     # load the parquet files for selected cell types & count rows
#     sampled_df = []
#     i = 0
    
#     for cell_type in cell_types:
#         print(f'--> Loading cell type: ### {cell_type}')
#         df = spark.read.parquet(f'{parquet_path}collapsed_reads_{cell_type}/')
        
#         print(f'--> Sampling cell type: {cell_type}')
#         df = df.withColumn("index", monotonically_increasing_id())
#         df_nrow = df.count()
#         n_sample = round(df_nrow * proportions[i])
#         indices_to_sample_from = list(range(0, df_nrow))
#         indices_sampled = random.sample(indices_to_sample_from, n_sample)
#         df_sampled = df[df.index.isin(indices_sampled)]
        
#         sampled_df.append(df_sampled)
#         i+=1

#     # combine reads
#     print('--> Combining sampled reads into one dataframe...')
#     combined_df = functools.reduce(DataFrame.union, sampled_df)
    
#     if save:
#         # create file name 
#         seed_string = str(int(seed))
#         celltype_string = '_'.join(cell_types)
#         proportion_str = [str(i) for i in proportions]
#         proportion_string = '_'.join(proportion_str)
#         file_name =  celltype_string + \
#                     '_proportions_' + proportion_string + \
#                     f'_seed_{seed_string}' + \
#                     '.parquet/'

#         print('--> Saving parquet file...')
#         save_path = result_path + file_name
#         combined_df.write.mode('overwrite').parquet(save_path)
#         print(f'--> Saved to: {save_path}')
    
#     return(combined_df)


In [179]:
N=2
CELLTYPES = ['Blueprint-B', 'Blueprint-CD4']
TOTAL_READS_TO_SAMPLE = 10000
PROPORTION = np.array([0.5, 0.5])
SEED = 888
PARQUET_PATH = '/analysis/gh-msun/projects/2023_06_26_SRT_deconvolution_MS/output/mixture_source/'
RESULT_PATH = '/analysis/gh-msun/projects/2023_06_26_SRT_deconvolution_MS/output/mixture/'

test_mixtures = mix_celltypes(n=N,
                             cell_types=CELLTYPES,
                             total_reads_to_sample=TOTAL_READS_TO_SAMPLE, 
                             proportions=PROPORTION, 
                             seed=SEED,
                             parquet_path=PARQUET_PATH,
                             result_path=RESULT_PATH,
                             save=True)

################ Creating mixture 0... ################
--> seed: 83723
--> Load parquet files and count rows...
----------> Loading cell type: ### Blueprint-B
----------> Loading cell type: ### Blueprint-CD4
total_reads_to_sample: [2963964 1965191]
Sampling fraction: [0.00168693 0.00254428]
--> Sample rows for each cell type...
----------> Sampling cell type: 0
----------> 4998
----------> Sampling cell type: 1
----------> 5030
--> Combining sampled reads into one dataframe...
--> Saving parquet file...
--> Saved to: /analysis/gh-msun/projects/2023_06_26_SRT_deconvolution_MS/output/mixture/M0_Blueprint-B_Blueprint-CD4_proportions_0.5_0.5_seed_83723.parquet/
 
################ Creating mixture 1... ################
--> seed: 452891
--> Load parquet files and count rows...
----------> Loading cell type: ### Blueprint-B
----------> Loading cell type: ### Blueprint-CD4
total_reads_to_sample: [2963964 1965191]
Sampling fraction: [0.00168693 0.00254428]
--> Sample rows for each cell type...

In [180]:
test_mixtures

[DataFrame[sample_id: string, molecule_id: string, chr: string, number_molecules: bigint, cpg_index_min: bigint, cpg_index_max: bigint, pat_string: string, region_id: string, region_cpg_index_min: bigint, region_cpg_index_max: bigint],
 DataFrame[sample_id: string, molecule_id: string, chr: string, number_molecules: bigint, cpg_index_min: bigint, cpg_index_max: bigint, pat_string: string, region_id: string, region_cpg_index_min: bigint, region_cpg_index_max: bigint]]

In [181]:
1965191 * 0.00254428

4999.99615748

In [182]:
2963964 * 0.00168693

4999.99979052

In [183]:
test_mixtures[0].show(5)

+---------+-----------+-----+----------------+-------------+-------------+----------------+--------------------+--------------------+--------------------+
|sample_id|molecule_id|  chr|number_molecules|cpg_index_min|cpg_index_max|      pat_string|           region_id|region_cpg_index_min|region_cpg_index_max|
+---------+-----------+-----+----------------+-------------+-------------+----------------+--------------------+--------------------+--------------------+
|ERS666930|  410430713|chr16|               1|     21772142|     21772148|         CCC.CCC|Immune_Lymph_B_Na...|            21772140|            21772145|
|ERS523625|  246240980|chr16|               1|     21773396|     21773407|    CCCCCCCCCCCC|Loyfer2022_Prepri...|            21773392|            21773411|
|ERS214675|  225068591|chr15|               1|     21491132|     21491142|     CCCCCCCCCCC|Umbilical_Endothe...|            21491086|            21491152|
|ERS214675|  225068617|chr15|               1|     21491138|     21491

In [184]:
test_mixtures[1].show(5)

+---------+-----------+-----+----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+
|sample_id|molecule_id|  chr|number_molecules|cpg_index_min|cpg_index_max|          pat_string|           region_id|region_cpg_index_min|region_cpg_index_max|
+---------+-----------+-----+----------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+
|ERS222266|  218899314|chr16|               1|     21773397|     21773413|   TCCCTCCCCCCTCCCCC|Loyfer2022_Prepri...|            21773392|            21773411|
|ERS222206|  319856146|chr16|               1|     21773397|     21773418|CCCCCCCT......CTC...|Loyfer2022_Prepri...|            21773392|            21773411|
|ERS666927|  429426016|chr16|               1|     21773402|     21773416|     CCCCCCTCCCCCTCC|Loyfer2022_Prepri...|            21773392|            21773411|
|ERS337607|  395958219|chr16|               1|

In [185]:
# PCA sanity check