In [1]:
import logging
import os
from pprint import pprint

import helpers
import helpers.creating_mixtures
import numpy as np
import pandas as pd
from cloudpathlib import AnyPath as Path

In [2]:
logger = logging.getLogger(__name__)
handler = logging.StreamHandler()
handler.setFormatter(
    logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
)
logger.addHandler(handler)
logger.setLevel("DEBUG")

In [3]:
from importlib import reload

reload(helpers)
reload(helpers.creating_mixtures)

<module 'helpers.creating_mixtures' from '/home/jupyter/deconv/helpers/creating_mixtures.py'>

In [4]:
sc_jerby_arnon, sc_jerby_arnon_metadata = helpers.datasets.load_jerby_arnon(
    "hg19", "tpm"
)

In [5]:
tcga_skcm_fractions_from_csx = helpers.datasets.load_tcga_skcm_fractions_from_csx()

In [6]:
logger.setLevel("DEBUG")

rng = np.random.default_rng(0)
uri_base = Path("gs://liulab/data/pseudobulk_optimization_3")
print(uri_base)

for n_cells in range(1, 21):
    for malignant_from_one_sample in (True, False):
        logger.info(f"{n_cells}, {malignant_from_one_sample}")
        logger.debug("making")
        mixtures, cell_type_geps = helpers.creating_mixtures.make_mixtures(
            sc_data=sc_jerby_arnon,
            sc_metadata=sc_jerby_arnon_metadata,
            sample_fractions=tcga_skcm_fractions_from_csx,
            n_cells_per_gep=n_cells,
            malignant_from_one_sample=malignant_from_one_sample,
            rng=rng,
        )
        logger.debug("reformatting")
        mixtures_out = (
            mixtures.rename_axis(columns="tcga_aliquot_barcode_for_fractions")
            .stack()
            .to_frame(name="tpm")
            .reset_index()
            .astype(
                dict(
                    gene_symbol="category",
                    tcga_aliquot_barcode_for_fractions="category",
                )
            )
        )
        logger.debug("writing mixtures")
        uri_mixtures = (
            uri_base
            / "mixtures"
            / f"n_cells={n_cells}"
            / f"malignant_from_one_sample={malignant_from_one_sample}"
            / "data.parquet"
        )
        mixtures_out.to_parquet(
            str(uri_mixtures),
            engine="pyarrow",
        )
        # logger.debug("writing cell type GEPs")
        # for sample_name, cell_type_gep in cell_type_geps.items():
        #     uri_cell_type_geps = (
        #         uri_base
        #         / "cell_type_geps"
        #         / f"sample_name={sample_name}"
        #         / f"n_cells={n_cells}"
        #         / f"malignant_from_one_sample={malignant_from_one_sample}"
        #         / "data.parquet"
        #     )
        #     cell_type_gep.to_parquet(str(uri_cell_type_geps), engine="pyarrow")

2022-05-26 04:28:40,171 - __main__ - INFO - 1, True
2022-05-26 04:28:40,172 - __main__ - DEBUG - making


gs://liulab/data/pseudobulk_optimization_3


2022-05-26 04:28:59,499 - __main__ - DEBUG - reformatting
2022-05-26 04:29:02,592 - __main__ - DEBUG - writing mixtures
2022-05-26 04:29:04,165 - __main__ - INFO - 1, False
2022-05-26 04:29:04,166 - __main__ - DEBUG - making
2022-05-26 04:29:26,273 - __main__ - DEBUG - reformatting
2022-05-26 04:29:29,198 - __main__ - DEBUG - writing mixtures
2022-05-26 04:29:30,850 - __main__ - INFO - 2, True
2022-05-26 04:29:30,851 - __main__ - DEBUG - making
2022-05-26 04:29:55,817 - __main__ - DEBUG - reformatting
2022-05-26 04:29:58,174 - __main__ - DEBUG - writing mixtures
2022-05-26 04:29:59,884 - __main__ - INFO - 2, False
2022-05-26 04:29:59,885 - __main__ - DEBUG - making
2022-05-26 04:30:11,971 - __main__ - DEBUG - reformatting
2022-05-26 04:30:13,413 - __main__ - DEBUG - writing mixtures
2022-05-26 04:30:15,095 - __main__ - INFO - 3, True
2022-05-26 04:30:15,096 - __main__ - DEBUG - making
2022-05-26 04:30:39,266 - __main__ - DEBUG - reformatting
2022-05-26 04:30:42,149 - __main__ - DEBUG -

In [201]:
pd.read_parquet(str(uri_base / "mixtures"))

Unnamed: 0,gene_symbol,tcga_aliquot_barcode_for_fractions,tpm,n_cells,malignant_from_one_sample
0,A1BG,TCGA-3N-A9WB-06A-11R-A38C-07,40872.028354,1,True
1,A1BG,TCGA-D3-A8GS-06A-12R-A37K-07,0.0,1,True
2,A1BG,TCGA-EE-A182-06A-11R-A18T-07,13962.992095,1,True
3,A1BG,TCGA-ER-A19D-06A-11R-A18S-07,1091.266981,1,True
4,A1BG,TCGA-GN-A265-06A-21R-A18T-07,2244.322433,1,True
5,ACSM5,TCGA-3N-A9WB-06A-11R-A38C-07,163.196815,1,True
6,ACSM5,TCGA-D3-A8GS-06A-12R-A37K-07,0.0,1,True
7,ACSM5,TCGA-EE-A182-06A-11R-A18T-07,422.92542,1,True
8,ACSM5,TCGA-ER-A19D-06A-11R-A18S-07,2362.746966,1,True
9,ACSM5,TCGA-GN-A265-06A-21R-A18T-07,0.0,1,True


In [203]:
pd.read_parquet(str(uri_base / "cell_type_geps")).info()

<class 'pandas.core.frame.DataFrame'>
Index: 119 entries, A1BG to ZNF850
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   B                          119 non-null    float64 
 1   CAF                        119 non-null    float64 
 2   Endothelial                119 non-null    float64 
 3   Macrophage                 119 non-null    float64 
 4   Malignant                  119 non-null    float64 
 5   NK                         119 non-null    float64 
 6   T                          119 non-null    float64 
 7   T CD4                      119 non-null    float64 
 8   T CD8                      119 non-null    float64 
 9   sample_name                119 non-null    category
 10  n_cells                    119 non-null    category
 11  malignant_from_one_sample  119 non-null    category
dtypes: category(3), float64(9)
memory usage: 10.0+ KB


In [195]:
df

Unnamed: 0,gene_symbol,tcga_aliquot_barcode_for_fractions,tpm,n_cells,malignant_from_one_sample
0,A1BG,TCGA-3N-A9WB-06A-11R-A38C-07,189.209107,1,False
1,A1BG,TCGA-D3-A8GS-06A-12R-A37K-07,19637.456039,1,False
2,A1BG,TCGA-EE-A182-06A-11R-A18T-07,1942.381001,1,False
3,A1BG,TCGA-ER-A19D-06A-11R-A18S-07,0.000000,1,False
4,A1BG,TCGA-GN-A265-06A-21R-A18T-07,0.000000,1,False
...,...,...,...,...,...
995,BTNL8,TCGA-3N-A9WB-06A-11R-A38C-07,1.669341,9,True
996,BTNL8,TCGA-D3-A8GS-06A-12R-A37K-07,444.006178,9,True
997,BTNL8,TCGA-EE-A182-06A-11R-A18T-07,231.341338,9,True
998,BTNL8,TCGA-ER-A19D-06A-11R-A18S-07,665.381362,9,True


In [196]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column                              Non-Null Count  Dtype   
---  ------                              --------------  -----   
 0   gene_symbol                         1000 non-null   category
 1   tcga_aliquot_barcode_for_fractions  1000 non-null   category
 2   tpm                                 1000 non-null   float64 
 3   n_cells                             1000 non-null   category
 4   malignant_from_one_sample           1000 non-null   category
dtypes: category(4), float64(1)
memory usage: 12.9 KB


In [7]:
cell_type_geps

{'TCGA-3N-A9WB-06A-11R-A38C-07':                       B         CAF  Endothelial  Macrophage   Malignant  \
 gene_symbol                                                                
 A1BG           3.505570   46.877778     4.295925   30.007080   50.205115   
 A1BG-AS1      18.006135   11.725681     0.637383    0.000000   13.166631   
 A1CF           1.336772    0.835496     1.221744    2.148557    2.162766   
 A2M            0.000000   92.200466   472.781862  273.889255  294.069488   
 A2M-AS1       11.409626    0.000000     3.732920   12.454472    4.409978   
 ...                 ...         ...          ...         ...         ...   
 ZYG11A       121.250688   41.554388    57.971899   62.422994   29.900737   
 ZYG11B       100.145649   74.736406    69.294240   51.277426   40.839420   
 ZYX           47.501153  182.804549   101.383782  239.211125   84.660674   
 ZZEF1         47.148589    7.977692    25.803989   50.017868   21.224151   
 ZZZ3         112.461610   48.145130     7.0