In [1]:
import numpy as np
import pandas as pd

In [2]:
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.io

plotly.io.renderers.default = "jupyterlab+png"

In [3]:
import helpers

In [4]:
import logging

handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logging.getLogger().addHandler(handler)

logging.getLogger("helpers").setLevel("DEBUG")
logger = logging.getLogger(__name__)
logger.setLevel("DEBUG")

In [5]:
rng = np.random.default_rng(seed=0)

# load data

## TCGA SKCM bulk RNA-seq

In [6]:
mixtures_tcga_skcm = helpers.datasets.load_tcga_skcm()
mixtures_tcga_skcm = helpers.creating_mixtures.normalize_expression(mixtures_tcga_skcm, 1_000_000)

In [7]:
mixtures_tcga_skcm

sample_id,TCGA.3N.A9WB.06A.11R.A38C.07,TCGA.3N.A9WC.06A.11R.A38C.07,TCGA.3N.A9WD.06A.11R.A38C.07,TCGA.BF.A1PU.01A.11R.A18S.07,TCGA.BF.A1PV.01A.11R.A18U.07,TCGA.BF.A1PX.01A.12R.A18T.07,TCGA.BF.A1PZ.01A.11R.A18S.07,TCGA.BF.A1Q0.01A.21R.A18S.07,TCGA.BF.A3DJ.01A.11R.A20F.07,TCGA.BF.A3DL.01A.11R.A20F.07,...,TCGA.XV.AB01.06A.12R.A40A.07,TCGA.YD.A89C.06A.11R.A37K.07,TCGA.YD.A9TA.06A.11R.A39D.07,TCGA.YD.A9TB.06A.12R.A40A.07,TCGA.YG.AA3N.01A.11R.A38C.07,TCGA.YG.AA3O.06A.11R.A38C.07,TCGA.YG.AA3P.06A.11R.A38C.07,TCGA.Z2.A8RT.06A.11R.A37K.07,TCGA.Z2.AA3S.06A.11R.A39D.07,TCGA.Z2.AA3V.06A.11R.A39D.07
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,19.740875,10.369330,14.257515,8.356213,12.480939,13.381595,21.218028,15.880855,11.437546,15.883137,...,16.366698,8.811148,4.867023,15.167823,11.002386,11.587025,1.617723,17.433731,18.990391,26.051613
A1CF,0.000000,0.000000,0.028019,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.074203,0.000000,0.000000,0.000000,0.000000,0.020087,0.000000,0.000000
A2BP1,0.000000,0.000000,0.252178,0.061521,0.000000,0.000000,0.000000,0.000000,0.000000,0.020444,...,0.000000,0.000000,0.000000,0.000000,0.025433,0.018830,3.184092,0.000000,0.015651,0.023752
A2LD1,12.961332,8.540326,3.840093,7.732546,3.503615,5.900259,3.942612,3.279326,4.576888,4.165400,...,1.813268,4.891449,2.033552,8.392148,4.064287,4.806950,2.411947,1.933933,8.313469,1.843167
A2M,114.462646,8990.935642,721.327094,318.164904,100.181732,1506.774877,160.847095,313.319162,18599.722553,322.278302,...,665.860798,985.532588,682.319652,1460.927230,474.432192,2079.134869,396.044863,1146.847675,219.565726,1674.906699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYX,64.044031,190.962875,129.198585,243.114657,295.161620,219.491335,158.803870,531.532912,149.791155,300.282398,...,439.809043,252.882485,203.726404,173.244503,468.914976,174.115119,92.800828,148.479867,116.130025,429.795422
ZZEF1,32.262626,46.946197,43.682628,85.083208,63.164141,65.641810,82.394776,62.370800,65.757418,61.430229,...,55.869623,40.712054,68.527168,61.857779,108.823473,148.827285,71.359575,95.853073,89.147794,51.328428
ZZZ3,37.367734,60.447760,30.401313,13.549949,95.485875,24.458629,32.554792,41.569166,47.379281,12.551798,...,8.567026,35.496273,52.777417,42.540575,49.134598,52.402116,48.377633,23.541452,78.927726,3.705338
psiTPTE22,0.146456,0.255952,7.789461,0.123041,0.103210,15.419219,0.045811,0.272809,0.237576,0.306638,...,0.788378,0.041394,0.241164,0.305170,0.915551,0.207124,0.539243,0.281214,0.172162,1.163858


## Jerby-Arnon scRNA-seq

In [8]:
sc_data, sc_metadata = helpers.datasets.load_jerby_arnon()
sc_data = helpers.creating_mixtures.normalize_expression(sc_data, 1_000_000)

In [9]:
sc_data

single_cell_id,CY106_CD45neg_CD90neg_10cells_S289,CY106_CD45neg_CD90neg_S291,CY106_CD45neg_CD90neg_S292,CY106_CD45neg_CD90neg_S294,CY106_CD45neg_CD90neg_S297,CY106_CD45neg_CD90neg_S301,CY106_CD45neg_CD90neg_S302,CY106_CD45neg_CD90neg_S308,CY106_CD45neg_CD90neg_S310,CY106_CD45neg_CD90neg_S320,...,monika_E9_S143_comb_BCD8_3,monika_F10_S145_comb_BCD8_3,monika_F1_S144_comb_BCD8_3,monika_F2_S146_comb_BCD8_3,monika_F4_S147_comb_BCD8_3,monika_F5_S148_comb_BCD8_3,monika_F6_S149_comb_BCD8_3,monika_F7_S150_comb_BCD8_3,monika_F8_S151_comb_BCD8_3,monika_F9_S152_comb_BCD8_3
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,109.690164,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,44.715324,0.000000,0.000000
A1BG-AS1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,30.862239,0.000000,0.000000,0.000000,318.405674,333.293439,0.000000,0.000000
A1CF,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.701365,0.000000,0.000000,...,6.686230,195.954807,2.015934,0.000000,2.853006,7.229537,3.031841,5.374203,0.000000,7.835906
A2M,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,60.037219,0.000000,0.000000,0.000000,0.000000,0.000000,537.103146,0.000000
A2M-AS1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,27.910829,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,26.336739,4.878462,0.000000,3.340233,13.913700,0.000000,9.737023,10.733908,4.998714,7.858778,...,134.169044,202.740387,94.258443,117.146827,167.886040,85.855966,96.179055,110.933276,73.842810,217.099927
ZYG11B,0.000000,0.000000,1.539584,3.340233,0.000000,0.175678,0.000000,4.509613,2.414791,10.205666,...,84.379812,138.192466,46.234008,75.325176,96.010499,30.794973,54.030149,70.022617,58.347279,179.323137
ZYX,51.999381,155.751237,41.187557,0.000000,193.098546,0.000000,145.992385,349.589839,0.000000,0.000000,...,0.000000,222.404376,392.647025,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ZZEF1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,53.343344,189.588163,0.000000,320.873816,0.000000,0.000000,0.000000,0.000000,0.000000


In [10]:
sc_metadata

Unnamed: 0_level_0,cells,samples,cell.types,treatment.group,Cohort,no.of.genes,no.of.reads
single_cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,cy78_CD45_neg_1_B04_S496_comb,Mel78,Malignant,post.treatment,Tirosh,8258,357919
1,cy79_p4_CD45_neg_PDL1_neg_E11_S1115_comb,Mel79,Malignant,treatment.naive,Tirosh,2047,5727
2,CY88_5_B10_S694_comb,Mel88,Malignant,post.treatment,Tirosh,5375,139218
3,cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_F07_S67_comb,Mel79,Malignant,treatment.naive,Tirosh,5648,73996
4,cy78_CD45_neg_3_H06_S762_comb,Mel78,Malignant,post.treatment,Tirosh,7409,380341
...,...,...,...,...,...,...,...
7181,CY75_1_CD45_CD8_3__S168_comb_BCD8,Mel75,T CD8,post.treatment,Tirosh,3530,137245
7182,CY75_1_CD45_CD8_8__S338_comb_BCD8,Mel75,T CD8,post.treatment,Tirosh,3872,106432
7183,monika_D7_S132_comb_BCD8_3,Mel75,T CD8,post.treatment,Tirosh,4589,908173
7184,CY75_1_CD45_CD8_8__S289_comb_BCD8,Mel75,T CD8,post.treatment,Tirosh,4614,140903


## pseudobulks from Jerby-Arnon scRNA-seq

### fractions from csx of TCGA SKCM

In [11]:
fractions = helpers.datasets.load_tcga_skcm_fractions_from_csx()

In [12]:
fractions

cell_type,B,CAF,Endothelial,Macrophage,Malignant,NK,T,T CD4,T CD8
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TCGA.3N.A9WB.06A.11R.A38C.07,0.000000,0.018483,0.004211,0.000000,0.971059,0.000000,0.006247,0.000000,0.000000
TCGA.3N.A9WC.06A.11R.A38C.07,0.035060,0.056463,0.004178,0.103408,0.669623,0.000000,0.000000,0.082836,0.048432
TCGA.3N.A9WD.06A.11R.A38C.07,0.044227,0.105762,0.070834,0.036322,0.703525,0.000000,0.000000,0.039330,0.000000
TCGA.BF.A1PU.01A.11R.A18S.07,0.020758,0.048434,0.015335,0.000000,0.912796,0.002677,0.000000,0.000000,0.000000
TCGA.BF.A1PV.01A.11R.A18U.07,0.003591,0.042464,0.000000,0.000000,0.944365,0.000000,0.000000,0.009580,0.000000
...,...,...,...,...,...,...,...,...,...
TCGA.YG.AA3O.06A.11R.A38C.07,0.007658,0.060723,0.000000,0.026970,0.896399,0.000000,0.008250,0.000000,0.000000
TCGA.YG.AA3P.06A.11R.A38C.07,0.000000,0.000000,0.002483,0.008485,0.969895,0.000000,0.019137,0.000000,0.000000
TCGA.Z2.A8RT.06A.11R.A37K.07,0.029734,0.026755,0.002953,0.044928,0.878121,0.000000,0.000000,0.017508,0.000000
TCGA.Z2.AA3S.06A.11R.A39D.07,0.018830,0.000000,0.000000,0.025155,0.956015,0.000000,0.000000,0.000000,0.000000


### compute pseudobulks

In [13]:
# import importlib
# importlib.reload(helpers.creating_mixtures)

<module 'helpers.creating_mixtures' from '/home/jupyter/deconv/helpers/creating_mixtures.py'>

In [14]:
mixtures_in_silico, cell_type_geps = helpers.creating_mixtures.make_mixtures(
    sc_data, sc_metadata, fractions, n_cells_per_gep=5, normalization_factor=1_000_000, rng=rng
)

2022-02-24 22:31:46,748 - helpers.creating_mixtures - DEBUG - using np.random.Generator with BitGenerator state {'state': 35399562948360463058890781895381311971, 'inc': 87136372517582989555478159403783844777}
2022-02-24 22:31:46,751 - helpers.creating_mixtures - DEBUG - randomly chose Mel98 for malignant cells
2022-02-24 22:31:46,798 - helpers.creating_mixtures - DEBUG - randomly chose Mel194 for malignant cells
2022-02-24 22:31:46,848 - helpers.creating_mixtures - DEBUG - randomly chose Mel80 for malignant cells
2022-02-24 22:31:46,895 - helpers.creating_mixtures - DEBUG - randomly chose Mel110 for malignant cells
2022-02-24 22:31:46,930 - helpers.creating_mixtures - DEBUG - randomly chose Mel129pa for malignant cells
2022-02-24 22:31:46,963 - helpers.creating_mixtures - DEBUG - randomly chose Mel53 for malignant cells
2022-02-24 22:31:46,998 - helpers.creating_mixtures - DEBUG - randomly chose Mel112 for malignant cells
2022-02-24 22:31:47,041 - helpers.creating_mixtures - DEBUG - ra

In [15]:
helpers.creating_mixtures.make_cell_type_geps(sc_data, sc_metadata, rng=rng)

2022-02-24 22:32:03,847 - helpers.creating_mixtures - DEBUG - randomly chose Mel78 for malignant cells


Unnamed: 0_level_0,B,CAF,Endothelial,Macrophage,Malignant,NK,T,T CD4,T CD8
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A1BG,0.000000,62.996894,0.000000,9.771765,112.437307,0.000000,84.316614,0.000000,0.000000
A1BG-AS1,0.000000,22.474877,0.000000,0.000000,18.455507,6.145154,0.000000,28.359123,52.012063
A1CF,0.676039,0.000000,3.719928,0.000000,0.000000,11.644783,0.000000,0.000000,2.392596
A2M,12.167897,128.128101,506.814231,226.973070,0.000000,109.114517,28.360820,0.000000,0.000000
A2M-AS1,99.002692,0.000000,0.000000,29.868196,0.000000,73.467602,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
ZYG11A,116.183851,47.425635,56.806713,21.803064,40.526738,138.686659,101.809121,96.842362,56.959468
ZYG11B,80.253607,138.092907,54.188197,34.721422,74.148140,76.393491,109.585026,59.788379,71.331588
ZYX,2.920403,201.446748,149.652237,206.426965,41.074223,72.210413,52.066820,188.958209,35.498276
ZZEF1,43.866943,16.729796,43.107382,70.237758,50.105194,0.000000,18.170692,82.462693,11.997631


In [16]:
x = sc_metadata.groupby("cell.types")

In [17]:
x.groups

{'B': [1375, 1404, 1655, 1814, 1824, 2318, 2480, 2533, 2746, 2748, 2751, 2753, 2755, 2757, 2758, 2760, 2768, 2769, 2771, 2772, 2773, 2777, 2778, 2782, 2783, 2789, 2791, 2795, 2796, 2800, 2801, 2802, 2805, 2806, 2807, 2810, 2811, 2814, 2818, 2819, 2823, 2825, 2827, 2828, 2829, 2830, 2831, 2833, 2834, 2835, 2838, 2839, 2842, 2844, 2847, 2849, 2852, 2863, 2866, 2869, 2870, 2879, 2880, 2881, 2884, 2885, 2886, 2887, 2889, 2890, 2894, 2895, 2898, 2900, 2901, 2904, 2907, 2908, 2914, 2917, 2920, 2921, 2922, 2924, 2928, 2929, 2931, 2932, 2941, 2945, 2950, 2951, 2952, 2953, 2954, 2955, 2957, 2958, 2959, 2960, ...], 'CAF': [408, 837, 906, 1039, 1082, 1091, 1127, 1141, 2750, 2766, 2767, 2821, 2846, 2851, 2862, 2872, 2882, 2905, 2919, 2938, 2942, 2946, 2962, 2978, 2979, 3062, 3066, 3109, 3116, 3156, 3169, 3171, 3182, 3193, 3203, 3218, 3289, 3321, 3349, 3363, 3392, 3413, 3416, 3419, 3472, 3475, 3494, 3514, 3527, 3565, 3653, 3682, 3701, 3703, 3758, 3799, 3811, 3822, 3897, 4698, 4793, 4903, 4913, 4933