In [1]:
import glob
import os

import numpy as np
import pandas as pd

from helpers.cell_type_naming import nice_to_weirds, weird_to_nice
from helpers.creating_mixtures import make_mixture
from helpers.loading_single_cell_cohorts import load_tirosh

In [2]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io

plotly.io.renderers.default = "jupyterlab+png"

# load single cell data

In [3]:
sc_data, sc_metadata = load_tirosh()

sc_data

cells,cy78_CD45_neg_1_B04_S496_comb,cy79_p4_CD45_neg_PDL1_neg_E11_S1115_comb,CY88_5_B10_S694_comb,cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_F07_S67_comb,cy78_CD45_neg_3_H06_S762_comb,cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_G01_S73_comb,cy79_p1_CD45_neg_PDL1_neg_AS_C4_R1_D09_S141_comb,CY88_3_D02_S614_comb,cy79_p1_CD45_neg_PDL1_neg_AS_C4_R1_D06_S138_comb,cy53_1_CD45_neg_C06_S318_comb,...,monika_C11_S119_comb_BCD8_3,CY75_1_CD45_CD8_8__S331_comb_BCD8,CY75_1_CD45_CD8_7__S210_comb_BCD8,CY75_1_CD45_CD8_3__S142_comb_BCD8,CY75_1_CD45_CD8_7__S280_comb_BCD8,CY75_1_CD45_CD8_3__S168_comb_BCD8,CY75_1_CD45_CD8_8__S338_comb_BCD8,monika_D7_S132_comb_BCD8_3,CY75_1_CD45_CD8_8__S289_comb_BCD8,CY75_1_CD45_CD8_8__S351_comb_BCD8
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C9orf152,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
RPS11,8.144184,5.915091,7.243164,6.019502,7.974753,5.557257,4.639058,7.655502,3.554834,8.791371,...,8.116349,5.756490,0.000000,5.258820,3.930454,5.945186,6.661408,6.792322,0.000000,4.820741
ELMO2,2.639232,0.000000,0.732052,3.687956,2.599318,1.691087,2.019346,0.136191,0.000000,1.724650,...,1.580145,0.000000,1.075875,0.000000,5.616211,0.000000,0.000000,0.000000,0.000000,6.166655
CREB3L1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
PNMA1,3.656496,0.000000,0.536053,3.041418,4.132741,2.827006,0.000000,2.873420,0.000000,4.787694,...,0.000000,1.407081,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PIK3IP1,0.936591,5.761498,0.000000,0.000000,2.034216,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,5.018011,0.000000,1.491597,0.000000,0.000000,4.510139
SNRPD2,5.374866,0.000000,5.437661,6.240581,5.946754,4.628132,4.648580,4.330415,4.559981,5.664966,...,5.350462,0.000000,5.225853,3.966523,5.537203,6.792582,0.000000,0.000000,3.525944,6.218684
SLC39A6,3.326681,0.000000,1.308011,3.516141,3.868390,1.791189,0.000000,2.948601,4.368279,0.000000,...,0.000000,4.774840,4.881518,5.688656,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
CTSC,2.612352,0.000000,2.361768,1.094236,1.934705,0.852798,0.532068,1.904966,0.999278,1.456806,...,1.851199,6.475069,3.449957,0.000000,0.145351,6.871634,1.837540,3.094236,6.253554,3.087463


# generate in silico mixtures

In [9]:
# params
n_samples = 100
n_cells_per_gep = 5
rng = np.random.default_rng(seed=0)

In [30]:
sampled_single_cells_per_type = sc_metadata.groupby("cell.types").apply(lambda group: list(rng.choice(group["cells"], n_cells_per_gep))).to_dict()

sampled_single_cells_per_type

In [37]:
cell_type_geps = pd.concat(
    {cell_type: sc_data[cells].sum(axis='columns') for cell_type, cells in sampled_single_cells_per_type.items()},
    axis='columns'
)

cell_type_geps

In [None]:
mixtures, fractions, cell_type_geps = make_cohort(sc_data, sc_metadata, rng, n_samples, n_cells_per_gep)

# what this should look like

In [None]:
n_cells_per_gep = 5
n_samples = 100
sc_samples = load_sc('gs://...')

in_silico_sample_geps = make_geps(sc_samples, n_samples, n_cells_per_gep)
in_silico_sample_fractions = make_fractions(cell_types)
in_silico_sample_mixtures = in_silico_sample_geps @ in_silico_sample_fractions
n_genes = len(in_silico_sample_geps)
multiplicative_noise  = rng.uniform(low=0.9, high=1.1, size=(g, n_samples))
in_silico_sample_mixtures *= multiplicative_noise