In [1]:
import numpy as np
import pandas as pd

from helpers.cell_type_naming import weird_to_nice
from helpers import creating_mixtures
from helpers.loading_single_cell_cohorts import load_jerby_arnon

In [2]:
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.io

plotly.io.renderers.default = "jupyterlab+png"

In [3]:
rng = np.random.default_rng(seed=0)

# load & prep data

In [4]:
!gsutil ls gs://liulab/downloaded_manually/derek_csx_tcga_skcm

gs://liulab/downloaded_manually/derek_csx_tcga_skcm/CIBERSORTx_Job6_cell_type_sourceGEP.txt
gs://liulab/downloaded_manually/derek_csx_tcga_skcm/CIBERSORTx_Job6_jerbyarnon_for_cibersortx_sigmatrix_inferred_phenoclasses.CIBERSORTx_Job6_jerbyarnon_for_cibersortx_sigmatrix_inferred_refsample.bm.K999.txt
gs://liulab/downloaded_manually/derek_csx_tcga_skcm/CIBERSORTx_Job7_Results.csv
gs://liulab/downloaded_manually/derek_csx_tcga_skcm/CIBERSORTx_Job7_Results.html
gs://liulab/downloaded_manually/derek_csx_tcga_skcm/CIBERSORTx_Job7_Results.pdf
gs://liulab/downloaded_manually/derek_csx_tcga_skcm/CIBERSORTx_Job7_Results.txt
gs://liulab/downloaded_manually/derek_csx_tcga_skcm/CIBERSORTx_Job7_error_log.txt
gs://liulab/downloaded_manually/derek_csx_tcga_skcm/CIBERSORTx_Job7_jerbyarnon_for_cibersortx_sigmatrix_inferred_phenoclasses.CIBERSORTx_Job7_jerbyarnon_for_cibersortx_sigmatrix_inferred_refsample.bm.K999.txt
gs://liulab/downloaded_manually/derek_csx_tcga_skcm/CIBERSORTx_Job7_runtime_log.txt
gs:

## load (real) TCGA SKCM mixtures

In [5]:
# mixtures_tcga_skcm = helpers.datasets.load_tcga_skcm_mixtures()

In [6]:
%time

path = "gs://liulab/downloaded_manually/derek_csx_tcga_skcm/skcm_rnaseqv2_normalized_clean.txt"

mixtures_tcga_skcm = pd.read_csv(
    path,
    sep="\t",
    # nrows=1000,
    index_col=0
)

mixtures_tcga_skcm *= 1_000_000 / mixtures_tcga_skcm.sum()
# mixtures_tcga_skcm = helpers.creating_mixtures.normalize_to_tp100k(mixtures_tcga_skcm)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


In [7]:
mixtures_tcga_skcm

Unnamed: 0_level_0,TCGA.3N.A9WB.06A.11R.A38C.07,TCGA.3N.A9WC.06A.11R.A38C.07,TCGA.3N.A9WD.06A.11R.A38C.07,TCGA.BF.A1PU.01A.11R.A18S.07,TCGA.BF.A1PV.01A.11R.A18U.07,TCGA.BF.A1PX.01A.12R.A18T.07,TCGA.BF.A1PZ.01A.11R.A18S.07,TCGA.BF.A1Q0.01A.21R.A18S.07,TCGA.BF.A3DJ.01A.11R.A20F.07,TCGA.BF.A3DL.01A.11R.A20F.07,...,TCGA.XV.AB01.06A.12R.A40A.07,TCGA.YD.A89C.06A.11R.A37K.07,TCGA.YD.A9TA.06A.11R.A39D.07,TCGA.YD.A9TB.06A.12R.A40A.07,TCGA.YG.AA3N.01A.11R.A38C.07,TCGA.YG.AA3O.06A.11R.A38C.07,TCGA.YG.AA3P.06A.11R.A38C.07,TCGA.Z2.A8RT.06A.11R.A37K.07,TCGA.Z2.AA3S.06A.11R.A39D.07,TCGA.Z2.AA3V.06A.11R.A39D.07
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,19.740875,10.369330,14.257515,8.356213,12.480939,13.381595,21.218028,15.880855,11.437546,15.883137,...,16.366698,8.811148,4.867023,15.167823,11.002386,11.587025,1.617723,17.433731,18.990391,26.051613
A1CF,0.000000,0.000000,0.028019,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.074203,0.000000,0.000000,0.000000,0.000000,0.020087,0.000000,0.000000
A2BP1,0.000000,0.000000,0.252178,0.061521,0.000000,0.000000,0.000000,0.000000,0.000000,0.020444,...,0.000000,0.000000,0.000000,0.000000,0.025433,0.018830,3.184092,0.000000,0.015651,0.023752
A2LD1,12.961332,8.540326,3.840093,7.732546,3.503615,5.900259,3.942612,3.279326,4.576888,4.165400,...,1.813268,4.891449,2.033552,8.392148,4.064287,4.806950,2.411947,1.933933,8.313469,1.843167
A2M,114.462646,8990.935642,721.327094,318.164904,100.181732,1506.774877,160.847095,313.319162,18599.722553,322.278302,...,665.860798,985.532588,682.319652,1460.927230,474.432192,2079.134869,396.044863,1146.847675,219.565726,1674.906699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,0.376607,0.042661,0.616431,0.000000,0.000000,0.261064,0.061078,0.460366,0.084846,0.000000,...,0.000000,0.020699,0.055655,0.000000,0.025433,0.056486,0.000000,0.100434,0.000000,0.000000
ZYG11B,33.392442,36.665386,33.567535,20.655596,57.074786,23.495951,36.860822,49.088446,46.564736,7.543344,...,5.571194,36.324173,50.514204,58.989190,26.042356,55.377154,44.448874,24.786819,88.553059,2.755252
ZYX,64.044031,190.962875,129.198585,243.114657,295.161620,219.491335,158.803870,531.532912,149.791155,300.282398,...,439.809043,252.882485,203.726404,173.244503,468.914976,174.115119,92.800828,148.479867,116.130025,429.795422
ZZEF1,32.262626,46.946197,43.682628,85.083208,63.164141,65.641810,82.394776,62.370800,65.757418,61.430229,...,55.869623,40.712054,68.527168,61.857779,108.823473,148.827285,71.359575,95.853073,89.147794,51.328428


## generate in silico mixtures

### load single cell cohort (Jerby-Arnon)

In [None]:
%time

sc_data, sc_metadata = load_jerby_arnon(
    # n_genes_if_not_all=1000
)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.68 µs


In [None]:
sc_data

In [None]:
sc_metadata

In [None]:
sc_metadata["cell.types"].value_counts(dropna=False)

### load fractions produced by derek

In [None]:
# !gsutil ls gs://liulab/downloaded_manually/derek_csx_tcga_skcm

In [None]:
%time

path = "gs://liulab/downloaded_manually/derek_csx_tcga_skcm/CIBERSORTx_Job8_Results.txt"

csx_fraction_output_tcga_skcm = pd.read_csv(
    path,
    sep="\t",
    # nrows=10,
    index_col=0
)
csx_fraction_output_tcga_skcm.rename(columns=weird_to_nice, inplace=True)
columns = list(sorted(csx_fraction_output_tcga_skcm.columns[:-3]))
csx_fraction_output_tcga_skcm = csx_fraction_output_tcga_skcm[columns]

In [None]:
csx_fraction_output_tcga_skcm

In [None]:
px.bar(csx_fraction_output_tcga_skcm.sample(50))

In [None]:
# QA
# do all rows start with "TCGA"?
assert not any(filter(lambda name: name[0:4] != 'TCGA', csx_fraction_output_tcga_skcm.index))

### compute mixtures

In [None]:
import importlib
import helpers
importlib.reload(helpers.creating_mixtures)

In [None]:
# params
rng = np.random.default_rng(seed=0)

In [None]:
%time

mixtures_in_silico, cell_type_geps = creating_mixtures.make_mixtures(
    sc_data,
    sc_metadata,
    sample_fractions=csx_fraction_output_tcga_skcm,
    n_cells_per_gep=5,
    normalization_factor=1_000_000,
    rng=rng
)

In [None]:
mixtures_in_silico

In [None]:
sample_name = "TCGA.3N.A9WB.06A.11R.A38C.07"
cell_type_geps[sample_name]

## data checks

### similar normalization?

In [None]:
mixtures_tcga_skcm.sum()

In [None]:
mixtures_in_silico.sum()

In [None]:
mixtures_tcga_skcm.mean(axis=1).describe()

In [None]:
mixtures_in_silico.mean(axis=1).describe()

### same genes in real & in silico?

looks like not really... # rows in expression data:
- TCGA SKCM: 20,531
- in silico mixtures (jerby-arnon): 23,686
- overlap: 18,762

In [None]:
mixtures_tcga_skcm.shape

In [None]:
mixtures_in_silico.shape

In [None]:
len(list(set(mixtures_tcga_skcm.index) & set(mixtures_in_silico.index)))

random genes in jerby-arnon, not in TCGA SKCM:

In [None]:
#todo

random genes in TCGA SKCM, not in jerby-arnon:

In [None]:
#todo

# output 1: check distribution (in silico vs real)

## plot summary stats

by gene, in silico vs real

In [None]:
mixtures_tcga_skcm.mean(axis=1)

In [None]:
def prep_for_plots(mixtures):
    mean_expression_per_gene = mixtures_tcga_skcm.mean(axis=1)
    return np.log(mean_expression_per_gene + 1)


series = [
    prep_for_plots(mixtures_tcga_skcm),
    prep_for_plots(mixtures_in_silico),
]

In [None]:
ff.create_distplot(
    series, ["tcga", "in silico"], show_rug=False, show_hist=True
)

In [None]:
def compute_means(mixtures):
    base_means = mixtures_tcga_skcm.mean(axis=1)
    return np.log(base_means + 1)


mean_expressions = pd.DataFrame({
    "TCGA SKCM": compute_means(mixtures_tcga_skcm),
    "in silico": compute_means(mixtures_in_silico),
})

In [None]:
compute_means(mixtures_tcga_skcm)

In [None]:
compute_means(mixtures_in_silico)

In [None]:
px.box(compute_means(mixtures_tcga_skcm))

In [None]:
px.box([compute_means(mixtures_tcga_skcm), compute_means(mixtures_in_silico)])

# output 2: estimated cell type fractions

## run cibersortx fractions on in silico mixtures

In [None]:
import tempfile

In [None]:
temp_dir = tempfile.TemporaryDirectory()

print(temp_dir)

In [None]:
from helpers.running_csx import DockerJob, Experiment, InputFile

In [None]:
experiments_root = "gs://liulab/csx_experiments/tcga_skcm_fractions"
experiment = Experiment(experiments_root, "test1")

In [None]:
!tree -h {experiment.local_path}/..

In [None]:
input_file_args = {
    "mixture": InputFile("mixturestirosh.txt", "gs://liulab/csx_example_files/Single_Cell_RNA-Seq_Melanoma_SuppFig_3b-d/mixture_melanoma_Tirosh_SuppFig_3b-d.txt"),
    "sigmatrix": InputFile()
}


In [None]:

job = DockerJob(temp_dir, input_file_args, other_args)
    for copy_command in job.make_copy_commands():
        !{copy_command}
    !tree -h {job.path}
    docker_command = job.make_docker_command()
    print(docker_command)
    !{docker_command}
    !tree -h {job.path}
    !gsutil -m rsync -r -d {job.path} {experiment.gcs_uri}


### prep input files

In [None]:
from helpers.csx_docker import save_valid_mixture_file, save_valid_fractions_file

with open("mixture.txt", "w") as f:
    save_valid_mixture_file(mixtures, f)
with open("fractions.txt", "w") as f:
    save_valid_fractions_file(fractions, f)

### run cibersortx

In [None]:
run_csx(path='...')

### load and process outputs

## evaluate results

# output 3: estimated gene expression by cell type

## run cibersortx expression imputation on in silico mixtures

## volcano plot outputs

In [None]:
# for each cell type, scatter plot (effect_size, p_value) for each gene