In [1]:
import os
import subprocess

import pandas as pd

from helpers.cell_type_naming import nice_to_weirds, weird_to_nice

In [2]:
import logging

logging.getLogger('root').addHandler(logging.NullHandler())

# configure a handler
format_str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(format_str))

# make a logger for this notebook
logger = logging.getLogger(__name__)
logger.setLevel("DEBUG")
logger.addHandler(handler)

In [3]:
logger.debug("reading single cells used for sigmat generation")
sc_refsample = pd.read_csv(
    "gs://liulab/csx_example_files/Single_Cell_RNA-Seq_Melanoma_SuppFig_3b-d/scRNA-Seq_reference_melanoma_Tirosh_SuppFig_3b-d.txt",
    sep="\t",
    index_col=0,
#     nrows=1000  # for debugging
#     skiprows=lambda i: i % 500
)

2021-08-10 22:38:07,503 - __main__ - DEBUG - reading single cells used for sigmat generation


In [4]:
logger.debug("loading original column names (which have duplicates so pandas messes them up)")
from google.cloud import storage
client = storage.Client()
bucket = client.bucket("liulab")
blob = bucket.get_blob("csx_example_files/Single_Cell_RNA-Seq_Melanoma_SuppFig_3b-d/scRNA-Seq_reference_melanoma_Tirosh_SuppFig_3b-d.txt")
with blob.open("rt") as f:
    original_column_names = f.readline().strip().split("\t")

logger.debug("replacing column names with good ones")
good_column_names = [weird_to_nice.get(column, column) for column in original_column_names]
sc_refsample.columns = good_column_names[1:]  # exclude index column name

sc_refsample

2021-08-10 22:38:12,914 - __main__ - DEBUG - loading original column names (which have duplicates so pandas messes them up)
2021-08-10 22:38:13,307 - __main__ - DEBUG - replacing column names with good ones


Unnamed: 0_level_0,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,...,B,B,B,B,B,B,B,B,B,B
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C9orf152,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
RPS11,135.788203,153.354101,296.923889,283.818688,313.256767,323.528192,292.029073,320.594740,148.106568,168.636169,...,222.851783,620.452365,308.307928,969.846904,214.314251,418.579389,263.184131,327.511485,321.778349,840.663649
ELMO2,0.000000,13.149704,0.357997,5.159017,0.000000,0.927005,4.695806,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.274004,0.000000,7.250034,9.348143,0.886994,0.000000,0.000000
CREB3L1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
PNMA1,1.780029,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,9.870904,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PIK3IP1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.345004,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
SNRPD2,56.004346,25.364048,82.975699,51.746373,171.888728,91.655197,71.564901,66.761786,82.349405,137.131639,...,0.000000,0.000000,69.063930,0.000000,0.000000,23.430306,45.850742,65.312089,28.601533,5.083915
SLC39A6,30.710748,17.722485,17.310509,6.795816,12.853705,7.719830,28.671377,2.438073,6.099906,7.051737,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.554878,0.000000,0.000000,0.000000
CTSC,15.550467,4.543127,7.130252,15.113522,5.018901,1.876072,22.271624,21.019350,6.805007,14.943539,...,1.770987,0.859997,0.403997,7.064024,3.321104,0.477000,1.457004,6.552739,1.189043,2.990031


In [5]:
cell_types = list(nice_to_weirds)
cell_types

['Malignant', 'Endothelial', 'CAF', 'T CD8', 'NK', 'Macrophage', 'T CD4', 'B']

In [6]:
!find /mnt/buckets/liulab/derek/ -type f -name "*0.5sd*" -exec ls -l {} \;

-rw-r--r-- 1 jupyter jupyter 28141097 Jun 25 20:59 /mnt/buckets/liulab/derek/simulations/experiments/cibersortx_sim_0.5sd.txt
-rw-r--r-- 1 jupyter jupyter 14111 Jun 25 20:58 /mnt/buckets/liulab/derek/simulations/experiments/generate_cohorts_for_emma/ctp_sim_0.5sd.txt
-rw-r--r-- 1 jupyter jupyter 28125676 Jun 25 20:58 /mnt/buckets/liulab/derek/simulations/experiments/generate_cohorts_for_emma/sim_0.5sd.txt


In [7]:
!gsutil -m rm -r gs://liulab/csx_experiments_excluding_cell_types

Removing gs://liulab/csx_experiments_excluding_cell_types/without_B/cibersortx_sim_0.5sd.txt#1628635040307224...
Removing gs://liulab/csx_experiments_excluding_cell_types/without_B/sc_refsample_modified.txt#1628635039148273...
Removing gs://liulab/csx_experiments_excluding_cell_types/without_CAF/cibersortx_sim_0.5sd.txt#1628635039089927...
Removing gs://liulab/csx_experiments_excluding_cell_types/without_CAF/sc_refsample_modified.txt#1628635037837056...
Removing gs://liulab/csx_experiments_excluding_cell_types/without_Endothelial/cibersortx_sim_0.5sd.txt#1628635038781584...
Removing gs://liulab/csx_experiments_excluding_cell_types/without_Endothelial/sc_refsample_modified.txt#1628635037606228...
Removing gs://liulab/csx_experiments_excluding_cell_types/without_Macrophage/cibersortx_sim_0.5sd.txt#1628635039851447...
Removing gs://liulab/csx_experiments_excluding_cell_types/without_Macrophage/sc_refsample_modified.txt#1628635038621747...
Removing gs://liulab/csx_experiments_excluding_cel

In [8]:
gsutil_calls = list()

for cell_type in cell_types:
    logger.debug(f"exluding {cell_type}")
    desired_column_positions = [
        i for (i, column_name) in enumerate(sc_refsample.columns)
        if cell_type not in column_name]
    logger.debug(f"limiting to {len(desired_column_positions)} single cells")
    base_path = f"gs://liulab/csx_experiments_excluding_cell_types/without_{cell_type}".replace(" ", "_")
    filepath_sc_refsample = os.path.join(base_path, "sc_refsample_modified.txt")
    logger.debug(f"saving to {filepath_sc_refsample}")
    sc_refsample.iloc[:, desired_column_positions].to_csv(filepath_sc_refsample, sep="\t")
    filepath_mixture = os.path.join(base_path, "cibersortx_sim_0.5sd.txt")
    logger.debug(f"copying mixture file to {filepath_mixture}")
    copy_mixture_command = f"gsutil cp gs://liulab/derek/simulations/experiments/cibersortx_sim_0.5sd.txt {filepath_mixture}"
    gsutil_calls.append(subprocess.Popen(copy_mixture_command, shell=True))

for call in gsutil_calls:
    call.wait()

2021-08-10 22:38:16,188 - __main__ - DEBUG - exluding Malignant
2021-08-10 22:38:16,189 - __main__ - DEBUG - limiting to 495 single cells
2021-08-10 22:38:16,189 - __main__ - DEBUG - saving to gs://liulab/csx_experiments_excluding_cell_types/without_Malignant/sc_refsample_modified.txt
2021-08-10 22:38:23,195 - __main__ - DEBUG - copying mixture file to gs://liulab/csx_experiments_excluding_cell_types/without_Malignant/cibersortx_sim_0.5sd.txt
2021-08-10 22:38:23,207 - __main__ - DEBUG - exluding Endothelial
2021-08-10 22:38:23,208 - __main__ - DEBUG - limiting to 708 single cells
2021-08-10 22:38:23,208 - __main__ - DEBUG - saving to gs://liulab/csx_experiments_excluding_cell_types/without_Endothelial/sc_refsample_modified.txt
2021-08-10 22:38:33,946 - __main__ - DEBUG - copying mixture file to gs://liulab/csx_experiments_excluding_cell_types/without_Endothelial/cibersortx_sim_0.5sd.txt
2021-08-10 22:38:33,958 - __main__ - DEBUG - exluding CAF
2021-08-10 22:38:33,959 - __main__ - DEBUG

In [9]:
sc_refsample.iloc[:, desired_column_positions]

Unnamed: 0_level_0,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,...,T CD4,T CD4,T CD4,T CD4,T CD4,T CD4,T CD4,T CD4,T CD4,T CD4
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C9orf152,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
RPS11,135.788203,153.354101,296.923889,283.818688,313.256767,323.528192,292.029073,320.594740,148.106568,168.636169,...,303.669232,560.211204,385.868646,439.500547,341.248403,563.879761,467.790655,1045.241365,372.017628,64.132243
ELMO2,0.000000,13.149704,0.357997,5.159017,0.000000,0.927005,4.695806,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,20.034710,0.000000,0.000000,0.000000,128.490280,0.000000,0.000000
CREB3L1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.101035
PNMA1,1.780029,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,9.870904,0.000000,...,0.000000,0.000000,0.000000,22.544203,0.000000,59.171149,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PIK3IP1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,37.282151,43.988988,13.487131,44.727812,2.069918,66.130664,137.198678,2.439980,18.475656
SNRPD2,56.004346,25.364048,82.975699,51.746373,171.888728,91.655197,71.564901,66.761786,82.349405,137.131639,...,79.020625,0.000000,0.000000,71.902683,43.582327,53.712554,51.269592,139.507176,27.936128,0.000000
SLC39A6,30.710748,17.722485,17.310509,6.795816,12.853705,7.719830,28.671377,2.438073,6.099906,7.051737,...,3.013052,0.000000,0.000000,31.979554,0.000000,0.000000,4.784130,0.000000,0.000000,0.000000
CTSC,15.550467,4.543127,7.130252,15.113522,5.018901,1.876072,22.271624,21.019350,6.805007,14.943539,...,0.913999,65.690067,1.826075,9.488235,0.990996,1.069956,9.913939,2.766958,10.463693,84.936553


In [10]:
!tree -h /mnt/buckets/liulab/csx_experiments_excluding_cell_types/

[01;34m/mnt/buckets/liulab/csx_experiments_excluding_cell_types/[00m
├── [   0]  [01;34mwithout_B[00m
│   ├── [ 27M]  cibersortx_sim_0.5sd.txt
│   └── [105M]  sc_refsample_modified.txt
├── [   0]  [01;34mwithout_CAF[00m
│   ├── [ 27M]  cibersortx_sim_0.5sd.txt
│   └── [110M]  sc_refsample_modified.txt
├── [   0]  [01;34mwithout_Endothelial[00m
│   ├── [ 27M]  cibersortx_sim_0.5sd.txt
│   └── [110M]  sc_refsample_modified.txt
├── [   0]  [01;34mwithout_Macrophage[00m
│   ├── [ 27M]  cibersortx_sim_0.5sd.txt
│   └── [108M]  sc_refsample_modified.txt
├── [   0]  [01;34mwithout_Malignant[00m
│   ├── [ 27M]  cibersortx_sim_0.5sd.txt
│   └── [ 72M]  sc_refsample_modified.txt
├── [   0]  [01;34mwithout_NK[00m
│   ├── [ 27M]  cibersortx_sim_0.5sd.txt
│   └── [112M]  sc_refsample_modified.txt
├── [   0]  [01;34mwithout_T_CD4[00m
│   ├── [ 27M]  cibersortx_sim_0.5sd.txt
│   └── [ 84M]  sc_refsample_modified.txt
└── [   0]  [01;34mwithout_T_CD8[00m
    ├── [ 27M]  cibersortx_sim

In [11]:
!gsutil ls -hlR gs://liulab/csx_experiments_excluding_cell_types

gs://liulab/csx_experiments_excluding_cell_types/:

gs://liulab/csx_experiments_excluding_cell_types/without_B/:
 26.84 MiB  2021-08-10T22:39:34Z  gs://liulab/csx_experiments_excluding_cell_types/without_B/cibersortx_sim_0.5sd.txt
104.68 MiB  2021-08-10T22:39:33Z  gs://liulab/csx_experiments_excluding_cell_types/without_B/sc_refsample_modified.txt

gs://liulab/csx_experiments_excluding_cell_types/without_CAF/:
 26.84 MiB  2021-08-10T22:38:45Z  gs://liulab/csx_experiments_excluding_cell_types/without_CAF/cibersortx_sim_0.5sd.txt
109.52 MiB  2021-08-10T22:38:44Z  gs://liulab/csx_experiments_excluding_cell_types/without_CAF/sc_refsample_modified.txt

gs://liulab/csx_experiments_excluding_cell_types/without_Endothelial/:
 26.84 MiB  2021-08-10T22:38:35Z  gs://liulab/csx_experiments_excluding_cell_types/without_Endothelial/cibersortx_sim_0.5sd.txt
109.68 MiB  2021-08-10T22:38:33Z  gs://liulab/csx_experiments_excluding_cell_types/without_Endothelial/sc_refsample_modified.txt

gs://liulab/csx

# run CIBERSORTx on these folders

In [12]:
for cell_type in cell_types:
    logger.debug(f"setting up docker command for {cell_type}")
    experiment_path = f"/mnt/buckets/liulab/csx_experiments_excluding_cell_types/without_{cell_type}".replace(" ", "_")
    os.makedirs(os.path.join(experiment_path, "out"), exist_ok=True)
    command = f'''docker run \
    --rm \
    -d \
    -v {experiment_path}:/src/data \
    -v {experiment_path}/out:/src/outdir \
    --user "$(id -u):$(id -g)" \
    cibersortx/fractions:latest \
    --username lyronctk@stanford.edu \
    --token dfeba2c8b9d61daebee5fa87026b8e56 \
    --single_cell TRUE \
    --refsample sc_refsample_modified.txt \
    --mixture cibersortx_sim_0.5sd.txt \
    --replicates 5 \
    --sampling 0.5 \
    --fraction 0.75 \
    --k.max 999 \
    --q.value 0.01 \
    --G.min 300 \
    --G.max 500 \
    --filter FALSE \
    --verbose TRUE \
    --QN FALSE
    '''
    logger.debug(f"calling docker run for {cell_type}")
    subprocess.run(command, shell=True)


2021-08-10 22:39:36,528 - __main__ - DEBUG - setting up docker command for Malignant
2021-08-10 22:39:36,779 - __main__ - DEBUG - calling docker run for Malignant
2021-08-10 22:39:37,879 - __main__ - DEBUG - setting up docker command for Endothelial
2021-08-10 22:39:38,117 - __main__ - DEBUG - calling docker run for Endothelial
2021-08-10 22:39:38,752 - __main__ - DEBUG - setting up docker command for CAF
2021-08-10 22:39:38,976 - __main__ - DEBUG - calling docker run for CAF
2021-08-10 22:39:39,584 - __main__ - DEBUG - setting up docker command for T CD8
2021-08-10 22:39:39,878 - __main__ - DEBUG - calling docker run for T CD8
2021-08-10 22:39:40,470 - __main__ - DEBUG - setting up docker command for NK
2021-08-10 22:39:40,658 - __main__ - DEBUG - calling docker run for NK
2021-08-10 22:39:41,236 - __main__ - DEBUG - setting up docker command for Macrophage
2021-08-10 22:39:41,464 - __main__ - DEBUG - calling docker run for Macrophage
2021-08-10 22:39:42,106 - __main__ - DEBUG - setti

In [13]:
!docker ps

CONTAINER ID   IMAGE                          COMMAND                  CREATED         STATUS                  PORTS     NAMES
97b415468a4b   cibersortx/fractions:latest    "./CIBERSORTxFractio…"   1 second ago    Up Less than a second             sleepy_blackwell
36f0bea2f27b   cibersortx/fractions:latest    "./CIBERSORTxFractio…"   2 seconds ago   Up 1 second                       silly_lichterman
d814a30ed567   cibersortx/fractions:latest    "./CIBERSORTxFractio…"   3 seconds ago   Up 1 second                       busy_moser
7833c98be0cb   cibersortx/fractions:latest    "./CIBERSORTxFractio…"   4 seconds ago   Up 2 seconds                      hopeful_fermat
ff1f6b3b911f   cibersortx/fractions:latest    "./CIBERSORTxFractio…"   5 seconds ago   Up 3 seconds                      recursing_curran
a935e1aac14a   cibersortx/fractions:latest    "./CIBERSORTxFractio…"   5 seconds ago   Up 4 seconds                      vigorous_morse
b16399c737f9   cibersortx/fractions:latest    "./CIBERS

# appendix

In [14]:
sc_annotations = pd.read_csv("gs://liulab/ftp/GSE115978/GSE115978_cell.annotations.csv")
sc_annotations

Unnamed: 0,cells,samples,cell.types,treatment.group,Cohort,no.of.genes,no.of.reads
0,cy78_CD45_neg_1_B04_S496_comb,Mel78,Mal,post.treatment,Tirosh,8258,357919
1,cy79_p4_CD45_neg_PDL1_neg_E11_S1115_comb,Mel79,Mal,treatment.naive,Tirosh,2047,5727
2,CY88_5_B10_S694_comb,Mel88,Mal,post.treatment,Tirosh,5375,139218
3,cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_F07_S67_comb,Mel79,Mal,treatment.naive,Tirosh,5648,73996
4,cy78_CD45_neg_3_H06_S762_comb,Mel78,Mal,post.treatment,Tirosh,7409,380341
...,...,...,...,...,...,...,...
7181,CY75_1_CD45_CD8_3__S168_comb_BCD8,Mel75,T.CD8,post.treatment,Tirosh,3530,137245
7182,CY75_1_CD45_CD8_8__S338_comb_BCD8,Mel75,T.CD8,post.treatment,Tirosh,3872,106432
7183,monika_D7_S132_comb_BCD8_3,Mel75,T.CD8,post.treatment,Tirosh,4589,908173
7184,CY75_1_CD45_CD8_8__S289_comb_BCD8,Mel75,T.CD8,post.treatment,Tirosh,4614,140903


In [15]:
# cell_types = list(sc_annotations["cell.types"].unique())
# cell_types.remove("?")
# cell_types

In [16]:
# from google.cloud import storage
# client = storage.Client()
# bucket = client.bucket("liulab")
# blob = bucket.get_blob("csx_example_files/Single_Cell_RNA-Seq_Melanoma_SuppFig_3b-d/scRNA-Seq_reference_melanoma_Tirosh_SuppFig_3b-d.txt")
# with blob.open("rt") as f:
#     original_column_names = f.readline().strip().split("\t")
# melanoma_sigmatrix_cells.columns = original_column_names[1:]  # exclude index column name