In [14]:
import os
import subprocess

import pandas as pd

In [2]:
import logging

logging.getLogger('root').addHandler(logging.NullHandler())

# configure a handler
format_str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(format_str))

# make a logger for this notebook
logger = logging.getLogger(__name__)
logger.setLevel("DEBUG")
logger.addHandler(handler)

In [3]:
logger.debug("loading original column names (which have duplicates so pandas messes them up)")
from google.cloud import storage
client = storage.Client()
bucket = client.bucket("liulab")
blob = bucket.get_blob("csx_example_files/Single_Cell_RNA-Seq_Melanoma_SuppFig_3b-d/scRNA-Seq_reference_melanoma_Tirosh_SuppFig_3b-d.txt")
with blob.open("rt") as f:
    original_column_names = f.readline().strip().split("\t")

logger.debug("reading single cells used for sigmat generation")
sc_refsample = pd.read_csv(
    "gs://liulab/csx_example_files/Single_Cell_RNA-Seq_Melanoma_SuppFig_3b-d/scRNA-Seq_reference_melanoma_Tirosh_SuppFig_3b-d.txt",
    sep="\t",
    index_col=0,
#     nrows=1000  # for debugging
#     skiprows=lambda i: i % 500
)
sc_refsample.columns = original_column_names[1:]  # exclude index column name
sc_refsample

2021-08-10 00:18:33,156 - __main__ - DEBUG - loading original column names (which have duplicates so pandas messes them up)
2021-08-10 00:18:33,490 - __main__ - DEBUG - reading single cells used for sigmat generation


Unnamed: 0_level_0,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,...,B cells,B cells,B cells,B cells,B cells,B cells,B cells,B cells,B cells,B cells
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C9orf152,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
RPS11,135.788203,153.354101,296.923889,283.818688,313.256767,323.528192,292.029073,320.594740,148.106568,168.636169,...,222.851783,620.452365,308.307928,969.846904,214.314251,418.579389,263.184131,327.511485,321.778349,840.663649
ELMO2,0.000000,13.149704,0.357997,5.159017,0.000000,0.927005,4.695806,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.274004,0.000000,7.250034,9.348143,0.886994,0.000000,0.000000
CREB3L1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
PNMA1,1.780029,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,9.870904,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PIK3IP1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.345004,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
SNRPD2,56.004346,25.364048,82.975699,51.746373,171.888728,91.655197,71.564901,66.761786,82.349405,137.131639,...,0.000000,0.000000,69.063930,0.000000,0.000000,23.430306,45.850742,65.312089,28.601533,5.083915
SLC39A6,30.710748,17.722485,17.310509,6.795816,12.853705,7.719830,28.671377,2.438073,6.099906,7.051737,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.554878,0.000000,0.000000,0.000000
CTSC,15.550467,4.543127,7.130252,15.113522,5.018901,1.876072,22.271624,21.019350,6.805007,14.943539,...,1.770987,0.859997,0.403997,7.064024,3.321104,0.477000,1.457004,6.552739,1.189043,2.990031


In [4]:
cell_types = list(set(sc_refsample.columns))
cell_types

['CAF',
 'Malignant',
 'T cells CD4',
 'NK cells',
 'Endothelial cells',
 'Macrophages',
 'B cells',
 'T cells CD8']

In [5]:
!find /mnt/buckets/liulab/derek/ -type f -name "*0.5sd*" -exec ls -l {} \;

-rw-r--r-- 1 jupyter jupyter 28141097 Jun 25 20:59 /mnt/buckets/liulab/derek/simulations/experiments/cibersortx_sim_0.5sd.txt
-rw-r--r-- 1 jupyter jupyter 14111 Jun 25 20:58 /mnt/buckets/liulab/derek/simulations/experiments/generate_cohorts_for_emma/ctp_sim_0.5sd.txt
-rw-r--r-- 1 jupyter jupyter 28125676 Jun 25 20:58 /mnt/buckets/liulab/derek/simulations/experiments/generate_cohorts_for_emma/sim_0.5sd.txt


In [6]:
!gsutil -m rm -r gs://liulab/csx_experiments_excluding_cell_types

Removing gs://liulab/csx_experiments_excluding_cell_types/without_B_cells/cibersortx_sim_0.5sd.txt#1628553934733460...
Removing gs://liulab/csx_experiments_excluding_cell_types/without_B_cells/out/#1628553938740777...
Removing gs://liulab/csx_experiments_excluding_cell_types/without_B_cells/sc_refsample_modified.txt#1628553933659512...
Removing gs://liulab/csx_experiments_excluding_cell_types/without_CAF/cibersortx_sim_0.5sd.txt#1628553935599777...
Removing gs://liulab/csx_experiments_excluding_cell_types/without_CAF/sc_refsample_modified.txt#1628553934407592...
Removing gs://liulab/csx_experiments_excluding_cell_types/without_CAF/out/#1628553939371781...
Removing gs://liulab/csx_experiments_excluding_cell_types/without_Endothelial_cells/cibersortx_sim_0.5sd.txt#1628553936208452...
Removing gs://liulab/csx_experiments_excluding_cell_types/without_Endothelial_cells/out/#1628553939990065...
Removing gs://liulab/csx_experiments_excluding_cell_types/without_Endothelial_cells/sc_refsample_m

In [7]:
gsutil_calls = list()

for cell_type in cell_types:
    logger.debug(f"exluding {cell_type}")
    desired_column_positions = [
        i for (i, column_name) in enumerate(sc_refsample.columns)
        if cell_type not in column_name]
    logger.debug(f"limiting to {len(desired_column_positions)} single cells")
    base_path = f"gs://liulab/csx_experiments_excluding_cell_types/without_{cell_type}".replace(" ", "_")
    filepath_sc_refsample = os.path.join(base_path, "sc_refsample_modified.txt")
    logger.debug(f"saving to {filepath_sc_refsample}")
    sc_refsample.iloc[:, desired_column_positions].to_csv(filepath_sc_refsample, sep="\t")
    filepath_mixture = os.path.join(base_path, "cibersortx_sim_0.5sd.txt")
    logger.debug(f"copying mixture file to {filepath_mixture}")
    copy_mixture_command = [
        "gsutil",
        "cp",
        "gs://liulab/derek/simulations/experiments/cibersortx_sim_0.5sd.txt",
        filepath_mixture]
    gsutil_calls.append(subprocess.Popen(copy_mixture_command))

for call in gsutil_calls:
    call.wait()

2021-08-10 00:18:39,505 - __main__ - DEBUG - exluding CAF
2021-08-10 00:18:39,506 - __main__ - DEBUG - limiting to 712 single cells
2021-08-10 00:18:39,507 - __main__ - DEBUG - saving to gs://liulab/csx_experiments_excluding_cell_types/without_CAF/sc_refsample_modified.txt
2021-08-10 00:18:49,431 - __main__ - DEBUG - copying mixture file to gs://liulab/csx_experiments_excluding_cell_types/without_CAF/cibersortx_sim_0.5sd.txt
2021-08-10 00:18:49,442 - __main__ - DEBUG - exluding Malignant
2021-08-10 00:18:49,443 - __main__ - DEBUG - limiting to 495 single cells
2021-08-10 00:18:49,444 - __main__ - DEBUG - saving to gs://liulab/csx_experiments_excluding_cell_types/without_Malignant/sc_refsample_modified.txt
2021-08-10 00:18:56,128 - __main__ - DEBUG - copying mixture file to gs://liulab/csx_experiments_excluding_cell_types/without_Malignant/cibersortx_sim_0.5sd.txt
2021-08-10 00:18:56,139 - __main__ - DEBUG - exluding T cells CD4
2021-08-10 00:18:56,140 - __main__ - DEBUG - limiting to 5

In [8]:
sc_refsample.iloc[:, desired_column_positions]

Unnamed: 0_level_0,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,Malignant,...,B cells,B cells,B cells,B cells,B cells,B cells,B cells,B cells,B cells,B cells
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C9orf152,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
RPS11,135.788203,153.354101,296.923889,283.818688,313.256767,323.528192,292.029073,320.594740,148.106568,168.636169,...,222.851783,620.452365,308.307928,969.846904,214.314251,418.579389,263.184131,327.511485,321.778349,840.663649
ELMO2,0.000000,13.149704,0.357997,5.159017,0.000000,0.927005,4.695806,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.274004,0.000000,7.250034,9.348143,0.886994,0.000000,0.000000
CREB3L1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
PNMA1,1.780029,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,9.870904,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PIK3IP1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.345004,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
SNRPD2,56.004346,25.364048,82.975699,51.746373,171.888728,91.655197,71.564901,66.761786,82.349405,137.131639,...,0.000000,0.000000,69.063930,0.000000,0.000000,23.430306,45.850742,65.312089,28.601533,5.083915
SLC39A6,30.710748,17.722485,17.310509,6.795816,12.853705,7.719830,28.671377,2.438073,6.099906,7.051737,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.554878,0.000000,0.000000,0.000000
CTSC,15.550467,4.543127,7.130252,15.113522,5.018901,1.876072,22.271624,21.019350,6.805007,14.943539,...,1.770987,0.859997,0.403997,7.064024,3.321104,0.477000,1.457004,6.552739,1.189043,2.990031


In [9]:
!tree -h /mnt/buckets/liulab/csx_experiments_excluding_cell_types/

[01;34m/mnt/buckets/liulab/csx_experiments_excluding_cell_types/[00m
├── [   0]  [01;34mwithout_B_cells[00m
│   ├── [ 27M]  cibersortx_sim_0.5sd.txt
│   └── [105M]  sc_refsample_modified.txt
├── [   0]  [01;34mwithout_CAF[00m
│   ├── [ 27M]  cibersortx_sim_0.5sd.txt
│   └── [110M]  sc_refsample_modified.txt
├── [   0]  [01;34mwithout_Endothelial_cells[00m
│   ├── [ 27M]  cibersortx_sim_0.5sd.txt
│   └── [110M]  sc_refsample_modified.txt
├── [   0]  [01;34mwithout_Macrophages[00m
│   ├── [ 27M]  cibersortx_sim_0.5sd.txt
│   └── [108M]  sc_refsample_modified.txt
├── [   0]  [01;34mwithout_Malignant[00m
│   ├── [ 27M]  cibersortx_sim_0.5sd.txt
│   └── [ 72M]  sc_refsample_modified.txt
├── [   0]  [01;34mwithout_NK_cells[00m
│   ├── [ 27M]  cibersortx_sim_0.5sd.txt
│   └── [112M]  sc_refsample_modified.txt
├── [   0]  [01;34mwithout_T_cells_CD4[00m
│   ├── [ 27M]  cibersortx_sim_0.5sd.txt
│   └── [ 84M]  sc_refsample_modified.txt
└── [   0]  [01;34mwithout_T_cells_CD8[00m

In [10]:
!gsutil ls -hlR gs://liulab/csx_experiments_excluding_cell_types

gs://liulab/csx_experiments_excluding_cell_types/:

gs://liulab/csx_experiments_excluding_cell_types/without_B_cells/:
 26.84 MiB  2021-08-10T00:19:44Z  gs://liulab/csx_experiments_excluding_cell_types/without_B_cells/cibersortx_sim_0.5sd.txt
104.69 MiB  2021-08-10T00:19:43Z  gs://liulab/csx_experiments_excluding_cell_types/without_B_cells/sc_refsample_modified.txt

gs://liulab/csx_experiments_excluding_cell_types/without_CAF/:
 26.84 MiB  2021-08-10T00:18:50Z  gs://liulab/csx_experiments_excluding_cell_types/without_CAF/cibersortx_sim_0.5sd.txt
109.52 MiB  2021-08-10T00:18:49Z  gs://liulab/csx_experiments_excluding_cell_types/without_CAF/sc_refsample_modified.txt

gs://liulab/csx_experiments_excluding_cell_types/without_Endothelial_cells/:
 26.84 MiB  2021-08-10T00:19:25Z  gs://liulab/csx_experiments_excluding_cell_types/without_Endothelial_cells/cibersortx_sim_0.5sd.txt
109.68 MiB  2021-08-10T00:19:24Z  gs://liulab/csx_experiments_excluding_cell_types/without_Endothelial_cells/sc_ref

# run CIBERSORTx on these folders

In [47]:
for cell_type in cell_types:
    logger.debug(f"setting up docker command for {cell_type}")
    experiment_path = f"/mnt/buckets/liulab/csx_experiments_excluding_cell_types/without_{cell_type}".replace(" ", "_")
    os.makedirs(os.path.join(experiment_path, "out"), exist_ok=True)
    command = f'''docker run \
    --rm \
    -d \
    -v {experiment_path}:/src/data \
    -v {experiment_path}/out:/src/outdir \
    --user "$(id -u):$(id -g)" \
    cibersortx/fractions:latest \
    --username lyronctk@stanford.edu \
    --token dfeba2c8b9d61daebee5fa87026b8e56 \
    --single_cell TRUE \
    --refsample sc_refsample_modified.txt \
    --mixture cibersortx_sim_0.5sd.txt \
    --replicates 5 \
    --sampling 0.5 \
    --fraction 0.75 \
    --k.max 999 \
    --q.value 0.01 \
    --G.min 300 \
    --G.max 500 \
    --filter FALSE \
    --verbose TRUE \
    --QN FALSE
    '''
    logger.debug(f"calling docker run for {cell_type}")
    subprocess.run(command, shell=True)


2021-08-10 01:00:01,434 - __main__ - DEBUG - setting up docker command for CAF
2021-08-10 01:00:02,902 - __main__ - DEBUG - setting up docker command for Malignant
2021-08-10 01:00:04,316 - __main__ - DEBUG - setting up docker command for T cells CD4
2021-08-10 01:00:05,639 - __main__ - DEBUG - setting up docker command for NK cells
2021-08-10 01:00:06,997 - __main__ - DEBUG - setting up docker command for Endothelial cells
2021-08-10 01:00:08,367 - __main__ - DEBUG - setting up docker command for Macrophages
2021-08-10 01:00:09,796 - __main__ - DEBUG - setting up docker command for B cells
2021-08-10 01:00:11,253 - __main__ - DEBUG - setting up docker command for T cells CD8


In [48]:
!docker ps

CONTAINER ID   IMAGE                          COMMAND                  CREATED          STATUS                  PORTS     NAMES
043f97ef977b   cibersortx/fractions:latest    "./CIBERSORTxFractio…"   2 seconds ago    Up Less than a second             stupefied_jennings
94895314247f   cibersortx/fractions:latest    "./CIBERSORTxFractio…"   3 seconds ago    Up 2 seconds                      silly_kalam
2ad5c322a888   cibersortx/fractions:latest    "./CIBERSORTxFractio…"   5 seconds ago    Up 3 seconds                      loving_shockley
9e87fe310ea0   cibersortx/fractions:latest    "./CIBERSORTxFractio…"   6 seconds ago    Up 5 seconds                      beautiful_tu
a95ab376d7be   cibersortx/fractions:latest    "./CIBERSORTxFractio…"   7 seconds ago    Up 6 seconds                      crazy_hamilton
febcecfa0f14   cibersortx/fractions:latest    "./CIBERSORTxFractio…"   9 seconds ago    Up 7 seconds                      vigilant_panini
994e42937f6b   cibersortx/fractions:latest    "./

# appendix

In [None]:
sc_annotations = pd.read_csv("gs://liulab/ftp/GSE115978/GSE115978_cell.annotations.csv")
sc_annotations

In [None]:
# cell_types = list(sc_annotations["cell.types"].unique())
# cell_types.remove("?")
# cell_types

In [None]:
# from google.cloud import storage
# client = storage.Client()
# bucket = client.bucket("liulab")
# blob = bucket.get_blob("csx_example_files/Single_Cell_RNA-Seq_Melanoma_SuppFig_3b-d/scRNA-Seq_reference_melanoma_Tirosh_SuppFig_3b-d.txt")
# with blob.open("rt") as f:
#     original_column_names = f.readline().strip().split("\t")
# melanoma_sigmatrix_cells.columns = original_column_names[1:]  # exclude index column name