# tut 1

Single Cell RNA-Seq Melanoma (Fig. 3c-e) (Tutorial 1):

This example builds a signature matrix from single cell RNA sequencing data from melanoma (Tirosh et al., Science, 2016) and enumerates the proportions of the different cell types in bulk melanoma tumors reconstituted from single cell RNA-Seq data.

```
docker run \
    -v absolute/path/to/input/dir:/src/data \
    -v absolute/path/to/output/dir:/src/outdir \
    cibersortx/fractions \
    --username email_address_registered_on_CIBERSORTx_website \
    --token token_obtained_from_CIBERSORTx_website \
    --single_cell TRUE \
    --refsample scRNA-Seq_reference_melanoma_Tirosh_SuppFig_3b-d.txt \
    --mixture mixture_melanoma_Tirosh_SuppFig_3b-d.txt

```

## import stuff

In [1]:
import os

In [2]:
import pandas as pd

In [3]:
import numpy as np

np.set_printoptions(precision=6, suppress=True)  # don't use scientific notation

In [4]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io

plotly.io.renderers.default = 'jupyterlab+png'  # makes plots visible on github

### find input files

In [24]:
!find /mnt/buckets/liulab/ -type f -iname '*tirosh*' -o -type f -iname '*melanoma*'

/mnt/buckets/liulab/csx_example_files/RNA-Seq_mixture_melanoma_Tirosh_Fig2b-d.txt
/mnt/buckets/liulab/csx_example_files/Single_Cell_RNA-Seq_Melanoma_SuppFig_3b-d/mixture_melanoma_Tirosh_SuppFig_3b-d.txt
/mnt/buckets/liulab/csx_example_files/Single_Cell_RNA-Seq_Melanoma_SuppFig_3b-d/scRNA-Seq_reference_melanoma_Tirosh_SuppFig_3b-d.txt
/mnt/buckets/liulab/csx_example_files/groundtruth_Melanoma_Tirosh_et_al_SuppFig3b-d.txt


### read data into dataframes

In [3]:
import pandas as pd

In [4]:
path = (
    "/mnt/buckets/liulab/csx_example_files/Single_Cell_RNA-Seq_Melanoma_SuppFig_3b-d/"
    "scRNA-Seq_reference_melanoma_Tirosh_SuppFig_3b-d.txt"
)

tirosh_sc = pd.read_csv(
    path,
    sep='\t',
    index_col=0
)

tirosh_sc

Unnamed: 0_level_0,Malignant,Malignant.1,Malignant.2,Malignant.3,Malignant.4,Malignant.5,Malignant.6,Malignant.7,Malignant.8,Malignant.9,...,B cells.55,B cells.56,B cells.57,B cells.58,B cells.59,B cells.60,B cells.61,B cells.62,B cells.63,B cells.64
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C9orf152,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
RPS11,135.788203,153.354101,296.923889,283.818688,313.256767,323.528192,292.029073,320.594740,148.106568,168.636169,...,222.851783,620.452365,308.307928,969.846904,214.314251,418.579389,263.184131,327.511485,321.778349,840.663649
ELMO2,0.000000,13.149704,0.357997,5.159017,0.000000,0.927005,4.695806,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.274004,0.000000,7.250034,9.348143,0.886994,0.000000,0.000000
CREB3L1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
PNMA1,1.780029,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,9.870904,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PIK3IP1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.345004,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
SNRPD2,56.004346,25.364048,82.975699,51.746373,171.888728,91.655197,71.564901,66.761786,82.349405,137.131639,...,0.000000,0.000000,69.063930,0.000000,0.000000,23.430306,45.850742,65.312089,28.601533,5.083915
SLC39A6,30.710748,17.722485,17.310509,6.795816,12.853705,7.719830,28.671377,2.438073,6.099906,7.051737,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.554878,0.000000,0.000000,0.000000
CTSC,15.550467,4.543127,7.130252,15.113522,5.018901,1.876072,22.271624,21.019350,6.805007,14.943539,...,1.770987,0.859997,0.403997,7.064024,3.321104,0.477000,1.457004,6.552739,1.189043,2.990031


In [5]:
tirosh_sc.sum(axis=0).sort_values()

T cells CD8.89      99913.914496
Macrophages.23      99941.708622
T cells CD4.128     99955.137832
T cells CD8.87      99959.006249
T cells CD4.208     99969.390404
                       ...      
T cells CD8.50     100006.336377
Malignant.95       100007.098909
Malignant.151      100008.212478
Malignant.162      100008.581029
B cells.61         100008.687798
Length: 737, dtype: float64

In [6]:
path = (
    "/mnt/buckets/liulab/csx_example_files/Single_Cell_RNA-Seq_Melanoma_SuppFig_3b-d/"
    "mixture_melanoma_Tirosh_SuppFig_3b-d.txt"
)

tirosh_mixtures = pd.read_csv(
    path,
    sep='\t',
    index_col=0
)

tirosh_mixtures

Unnamed: 0_level_0,53,58,59,60,65,67,71,72,74,75,78,79,80,81,82,84,88,89,94
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
C9orf152,0.289337,9.801102,6.082935,0.000000,0.000000,28.169763,0.000000,0.000000,0.000000,0.483223,2.690996,1.978439,0.308715,0.291315,0.000000,0.358862,0.367347,0.401188,0.288157
RPS11,47840.775296,51131.456789,40491.903396,42780.017704,51511.313553,44861.720581,66723.389402,69217.689206,38093.350291,12916.004346,43495.724667,23180.732926,35476.850476,34404.065667,25109.440304,36166.236120,45072.841711,40886.412031,45779.352110
ELMO2,648.150664,466.285097,589.063687,602.165025,414.575624,336.606670,546.312859,344.113719,522.410148,1303.223963,409.339491,579.531637,561.839134,373.884862,759.849151,739.386814,661.007565,391.013130,363.486614
CREB3L1,50.630855,0.000000,0.000000,0.000000,0.000000,3.376491,0.000000,0.000000,0.558999,7.652052,0.000000,0.506990,41.938175,0.000000,6.000351,1.713570,11.081761,6.207048,22.073150
PNMA1,463.297460,419.045738,248.376826,328.087061,257.670978,189.640521,347.080395,392.563825,809.961976,757.164439,888.541620,574.500515,289.133662,220.417536,257.028939,434.138882,560.953526,369.350573,246.152971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PIK3IP1,2713.748642,4402.832732,194.331248,3587.371996,8941.159658,3300.708543,2488.001069,11934.354369,3970.364331,3600.616577,466.602158,4418.954755,4097.465832,2436.136866,808.699677,3142.832161,2452.706611,4578.102211,9889.934054
SNRPD2,8173.991503,7516.121157,6537.172031,4487.384239,5641.659192,5059.084437,7911.192146,5566.143170,7139.645943,2751.262303,7238.229045,3566.115118,7119.707552,5741.684765,3404.033429,4484.113405,4120.083442,4513.201611,4330.713365
SLC39A6,158.322703,219.809048,625.323786,817.165852,462.766297,360.820284,1406.541577,42.370089,544.624915,419.588179,1376.010654,817.723630,717.143522,509.528484,412.829590,612.605765,841.827789,810.463987,558.184691
CTSC,4317.942793,3957.807681,725.196785,3804.604704,1381.900617,5119.496318,1204.572114,1004.547908,2402.465430,4161.031538,1394.348437,1355.233992,2774.711130,1580.878261,3874.538653,4851.670095,2728.713435,1991.983612,1818.908918


## run csx with docker

In [7]:
!./scripts/tut1-csx_fractions_melanoma.sh

setting up CIBERSORTx in directory:
/mnt/buckets/liulab/csx-runs/20210715_160345
[01;34m/mnt/buckets/liulab/csx-runs/20210715_160345[00m
├── [   0]  [01;34min[00m
│   ├── [6.0M]  mymixture.txt
│   └── [ 88M]  myrefsample.txt
└── [   0]  [01;34mout[00m

2 directories, 2 files
>Running CIBERSORTxFractions...
>[Options] username: lyronctk@stanford.edu
>[Options] token: dfeba2c8b9d61daebee5fa87026b8e56
>[Options] single_cell: TRUE
>[Options] refsample: myrefsample.txt
>[Options] mixture: mymixture.txt
>[Options] rmbatchSmode: TRUE
>[Options] verbose: TRUE
>Making reference sample file.
>Making phenotype class file.
>single_cell is set to TRUE, so quantile normalization is set to FALSE, and the default parameters for building the signature matrix have been set to the following values:
	- G.min <- 300
	- G.max <- 500
	- q.value <- 0.01
>Pure samples file: /src/outdir//CIBERSORTx_myrefsample_inferred_refsample.txt
>Phenotype classes file: /src/outdir//CIBERSORTx_myrefsample_inferred_phe

### check csx results

In [8]:
csx_run_dir = "/mnt/buckets/liulab/csx-runs/20210715_160345"

In [11]:
path = os.path.join(csx_run_dir, "out", "CIBERSORTx_sigmatrix_Adjusted.txt")
print(path)

learned_sigmatrix = pd.read_csv(
    path,
    sep='\t',
    index_col=0
)

/mnt/buckets/liulab/csx-runs/20210715_160345/out/CIBERSORTx_sigmatrix_Adjusted.txt


In [12]:
learned_sigmatrix.sample(10)

Unnamed: 0_level_0,Malignant,Endothelial cells,CAF,T cells CD8,NK cells,Macrophages,T cells CD4,B cells
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
RASSF9,0.0,45.506067,1.888994,0.0,8.979182,1.694934,0.0,0.0
CD200R1,0.832775,40.191135,0.0,134.52258,0.0,0.0,4.626407,6.665824
MFN1,25.902962,24.5542,0.0,26.85817,0.0,0.388622,7.42369,42.770807
CFL2,26.38059,37.527303,278.956131,0.0,34.190293,6.526353,0.573245,6.976067
ZDHHC21,0.0,3.138619,0.0,0.696663,1.271348,6.902824,6.830183,22.744323
SERPINE2,2690.120294,0.0,536.728948,0.0,0.0,0.0,0.0,0.0
ST6GALNAC3,37.829718,70.935883,0.0,19.11002,0.0,0.0,3.714777,0.0
RSBN1,1.38891,0.0,9.515906,40.323353,0.0,6.28275,25.397633,10.529982
ARMC7,4.412891,0.0,31.886964,31.314065,34.672157,1.914907,11.132449,1.992761
PGM5,0.0,54.145076,0.0,0.0,0.30922,0.0,0.0,0.0


In [13]:
path = os.path.join(csx_run_dir, "in", "mymixture.txt")
print(path)

mixtures = pd.read_csv(
    path,
    sep='\t',
    index_col=0
)

/mnt/buckets/liulab/csx-runs/20210715_160345/in/mymixture.txt


In [14]:
mixtures

Unnamed: 0_level_0,53,58,59,60,65,67,71,72,74,75,78,79,80,81,82,84,88,89,94
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
C9orf152,0.289337,9.801102,6.082935,0.000000,0.000000,28.169763,0.000000,0.000000,0.000000,0.483223,2.690996,1.978439,0.308715,0.291315,0.000000,0.358862,0.367347,0.401188,0.288157
RPS11,47840.775296,51131.456789,40491.903396,42780.017704,51511.313553,44861.720581,66723.389402,69217.689206,38093.350291,12916.004346,43495.724667,23180.732926,35476.850476,34404.065667,25109.440304,36166.236120,45072.841711,40886.412031,45779.352110
ELMO2,648.150664,466.285097,589.063687,602.165025,414.575624,336.606670,546.312859,344.113719,522.410148,1303.223963,409.339491,579.531637,561.839134,373.884862,759.849151,739.386814,661.007565,391.013130,363.486614
CREB3L1,50.630855,0.000000,0.000000,0.000000,0.000000,3.376491,0.000000,0.000000,0.558999,7.652052,0.000000,0.506990,41.938175,0.000000,6.000351,1.713570,11.081761,6.207048,22.073150
PNMA1,463.297460,419.045738,248.376826,328.087061,257.670978,189.640521,347.080395,392.563825,809.961976,757.164439,888.541620,574.500515,289.133662,220.417536,257.028939,434.138882,560.953526,369.350573,246.152971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PIK3IP1,2713.748642,4402.832732,194.331248,3587.371996,8941.159658,3300.708543,2488.001069,11934.354369,3970.364331,3600.616577,466.602158,4418.954755,4097.465832,2436.136866,808.699677,3142.832161,2452.706611,4578.102211,9889.934054
SNRPD2,8173.991503,7516.121157,6537.172031,4487.384239,5641.659192,5059.084437,7911.192146,5566.143170,7139.645943,2751.262303,7238.229045,3566.115118,7119.707552,5741.684765,3404.033429,4484.113405,4120.083442,4513.201611,4330.713365
SLC39A6,158.322703,219.809048,625.323786,817.165852,462.766297,360.820284,1406.541577,42.370089,544.624915,419.588179,1376.010654,817.723630,717.143522,509.528484,412.829590,612.605765,841.827789,810.463987,558.184691
CTSC,4317.942793,3957.807681,725.196785,3804.604704,1381.900617,5119.496318,1204.572114,1004.547908,2402.465430,4161.031538,1394.348437,1355.233992,2774.711130,1580.878261,3874.538653,4851.670095,2728.713435,1991.983612,1818.908918


In [15]:
pd.merge(learned_sigmatrix, mixtures['53'], left_index=True, right_index=True)

Unnamed: 0_level_0,Malignant,Endothelial cells,CAF,T cells CD8,NK cells,Macrophages,T cells CD4,B cells,53
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A2M,177.148396,116.535731,597.837370,0.000000,134.321693,811.058033,0.000000,0.000000,2509.057478
A4GALT,0.000000,49.132546,14.326610,0.000000,5.426864,0.000000,0.000000,0.912482,97.725277
AARSD1,56.985390,53.597910,10.911375,0.000000,0.000000,33.308533,33.107883,21.192432,238.314902
ABCA1,7.425631,67.044475,19.820048,0.000000,0.000000,113.857187,0.195908,8.242122,33.985269
ABCB5,133.577349,0.000000,0.000000,2.977293,34.036859,2.726093,0.000000,0.000000,82.310487
...,...,...,...,...,...,...,...,...,...
ZRSR2,0.519609,0.000000,21.196001,34.894634,0.000000,0.000000,34.167554,13.071341,401.338682
ZSCAN12,1.293517,23.937032,0.000000,31.913763,0.085353,0.000000,7.642947,21.654681,76.147909
ZSCAN16,0.000000,17.775482,0.000000,15.599247,97.035793,0.000000,43.473574,29.006506,97.981741
ZWINT,20.752687,0.000000,27.830771,54.575860,44.165835,11.479189,7.066642,45.198430,368.148809


# attempt inferring fractions myself with sigmatrix, mixture

In [18]:
import numpy as np

from sklearn.svm import NuSVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [19]:
np.set_printoptions(precision=6, suppress=True)

In [20]:
_combined_data = pd.merge(learned_sigmatrix, mixtures['53'], left_index=True, right_index=True)
y = _combined_data.values[:, -1]
X = _combined_data.values[:, :-1]
y.shape, X.shape

((3082,), (3082, 8))

In [21]:
regr = make_pipeline(StandardScaler(), NuSVR(nu=0.5, kernel='linear'))
regr.fit(X, y)
_ = regr.named_steps['nusvr'].coef_
_ / np.sum(_)

array([[0.028042, 0.062896, 0.095515, 0.207445, 0.195229, 0.127134,
        0.238468, 0.045271]])

# check fractions inferred by csx

In [32]:
path = os.path.join(csx_run_dir, "out", "CIBERSORTx_Adjusted.txt")
print(path)

inferred_fractions = pd.read_csv(
    path,
    sep='\t',
    index_col=0
)

inferred_fractions

/mnt/buckets/liulab/csx-runs/20210715_160345/out/CIBERSORTx_Adjusted.txt


Unnamed: 0_level_0,Malignant,Endothelial cells,CAF,T cells CD8,NK cells,Macrophages,T cells CD4,B cells,P-value,Correlation,RMSE
Mixture,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
53,0.046912,0.048921,0.056016,0.28846,0.046088,0.107681,0.404889,0.001032,9999.0,0.871528,0.649739
58,0.000719,0.000682,0.000201,0.469386,0.010466,0.045708,0.412112,0.060726,9999.0,0.96467,0.531574
59,0.655298,0.009818,0.301073,0.0,0.000472,0.026106,0.007233,0.0,9999.0,0.652095,0.780055
60,0.034117,0.004892,0.0,0.178893,0.060326,0.081418,0.250973,0.389381,9999.0,0.946328,0.455591
65,0.06895,0.002214,0.000675,0.199426,0.02468,0.055358,0.556732,0.091964,9999.0,0.901685,0.646823
67,0.000816,0.0,0.0,0.137107,0.031391,0.014096,0.582954,0.233635,9999.0,0.901385,0.624561
71,0.402557,0.009144,0.002694,0.229847,0.032193,0.062506,0.258383,0.002676,9999.0,0.731109,0.767737
72,0.001155,0.001563,0.000994,0.0,0.009979,0.032146,0.708719,0.245443,9999.0,0.88088,0.63813
74,0.003817,0.001437,0.001734,0.454665,0.012862,0.090057,0.34899,0.086437,9999.0,0.942353,0.536504
75,0.024983,0.002253,0.0,0.499056,0.016627,0.044156,0.395921,0.017004,9999.0,0.811784,0.65926


# check groundtruth fractions

did they have data for this?