# tut 1

NSCLC PBMCs Single Cell RNA-Seq (Fig. 2a,b):
* This example builds a signature matrix from single cell RNA sequencing data from NSCLC PBMCs and enumerates the proportions of the different cell types in a RNA-seq dataset profiled from whole blood using S-mode batch correction.


## set up some stuff

In [42]:
import logging

In [49]:
logging.basicConfig()

## download data

In [20]:
%%bash

export BASE_URL="https://cibersortx.stanford.edu/inc/inc.download.page.handler.php"
pushd /mnt/liulab/csx_example_files/
# curl -O -J -L {$BASE_URL}?file=NSCLC_PBMCs_Single_Cell_RNA-Seq_Fig2ab.zip
# unzip NSCLC_PBMCs_Single_Cell_RNA-Seq_Fig2ab.zip
curl -O -J -L {$BASE_URL}?file=RNA-Seq_mixture_melanoma_Tirosh_Fig2b-d.txt
tree -h
popd

/mnt/liulab/csx_example_files ~/deconv-data-exploration
curl: Saved to filename 'RNA-Seq_mixture_melanoma_Tirosh_Fig2b-d.txt'
.
├── [   0]  Expression_datasets
│   ├── [ 52M]  Fig2a-NSCLC_PBMCs_scRNAseq_matrix.txt
│   ├── [4.1M]  Fig2b-WholeBlood_RNAseq.txt
│   ├── [ 835]  Fig2b_ground_truth_whole_blood.txt
│   ├── [1.0M]  Fig3b-f-FL-arrays-groundtruth.RMA.txt
│   ├── [ 67M]  Fig3b-f-FL-arrays-mixture.txt
│   ├── [ 36M]  Fig3g_NSCLC_RNASeq_bulksortedpopulation.txt
│   ├── [1.8M]  Fig3g_groundtruth_NSCLCsubsets_Fig3g.txt
│   ├── [8.2M]  Fig3g_mixture_NSCLCbulk.txt
│   └── [2.1K]  README.txt
├── [   0]  Fig2ab-NSCLC_PBMCs
│   ├── [ 52M]  Fig2ab-NSCLC_PBMCs_scRNAseq_refsample.txt
│   ├── [186K]  Fig2ab-NSCLC_PBMCs_scRNAseq_sigmatrix.txt
│   └── [4.1M]  Fig2b-WholeBlood_RNAseq.txt
├── [ 835]  Fig2b_ground_truth_whole_blood.txt
├── [143K]  LM22.txt
├── [ 12M]  NSCLC_PBMCs_Single_Cell_RNA-Seq_Fig2ab.zip
├── [6.0M]  RNA-Seq_mixture_melanoma_Tirosh_Fig2b-d.txt
├── [   0]  Single_Cell_RNA-Seq_M

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 6117k    0 6117k    0     0  1324k      0 --:--:--  0:00:04 --:--:-- 1324k


### read data into dataframes

In [2]:
import pandas as pd
logging.getLogger('pandas').setLevel('DEBUG')

In [63]:
path = (
    "/mnt/liulab/csx_example_files/"
    "Fig2ab-NSCLC_PBMCs/Fig2ab-NSCLC_PBMCs_scRNAseq_refsample.txt"
)

nsclc_pbmc_sc = pd.read_csv(
    path,
    sep='\t',
    index_col=0
)

In [35]:
nsclc_pbmc_sc.sum(axis=0)

T cells CD8        1000000.0
T cells CD8.1      1000000.0
T cells CD8.2      1000000.0
Monocytes          1000000.0
Monocytes.1        1000000.0
                     ...    
Monocytes.456      1000000.0
Monocytes.457      1000000.0
NKT cells.81       1000000.0
T cells CD8.235    1000000.0
Monocytes.458      1000000.0
Length: 1054, dtype: float64

In [36]:
!ls -l /mnt/liulab/csx_example_files/RNA-Seq_mixture_melanoma_Tirosh_Fig2b-d.txt

-rw-r--r-- 1 jupyter jupyter 6264562 Jul 13 08:39 /mnt/liulab/csx_example_files/RNA-Seq_mixture_melanoma_Tirosh_Fig2b-d.txt


In [37]:
path = (
    "/mnt/liulab/csx_example_files/"
    "RNA-Seq_mixture_melanoma_Tirosh_Fig2b-d.txt"
)

tirosh_tumor_mixtures = pd.read_csv(
    path,
    sep='\t',
    index_col=0
)

tirosh_tumor_mixtures

Unnamed: 0_level_0,53,58,59,60,65,67,71,72,74,75,78,79,80,81,82,84,88,89,94
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
C9orf152,0.289337,9.801102,6.082935,0.000000,0.000000,28.169763,0.000000,0.000000,0.000000,0.483223,2.690996,1.978439,0.308715,0.291315,0.000000,0.358862,0.367347,0.401188,0.288157
RPS11,47840.775296,51131.456789,40491.903396,42780.017704,51511.313553,44861.720581,66723.389402,69217.689206,38093.350291,12916.004346,43495.724667,23180.732926,35476.850476,34404.065667,25109.440304,36166.236120,45072.841711,40886.412031,45779.352110
ELMO2,648.150664,466.285097,589.063687,602.165025,414.575624,336.606670,546.312859,344.113719,522.410148,1303.223963,409.339491,579.531637,561.839134,373.884862,759.849151,739.386814,661.007565,391.013130,363.486614
CREB3L1,50.630855,0.000000,0.000000,0.000000,0.000000,3.376491,0.000000,0.000000,0.558999,7.652052,0.000000,0.506990,41.938175,0.000000,6.000351,1.713570,11.081761,6.207048,22.073150
PNMA1,463.297460,419.045738,248.376826,328.087061,257.670978,189.640521,347.080395,392.563825,809.961976,757.164439,888.541620,574.500515,289.133662,220.417536,257.028939,434.138882,560.953526,369.350573,246.152971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PIK3IP1,2713.748642,4402.832732,194.331248,3587.371996,8941.159658,3300.708543,2488.001069,11934.354369,3970.364331,3600.616577,466.602158,4418.954755,4097.465832,2436.136866,808.699677,3142.832161,2452.706611,4578.102211,9889.934054
SNRPD2,8173.991503,7516.121157,6537.172031,4487.384239,5641.659192,5059.084437,7911.192146,5566.143170,7139.645943,2751.262303,7238.229045,3566.115118,7119.707552,5741.684765,3404.033429,4484.113405,4120.083442,4513.201611,4330.713365
SLC39A6,158.322703,219.809048,625.323786,817.165852,462.766297,360.820284,1406.541577,42.370089,544.624915,419.588179,1376.010654,817.723630,717.143522,509.528484,412.829590,612.605765,841.827789,810.463987,558.184691
CTSC,4317.942793,3957.807681,725.196785,3804.604704,1381.900617,5119.496318,1204.572114,1004.547908,2402.465430,4161.031538,1394.348437,1355.233992,2774.711130,1580.878261,3874.538653,4851.670095,2728.713435,1991.983612,1818.908918


## run csx with docker

```
docker run \
    -v absolute/path/to/input/dir:/src/data \
    -v absolute/path/to/output/dir:/src/outdir \
    cibersortx/fractions \
    --username email_address_registered_on_CIBERSORTx_website \
    --token token_obtained_from_CIBERSORTx_website \
    --single_cell TRUE \
    --refsample Fig2ab-NSCLC_PBMCs_scRNAseq_refsample.txt \
    --mixture Fig2b-WholeBlood_RNAseq.txt \
    --fraction 0 \
    --rmbatchSmode TRUE 

```

In [64]:
!ls -l /mnt/liulab/csx_example_files

total 18239
drwxr-xr-x 1 jupyter jupyter        0 Jul 13 08:07 Expression_datasets
drwxr-xr-x 1 jupyter jupyter        0 Jul 13 08:07 Fig2ab-NSCLC_PBMCs
-rw-r--r-- 1 jupyter jupyter      835 Jul  2 21:48 Fig2b_ground_truth_whole_blood.txt
-rw-r--r-- 1 jupyter jupyter   146759 Jul  3 04:39 LM22.txt
-rw-r--r-- 1 jupyter jupyter 12259563 Jul 13 08:06 NSCLC_PBMCs_Single_Cell_RNA-Seq_Fig2ab.zip
-rw-r--r-- 1 jupyter jupyter  6264562 Jul 13 08:39 RNA-Seq_mixture_melanoma_Tirosh_Fig2b-d.txt
drwxr-xr-x 1 jupyter jupyter        0 Jul 13 08:07 Single_Cell_RNA-Seq_Melanoma_SuppFig_3b-d
-rw-r--r-- 1 jupyter jupyter     1974 Jul  2 21:48 groundtruth_HNSCC_Puram_et_al_Fig2cd.txt
-rw-r--r-- 1 jupyter jupyter     1216 Jul  2 21:48 groundtruth_Melanoma_Tirosh_et_al_SuppFig3b-d.txt


In [150]:
%%bash

export CSX_INPUT_DIR="/home/jupyter/csx/input"
export CSX_OUTPUT_DIR="/home/jupyter/csx/output"

export MIXTURE_FILE="RNA-Seq_mixture_melanoma_Tirosh_Fig2b-d.txt"
export REFSAMPLE_FILE="Fig2ab-NSCLC_PBMCs_scRNAseq_refsample.txt"

rsync -v $(find /mnt/liulab/ -name "$MIXTURE_FILE") $CSX_INPUT_DIR/mixture.txt
rsync -v $(find /mnt/liulab/ -name "$REFSAMPLE_FILE") $CSX_INPUT_DIR/refsample.txt

ls -hl $CSX_INPUT_DIR

docker run \
    --rm \
    -v $CSX_INPUT_DIR:/src/data \
    -v $CSX_OUTPUT_DIR:/src/outdir \
    --user "$(id -u):$(id -g)" \
    cibersortx/fractions:latest \
    --username lyronctk@stanford.edu \
    --token dfeba2c8b9d61daebee5fa87026b8e56 \
    --single_cell TRUE \
    --refsample refsample.txt \
    --mixture mixture.txt \
    --rmbatchSmode TRUE \
    --verbose TRUE

#     --perm 10 \
#     --fraction 0 \
#     --sourceGEPs signature_matrix.txt

RNA-Seq_mixture_melanoma_Tirosh_Fig2b-d.txt

sent 6,266,206 bytes  received 35 bytes  12,532,482.00 bytes/sec
total size is 6,264,562  speedup is 1.00
Fig2ab-NSCLC_PBMCs_scRNAseq_refsample.txt

sent 54,724,713 bytes  received 35 bytes  36,483,165.33 bytes/sec
total size is 54,711,251  speedup is 1.00
total 59M
-rw-r--r-- 1 jupyter jupyter 6.0M Jul 13 11:54 mixture.txt
-rw-r--r-- 1 jupyter jupyter  53M Jul 13 11:54 refsample.txt
>Running CIBERSORTxFractions...
>[Options] username: lyronctk@stanford.edu
>[Options] token: dfeba2c8b9d61daebee5fa87026b8e56
>[Options] single_cell: TRUE
>[Options] refsample: refsample.txt
>[Options] mixture: mixture.txt
>[Options] rmbatchSmode: TRUE
>[Options] verbose: TRUE
>Mixture file: /src/data/mixture.txt
>Signature matrix file: /src/outdir//CIBERSORTx_refsample_inferred_phenoclasses.CIBERSORTx_refsample_inferred_refsample.bm.K999.txt
>Enable verbose output
>Do S-mode batch correction
>Adjusting mixtures with S-mode batch correction.
>Running CIBERSORTx 

In [153]:
!tree -h /home/jupyter/csx/

[01;34m/home/jupyter/csx/[00m
├── [4.0K]  [01;34minput[00m
│   ├── [6.0M]  mixture.txt
│   └── [ 52M]  refsample.txt
└── [4.0K]  [01;34moutput[00m
    ├── [3.5K]  CIBERSORTx_Adjusted.txt
    ├── [4.2M]  CIBERSORTx_Mixtures_Adjusted.txt
    ├── [2.0M]  CIBERSORTx_cell_type_sourceGEP.txt
    ├── [116K]  CIBERSORTx_refsample_inferred_phenoclasses.CIBERSORTx_refsample_inferred_refsample.bm.K999.pdf
    ├── [302K]  CIBERSORTx_refsample_inferred_phenoclasses.CIBERSORTx_refsample_inferred_refsample.bm.K999.txt
    ├── [ 421]  CIBERSORTx_refsample_inferred_phenoclasses.txt
    ├── [9.8M]  CIBERSORTx_refsample_inferred_refsample.txt
    └── [302K]  CIBERSORTx_sigmatrix_Adjusted.txt

2 directories, 10 files


In [102]:
path = "/home/jupyter/csx/output/CIBERSORTx_sigmatrix_Adjusted.txt"

learned_sigmatrix = pd.read_csv(
    path,
    sep='\t',
    index_col=0
)

In [103]:
learned_sigmatrix

Unnamed: 0_level_0,T cells CD8,Monocytes,T cells CD4,NKT cells,B cells,NK cells
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A2M.AS1,52.724120,0.000000,1.995144,25.644416,11.269950,5.772054
AAK1,64.724348,14.473957,108.544393,88.335511,0.000000,61.448091
ABCA1,1.768895,19.596316,1.426439,0.000000,0.000000,0.000000
ABCB1,14.914946,0.000000,0.000000,9.038372,0.000000,63.962007
ABCB4,2.823775,0.323269,0.000000,0.000000,26.493081,3.787072
...,...,...,...,...,...,...
ZSCAN18,9.753607,0.000000,10.196358,0.000000,39.915180,0.000000
ZSWIM1,1.976944,9.358320,22.511495,0.000000,0.000000,6.381458
ZSWIM6,5.203981,18.791859,0.000000,1.411373,4.156974,3.525318
ZWILCH,11.661740,1.457614,8.390918,33.626264,1.188049,5.413562


In [106]:
tirosh_tumor_mixtures['53']

GeneSymbol
C9orf152        0.289337
RPS11       47840.775296
ELMO2         648.150664
CREB3L1        50.630855
PNMA1         463.297460
                ...     
PIK3IP1      2713.748642
SNRPD2       8173.991503
SLC39A6       158.322703
CTSC         4317.942793
AQP7            8.216634
Name: 53, Length: 23684, dtype: float64

In [108]:
pd.merge(learned_sigmatrix, tirosh_tumor_mixtures['53'], left_index=True, right_index=True)

Unnamed: 0_level_0,T cells CD8,Monocytes,T cells CD4,NKT cells,B cells,NK cells,53
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A2M.AS1,52.724120,0.000000,1.995144,25.644416,11.269950,5.772054,165.340434
AAK1,64.724348,14.473957,108.544393,88.335511,0.000000,61.448091,229.191239
ABCA1,1.768895,19.596316,1.426439,0.000000,0.000000,0.000000,33.985269
ABCB1,14.914946,0.000000,0.000000,9.038372,0.000000,63.962007,159.774577
ABCB4,2.823775,0.323269,0.000000,0.000000,26.493081,3.787072,15.782543
...,...,...,...,...,...,...,...
ZSCAN18,9.753607,0.000000,10.196358,0.000000,39.915180,0.000000,73.089754
ZSWIM1,1.976944,9.358320,22.511495,0.000000,0.000000,6.381458,332.613603
ZSWIM6,5.203981,18.791859,0.000000,1.411373,4.156974,3.525318,87.861743
ZWILCH,11.661740,1.457614,8.390918,33.626264,1.188049,5.413562,101.294953


In [112]:
pd.merge(learned_sigmatrix, tirosh_tumor_mixtures['53'], left_index=True, right_index=True)

Unnamed: 0_level_0,T cells CD8,Monocytes,T cells CD4,NKT cells,B cells,NK cells,53
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A2M.AS1,52.724120,0.000000,1.995144,25.644416,11.269950,5.772054,165.340434
AAK1,64.724348,14.473957,108.544393,88.335511,0.000000,61.448091,229.191239
ABCA1,1.768895,19.596316,1.426439,0.000000,0.000000,0.000000,33.985269
ABCB1,14.914946,0.000000,0.000000,9.038372,0.000000,63.962007,159.774577
ABCB4,2.823775,0.323269,0.000000,0.000000,26.493081,3.787072,15.782543
...,...,...,...,...,...,...,...
ZSCAN18,9.753607,0.000000,10.196358,0.000000,39.915180,0.000000,73.089754
ZSWIM1,1.976944,9.358320,22.511495,0.000000,0.000000,6.381458,332.613603
ZSWIM6,5.203981,18.791859,0.000000,1.411373,4.156974,3.525318,87.861743
ZWILCH,11.661740,1.457614,8.390918,33.626264,1.188049,5.413562,101.294953


# attempt inferring fractions myself with sigmatrix, mixture

In [114]:
from sklearn.svm import NuSVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [124]:
_combined_data = pd.merge(learned_sigmatrix, tirosh_tumor_mixtures['53'], left_index=True, right_index=True)
y = _combined_data.values[:, -1]
X = _combined_data.values[:, :-1]
y.shape, X.shape

((2118,), (2118, 6))

In [130]:
regr = make_pipeline(StandardScaler(), NuSVR(kernel='linear'))
regr.fit(X, y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('nusvr', NuSVR(kernel='linear'))])

In [143]:
_ = regr.named_steps['nusvr'].coef_
import numpy as np
_ / np.sum(_)

array([[0.20950845, 0.20899131, 0.12255509, 0.20169151, 0.06718878,
        0.19006486]])

# check fractions inferred by csx

In [154]:
!find /home/jupyter/csx/output -name '*txt'

/home/jupyter/csx/output/CIBERSORTx_refsample_inferred_phenoclasses.txt
/home/jupyter/csx/output/CIBERSORTx_sigmatrix_Adjusted.txt
/home/jupyter/csx/output/CIBERSORTx_cell_type_sourceGEP.txt
/home/jupyter/csx/output/CIBERSORTx_refsample_inferred_phenoclasses.CIBERSORTx_refsample_inferred_refsample.bm.K999.txt
/home/jupyter/csx/output/CIBERSORTx_Mixtures_Adjusted.txt
/home/jupyter/csx/output/CIBERSORTx_refsample_inferred_refsample.txt
/home/jupyter/csx/output/CIBERSORTx_Adjusted.txt


In [157]:
path = "/home/jupyter/csx/output/CIBERSORTx_Adjusted.txt"

pd.read_csv(
    path,
    sep='\t',
    index_col=0
).loc[53]

T cells CD8       0.119403
Monocytes         0.340806
T cells CD4       0.182638
NKT cells         0.168883
B cells           0.070724
NK cells          0.117547
P-value        9999.000000
Correlation       0.411767
RMSE              0.911568
Name: 53, dtype: float64

# extra

In [10]:
pd.read_csv(
    "/mnt/liulab/csx_example_files/Fig2ab-NSCLC_PBMCs/Fig2ab-NSCLC_PBMCs_scRNAseq_sigmatrix.txt",
    sep='\t',
    index_col=0
)

Unnamed: 0_level_0,T cells CD8,Monocytes,T cells CD4,NKT cells,B cells,NK cells
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A2M.AS1,42.389248,3.034278,8.406227,47.115313,1.0,1.000000
AAK1,171.246018,53.947719,245.690249,189.969058,1.0,176.336120
ABCA1,3.097667,14.269549,1.000000,1.000000,1.0,1.000000
ABCA2,22.847189,2.331345,3.134736,21.462190,1.0,34.812778
ABCA3,1.000000,1.653714,1.000000,8.154662,1.0,11.566915
...,...,...,...,...,...,...
ZSWIM1,1.000000,1.461999,6.619884,1.000000,1.0,1.000000
ZSWIM6,12.516047,33.538731,2.002665,6.003127,1.0,1.000000
ZXDA,4.640282,1.000000,1.000000,1.000000,1.0,1.000000
ZXDB,6.661040,1.751215,8.746464,1.000000,1.0,1.000000


In [9]:
pd.read_csv(
    "/mnt/liulab/csx_example_files/Fig2ab-NSCLC_PBMCs/Fig2b-WholeBlood_RNAseq.txt",
    sep='\t',
    index_col=0
)

Unnamed: 0_level_0,W070517001156,W070517001157,W070517001159,W070517001160,W070517001161,W070517001162,W070517102034,W070517102035,W070517102036,W070517102037,W070517102038,W070517102051
GeneSym,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
5_8S_rRNA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5S_rRNA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.304710,0.000000,0.000000,0.752697,0.000000
7SK,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.181943,0.000000,0.000000
A1BG,1.524589,1.198209,2.281101,2.510963,1.752686,3.467098,2.523853,1.634724,2.687471,3.385051,2.195180,1.912779
A1BG-AS1,0.210020,0.263073,0.410865,0.571484,0.139725,0.142219,0.348219,0.294046,0.732450,0.595088,0.424970,0.272239
...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11B,18.753000,10.084024,8.159590,12.489620,5.222887,6.192270,7.825120,12.366960,7.205970,7.896432,9.496550,8.637130
ZYX,200.613353,140.107566,144.816461,134.412477,81.341464,107.785758,62.656594,265.309460,88.768774,94.147450,194.531694,127.203111
ZYXP1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ZZEF1,76.731847,61.515861,51.289958,80.723373,48.363048,44.139845,51.296040,63.324805,40.328860,49.242034,73.869547,62.659012
