# tut 1

NSCLC PBMCs Single Cell RNA-Seq (Fig. 2a,b):
* This example builds a signature matrix from single cell RNA sequencing data from NSCLC PBMCs and enumerates the proportions of the different cell types in a RNA-seq dataset profiled from whole blood using S-mode batch correction.


# example 1: generate signature matrix

### NSCLC PBMCs Single Cell RNA-Seq (Fig. 2a,b):

This example builds a signature matrix from single cell RNA sequencing data from NSCLC PBMCs and enumerates the proportions of the different cell types in a RNA-seq dataset profiled from whole blood using S-mode batch correction.

```
docker run \
    -v absolute/path/to/input/dir:/src/data \
    -v absolute/path/to/output/dir:/src/outdir \
    cibersortx/fractions \
    --username email_address_registered_on_CIBERSORTx_website \
    --token token_obtained_from_CIBERSORTx_website \
    --single_cell TRUE \
    --refsample Fig2ab-NSCLC_PBMCs_scRNAseq_refsample.txt \
    --mixture Fig2b-WholeBlood_RNAseq.txt \
    --fraction 0 \
    --rmbatchSmode TRUE 
```

## set up some stuff

In [2]:
import logging

In [3]:
logging.basicConfig()

### download data

In [4]:
# !./scripts/download_csx_example_data.sh

### read data into dataframes

In [5]:
import pandas as pd

logging.getLogger('pandas').setLevel('DEBUG')

In [6]:
path = (
    "/mnt/buckets/liulab/csx_example_files/Fig2ab-NSCLC_PBMCs/"
    "Fig2ab-NSCLC_PBMCs_scRNAseq_refsample.txt"
)

nsclc_pbmc_sc = pd.read_csv(
    path,
    sep='\t',
    index_col=0
)

nsclc_pbmc_sc

Unnamed: 0_level_0,T cells CD8,T cells CD8.1,T cells CD8.2,Monocytes,Monocytes.1,T cells CD4,T cells CD8.3,Monocytes.2,Monocytes.3,Monocytes.4,...,T cells CD8.233,T cells CD8.234,NKT cells.80,Monocytes.454,Monocytes.455,Monocytes.456,Monocytes.457,NKT cells.81,T cells CD8.235,Monocytes.458
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RP11.34P13.7,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL627309.1,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AP006222.2,0.0,0.0,0.0,0.0,0.0,0.0,216.59086,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RP4.669L17.10,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RP5.857K21.3,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC011841.1,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL354822.1,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
KIR2DL2,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PNRC2.1,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
nsclc_pbmc_sc.sum(axis=0).sort_values()

T cells CD4.127    1000000.0
Monocytes.175      1000000.0
Monocytes.295      1000000.0
T cells CD4.82     1000000.0
T cells CD8.6      1000000.0
                     ...    
Monocytes.319      1000000.0
Monocytes.205      1000000.0
T cells CD8.81     1000000.0
Monocytes.81       1000000.0
Monocytes.230      1000000.0
Length: 1054, dtype: float64

In [8]:
path = (
    "/mnt/buckets/liulab/csx_example_files/Fig2ab-NSCLC_PBMCs/"
    "Fig2b-WholeBlood_RNAseq.txt"
)

nsclc_wholeblood_mixtures = pd.read_csv(
    path,
    sep='\t',
    index_col=0
)

nsclc_wholeblood_mixtures

Unnamed: 0_level_0,W070517001156,W070517001157,W070517001159,W070517001160,W070517001161,W070517001162,W070517102034,W070517102035,W070517102036,W070517102037,W070517102038,W070517102051
GeneSym,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
5_8S_rRNA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5S_rRNA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.304710,0.000000,0.000000,0.752697,0.000000
7SK,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.181943,0.000000,0.000000
A1BG,1.524589,1.198209,2.281101,2.510963,1.752686,3.467098,2.523853,1.634724,2.687471,3.385051,2.195180,1.912779
A1BG-AS1,0.210020,0.263073,0.410865,0.571484,0.139725,0.142219,0.348219,0.294046,0.732450,0.595088,0.424970,0.272239
...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11B,18.753000,10.084024,8.159590,12.489620,5.222887,6.192270,7.825120,12.366960,7.205970,7.896432,9.496550,8.637130
ZYX,200.613353,140.107566,144.816461,134.412477,81.341464,107.785758,62.656594,265.309460,88.768774,94.147450,194.531694,127.203111
ZYXP1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ZZEF1,76.731847,61.515861,51.289958,80.723373,48.363048,44.139845,51.296040,63.324805,40.328860,49.242034,73.869547,62.659012


## run csx with docker

```
docker run \
    -v absolute/path/to/input/dir:/src/data \
    -v absolute/path/to/output/dir:/src/outdir \
    cibersortx/fractions \
    --username email_address_registered_on_CIBERSORTx_website \
    --token token_obtained_from_CIBERSORTx_website \
    --single_cell TRUE \
    --refsample Fig2ab-NSCLC_PBMCs_scRNAseq_refsample.txt \
    --mixture Fig2b-WholeBlood_RNAseq.txt \
    --fraction 0 \
    --rmbatchSmode TRUE 

```

In [10]:
!ls -l /mnt/buckets/liulab/csx_example_files/Fig2ab-NSCLC_PBMCs

total 57864
-rw-r--r-- 1 jupyter jupyter 54711251 Jul 12  2018 Fig2ab-NSCLC_PBMCs_scRNAseq_refsample.txt
-rw-r--r-- 1 jupyter jupyter   190077 Jul 12  2018 Fig2ab-NSCLC_PBMCs_scRNAseq_sigmatrix.txt
-rw-r--r-- 1 jupyter jupyter  4350725 Jul 12  2018 Fig2b-WholeBlood_RNAseq.txt


In [11]:
!./scripts/run_csx_fractions.sh

setting up CIBERSORTx in directory:
/mnt/buckets/liulab/csx-runs/20210714_165603
[01;34m/mnt/buckets/liulab/csx-runs/20210714_165603[00m
├── [   0]  [01;34min[00m
│   ├── [4.1M]  mymixture.txt
│   └── [ 52M]  myrefsample.txt
└── [   0]  [01;34mout[00m

2 directories, 2 files
>Running CIBERSORTxFractions...
>[Options] username: lyronctk@stanford.edu
>[Options] token: dfeba2c8b9d61daebee5fa87026b8e56
>[Options] single_cell: TRUE
>[Options] refsample: myrefsample.txt
>[Options] mixture: mymixture.txt
>[Options] rmbatchSmode: TRUE
>[Options] verbose: TRUE
>Making reference sample file.
>Making phenotype class file.
>single_cell is set to TRUE, so quantile normalization is set to FALSE, and the default parameters for building the signature matrix have been set to the following values:
	- G.min <- 300
	- G.max <- 500
	- q.value <- 0.01
>Pure samples file: /src/outdir//CIBERSORTx_myrefsample_inferred_refsample.txt
>Phenotype classes file: /src/outdir//CIBERSORTx_myrefsample_inferred_phe

In [16]:
path = (
    "/mnt/buckets/liulab/csx-runs/20210714_165603/out/"
    "CIBERSORTx_sigmatrix_Adjusted.txt"
)

learned_sigmatrix = pd.read_csv(
    path,
    sep='\t',
    index_col=0
)

In [17]:
learned_sigmatrix.sample(10)

Unnamed: 0_level_0,T cells CD8,Monocytes,T cells CD4,NKT cells,B cells,NK cells
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
F13A1,31.582554,116.29147,16.543526,19.393538,0.0,16.128564
GNLY,204.601176,89.981023,101.01992,1603.024312,18.236959,2796.002095
IGSF6,10.61404,115.769768,6.293894,13.371633,46.869165,25.520371
PRRC2B,41.120586,22.222028,25.461029,18.329093,23.464044,17.333819
CD79B,0.0,25.660625,43.955907,7.260332,820.412897,18.707355
SCML4,36.19252,3.996084,44.85198,56.727066,15.05321,34.545712
VPS13C,87.172807,111.992892,43.918278,63.700344,0.0,32.793671
HMGCL,26.608174,12.36409,16.029836,25.671054,63.150202,36.013538
ZCCHC6,65.567998,91.804041,32.839599,38.819955,34.748406,47.654076
SDF2L1,14.30975,9.338532,4.656168,28.555987,11.328592,35.662218


In [25]:
path = (
    "/mnt/buckets/liulab/csx-runs/20210714_165603/in/"
    "mymixture.txt"
)

mixtures = pd.read_csv(
    path,
    sep='\t',
    index_col=0
)

In [26]:
mixtures

Unnamed: 0_level_0,W070517001156,W070517001157,W070517001159,W070517001160,W070517001161,W070517001162,W070517102034,W070517102035,W070517102036,W070517102037,W070517102038,W070517102051
GeneSym,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
5_8S_rRNA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5S_rRNA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.304710,0.000000,0.000000,0.752697,0.000000
7SK,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.181943,0.000000,0.000000
A1BG,1.524589,1.198209,2.281101,2.510963,1.752686,3.467098,2.523853,1.634724,2.687471,3.385051,2.195180,1.912779
A1BG-AS1,0.210020,0.263073,0.410865,0.571484,0.139725,0.142219,0.348219,0.294046,0.732450,0.595088,0.424970,0.272239
...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11B,18.753000,10.084024,8.159590,12.489620,5.222887,6.192270,7.825120,12.366960,7.205970,7.896432,9.496550,8.637130
ZYX,200.613353,140.107566,144.816461,134.412477,81.341464,107.785758,62.656594,265.309460,88.768774,94.147450,194.531694,127.203111
ZYXP1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ZZEF1,76.731847,61.515861,51.289958,80.723373,48.363048,44.139845,51.296040,63.324805,40.328860,49.242034,73.869547,62.659012


In [27]:
pd.merge(learned_sigmatrix, mixtures['W070517001156'], left_index=True, right_index=True)

Unnamed: 0,T cells CD8,Monocytes,T cells CD4,NKT cells,B cells,NK cells,W070517001156
A1BG,9.279932,19.968371,7.059486,5.317669,5.934505,18.955470,1.524589
AATF,40.693993,27.234785,18.720587,74.402640,21.070055,52.783887,40.336661
ABCF1,52.575306,20.556804,43.116876,43.514392,16.124892,81.494719,40.893660
ABHD14A,13.013968,3.411371,5.650299,14.566843,0.000000,0.577246,0.824229
ABHD2,31.165896,100.307776,20.938952,48.473854,0.000000,30.596787,145.571646
...,...,...,...,...,...,...,...
ZNF800,65.349101,24.597513,37.836547,73.230805,39.224281,71.555645,52.661468
ZNF92,20.641427,5.142733,11.595737,12.037755,31.046507,7.320498,3.499944
ZNRD1,20.728089,10.430958,15.253098,27.972517,0.000000,22.661131,3.807222
ZRSR2,64.781923,34.281854,47.319957,34.485672,66.667349,65.680700,59.136600


# attempt inferring fractions myself with sigmatrix, mixture

In [37]:
import numpy

from sklearn.svm import NuSVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [39]:
np.set_printoptions(precision=6, suppress=True)

In [29]:
_combined_data = pd.merge(learned_sigmatrix, mixtures['W070517001156'], left_index=True, right_index=True)
y = _combined_data.values[:, -1]
X = _combined_data.values[:, :-1]
y.shape, X.shape

((1759,), (1759, 6))

In [44]:
regr = make_pipeline(StandardScaler(), NuSVR(nu=0.5, kernel='linear'))
regr.fit(X, y)
_ = regr.named_steps['nusvr'].coef_
_ / np.sum(_)

array([[0.085325, 0.671353, 0.09712 , 0.015047, 0.077711, 0.053444]])

# check fractions inferred by csx

In [50]:
path = (
    "/mnt/buckets/liulab/csx-runs/20210714_165603/out/"
    "CIBERSORTx_Adjusted.txt"
)

inferred_fractions = pd.read_csv(
    path,
    sep='\t',
    index_col=0
)

inferred_fractions.loc['W070517001156'][['T cells CD8', 'Monocytes', 'T cells CD4', 'NKT cells', 'B cells',
       'NK cells']].values

array([0.131738, 0.549439, 0.179788, 0.009661, 0.090603, 0.038769])

# extra

In [51]:
pd.read_csv(
    "/mnt/buckets/liulab/csx_example_files/Fig2ab-NSCLC_PBMCs/Fig2ab-NSCLC_PBMCs_scRNAseq_sigmatrix.txt",
    sep='\t',
    index_col=0
)

Unnamed: 0_level_0,T cells CD8,Monocytes,T cells CD4,NKT cells,B cells,NK cells
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A2M.AS1,42.389248,3.034278,8.406227,47.115313,1.0,1.000000
AAK1,171.246018,53.947719,245.690249,189.969058,1.0,176.336120
ABCA1,3.097667,14.269549,1.000000,1.000000,1.0,1.000000
ABCA2,22.847189,2.331345,3.134736,21.462190,1.0,34.812778
ABCA3,1.000000,1.653714,1.000000,8.154662,1.0,11.566915
...,...,...,...,...,...,...
ZSWIM1,1.000000,1.461999,6.619884,1.000000,1.0,1.000000
ZSWIM6,12.516047,33.538731,2.002665,6.003127,1.0,1.000000
ZXDA,4.640282,1.000000,1.000000,1.000000,1.0,1.000000
ZXDB,6.661040,1.751215,8.746464,1.000000,1.0,1.000000


In [52]:
pd.read_csv(
    "/mnt/buckets/liulab/csx_example_files/Fig2ab-NSCLC_PBMCs/Fig2b-WholeBlood_RNAseq.txt",
    sep='\t',
    index_col=0
)

Unnamed: 0_level_0,W070517001156,W070517001157,W070517001159,W070517001160,W070517001161,W070517001162,W070517102034,W070517102035,W070517102036,W070517102037,W070517102038,W070517102051
GeneSym,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
5_8S_rRNA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5S_rRNA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.304710,0.000000,0.000000,0.752697,0.000000
7SK,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.181943,0.000000,0.000000
A1BG,1.524589,1.198209,2.281101,2.510963,1.752686,3.467098,2.523853,1.634724,2.687471,3.385051,2.195180,1.912779
A1BG-AS1,0.210020,0.263073,0.410865,0.571484,0.139725,0.142219,0.348219,0.294046,0.732450,0.595088,0.424970,0.272239
...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11B,18.753000,10.084024,8.159590,12.489620,5.222887,6.192270,7.825120,12.366960,7.205970,7.896432,9.496550,8.637130
ZYX,200.613353,140.107566,144.816461,134.412477,81.341464,107.785758,62.656594,265.309460,88.768774,94.147450,194.531694,127.203111
ZYXP1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ZZEF1,76.731847,61.515861,51.289958,80.723373,48.363048,44.139845,51.296040,63.324805,40.328860,49.242034,73.869547,62.659012
