In [1]:
import os

import numpy as np
import pandas as pd

In [2]:
rng = np.random.default_rng(seed=0)

In [3]:
s = 100  # number of samples
samples = pd.Series([f"sample_{j:0{len(str(s))}d}" for j in range(s)], name="Mixture")
samples

0     sample_000
1     sample_001
2     sample_002
3     sample_003
4     sample_004
         ...    
95    sample_095
96    sample_096
97    sample_097
98    sample_098
99    sample_099
Name: Mixture, Length: 100, dtype: object

In [4]:
c = 15  # number of cell types
cell_types = pd.Series(
    [f"cell_{i:0{len(str(c))}d}" for i in range(c)], name="cell_type"
)
cell_types

0     cell_00
1     cell_01
2     cell_02
3     cell_03
4     cell_04
5     cell_05
6     cell_06
7     cell_07
8     cell_08
9     cell_09
10    cell_10
11    cell_11
12    cell_12
13    cell_13
14    cell_14
Name: cell_type, dtype: object

In [5]:
make_fake_data = True

if make_fake_data:
    g = 20000  # number of genes
    genes = pd.Series([f"FAKE{i:0{len(str(g))}d}" for i in range(g)], name="GeneSymbol")
    cell_type_geps_known = pd.DataFrame(
        rng.uniform(low=20.0, high=200.0, size=(g, c)), columns=cell_types, index=genes
    )
else:
    path = "gs://liulab/"
    cell_type_geps_known = pd.read_csv(path, sep="\t", index_col=0)
    g = len(cell_type_geps_known)
    genes = pd.Series(cell_type_geps_known.index)

cell_type_geps_known

cell_type,cell_00,cell_01,cell_02,cell_03,cell_04,cell_05,cell_06,cell_07,cell_08,cell_09,cell_10,cell_11,cell_12,cell_13,cell_14
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
FAKE00000,134.653104,68.561608,27.375234,22.974974,166.388643,184.296004,129.194440,151.309381,117.852498,188.313036,166.853640,20.492930,174.332770,26.045404,151.337980
FAKE00001,51.618012,175.372206,117.463020,73.948140,96.083700,25.097541,42.370990,140.712395,136.494112,130.769320,89.061960,199.497788,196.550361,143.397557,137.082670
FAKE00002,143.920412,90.005856,44.317371,149.867901,114.563778,75.843538,107.450365,180.107810,188.127833,84.403135,122.875370,77.936490,126.974005,80.824021,90.491420
FAKE00003,180.249383,60.888367,132.173686,35.122762,169.875947,161.677695,63.086500,177.767162,30.542246,80.501071,47.050304,101.061086,163.338369,61.515598,29.363834
FAKE00004,92.819331,55.732348,36.335548,124.459829,73.765304,140.959078,55.912780,189.580360,85.719830,38.989150,133.239467,186.887820,99.267888,191.826289,109.981246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FAKE19995,97.516457,40.494301,158.594559,112.293218,107.437837,179.441670,95.908483,143.862697,142.727650,26.938403,60.928486,68.590896,90.327525,163.789251,63.072727
FAKE19996,56.182387,126.812460,161.922077,146.193581,161.256369,35.576508,101.780236,129.998264,129.086156,142.288038,83.808072,98.538341,170.325195,52.221026,96.789273
FAKE19997,101.684565,165.352061,130.613755,157.986221,109.543798,72.656536,138.737370,85.826953,77.396502,29.201255,181.392263,199.363218,91.116260,116.748674,71.470641
FAKE19998,114.358086,47.661403,104.620854,115.184721,99.535762,83.525361,159.696441,121.171307,77.086889,97.816747,109.410536,25.658682,175.805709,111.315127,93.923947


In [6]:
if c == 2:
    # linspace fractions for 2 cell types
    fraction_values = [(p, 1 - p) for p in np.arange(0, 1, 1.0 / s)]
else:
    # random fractions
    fraction_values = rng.dirichlet((1,) * c, size=(s,))

fractions = pd.DataFrame(fraction_values, index=samples, columns=cell_types)

fractions_and_empty_csx_metrics = fractions.copy()
fractions_and_empty_csx_metrics[["P-value", "Correlation", "RMSE"]] = (0, 0, 0)

fractions_and_empty_csx_metrics.head()

cell_type,cell_00,cell_01,cell_02,cell_03,cell_04,cell_05,cell_06,cell_07,cell_08,cell_09,cell_10,cell_11,cell_12,cell_13,cell_14,P-value,Correlation,RMSE
Mixture,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
sample_000,0.106978,0.045821,0.015543,0.008793,0.058507,0.010925,0.047463,0.005687,0.29276,0.133841,0.02205528,0.060117,0.001567,0.050249,0.139692,0,0,0
sample_001,0.003211,0.074931,0.030299,0.104015,0.021104,0.36749,0.130251,0.009127,0.003269,0.030158,2.974689e-07,0.046795,0.100138,0.062947,0.016265,0,0,0
sample_002,0.014459,0.144278,0.076262,0.018306,0.034171,0.045731,0.085911,0.056957,0.195259,0.064227,0.008863847,0.029965,0.034748,0.112814,0.078051,0,0,0
sample_003,0.047392,0.004293,0.091425,0.205132,0.050327,0.012089,0.211999,0.023242,0.105343,0.007657,0.0344996,0.047304,0.064277,0.054325,0.040696,0,0,0
sample_004,0.014482,0.009112,0.002267,0.015641,0.027896,0.031466,0.058003,0.057248,0.06134,0.111015,0.05164205,0.276449,0.083729,0.125027,0.074684,0,0,0


In [7]:
mixture_noise = rng.uniform(low=0, high=1.0, size=(g, s))

mixtures = cell_type_geps_known.dot(fractions.T) + mixture_noise

mixtures.head()

Mixture,sample_000,sample_001,sample_002,sample_003,sample_004,sample_005,sample_006,sample_007,sample_008,sample_009,...,sample_090,sample_091,sample_092,sample_093,sample_094,sample_095,sample_096,sample_097,sample_098,sample_099
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FAKE00000,125.09056,127.719812,106.799932,95.39305,101.167017,126.403077,141.402384,135.209969,152.903806,86.936602,...,129.336071,111.363294,92.455666,118.167529,104.232918,120.596198,121.089202,111.339787,111.281289,128.566886
FAKE00001,123.772727,87.742327,128.425999,100.921666,145.397545,117.454007,116.517232,117.916906,105.566914,115.125319,...,114.432057,125.780161,140.031818,131.063523,124.304057,104.865955,108.400463,120.368481,120.093499,115.753686
FAKE00002,125.868231,96.180098,114.255018,121.110642,104.187968,127.555091,114.806804,117.042114,102.634462,109.546295,...,113.41362,111.380532,108.839413,111.086862,110.365549,113.635048,106.497024,88.987383,107.348761,108.113878
FAKE00003,74.958461,114.503671,80.959477,82.372446,93.290867,110.847214,106.93458,111.325914,109.310501,85.420729,...,99.626752,104.697832,90.78889,112.955228,102.312937,81.679655,104.726203,96.084697,70.149633,101.178491
FAKE00004,93.43956,115.322864,99.162965,100.088154,133.879756,113.229674,110.527636,107.629563,79.281131,126.47959,...,125.24495,108.692354,110.823164,105.35188,123.605742,85.066466,100.764524,89.512393,101.763388,87.825259


In [8]:
(cell_type_geps_known.values < 0).any()

False

In [9]:
(mixtures.values < 0).any()

False

# set up cibersortx GEP imputation - "group" mode

In [10]:
base_path = os.path.abspath("./5_results/group_mode")

In [11]:
!sudo chown -R jupyter:jupyter $base_path
!rm -r $base_path
os.makedirs(base_path, exist_ok=True)
os.makedirs(os.path.join(base_path, "in"), exist_ok=True)

path = os.path.join(base_path, "in", "mixtures_computed.txt")
mixtures.to_csv(path, sep="\t")
print(path)

path = os.path.join(base_path, "fractions_known_and_empty_csx_metrics.txt")
fractions_and_empty_csx_metrics.to_csv(path, sep="\t")
print(path)

path = os.path.join(base_path, "in", "cell_type_geps_known.txt")
cell_type_geps_known.to_csv(path, sep="\t")
print(path)

/home/jupyter/deconv/5_results/group_mode/in/mixtures_computed.txt
/home/jupyter/deconv/5_results/group_mode/fractions_known_and_empty_csx_metrics.txt
/home/jupyter/deconv/5_results/group_mode/in/cell_type_geps_known.txt
[01;34m/home/jupyter/deconv/5_results/group_mode[00m
├── [ 32K]  fractions_known_and_empty_csx_metrics.txt
└── [4.0K]  [01;34min[00m
    ├── [5.5M]  cell_type_geps_known.txt
    └── [ 36M]  mixtures_computed.txt

1 directory, 3 files


In [11]:
!tree -h $base_path

[01;34m/home/jupyter/deconv/5_results/group_mode[00m
├── [4.6M]  CIBERSORTxGEP_GEPs.txt
├── [5.9M]  CIBERSORTxGEP_GEPs_CVs.txt
├── [4.6M]  CIBERSORTxGEP_GEPs_Filtered.txt
├── [5.9M]  CIBERSORTxGEP_GEPs_Pvals.txt
├── [5.9M]  CIBERSORTxGEP_GEPs_Qvals.txt
├── [5.3M]  CIBERSORTxGEP_GEPs_StdErrs.txt
├── [985K]  CIBERSORTxGEP_GEPs_ThresholdPlots.pdf
├── [4.6M]  CIBERSORTxGEP_SM_GEPs_Filtered.txt
├── [ 32K]  CIBERSORTxGEP_Weights.txt
├── [ 32K]  fractions_known_and_empty_csx_metrics.txt
└── [4.0K]  [01;34min[00m
    ├── [5.5M]  cell_type_geps_known.txt
    └── [ 36M]  mixtures_computed.txt

1 directory, 12 files


In [13]:
!docker run \
    --rm \
    -it \
    -v $base_path/in:/src/data \
    -v $base_path:/src/outdir \
    cibersortx/gep:latest \
    --username lyronctk@stanford.edu \
    --token dfeba2c8b9d61daebee5fa87026b8e56 \
    --mixture mixtures_computed.txt \
    --cibresults fractions_known_and_empty_csx_metrics.txt \
    --sigmatrix cell_type_geps_known.txt

!sudo chown -R jupyter:jupyter $base_path

>Running CIBERSORTx GEP imputation (representative profiles only)...
>[Options] username: lyronctk@stanford.edu
>[Options] token: dfeba2c8b9d61daebee5fa87026b8e56
>[Options] mixture: mixtures_computed.txt
>[Options] cibresults: fractions_known_and_empty_csx_metrics.txt
>[Options] sigmatrix: cell_type_geps_known.txt
>Previous estimates of cell proportions detected. To rerun, use redocibersort=TRUE.
>Loaded 100 mixture samples, 20000 genes, and 15 cell subsets...
>Imputing representative cell type GEPs...done.
>Writing output to disk...done.
>Running time (sec): 41


In [12]:
!tree -h $base_path

[01;34m/home/jupyter/deconv/5_results/group_mode[00m
├── [4.6M]  CIBERSORTxGEP_GEPs.txt
├── [5.9M]  CIBERSORTxGEP_GEPs_CVs.txt
├── [4.6M]  CIBERSORTxGEP_GEPs_Filtered.txt
├── [5.9M]  CIBERSORTxGEP_GEPs_Pvals.txt
├── [5.9M]  CIBERSORTxGEP_GEPs_Qvals.txt
├── [5.3M]  CIBERSORTxGEP_GEPs_StdErrs.txt
├── [985K]  CIBERSORTxGEP_GEPs_ThresholdPlots.pdf
├── [4.6M]  CIBERSORTxGEP_SM_GEPs_Filtered.txt
├── [ 32K]  CIBERSORTxGEP_Weights.txt
├── [ 32K]  fractions_known_and_empty_csx_metrics.txt
└── [4.0K]  [01;34min[00m
    ├── [5.5M]  cell_type_geps_known.txt
    └── [ 36M]  mixtures_computed.txt

1 directory, 12 files


# set up cibersortx GEP imputation - "hires" mode

In [14]:
base_path = os.path.abspath("./5_results/hires")

In [15]:
!sudo chown -R jupyter:jupyter $base_path
!rm -r $base_path
os.makedirs(base_path, exist_ok=True)
os.makedirs(os.path.join(base_path, "in"), exist_ok=True)

path = os.path.join(base_path, "in", "mixtures_computed.txt")
mixtures.to_csv(path, sep="\t")
print(path)

path = os.path.join(base_path, "fractions_known_and_empty_csx_metrics.txt")
fractions_and_empty_csx_metrics.to_csv(path, sep="\t")
print(path)

path = os.path.join(base_path, "fractions_known.txt")
fractions.to_csv(path, sep="\t")
print(path)

path = os.path.join(base_path, "in", "cell_type_geps_known.txt")
cell_type_geps_known.to_csv(path, sep="\t")
print(path)

/home/jupyter/deconv/5_results/hires/in/mixtures_computed.txt
/home/jupyter/deconv/5_results/hires/fractions_known_and_empty_csx_metrics.txt
/home/jupyter/deconv/5_results/hires/fractions_known.txt
/home/jupyter/deconv/5_results/hires/in/cell_type_geps_known.txt
[01;34m/home/jupyter/deconv/5_results/hires[00m
├── [ 31K]  fractions_known.txt
├── [ 32K]  fractions_known_and_empty_csx_metrics.txt
└── [4.0K]  [01;34min[00m
    ├── [5.5M]  cell_type_geps_known.txt
    └── [ 36M]  mixtures_computed.txt

1 directory, 4 files


In [15]:
!tree -h $base_path

[01;34m/home/jupyter/deconv/5_results/hires[00m
├── [203K]  CIBERSORTxHiRes_NA_Heatmap_cell_00_Window50.png
├── [201K]  CIBERSORTxHiRes_NA_Heatmap_cell_01_Window50.png
├── [197K]  CIBERSORTxHiRes_NA_Heatmap_cell_02_Window50.png
├── [187K]  CIBERSORTxHiRes_NA_Heatmap_cell_04_Window50.png
├── [187K]  CIBERSORTxHiRes_NA_Heatmap_cell_05_Window50.png
├── [210K]  CIBERSORTxHiRes_NA_Heatmap_cell_06_Window50.png
├── [217K]  CIBERSORTxHiRes_NA_Heatmap_cell_07_Window50.png
├── [172K]  CIBERSORTxHiRes_NA_Heatmap_cell_08_Window50.png
├── [184K]  CIBERSORTxHiRes_NA_Heatmap_cell_09_Window50.png
├── [214K]  CIBERSORTxHiRes_NA_Heatmap_cell_10_Window50.png
├── [185K]  CIBERSORTxHiRes_NA_Heatmap_cell_11_Window50.png
├── [203K]  CIBERSORTxHiRes_NA_Heatmap_cell_12_Window50.png
├── [189K]  CIBERSORTxHiRes_NA_Heatmap_cell_13_Window50.png
├── [188K]  CIBERSORTxHiRes_NA_Heatmap_cell_14_Window50.png
├── [ 41M]  CIBERSORTxHiRes_NA_cell_00_Window50.txt
├── [ 41M]  CIBERSORTxHiRes_NA_cell_01_Window50.txt
├── [ 

In [16]:
!rm $base_path/CIBERSORT*

!docker run \
    --rm \
    -it \
    -v $base_path/in:/src/data \
    -v $base_path:/src/outdir \
    cibersortx/hires:latest \
    --username lyronctk@stanford.edu \
    --token dfeba2c8b9d61daebee5fa87026b8e56 \
    --mixture mixtures_computed.txt \
    --cibresults fractions_known.txt

'''
    --sigmatrix cell_type_geps_known.txt
'''

!sudo chown -R jupyter:jupyter $base_path

rm: cannot remove '/home/jupyter/deconv/5_results/hires/CIBERSORT*': No such file or directory
>Running CIBERSORTx high-resolution GEP imputation...
>[Options] username: lyronctk@stanford.edu
>[Options] token: dfeba2c8b9d61daebee5fa87026b8e56
>[Options] mixture: mixtures_computed.txt
>[Options] cibresults: fractions_known.txt
>Loaded 100 mixture samples, 20000 genes, and 15 cell subsets...
>Window size adaptively set to 50
>Imputing high-resolution cell type GEPs...done.
>Writing output to disk ...done.
>Running time (sec): 924


In [16]:
!tree -h $base_path

[01;34m/home/jupyter/deconv/5_results/hires[00m
├── [203K]  CIBERSORTxHiRes_NA_Heatmap_cell_00_Window50.png
├── [201K]  CIBERSORTxHiRes_NA_Heatmap_cell_01_Window50.png
├── [197K]  CIBERSORTxHiRes_NA_Heatmap_cell_02_Window50.png
├── [187K]  CIBERSORTxHiRes_NA_Heatmap_cell_04_Window50.png
├── [187K]  CIBERSORTxHiRes_NA_Heatmap_cell_05_Window50.png
├── [210K]  CIBERSORTxHiRes_NA_Heatmap_cell_06_Window50.png
├── [217K]  CIBERSORTxHiRes_NA_Heatmap_cell_07_Window50.png
├── [172K]  CIBERSORTxHiRes_NA_Heatmap_cell_08_Window50.png
├── [184K]  CIBERSORTxHiRes_NA_Heatmap_cell_09_Window50.png
├── [214K]  CIBERSORTxHiRes_NA_Heatmap_cell_10_Window50.png
├── [185K]  CIBERSORTxHiRes_NA_Heatmap_cell_11_Window50.png
├── [203K]  CIBERSORTxHiRes_NA_Heatmap_cell_12_Window50.png
├── [189K]  CIBERSORTxHiRes_NA_Heatmap_cell_13_Window50.png
├── [188K]  CIBERSORTxHiRes_NA_Heatmap_cell_14_Window50.png
├── [ 41M]  CIBERSORTxHiRes_NA_cell_00_Window50.txt
├── [ 41M]  CIBERSORTxHiRes_NA_cell_01_Window50.txt
├── [ 

In [None]:
# pd.set_option('display.precision', 3)

In [21]:
pd.read_csv(
    os.path.join(base_path, "CIBERSORTxHiRes_NA_cell_00_Window50.txt"),
    sep="\t",
    index_col=0,
).mean(axis=1)

GeneSymbol
FAKE00000    132.182016
FAKE00001     52.729992
FAKE00002    142.841248
FAKE00003    181.682223
FAKE00004     92.631225
                ...    
FAKE19995     98.396343
FAKE19996     56.822467
FAKE19997    102.159662
FAKE19998    115.526668
FAKE19999     71.649398
Length: 20000, dtype: float64

In [22]:
cell_type_geps_known["cell_00"]

GeneSymbol
FAKE00000    134.653104
FAKE00001     51.618012
FAKE00002    143.920412
FAKE00003    180.249383
FAKE00004     92.819331
                ...    
FAKE19995     97.516457
FAKE19996     56.182387
FAKE19997    101.684565
FAKE19998    114.358086
FAKE19999     70.994265
Name: cell_00, Length: 20000, dtype: float64