# Load Derek's pseudobulk analyses (2sd perturbations)

In [1]:
import glob
import os
import sys

import pandas as pd

## mount data

In [2]:
# mount google bucket as a directory in home folder with gcsfuse
!gcsfuse --implicit-dirs liulab /home/jupyter/mnt/liulab

2021/07/07 14:15:23.918169 Using mount point: /home/jupyter/mnt/liulab
2021/07/07 14:15:23.926223 Opening GCS connection...
2021/07/07 14:15:24.187707 Mounting file system "liulab"...
daemonize.Run: readFromProcess: sub-process: mountWithArgs: mountWithConn: Mount: mount: running fusermount: exit status 1

stderr:
/usr/bin/fusermount: failed to access mountpoint /home/jupyter/mnt/liulab: Permission denied



In [3]:
# check that it worked
!tree -h -L 2 /home/jupyter/mnt/liulab

[01;34m/home/jupyter/mnt/liulab[00m
├── [   0]  [01;34mcsx_example_files[00m
│   ├── [   0]  [01;34mExpression_datasets[00m
│   ├── [ 835]  Fig2b_ground_truth_whole_blood.txt
│   ├── [143K]  LM22.txt
│   ├── [   0]  [01;34mSingle_Cell_RNA-Seq_Melanoma_SuppFig_3b-d[00m
│   ├── [1.9K]  groundtruth_HNSCC_Puram_et_al_Fig2cd.txt
│   └── [1.2K]  groundtruth_Melanoma_Tirosh_et_al_SuppFig3b-d.txt
├── [   0]  [01;34mdata[00m
│   ├── [   0]  [01;34mdownloaded_manually[00m
│   ├── [   0]  [01;34mftp[00m
│   └── [   0]  [01;34mgeoparse[00m
└── [   0]  [01;34mderek[00m
    ├── [   0]  [01;34mscRNAseq[00m
    └── [   0]  [01;34msimulations[00m

10 directories, 4 files


## find 2sd files

In [4]:
for filepath in glob.iglob('/home/jupyter/mnt/liulab/derek/**/*', recursive = True):
    if '2sd' not in filepath:
        continue
    print(filepath)
#     print("." * 100)
#     print(pd.read_csv(filepath, sep="\t").head(3))
#     print("=" * 100)


/home/jupyter/mnt/liulab/derek/simulations/experiments/cibersortx_sim_2sd.txt
/home/jupyter/mnt/liulab/derek/simulations/experiments/cibersortx/cibersortx_sim_2sd
/home/jupyter/mnt/liulab/derek/simulations/experiments/cibersortx/cibersortx_sim_2sd/CIBERSORTxGEP_Job1_Fractions-Adjusted.txt
/home/jupyter/mnt/liulab/derek/simulations/experiments/cibersortx/cibersortx_sim_2sd/CIBERSORTxGEP_Job1_GEPs.txt
/home/jupyter/mnt/liulab/derek/simulations/experiments/cibersortx/cibersortx_sim_2sd/CIBERSORTxGEP_Job1_GEPs_CVs.txt
/home/jupyter/mnt/liulab/derek/simulations/experiments/cibersortx/cibersortx_sim_2sd/CIBERSORTxGEP_Job1_GEPs_Filtered.txt
/home/jupyter/mnt/liulab/derek/simulations/experiments/cibersortx/cibersortx_sim_2sd/CIBERSORTxGEP_Job1_GEPs_Pvals.txt
/home/jupyter/mnt/liulab/derek/simulations/experiments/cibersortx/cibersortx_sim_2sd/CIBERSORTxGEP_Job1_GEPs_Qvals.txt
/home/jupyter/mnt/liulab/derek/simulations/experiments/cibersortx/cibersortx_sim_2sd/CIBERSORTxGEP_Job1_GEPs_StdErrs.txt

## load files

In [10]:
base_dir = "/home/jupyter/mnt/liulab/derek/simulations/experiments/"

helper function for standardizing column names

In [5]:
def process_column_name(column: str):
    column = column.lower().strip()
    import re
    column = re.sub('.cell(s)?', '', column)
    column = re.sub('\.| ', '_', column)
    return column

### groundtruth proportions

In [11]:
pd.read_csv(
    os.path.join(base_dir, "generate_cohorts_for_emma/ctp_sim_2sd.txt"),
    sep="\t"
).rename(columns=process_column_name)

Unnamed: 0,malignant,endothelial,caf,t_cd8,nk,macrophage,t_cd4,b
1,0.274,0.080697,0.140275,0.037578,0.051757,0.066721,0.226917,0.122056
2,0.761,0.026566,0.046179,0.012371,0.017038,0.021965,0.074701,0.040181
3,0.843,0.017451,0.030335,0.008126,0.011193,0.014429,0.049072,0.026395
4,0.739,0.029011,0.050429,0.013509,0.018607,0.023986,0.081578,0.043880
5,0.710,0.032234,0.056033,0.015010,0.020674,0.026651,0.090642,0.048755
...,...,...,...,...,...,...,...,...
96,0.741,0.028789,0.050043,0.013406,0.018464,0.023803,0.080953,0.043543
97,0.717,0.031456,0.054680,0.014648,0.020175,0.026008,0.088454,0.047578
98,0.877,0.013672,0.023766,0.006366,0.008769,0.011304,0.038445,0.020679
99,0.838,0.018007,0.031301,0.008385,0.011549,0.014888,0.050634,0.027236


### generated (_in silico_) bulk samples

In [12]:
pd.read_csv(
    os.path.join(base_dir, "generate_cohorts_for_emma/sim_2sd.txt"),
    sep="\t",
    index_col=0
).sort_index().rename(columns=process_column_name)

Unnamed: 0,sim_tumor_1,sim_tumor_2,sim_tumor_3,sim_tumor_4,sim_tumor_5,sim_tumor_6,sim_tumor_7,sim_tumor_8,sim_tumor_9,sim_tumor_10,...,sim_tumor_91,sim_tumor_92,sim_tumor_93,sim_tumor_94,sim_tumor_95,sim_tumor_96,sim_tumor_97,sim_tumor_98,sim_tumor_99,sim_tumor_100
A1BG,0.377547,8.628191,38.431846,5.934557,0.338356,1.102952,3.083051,2.002206,4.130644,34.079710,...,0.527516,0.139895,1.792838,0.047740,38.174442,12.897920,0.647267,0.252936,3.172672,1.694881
A1BG-AS1,0.209953,0.020849,0.135829,0.356260,2.533599,0.060231,10.318623,4.331257,5.118947,3.699168,...,9.609804,0.131246,3.991650,12.090708,0.017826,2.319751,0.029113,0.653743,4.750354,1.647821
A1CF,0.151359,0.056931,0.013277,0.054924,0.027325,0.264889,0.284251,0.037437,0.258140,0.552371,...,0.459364,0.035548,0.222898,0.082564,0.102891,0.046093,0.493716,0.084600,0.063418,0.153745
A2M,1077.666960,2339.561607,68.294249,997.914030,93.787339,651.816464,354.478062,1586.392914,350.044136,175.448801,...,89.718197,610.348155,433.905854,147.306104,83.023134,182.289944,1575.462935,1355.517361,162.118717,99.823493
A2M-AS1,11.252200,0.000000,0.000000,2.938910,0.020352,12.987792,2.625414,0.394104,3.855610,0.990894,...,5.209404,6.441148,0.200991,9.427507,0.066432,0.060262,1.682709,3.727766,0.000000,0.171641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,12.885265,1.941869,2.110330,5.789060,25.995065,6.144268,5.798888,7.603303,9.065823,13.647411,...,7.970289,5.018676,18.020256,7.953073,1.633666,4.017629,8.674140,8.715584,4.081291,7.897161
ZYG11B,8.804161,4.741164,2.315879,4.628113,17.756549,5.646317,10.409268,5.512391,9.516134,12.485028,...,18.654090,21.328501,10.867985,6.914683,3.858697,1.683303,12.333735,19.835904,3.917441,4.455430
ZYX,30.642090,57.732435,11.559291,135.836769,31.550229,46.172855,21.362768,75.589725,59.141445,84.244547,...,46.683227,61.904730,138.151159,45.697515,16.453333,16.651073,9.495373,41.734528,11.235196,6.618545
ZZEF1,6.020770,1.341900,0.727092,17.095655,1.572894,2.864163,5.388679,2.368271,6.452461,3.372158,...,2.853852,2.763041,3.466142,0.025354,0.202180,1.232799,18.178838,7.544118,10.035816,2.148776


#### same thing in another file

In [13]:
pd.read_csv(
    os.path.join(base_dir, "cibersortx_sim_2sd.txt"),
    sep="\t",
    index_col=0
).sort_index()

Unnamed: 0_level_0,sim.tumor.1,sim.tumor.2,sim.tumor.3,sim.tumor.4,sim.tumor.5,sim.tumor.6,sim.tumor.7,sim.tumor.8,sim.tumor.9,sim.tumor.10,...,sim.tumor.91,sim.tumor.92,sim.tumor.93,sim.tumor.94,sim.tumor.95,sim.tumor.96,sim.tumor.97,sim.tumor.98,sim.tumor.99,sim.tumor.100
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.377547,8.628191,38.431846,5.934557,0.338356,1.304081,3.083051,2.002206,1.588704,45.411955,...,0.605506,0.139895,1.792838,0.070797,38.174442,11.689945,0.647267,0.854497,3.172672,1.694881
A1BG-AS1,0.209953,0.020849,0.135829,0.356260,2.533599,0.042264,10.318623,4.331257,11.906094,1.910839,...,11.030535,0.131246,3.991650,8.804913,0.017826,3.016650,0.029113,2.208551,4.750354,1.647821
A1CF,0.151359,0.056931,0.013277,0.054924,0.027325,0.213459,0.284251,0.037437,0.098924,0.670424,...,0.455784,0.035548,0.222898,0.122439,0.102891,0.059940,0.493716,0.172510,0.063418,0.153745
A2M,1077.666960,2339.561607,68.294249,997.914030,93.787339,692.953907,354.478062,1586.392914,59.483907,119.212277,...,96.615861,610.348155,433.905854,214.815199,83.023134,237.053411,1575.462935,1032.671709,162.118717,99.823493
A2M-AS1,11.252200,0.000000,0.000000,2.938910,0.020352,9.309063,2.625414,0.394104,9.068281,0.511855,...,5.979572,6.441148,0.200991,13.980623,0.066432,0.078366,1.682709,6.271585,0.000000,0.171641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,12.885265,1.941869,2.110330,5.789060,25.995065,5.628393,5.798888,7.603303,4.853414,14.294516,...,8.199952,5.018676,18.020256,9.961530,1.633666,4.998457,8.674140,11.554351,4.081291,7.897161
ZYG11B,8.804161,4.741164,2.315879,4.628113,17.756549,4.812411,10.409268,5.512391,10.390681,15.099892,...,19.044051,21.328501,10.867985,10.050951,3.858697,2.088491,12.333735,14.068097,3.917441,4.455430
ZYX,30.642090,57.732435,11.559291,135.836769,31.550229,38.683274,21.362768,75.589725,19.389490,78.180151,...,53.584963,61.904730,138.151159,67.767627,16.453333,21.358889,9.495373,49.935370,11.235196,6.618545
ZZEF1,6.020770,1.341900,0.727092,17.095655,1.572894,2.352819,5.388679,2.368271,1.084141,2.769603,...,3.275771,2.763041,3.466142,0.037598,0.202180,1.603156,18.178838,6.849002,10.035816,2.148776


### CIBERSORTx inferred proportions

In [14]:
pd.read_csv(
    os.path.join(base_dir, "cibersortx/cibersortx_sim_2sd/CIBERSORTxGEP_Job1_Fractions-Adjusted.txt"),
#     os.path.join(base_dir, "cibersortx/old/cibersortx_sim_2sd/CIBERSORTxGEP_Job5_Fractions-Adjusted.txt"),
    sep="\t",
    index_col=0
).rename(columns=process_column_name)

Unnamed: 0_level_0,malignant,endothelial,caf,t_cd8,nk,macrophages,t_cd4,b,p-value,correlation,rmse
Mixture,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
sim.tumor.1,0.467000,0.007153,0.104783,0.034400,0.041658,0.073266,0.201250,0.070491,9999,0.878855,0.698618
sim.tumor.2,0.772265,0.001136,0.033639,0.006319,0.023389,0.036233,0.101917,0.025101,9999,0.835026,0.698110
sim.tumor.3,0.276416,0.001790,0.195520,0.078654,0.035919,0.136419,0.219770,0.055512,9999,0.598221,0.817916
sim.tumor.4,0.467639,0.033508,0.142829,0.044773,0.030813,0.085399,0.167128,0.027909,9999,0.675735,0.801465
sim.tumor.5,0.499380,0.014246,0.053157,0.063432,0.022105,0.076230,0.250507,0.020943,9999,0.878866,0.708137
...,...,...,...,...,...,...,...,...,...,...,...
sim.tumor.96,0.340897,0.013736,0.070578,0.036123,0.033515,0.106269,0.336250,0.062633,9999,0.879483,0.679410
sim.tumor.97,0.655441,0.016507,0.046579,0.030203,0.020211,0.041989,0.134156,0.054914,9999,0.839518,0.714571
sim.tumor.98,0.439056,0.013107,0.076885,0.019736,0.070364,0.101915,0.245603,0.033334,9999,0.836707,0.712821
sim.tumor.99,0.661488,0.015796,0.082957,0.066967,0.010059,0.056062,0.089382,0.017288,9999,0.639644,0.811884
