In [1]:
import os
import tempfile
import urllib

import pandas as pd

from helpers.cell_type_naming import nice_to_weirds, weird_to_nice

In [2]:
truth = (
    pd.read_csv(
        "gs://liulab/csx_example_files/groundtruth_Melanoma_Tirosh_et_al_SuppFig3b-d.txt",
        sep="\t",
        index_col=0,
    )
    .rename_axis(index="sample_id")
    .rename(columns=weird_to_nice)
)

truth

Unnamed: 0_level_0,Malignant,Endothelial,CAF,T CD8,NK,Macrophage,T CD4,B
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
53,0.128,0.088,0.032,0.2,0.08,0.096,0.376,0.0
58,0.0,0.0,0.0,0.6508,0.0317,0.0159,0.2857,0.0159
59,0.871,0.0,0.1129,0.0,0.0,0.0161,0.0,0.0
60,0.0448,0.0,0.0,0.194,0.0498,0.0199,0.2139,0.4776
65,0.0755,0.0,0.0,0.4717,0.0,0.0189,0.3396,0.0943
67,0.0,0.0,0.0,0.3059,0.0118,0.0,0.4588,0.2235
71,0.6835,0.0,0.0,0.1899,0.0,0.0253,0.1013,0.0
72,0.0,0.0,0.0,0.1634,0.0065,0.0,0.6013,0.2288
74,0.0,0.0,0.0,0.6496,0.0073,0.0365,0.2117,0.0949
75,0.0087,0.0,0.0,0.9417,0.0,0.0029,0.0466,0.0


In [3]:
experiments_root = "liulab/csx_experiments/varying_parameters"

!tree -h -L 1 {os.path.join("/mnt/buckets", experiments_root)}
!tree -h -L 1 {os.path.join("/mnt/buckets", experiments_root, "defaults,single_cell_true")}

[01;34m/mnt/buckets/liulab/csx_experiments/varying_parameters[00m
├── [   0]  [01;34mdefaults[00m
├── [   0]  [01;34mrefsample_from_defaults[00m
├── [   0]  [01;34mrefsample_from_defaults,single_cell_true[00m
├── [   0]  [01;34mrefsample_from_defaults,with_bmode[00m
├── [   0]  [01;34mrefsample_from_webjob4[00m
├── [   0]  [01;34mrefsample_from_webjob4,with_bmode[00m
├── [   0]  [01;34msigmat_from_defaults[00m
├── [   0]  [01;34msigmat_from_defaults,with_bmode[00m
├── [   0]  [01;34msigmat_from_webjob4[00m
├── [   0]  [01;34msigmat_from_webjob4,with_bmode[00m
└── [   0]  [01;34mwith_bmode[00m

11 directories, 0 files
/mnt/buckets/liulab/csx_experiments/varying_parameters/defaults,single_cell_true [error opening dir]

0 directories, 0 files


In [4]:
experiments = os.listdir(os.path.join("/mnt/buckets", experiments_root))
experiments

['defaults',
 'refsample_from_defaults',
 'refsample_from_defaults,single_cell_true',
 'refsample_from_defaults,with_bmode',
 'refsample_from_webjob4',
 'refsample_from_webjob4,with_bmode',
 'sigmat_from_defaults',
 'sigmat_from_defaults,with_bmode',
 'sigmat_from_webjob4',
 'sigmat_from_webjob4,with_bmode',
 'with_bmode']

In [5]:
def load_csx_results(experiment_uri):
    try:
        uri = os.path.join(experiment_uri, "CIBERSORTx_Results.txt")
        df = pd.read_csv(uri, sep="\t", index_col=0)
    except:
        uri = os.path.join(experiment_uri, "CIBERSORTx_Adjusted.txt")
        df = pd.read_csv(uri, sep="\t", index_col=0)
    df = df.iloc[:, :-3]  # ignore last three columns, which are test statistics
    df.rename(columns=weird_to_nice, inplace=True)
    df.rename_axis(columns="cell_type", index="sample_id", inplace=True)
    return df.stack()

In [6]:
preds = dict()
# preds["derek"] = results_derek
# preds["me"] = results_docker
for experiment in experiments:
    try:
        experiment_uri = os.path.join("gs://", experiments_root, experiment)
        preds[experiment] = load_csx_results(experiment_uri)
    except FileNotFoundError:
        print("no results for", experiment, experiment_uri)

df_preds = pd.concat(preds, names=["experiment"], axis=0).rename("csx_fraction")

# df_preds = df_preds.unstack("experiment")

df_preds

experiment  sample_id  cell_type  
defaults    53         Malignant      0.046178
                       Endothelial    0.027261
                       CAF            0.058420
                       T CD8          0.286861
                       NK             0.164019
                                        ...   
with_bmode  94         T CD8          0.003151
                       NK             0.017730
                       Macrophage     0.037556
                       T CD4          0.468256
                       B              0.291373
Name: csx_fraction, Length: 1672, dtype: float64

In [7]:
truth = pd.read_csv(
    "gs://liulab/csx_example_files/groundtruth_Melanoma_Tirosh_et_al_SuppFig3b-d.txt",
    sep="\t",
    index_col=0,
)
truth.rename_axis(columns="cell_type", index="sample_id", inplace=True)
truth.rename(columns=weird_to_nice, inplace=True)
truth = truth.stack().to_frame(name="true_fraction")
truth

Unnamed: 0_level_0,Unnamed: 1_level_0,true_fraction
sample_id,cell_type,Unnamed: 2_level_1
53,Malignant,0.1280
53,Endothelial,0.0880
53,CAF,0.0320
53,T CD8,0.2000
53,NK,0.0800
...,...,...
94,T CD8,0.2025
94,NK,0.0041
94,Macrophage,0.0083
94,T CD4,0.3306


In [8]:
truth.join(df_preds.unstack("experiment"), how="left").corr().sort_values(
    by="true_fraction", ascending=False
)

Unnamed: 0,true_fraction,defaults,refsample_from_defaults,"refsample_from_defaults,single_cell_true","refsample_from_defaults,with_bmode",refsample_from_webjob4,"refsample_from_webjob4,with_bmode",sigmat_from_defaults,"sigmat_from_defaults,with_bmode",sigmat_from_webjob4,"sigmat_from_webjob4,with_bmode",with_bmode
true_fraction,1.0,0.940378,0.940378,0.940378,0.940901,0.941592,0.93837,0.940378,0.940901,0.941592,0.93837,0.940901
refsample_from_webjob4,0.941592,0.99075,0.99075,0.99075,0.991459,1.0,0.995349,0.99075,0.991459,1.0,0.995349,0.991459
sigmat_from_webjob4,0.941592,0.99075,0.99075,0.99075,0.991459,1.0,0.995349,0.99075,0.991459,1.0,0.995349,0.991459
"refsample_from_defaults,with_bmode",0.940901,0.995794,0.995794,0.995794,1.0,0.991459,0.993899,0.995794,1.0,0.991459,0.993899,1.0
"sigmat_from_defaults,with_bmode",0.940901,0.995794,0.995794,0.995794,1.0,0.991459,0.993899,0.995794,1.0,0.991459,0.993899,1.0
with_bmode,0.940901,0.995794,0.995794,0.995794,1.0,0.991459,0.993899,0.995794,1.0,0.991459,0.993899,1.0
defaults,0.940378,1.0,1.0,1.0,0.995794,0.99075,0.98688,1.0,0.995794,0.99075,0.98688,0.995794
refsample_from_defaults,0.940378,1.0,1.0,1.0,0.995794,0.99075,0.98688,1.0,0.995794,0.99075,0.98688,0.995794
"refsample_from_defaults,single_cell_true",0.940378,1.0,1.0,1.0,0.995794,0.99075,0.98688,1.0,0.995794,0.99075,0.98688,0.995794
sigmat_from_defaults,0.940378,1.0,1.0,1.0,0.995794,0.99075,0.98688,1.0,0.995794,0.99075,0.98688,0.995794


In [9]:
x = df_preds.loc[["defaults", "sigmat_from_webjob4"]]
_ = (
    truth.join(x.unstack("experiment"), how="left")
    .corr()
    .sort_values(by="true_fraction", ascending=False)
)
_[["true_fraction"]]

Unnamed: 0,true_fraction
true_fraction,1.0
sigmat_from_webjob4,0.941592
defaults,0.940378


In [10]:
validation_set = [
    x not in (80, 88) for x in df_preds.index.get_level_values("sample_id")
]

In [11]:
_ = truth.join(
    df_preds.loc[:, validation_set, :].unstack("experiment"), how="left"
).corr()
_ = _[["true_fraction", "sigmat_from_webjob4", "defaults"]].loc[["true_fraction"]]
_["diff"] = _["defaults"] - _["sigmat_from_webjob4"]
_.columns = ["true_fraction", "web app", "docker", "(docker minus web app)"]
_

Unnamed: 0,true_fraction,web app,docker,(docker minus web app)
true_fraction,1.0,0.939651,0.938371,-0.001281


In [12]:
_ = truth.join(
    df_preds.loc[:, validation_set, :].unstack("experiment"), how="left"
).corr()
_ = _[["true_fraction", "defaults", "with_bmode"]].loc[["true_fraction"]]
_["diff"] = _["with_bmode"] - _["defaults"]
_.columns = ["true_fraction", "normal", "w/ b-mode", "(normal minus w/ b-mode)"]
_

Unnamed: 0,true_fraction,normal,w/ b-mode,(normal minus w/ b-mode)
true_fraction,1.0,0.938371,0.938945,0.000574


In [13]:
_ = (
    truth.join(df_preds.loc[:, validation_set, :].unstack("experiment"), how="left")
    .groupby(level="cell_type")
    .corr()
    .stack()
    .loc[:, "true_fraction", ["true_fraction", "sigmat_from_webjob4", "defaults"]]
    .unstack()
    .sort_values(by="defaults", ascending=False)
)
_["diff"] = _["defaults"] - _["sigmat_from_webjob4"]
_.columns = ["true_fraction", "web app", "docker", "(docker minus web app)"]
_

Unnamed: 0_level_0,Unnamed: 1_level_0,true_fraction,web app,docker,(docker minus web app)
cell_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
B,true_fraction,1.0,0.979727,0.979578,-0.000149
Macrophage,true_fraction,1.0,0.947757,0.960535,0.012779
Malignant,true_fraction,1.0,0.956962,0.95586,-0.001102
T CD4,true_fraction,1.0,0.942202,0.936408,-0.005794
T CD8,true_fraction,1.0,0.897413,0.896265,-0.001148
NK,true_fraction,1.0,0.895197,0.886451,-0.008747
CAF,true_fraction,1.0,0.856806,0.845197,-0.011609
Endothelial,true_fraction,1.0,0.822015,0.813831,-0.008184


In [14]:
truth.join(
    df_preds.loc[:, validation_set, :].unstack("experiment")[["with_bmode"]], how="left"
).groupby(level="cell_type").corr().stack().loc[
    :, "true_fraction", "with_bmode"
].sort_values(
    ascending=False
)

cell_type
B              0.979903
Malignant      0.955684
Macrophage     0.948048
T CD4          0.937925
T CD8          0.890556
NK             0.883886
CAF            0.863670
Endothelial    0.861230
dtype: float64

In [15]:
import plotly.io

plotly.io.renderers.default = "jupyterlab+png"

import plotly.express as px
import plotly.graph_objects as go

In [16]:
fig = px.bar(
    truth,
    x=truth.index.astype(str),
    y=truth.columns,
    title="True fractions of each mixture",
)

fig.show()

TypeError: Setting a MultiIndex dtype to anything other than object is not supported