In [15]:
import os
from pathlib import Path

import pandas as pd
import numpy as np
import scanpy as sc


#### prepare inputs for cibersortx

In [2]:
sct = sc.read_h5ad('../data/single_cell/checkpoints/non_eus_processed.h5ad')
sct

AnnData object with n_obs × n_vars = 113052 × 29227
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'nCount_SCT', 'nFeature_SCT', 'CELL', 'CONDITION', 'Patient', 'Type', 'Cell_type', 'integrated_snn_res.0.75', 'seurat_clusters', 'sample_id', 'cell_type', 'pollock_cell_type', 'dataset', 'Bailey | ADEX | score', 'Bailey | Squamous-like | score', 'Bailey | Pancreatic-Progenitor | score', 'Bailey | Immunogenic | score', 'Collison | Exocrine-like | score', 'Collison | Quasi-Mesenchymal | score', 'Collison | Classical | score', 'Moffit | Basal | score', 'Moffit | Classical | score', 'subTME | deserted | score', 'subTME | reactive | score', 'raghaven | scBasal | score', 'raghaven | scClassical | score', 'raghaven | IC | score', 'raghaven | Pericyte-like | score', 'raghaven | Fibroblast-like | score', 'raghaven | Inflammatory | score', 'raghaven | TAM-FCN1 | score', 'raghaven | TAM-C1QC | score', 'raghaven | TAM-SPP1 | score', 'elyada | myCAF | score', 'elyada | iCAF | score

In [19]:
# from collections import Counter
# for dataset in set(sct.obs['dataset']):
#     print(dataset)
#     tups = Counter(sct[sct.obs['dataset']==dataset].obs['cell_type_specific_final']).most_common()
#     for ct, c in tups:
#         print(ct, c)

In [3]:
# # subsample to 5k cells per cell type
# cell_types = sorted(set(sct.obs['cell_type_specific_final']))
# pool = []
# for ct in cell_types:
#     if 'Exclude' not in ct:
#         f = sct[sct.obs['cell_type_specific_final']==ct]
#         ids = list(np.random.choice(f.obs.index.to_list(), size=min(500, f.shape[0]), replace=False))
#         pool += ids
# f = sct[pool]
# f

In [4]:
sct = sct[[True if 'Exclude' not in c else False
          for c in sct.obs['cell_type_specific_final']]]
sct

View of AnnData object with n_obs × n_vars = 104486 × 29227
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'nCount_SCT', 'nFeature_SCT', 'CELL', 'CONDITION', 'Patient', 'Type', 'Cell_type', 'integrated_snn_res.0.75', 'seurat_clusters', 'sample_id', 'cell_type', 'pollock_cell_type', 'dataset', 'Bailey | ADEX | score', 'Bailey | Squamous-like | score', 'Bailey | Pancreatic-Progenitor | score', 'Bailey | Immunogenic | score', 'Collison | Exocrine-like | score', 'Collison | Quasi-Mesenchymal | score', 'Collison | Classical | score', 'Moffit | Basal | score', 'Moffit | Classical | score', 'subTME | deserted | score', 'subTME | reactive | score', 'raghaven | scBasal | score', 'raghaven | scClassical | score', 'raghaven | IC | score', 'raghaven | Pericyte-like | score', 'raghaven | Fibroblast-like | score', 'raghaven | Inflammatory | score', 'raghaven | TAM-FCN1 | score', 'raghaven | TAM-C1QC | score', 'raghaven | TAM-SPP1 | score', 'elyada | myCAF | score', 'elyada | iCAF

In [7]:
from mgitools.os_helpers import listfiles
bulk_fps = sorted(listfiles('../data/bulk_rna_seq/', regex=r'bulk_rna_seq/[^/]+_counts.txt$'))
ref_genes = set(sct.var.index.to_list())
bulk_genes = set()
for fp in bulk_fps:
    df = pd.read_csv(fp, sep='\t', index_col=0)
    bulk_genes.update(set(df.index.to_list()))
keep = bulk_genes.intersection(ref_genes)
len(keep)

20297

In [9]:
f = sct[:, sorted(keep)]
f

View of AnnData object with n_obs × n_vars = 104486 × 20297
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'nCount_SCT', 'nFeature_SCT', 'CELL', 'CONDITION', 'Patient', 'Type', 'Cell_type', 'integrated_snn_res.0.75', 'seurat_clusters', 'sample_id', 'cell_type', 'pollock_cell_type', 'dataset', 'Bailey | ADEX | score', 'Bailey | Squamous-like | score', 'Bailey | Pancreatic-Progenitor | score', 'Bailey | Immunogenic | score', 'Collison | Exocrine-like | score', 'Collison | Quasi-Mesenchymal | score', 'Collison | Classical | score', 'Moffit | Basal | score', 'Moffit | Classical | score', 'subTME | deserted | score', 'subTME | reactive | score', 'raghaven | scBasal | score', 'raghaven | scClassical | score', 'raghaven | IC | score', 'raghaven | Pericyte-like | score', 'raghaven | Fibroblast-like | score', 'raghaven | Inflammatory | score', 'raghaven | TAM-FCN1 | score', 'raghaven | TAM-C1QC | score', 'raghaven | TAM-SPP1 | score', 'elyada | myCAF | score', 'elyada | iCAF

In [10]:
# save counts for cytotrace
df = pd.DataFrame(data=f.layers['counts'].toarray(), columns=f.var.index.to_list(),
                  index=f.obs.index.to_list(), dtype=int)
df = df.transpose()
df['gene'] = [g.split('.')[0] for g in df.index.to_list()]
df = df.groupby('gene').mean().astype(int)
df.index.name = 'Gene'
# make ids compatible with r
# df.columns = ['X' + c for c in df.columns]
df.columns = [f.obs.loc[c, 'cell_type_specific_final'] for c in df.columns]
df = df[np.sum(df.values, axis=1)>0]
df

Unnamed: 0_level_0,NK,Malignant - Classical,Treg,B cell,CD8 T cell,Malignant - Proliferating Classical,CD8 T cell - Exhausted,CD4 T cell,Stellate,B cell,...,Malignant - IC,Malignant - IC,iCAF,Malignant - IC,Malignant - IC,iCAF,myCAF,myCAF,myCAF,Malignant - IC
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7SK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A1BG,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,1,1,1,0
A1BG-AS1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
A1CF,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2M,0,0,0,0,0,0,0,0,4,0,...,0,0,4,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZYG11B,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
ZYX,1,4,0,0,0,1,0,0,4,0,...,2,0,0,0,2,2,1,0,0,2
ZZEF1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df.to_csv('../data/deconvolution/cibersortx/inputs/sc_ref.txt', sep='\t')

#### run cibersortx via docker

example command from docs

docker run -v absolute/path/to/input/dir:/src/data -v absolute/path/to/output/dir:/src/outdir cibersortx/fractions --username email_address_registered_on_CIBERSORTx_website --token token_obtained_from_CIBERSORTx_website --single_cell TRUE --refsample Fig2ab-NSCLC_PBMCs_scRNAseq_refsample.txt --mixture Fig2b-WholeBlood_RNAseq.txt --fraction 0 --rmbatchSmode TRUE 

In [23]:
bulk_fps = sorted(listfiles('../data/bulk_rna_seq/', regex=r'bulk_rna_seq/[^/]+_cpm.txt$'))
bulk_fps

['../data/bulk_rna_seq/bailey_cpm.txt',
 '../data/bulk_rna_seq/cptac_cpm.txt',
 '../data/bulk_rna_seq/kirby_cpm.txt',
 '../data/bulk_rna_seq/tcga_cpm.txt']

In [24]:
input_dir = '/diskmnt/Projects/Users/estorrs/single-cell-pdac/data/deconvolution/cibersortx/inputs'
output_dir = '/diskmnt/Projects/Users/estorrs/single-cell-pdac/data/deconvolution/cibersortx/outputs'
sc_ref = '/src/data/sc_ref.txt'
username = 'estorrs@wustl.edu'
token = '5f2162d7ed4084effa23b2b794c31def'

In [25]:
import subprocess

In [29]:
cmds = []
for fp in bulk_fps:
    dataset = fp.split('/')[-1].split('_cpm.txt')[0]
    print(dataset)
    Path(os.path.join(output_dir, dataset)).mkdir(parents=True, exist_ok=True)
    out = os.path.join(output_dir, dataset)
    mixture = f'/src/data/{dataset}_cpm.txt'
    
    cmd = f'docker run -v {input_dir}:/src/data -v {out}:/src/outdir cibersortx/fractions --username {username} --token {token} --single_cell TRUE --refsample {sc_ref} --mixture {mixture} --fraction 0 --rmbatchSmode TRUE'
    cmds.append(cmd)
cmds

bailey
cptac
kirby
tcga


['docker run -v /diskmnt/Projects/Users/estorrs/single-cell-pdac/data/deconvolution/cibersortx/inputs:/src/data -v /diskmnt/Projects/Users/estorrs/single-cell-pdac/data/deconvolution/cibersortx/outputs/bailey:/src/outdir cibersortx/fractions --username estorrs@wustl.edu --token 5f2162d7ed4084effa23b2b794c31def --single_cell TRUE --refsample /src/data/sc_ref.txt --mixture /src/data/bailey_cpm.txt --fraction 0 --rmbatchSmode TRUE',
 'docker run -v /diskmnt/Projects/Users/estorrs/single-cell-pdac/data/deconvolution/cibersortx/inputs:/src/data -v /diskmnt/Projects/Users/estorrs/single-cell-pdac/data/deconvolution/cibersortx/outputs/cptac:/src/outdir cibersortx/fractions --username estorrs@wustl.edu --token 5f2162d7ed4084effa23b2b794c31def --single_cell TRUE --refsample /src/data/sc_ref.txt --mixture /src/data/cptac_cpm.txt --fraction 0 --rmbatchSmode TRUE',
 'docker run -v /diskmnt/Projects/Users/estorrs/single-cell-pdac/data/deconvolution/cibersortx/inputs:/src/data -v /diskmnt/Projects/U

In [30]:
for cmd in cmds:
    print(cmd)
    output = subprocess.check_output(cmd, shell=True)
    print(output)

docker run -v /diskmnt/Projects/Users/estorrs/single-cell-pdac/data/deconvolution/cibersortx/inputs:/src/data -v /diskmnt/Projects/Users/estorrs/single-cell-pdac/data/deconvolution/cibersortx/outputs/bailey:/src/outdir cibersortx/fractions --username estorrs@wustl.edu --token 5f2162d7ed4084effa23b2b794c31def --single_cell TRUE --refsample /src/data/sc_ref.txt --mixture /src/data/bailey_cpm.txt --fraction 0 --rmbatchSmode TRUE


KeyboardInterrupt: 

#### read and save results