In [7]:
import os
import anndata
import pandas as pd
import numpy as np
import scanpy as sc

from mgitools.os_helpers import listfiles

#### prepare inputs for cibersortx

In [8]:
sct = sc.read_h5ad('../data/single_cell/checkpoints/non_eus_processed.h5ad')
sct

AnnData object with n_obs × n_vars = 113052 × 29227
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'nCount_SCT', 'nFeature_SCT', 'CELL', 'CONDITION', 'Patient', 'Type', 'Cell_type', 'integrated_snn_res.0.75', 'seurat_clusters', 'sample_id', 'cell_type', 'pollock_cell_type', 'dataset', 'Bailey | ADEX | score', 'Bailey | Squamous-like | score', 'Bailey | Pancreatic-Progenitor | score', 'Bailey | Immunogenic | score', 'Collison | Exocrine-like | score', 'Collison | Quasi-Mesenchymal | score', 'Collison | Classical | score', 'Moffit | Basal | score', 'Moffit | Classical | score', 'subTME | deserted | score', 'subTME | reactive | score', 'raghaven | scBasal | score', 'raghaven | scClassical | score', 'raghaven | IC | score', 'raghaven | Pericyte-like | score', 'raghaven | Fibroblast-like | score', 'raghaven | Inflammatory | score', 'raghaven | TAM-FCN1 | score', 'raghaven | TAM-C1QC | score', 'raghaven | TAM-SPP1 | score', 'elyada | myCAF | score', 'elyada | iCAF | score

In [10]:
set(sct.obs['cell_type_specific_final'])

{'ADM',
 'Acinar',
 'B cell',
 'CD4 T cell',
 'CD8 T cell',
 'CD8 T cell - Exhausted',
 'DC',
 'Endocrine',
 'Endothelial',
 'Exclude - Ambiguous',
 'Exclude - Singleton',
 'Immune - Proliferating',
 'Malignant - Basal',
 'Malignant - Classical',
 'Malignant - IC',
 'Malignant - Proliferating Basal',
 'Malignant - Proliferating Classical',
 'Malignant - Proliferating IC',
 'Mast',
 'NK',
 'Plasma',
 'Stellate',
 'TAM - C1QC',
 'TAM - FCN1',
 'TAM - Proliferating',
 'TAM - SPP1',
 'Treg',
 'iCAF',
 'myCAF'}

In [11]:
sct = sct[[True if 'Exclude' not in ct else False
          for ct in sct.obs['cell_type_specific_final']]]
sct.shape

(104486, 29227)

In [12]:
ref_adata = anndata.AnnData(X=sct.layers['counts'],
                            obs=sct.obs[['sample_id', 'cell_type_specific_final', 'cell_type_specific_final']],
                           var=sct.var)
ref_adata

  This is separate from the ipykernel package so we can avoid doing imports until


AnnData object with n_obs × n_vars = 104486 × 29227
    obs: 'sample_id', 'cell_type_specific_final', 'cell_type_specific_final'

In [13]:
ref_adata.obs.columns = ['patient.id', 'cell.type', 'cell.subtype']
ref_adata.obs

Unnamed: 0,patient.id,cell.type,cell.subtype
1555-tumor_AAACCTGAGACCTAGG-1,1555-tumor,NK,NK
1555-tumor_AAACCTGAGTGCGTGA-1,1555-tumor,Malignant - Classical,Malignant - Classical
1555-tumor_AAACCTGCATCCCACT-1,1555-tumor,Treg,Treg
1555-tumor_AAACCTGGTCATGCAT-1,1555-tumor,B cell,B cell
1555-tumor_AAACCTGTCCGGGTGT-1,1555-tumor,CD8 T cell,CD8 T cell
...,...,...,...
G9903_filtered_gene_bc_matrices_h5.h5_TTTGGTTTCCTAGTGA-1,G9903,iCAF,iCAF
G9903_filtered_gene_bc_matrices_h5.h5_TTTGGTTTCTACCAGA-1,G9903,myCAF,myCAF
G9903_filtered_gene_bc_matrices_h5.h5_TTTGTCAAGTTGTCGT-1,G9903,myCAF,myCAF
G9903_filtered_gene_bc_matrices_h5.h5_TTTGTCACAACTTGAC-1,G9903,myCAF,myCAF


In [14]:
# # subsample to 5k cells per cell type
# cell_types = sorted(set(sct.obs['cell_type_specific_final']))
# pool = []
# for ct in cell_types:
#     if 'Exclude' not in ct:
#         f = sct[sct.obs['cell_type_specific_final']==ct]
#         ids = list(np.random.choice(f.obs.index.to_list(), size=min(500, f.shape[0]), replace=False))
#         pool += ids
# f = ref_adata[pool]
# f = f[:, np.sum(f.X, axis=0)>10]
# f

In [15]:
f = ref_adata

In [16]:
bulk_fps = sorted(listfiles('../data/bulk_rna_seq/', regex=r'bulk_rna_seq/[^/]+_counts.txt$'))
bulk_fps

['../data/bulk_rna_seq/bailey_counts.txt',
 '../data/bulk_rna_seq/cptac_counts.txt',
 '../data/bulk_rna_seq/kirby_counts.txt',
 '../data/bulk_rna_seq/tcga_counts.txt']

In [17]:
genes = set(f.var.index.to_list())
for fp in bulk_fps:
    df = pd.read_csv(fp, sep='\t', index_col=0)
    genes.intersection_update(set(df.index.to_list()))
    print(fp, len(genes))
len(genes)

../data/bulk_rna_seq/bailey_counts.txt 18749
../data/bulk_rna_seq/cptac_counts.txt 18263
../data/bulk_rna_seq/kirby_counts.txt 15118
../data/bulk_rna_seq/tcga_counts.txt 15081


15081

In [18]:
f = f[:, sorted(genes)]
f

View of AnnData object with n_obs × n_vars = 104486 × 15081
    obs: 'patient.id', 'cell.type', 'cell.subtype'

In [19]:
f.obs

Unnamed: 0,patient.id,cell.type,cell.subtype
1555-tumor_AAACCTGAGACCTAGG-1,1555-tumor,NK,NK
1555-tumor_AAACCTGAGTGCGTGA-1,1555-tumor,Malignant - Classical,Malignant - Classical
1555-tumor_AAACCTGCATCCCACT-1,1555-tumor,Treg,Treg
1555-tumor_AAACCTGGTCATGCAT-1,1555-tumor,B cell,B cell
1555-tumor_AAACCTGTCCGGGTGT-1,1555-tumor,CD8 T cell,CD8 T cell
...,...,...,...
G9903_filtered_gene_bc_matrices_h5.h5_TTTGGTTTCCTAGTGA-1,G9903,iCAF,iCAF
G9903_filtered_gene_bc_matrices_h5.h5_TTTGGTTTCTACCAGA-1,G9903,myCAF,myCAF
G9903_filtered_gene_bc_matrices_h5.h5_TTTGTCAAGTTGTCGT-1,G9903,myCAF,myCAF
G9903_filtered_gene_bc_matrices_h5.h5_TTTGTCACAACTTGAC-1,G9903,myCAF,myCAF


In [20]:
# enter sample id for tumor subtype
f.obs['cell.subtype'] = [s if 'Malignant' in c else c for s, c in zip(f.obs['patient.id'], f.obs['cell.subtype'])]
f.obs

  


Unnamed: 0,patient.id,cell.type,cell.subtype
1555-tumor_AAACCTGAGACCTAGG-1,1555-tumor,NK,NK
1555-tumor_AAACCTGAGTGCGTGA-1,1555-tumor,Malignant - Classical,1555-tumor
1555-tumor_AAACCTGCATCCCACT-1,1555-tumor,Treg,Treg
1555-tumor_AAACCTGGTCATGCAT-1,1555-tumor,B cell,B cell
1555-tumor_AAACCTGTCCGGGTGT-1,1555-tumor,CD8 T cell,CD8 T cell
...,...,...,...
G9903_filtered_gene_bc_matrices_h5.h5_TTTGGTTTCCTAGTGA-1,G9903,iCAF,iCAF
G9903_filtered_gene_bc_matrices_h5.h5_TTTGGTTTCTACCAGA-1,G9903,myCAF,myCAF
G9903_filtered_gene_bc_matrices_h5.h5_TTTGTCAAGTTGTCGT-1,G9903,myCAF,myCAF
G9903_filtered_gene_bc_matrices_h5.h5_TTTGTCACAACTTGAC-1,G9903,myCAF,myCAF


In [25]:
f.obs.index = ['X' + x for x in f.obs.index.to_list()]
f.obs.index.name = 'cell.id'

In [26]:
f.obs.to_csv('../data/deconvolution/bayesprism/inputs/sc_ref_annotations.txt', sep='\t')

In [27]:
# save counts for cytotrace
df = pd.DataFrame(data=f.X.toarray(), columns=f.var.index.to_list(),
                  index=f.obs.index.to_list(), dtype=int)
df = df.transpose()
df.index.name = 'Gene'

df

Unnamed: 0_level_0,X1555-tumor_AAACCTGAGACCTAGG-1,X1555-tumor_AAACCTGAGTGCGTGA-1,X1555-tumor_AAACCTGCATCCCACT-1,X1555-tumor_AAACCTGGTCATGCAT-1,X1555-tumor_AAACCTGTCCGGGTGT-1,X1555-tumor_AAACGGGAGACAATAC-1,X1555-tumor_AAACGGGAGGCGCTCT-1,X1555-tumor_AAACGGGAGGGTATCG-1,X1555-tumor_AAACGGGGTGAGTGAC-1,X1555-tumor_AAACGGGTCTCGCATC-1,...,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGCGCCACAGGAGT-1,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGCGCGTACCGTAT-1,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGCGCGTGCAACTT-1,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGCGCTCGTGGACC-1,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGGTTGTTGTACAC-1,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGGTTTCCTAGTGA-1,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGGTTTCTACCAGA-1,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGTCAAGTTGTCGT-1,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGTCACAACTTGAC-1,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGTCATCCGAACGC-1
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,1,1,1,0
A1CF,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2M,0,0,0,0,0,0,0,0,4,0,...,0,0,4,0,0,0,0,0,0,0
A2ML1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A4GALT,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
ZYG11A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZYG11B,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
ZYX,1,4,0,0,0,1,0,0,4,0,...,2,0,0,0,2,2,1,0,0,2


In [29]:
df.to_csv('../data/deconvolution/bayesprism/inputs/sc_ref_counts.txt', sep='\t')

In [30]:
# f.obs['cell_ID'] = f.obs.index.to_list()
# f.obs['tumor_flag'] = [1 if 'Malignant' in ct else 0 for ct in f.obs['cell.type']]
# f.obs['cell_type'] = f.obs['cell.type']
# f.obs = f.obs[['cell_ID', 'cell_type', 'tumor_flag']]
# f.obs

In [31]:
# f.write_h5ad('../data/deconvolution/bayesprism/inputs/sc_ref_500per.h5ad')

In [33]:
bulk_fps = sorted(listfiles('../data/bulk_rna_seq/', regex=r'bulk_rna_seq/[^/]+_counts.txt$'))
bulk_fps

['../data/bulk_rna_seq/bailey_counts.txt',
 '../data/bulk_rna_seq/cptac_counts.txt',
 '../data/bulk_rna_seq/kirby_counts.txt',
 '../data/bulk_rna_seq/tcga_counts.txt']

In [51]:
for fp in bulk_fps:
    dataset = fp.split('/')[-1].split('_')[0]
    df = pd.read_csv(fp, sep='\t', index_col=0)
    df = df.loc[sorted(genes), :]
    df.to_csv(f'../data/deconvolution/bayesprism/inputs/{dataset}_counts.txt', sep='\t')

In [5]:
d = pd.read_csv('../data/sandbox/gbm_meta.txt', sep='\t')
d

Unnamed: 0,patient.id,cell.type,cell.subtype
PJ016.V3,PJ016,tumor,PJ016-tumor-0
PJ016.V4,PJ016,tumor,PJ016-tumor-0
PJ016.V5,PJ016,tumor,PJ016-tumor-3
PJ016.V6,PJ016,tumor,PJ016-tumor-0
PJ016.V7,PJ016,tumor,PJ016-tumor-3
...,...,...,...
PJ048.V3082,PJ048,tumor,PJ048-tumor-0
PJ048.V3083,PJ048,tumor,PJ048-tumor-2
PJ048.V3084,PJ048,tumor,PJ048-tumor-1
PJ048.V3085,PJ048,tumor,PJ048-tumor-5


In [6]:
set(d['cell.type'])

{'endothelial', 'myeloid', 'oligo', 'pericyte', 'tcell', 'tumor'}

In [8]:
d[d['cell.type']=='endothelial']

Unnamed: 0,patient.id,cell.type,cell.subtype
PJ017.V160,PJ017,endothelial,endothelial
PJ017.V242,PJ017,endothelial,endothelial
PJ017.V418,PJ017,endothelial,endothelial
PJ017.V592,PJ017,endothelial,endothelial
PJ017.V937,PJ017,endothelial,endothelial
...,...,...,...
PJ048.V2620,PJ048,endothelial,endothelial
PJ048.V2830,PJ048,endothelial,endothelial
PJ048.V2916,PJ048,endothelial,endothelial
PJ048.V2928,PJ048,endothelial,endothelial
