In [1]:
import os
from pathlib import Path

import pandas as pd
import numpy as np
import scanpy as sc
import yaml


In [4]:
sct = sc.read_h5ad('../data/single_cell/checkpoints/non_eus_processed.h5ad')
sct = sct[[True if 'Exclude' not in c else False
          for c in sct.obs['cell_type_specific_final']]]
sct

View of AnnData object with n_obs × n_vars = 104486 × 29227
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'nCount_SCT', 'nFeature_SCT', 'CELL', 'CONDITION', 'Patient', 'Type', 'Cell_type', 'integrated_snn_res.0.75', 'seurat_clusters', 'sample_id', 'cell_type', 'pollock_cell_type', 'dataset', 'Bailey | ADEX | score', 'Bailey | Squamous-like | score', 'Bailey | Pancreatic-Progenitor | score', 'Bailey | Immunogenic | score', 'Collison | Exocrine-like | score', 'Collison | Quasi-Mesenchymal | score', 'Collison | Classical | score', 'Moffit | Basal | score', 'Moffit | Classical | score', 'subTME | deserted | score', 'subTME | reactive | score', 'raghaven | scBasal | score', 'raghaven | scClassical | score', 'raghaven | IC | score', 'raghaven | Pericyte-like | score', 'raghaven | Fibroblast-like | score', 'raghaven | Inflammatory | score', 'raghaven | TAM-FCN1 | score', 'raghaven | TAM-C1QC | score', 'raghaven | TAM-SPP1 | score', 'elyada | myCAF | score', 'elyada | iCAF

In [5]:
from mgitools.os_helpers import listfiles
bulk_fps = sorted(listfiles('../data/bulk_rna_seq/', regex=r'bulk_rna_seq/[^/]+_counts.txt$'))
ref_genes = set(sct.var.index.to_list())
bulk_genes = set()
for fp in bulk_fps:
    df = pd.read_csv(fp, sep='\t', index_col=0)
    bulk_genes.update(set(df.index.to_list()))
keep = bulk_genes.intersection(ref_genes)
f = sct[:, sorted(keep)]
f

View of AnnData object with n_obs × n_vars = 104486 × 20297
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'nCount_SCT', 'nFeature_SCT', 'CELL', 'CONDITION', 'Patient', 'Type', 'Cell_type', 'integrated_snn_res.0.75', 'seurat_clusters', 'sample_id', 'cell_type', 'pollock_cell_type', 'dataset', 'Bailey | ADEX | score', 'Bailey | Squamous-like | score', 'Bailey | Pancreatic-Progenitor | score', 'Bailey | Immunogenic | score', 'Collison | Exocrine-like | score', 'Collison | Quasi-Mesenchymal | score', 'Collison | Classical | score', 'Moffit | Basal | score', 'Moffit | Classical | score', 'subTME | deserted | score', 'subTME | reactive | score', 'raghaven | scBasal | score', 'raghaven | scClassical | score', 'raghaven | IC | score', 'raghaven | Pericyte-like | score', 'raghaven | Fibroblast-like | score', 'raghaven | Inflammatory | score', 'raghaven | TAM-FCN1 | score', 'raghaven | TAM-C1QC | score', 'raghaven | TAM-SPP1 | score', 'elyada | myCAF | score', 'elyada | iCAF

In [6]:
def rify_cell_id(cid):
    cid = 'X' + cid
    cid = cid.replace('-', '.').replace(' ', '.').replace('\t', '.')
    return cid
    

In [8]:
# save counts for cytotrace
df = pd.DataFrame(data=f.X.toarray(), columns=f.var.index.to_list(),
                  index=f.obs.index.to_list(), dtype=int)
df = df.transpose()
df['gene'] = [g.split('.')[0] for g in df.index.to_list()]
df = df.groupby('gene').mean()
df.index.name = 'Gene'
# make ids compatible with r
# df.columns = ['X' + c for c in df.columns]
df.columns = [rify_cell_id(c) for c in df.columns]
df = df[np.sum(df.values, axis=1)>0]
df

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,X1555.tumor_AAACCTGAGACCTAGG.1,X1555.tumor_AAACCTGAGTGCGTGA.1,X1555.tumor_AAACCTGCATCCCACT.1,X1555.tumor_AAACCTGGTCATGCAT.1,X1555.tumor_AAACCTGTCCGGGTGT.1,X1555.tumor_AAACGGGAGACAATAC.1,X1555.tumor_AAACGGGAGGCGCTCT.1,X1555.tumor_AAACGGGAGGGTATCG.1,X1555.tumor_AAACGGGGTGAGTGAC.1,X1555.tumor_AAACGGGTCTCGCATC.1,...,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGCGCCACAGGAGT.1,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGCGCGTACCGTAT.1,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGCGCGTGCAACTT.1,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGCGCTCGTGGACC.1,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGGTTGTTGTACAC.1,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGGTTTCCTAGTGA.1,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGGTTTCTACCAGA.1,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGTCAAGTTGTCGT.1,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGTCACAACTTGAC.1,XG9903_filtered_gene_bc_matrices_h5.h5_TTTGTCATCCGAACGC.1
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7SK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1BG-AS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZYG11B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZYX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
ZZEF1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
set(f.obs['cell_type_specific_final'])

{'ADM',
 'Acinar',
 'B cell',
 'CD4 T cell',
 'CD8 T cell',
 'CD8 T cell - Exhausted',
 'DC',
 'Endocrine',
 'Endothelial',
 'Immune - Proliferating',
 'Malignant - Basal',
 'Malignant - Classical',
 'Malignant - IC',
 'Malignant - Proliferating Basal',
 'Malignant - Proliferating Classical',
 'Malignant - Proliferating IC',
 'Mast',
 'NK',
 'Plasma',
 'Stellate',
 'TAM - C1QC',
 'TAM - FCN1',
 'TAM - Proliferating',
 'TAM - SPP1',
 'Treg',
 'iCAF',
 'myCAF'}

In [14]:
annotation_df = f.obs[['sample_id', 'cell_type_specific_final', 'dataset']]

def make_general_cell_types(cell_type):
    if 'Malignant' in cell_type:
        return 'Malignant'
    
    if 'CD8' in cell_type:
        return 'CD8 T cell'

    if 'TAM' in cell_type:
        return 'Macrophage'
    
    if 'CAF' in cell_type:
        return 'Fibroblast'
    
    return cell_type

annotation_df['cell_type'] = [make_general_cell_types(ct) for ct in annotation_df['cell_type_specific_final']]
annotation_df = annotation_df[['sample_id', 'cell_type']]
annotation_df.columns = ['Sample', 'CellType']
annotation_df.index = [rify_cell_id(cid) for cid in annotation_df.index.to_list()]
annotation_df.index.name = 'ID'
annotation_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,Sample,CellType
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
X1555.tumor_AAACCTGAGACCTAGG.1,1555-tumor,NK
X1555.tumor_AAACCTGAGTGCGTGA.1,1555-tumor,Malignant
X1555.tumor_AAACCTGCATCCCACT.1,1555-tumor,Treg
X1555.tumor_AAACCTGGTCATGCAT.1,1555-tumor,B cell
X1555.tumor_AAACCTGTCCGGGTGT.1,1555-tumor,CD8 T cell
...,...,...
XG9903_filtered_gene_bc_matrices_h5.h5_TTTGGTTTCCTAGTGA.1,G9903,Fibroblast
XG9903_filtered_gene_bc_matrices_h5.h5_TTTGGTTTCTACCAGA.1,G9903,Fibroblast
XG9903_filtered_gene_bc_matrices_h5.h5_TTTGTCAAGTTGTCGT.1,G9903,Fibroblast
XG9903_filtered_gene_bc_matrices_h5.h5_TTTGTCACAACTTGAC.1,G9903,Fibroblast


In [16]:
set(annotation_df['CellType'])

{'ADM',
 'Acinar',
 'B cell',
 'CD4 T cell',
 'CD8 T cell',
 'DC',
 'Endocrine',
 'Endothelial',
 'Fibroblast',
 'Immune - Proliferating',
 'Macrophage',
 'Malignant',
 'Mast',
 'NK',
 'Plasma',
 'Stellate',
 'Treg'}

In [20]:
annotation_df.to_csv('../data/deconvolution/ecotyper/inputs/sc_ref_annotation.txt', sep='\t')
df.to_csv('../data/deconvolution/ecotyper/inputs/sc_ref_expression.txt', sep='\t')

In [32]:
config = {
    'default': {
        'Input': {
            'Discovery dataset name': 'discovery_sc_ref',
            'Expression matrix': '/diskmnt/Projects/Users/estorrs/single-cell-pdac/data/deconvolution/ecotyper/inputs/sc_ref_expression.txt',
            'Annotation file': '/diskmnt/Projects/Users/estorrs/single-cell-pdac/data/deconvolution/ecotyper/inputs/sc_ref_annotation.txt',
            'Annotation file column to scale by': None,
            'Annotation file column(s) to plot' : []
        },
        'Output': {
            'Output folder': '/diskmnt/Projects/Users/estorrs/single-cell-pdac/data/deconvolution/ecotyper/outputs/discovery_sc_ref',
        },
        'Pipeline settings': {
            'Pipeline steps to skip': [],
            'Filter non cell type specific genes': True,
            'Number of threads': 20,
            'Number of NMF restarts': 5,
            'Maximum number of states per cell type': 10,
            'Cophenetic coefficient cutoff': .95,
            'Jaccard matrix p-value cutoff': .05
        }
    }
}

In [4]:
config = {
    'default': {
        'Input': {
            'Discovery dataset name': 'discovery_sc_ref',
            'Expression matrix': '/diskmnt/Projects/Users/estorrs/ecotyper/example_data/scRNA_CRC_data.txt',
            'Annotation file': '/diskmnt/Projects/Users/estorrs/ecotyper/example_data/scRNA_CRC_annotation.txt',
            'Annotation file column to scale by': None,
            'Annotation file column(s) to plot' : []
        },
        'Output': {
            'Output folder': '/diskmnt/Projects/Users/estorrs/single-cell-pdac/data/deconvolution/ecotyper/outputs/discovery_example',
        },
        'Pipeline settings': {
            'Pipeline steps to skip': [],
            'Filter non cell type specific genes': True,
            'Number of threads': 20,
            'Number of NMF restarts': 5,
            'Maximum number of states per cell type': 10,
            'Cophenetic coefficient cutoff': .95,
            'Jaccard matrix p-value cutoff': .05
        }
    }
}

In [5]:
config_fp = '/diskmnt/Projects/Users/estorrs/single-cell-pdac/data/deconvolution/ecotyper/inputs/discovery_example.yml'
yaml.dump(config, open(config_fp, 'w'))

run this from within ecotyper root directory

In [34]:
cmd = f'Rscript EcoTyper_discovery_scRNA.R -c {config_fp}'
cmd

'Rscript EcoTyper_discovery_scRNA.R -c /diskmnt/Projects/Users/estorrs/single-cell-pdac/data/deconvolution/ecotyper/inputs/discovery.yml'