In [2]:
from pathlib import Path
from collections import Counter
import os
import re
import random

import anndata
import scanpy as sc
import pandas as pd
import numpy as np

import mgitools.os_helpers as os_helpers

In [3]:
DATA_DIR = '/home/estorrs/pollock/benchmarking/data/01272021/'
OUTPUT_DIR = '/home/estorrs/pollock/benchmarking/data/01272021_harmonized/'
T1_OUTPUT = os.path.join(OUTPUT_DIR, 'teir_1')
T2_OUTPUT = os.path.join(OUTPUT_DIR, 'teir_2')
SANDBOX_DIR = '/home/estorrs/pollock/benchmarking/sandbox'

Path(T1_OUTPUT).mkdir(parents=True, exist_ok=True)
Path(T2_OUTPUT).mkdir(parents=True, exist_ok=True)

expected structure of input data is something like
- data
    - snATACseq
        - cesc.h5ad
        - pbmc.h5ad
        ....
    - scRNAseq
        - pbmc.h5ad
        - brca.h5ad
        .....
    - snRNAseq
        - brca.h5ad
        .....

In [17]:
fps = sorted(os_helpers.listfiles(DATA_DIR, regex='.h5ad$'))
adata_map = {fp.split('/')[-2]:{} for fp in fps}
for fp in fps:
    dtype = fp.split('/')[-2]
    disease = fp.split('/')[-1].replace('.h5ad', '')
    adata_map[dtype][disease] = sc.read_h5ad(fp)

for dtype, d in adata_map.items():
    for disease, m in d.items():
        print(dtype, disease, m.shape)

scRNAseq brca (98564, 27131)
scRNAseq cesc (31745, 22928)
scRNAseq hnscc (299761, 26929)
scRNAseq melanoma (52739, 23452)
scRNAseq myeloma (108187, 24020)
scRNAseq pbmc (2638, 32738)
scRNAseq pdac (182480, 28756)
snATACseq brca (70679, 19891)
snATACseq ccrcc (79853, 928628)
snATACseq gbm (65891, 19891)
snRNAseq brca (110443, 29175)
snRNAseq ccrcc (126034, 33538)
snRNAseq gbm (206642, 29748)


###### dataset processing if necessary

In [6]:
adata = sc.read_h5ad('/home/estorrs/pollock/benchmarking/data/01272021/scRNAseq/myeloma.h5ad')
adata

AnnData object with n_obs × n_vars = 108187 × 24020
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.5', 'seurat_clusters', 'cell_type', 'sample', 'predicted_cell_type', 'probability', 'new.ident', 'ident'
    layers: 'logcounts'

In [7]:
adata.obs

Unnamed: 0_level_0,orig.ident,nCount_RNA,nFeature_RNA,percent.mito,nCount_SCT,nFeature_SCT,SCT_snn_res.0.5,seurat_clusters,cell_type,sample,predicted_cell_type,probability,new.ident,ident
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
25183_AAACCTGAGTTAACGA-1,25183,9680.0,1669,0.018492,3721.0,1140,0,0,Plasma,,,,,Plasma
25183_AAACCTGTCAACTCTT-1,25183,2311.0,559,0.047166,3282.0,554,5,5,Plasma,,,,,Plasma
25183_AAACGGGAGAGACGAA-1,25183,3567.0,229,0.019064,3507.0,229,2,2,Plasma,,,,,Plasma
25183_AAACGGGAGCTTCGCG-1,25183,5664.0,1389,0.019421,3874.0,1374,1,1,Plasma,,,,,Plasma
25183_AAACGGGCACAGCCCA-1,25183,4208.0,955,0.023527,3648.0,951,2,2,Plasma,,,,,Plasma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Normal_sorted_170607_GTGCAGCTCTCTAGGA-1,Normal_sorted_170607,1238.0,247,0.010501,2243.0,292,12,12,B,,,,B,B
Normal_sorted_170607_TACTTGTCAAGGACTG-1,Normal_sorted_170607,1216.0,248,0.003289,2171.0,294,12,12,Plasma_BM,,,,Plasma,Plasma_BM
Normal_sorted_170607_TCAGCTCTCAGTCAGT-1,Normal_sorted_170607,3315.0,376,0.002112,2741.0,361,12,12,Plasma_BM,,,,Plasma,Plasma_BM
Normal_sorted_170607_TGACAACAGGACATTA-1,Normal_sorted_170607,2051.0,609,0.010726,2338.0,601,12,12,B,,,,B,B


In [8]:
adata.obs['cell_type']= [x if x != 'Plasma' else 'Malignant' for x in adata.obs['cell_type']]

In [9]:
adata.write_h5ad('/home/estorrs/pollock/benchmarking/data/01272021/scRNAseq/myeloma.h5ad')

  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'cell_type' as categorical


In [4]:
adata = sc.read_h5ad('/home/estorrs/pollock/benchmarking/data/01272021/scRNAseq/melanoma.h5ad')
adata

AnnData object with n_obs × n_vars = 52739 × 23452
    obs: 'nGene', 'nUMI', 'orig.ident', 'percent.mito', 'res.0.6', 'sample', 'nCount_RNA', 'nFeature_RNA', 'predicted.id', 'prediction.score.Bcell', 'prediction.score.Tcell', 'prediction.score.MM', 'prediction.score.CAF', 'prediction.score.PDC', 'prediction.score.Melanoma', 'prediction.score.DC', 'prediction.score.Plasma', 'prediction.score.Erythrocyte', 'prediction.score.Jejunum', 'prediction.score.Endothelial', 'prediction.score.max', 'assigned.id', 'manual.id', 'final.id', 'bc', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.5', 'seurat_clusters', 'int.id', 'ident', 'cell_type'
    var: 'detection_rate', 'gmean', 'variance', 'residual_mean', 'residual_variance'
    obsm: 'X_pca', 'X_umap'
    layers: 'logcounts'

In [5]:
adata.var

Unnamed: 0,detection_rate,gmean,variance,residual_mean,residual_variance
ENSG00000238009,0.001024,0.000710,0.001023,0.005429,1.205686
ENSG00000237491,0.020497,0.014736,0.023200,0.007018,1.024149
ENSG00000177757,0.000967,0.000699,0.001231,-0.000056,0.995660
ENSG00000225880,0.011794,0.008482,0.014008,0.009830,1.207445
ENSG00000230368,0.021824,0.015749,0.025406,-0.000743,0.930020
...,...,...,...,...,...
ENSG00000278384,0.008153,0.005783,0.008936,-0.018245,0.514212
ENSG00000276345,0.061454,0.058716,0.245925,-0.043126,0.742454
ENSG00000277856,0.000152,0.000121,0.000265,0.004714,1.940715
ENSG00000275063,0.000114,0.000092,0.000265,-0.003213,0.303124


In [8]:
# !pip install mygene

In [9]:
import mygene
mg = mygene.MyGeneInfo()

In [10]:
gene_ids = sorted(set(adata.var.index))
len(gene_ids)

23452

In [11]:
result = mg.querymany(gene_ids, scopes='ensemblgene', fields='symbol', species='human')

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-23000...done.
querying 23001-23452...done.
Finished.
1 input query terms found dup hits:
	[('ENSG00000229425', 2)]
132 input query terms found no hit:
	['ENSG00000116883', 'ENSG00000154537', 'ENSG00000163009', 'ENSG00000168078', 'ENSG00000184258', 'ENS
Pass "returnall=True" to return complete lists of duplicate or missing query terms.


In [12]:
ensembl_to_human_symbol = {x['query']:x['symbol'] for x in result if 'symbol' in x}
len(ensembl_to_human_symbol)

23320

In [13]:
adata.var.index = [ensembl_to_human_symbol.get(g, '') for g in adata.var.index]
adata

AnnData object with n_obs × n_vars = 52739 × 23452
    obs: 'nGene', 'nUMI', 'orig.ident', 'percent.mito', 'res.0.6', 'sample', 'nCount_RNA', 'nFeature_RNA', 'predicted.id', 'prediction.score.Bcell', 'prediction.score.Tcell', 'prediction.score.MM', 'prediction.score.CAF', 'prediction.score.PDC', 'prediction.score.Melanoma', 'prediction.score.DC', 'prediction.score.Plasma', 'prediction.score.Erythrocyte', 'prediction.score.Jejunum', 'prediction.score.Endothelial', 'prediction.score.max', 'assigned.id', 'manual.id', 'final.id', 'bc', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.5', 'seurat_clusters', 'int.id', 'ident', 'cell_type'
    var: 'detection_rate', 'gmean', 'variance', 'residual_mean', 'residual_variance'
    obsm: 'X_pca', 'X_umap'
    layers: 'logcounts'

In [14]:
adata.var_names_make_unique()

In [15]:
adata.var.index[:5]

Index(['AL627309.1', 'LINC01409', 'FAM87B', 'LINC00115', 'FAM41C'], dtype='object')

In [16]:
adata.write_h5ad('/home/estorrs/pollock/benchmarking/data/01272021/scRNAseq/melanoma.h5ad')

##### renaming

In [18]:
for dtype, d in adata_map.items():
    for disease, m in d.items():
        print(dtype, disease)
        print(sorted(set(m.obs['cell_type'])))

scRNAseq brca
['B', 'CD4_T', 'CD8_T', 'DC', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Macrophage', 'Mast', 'NK', 'Plasma', 'Treg', 'Tumor']
scRNAseq cesc
['CD4+T', 'CD8+T', 'CESC_Malignant/Epithelial', 'CESC_Malignant/pEMT', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Macrophage/Monocyte', 'Mast', 'Monocyte', 'NK', 'Plasma']
scRNAseq hnscc
['B', 'CD4+T', 'CD8+T', 'Endothelial', 'Endothelials', 'Erythrocyte', 'Macrophage/Monocyte', 'Malignant/Epithelial', 'Mast', 'NK', 'Plasma', 'Stromal', 'Treg', 'Unknown']
scRNAseq melanoma
['Bcell', 'CAF', 'CD4+, Naive T cells', 'CD4+, regulatory T cells', 'CD8+, cytotoxic T cells', 'CD8+, exhausted T cells', 'CD8+, follicular T cells', 'DC', 'M1 Macrophages', 'M2 Macrophages', 'MM', 'Melanoma', 'Monocytes', 'NK', 'PDC', 'Plasma', 'Tcell']
scRNAseq myeloma
['B', 'B(1)', 'B(2)', 'CD14+Mono', 'CD16+Mono', 'CD4+T', 'CD8+T', 'DC', 'Erythrocytes', 'Macrophages(1)', 'Macrophages(2)', 'Macrophages(3)', 'Malignant', 'Mast cells', 'Monocytes

In [19]:
T1_MAP = {
    'CD4 T cell': ['CD4 CTL', 'CD4 T', 'CD4 T-cells', 'CD4 T-cells activated', 'CD4 T-cells naive',
                  'CD4+ T-cells', 'CD4+, Naive T cells', 'CD4+T', 'CD4_T'],
    'CD8 T cell': ['CD4/CD8 proliferating', 'CD8 CTL', 'CD8 CTL exausted', 'CD8 T',
                 'CD8 T-cells preexhausted', 'CD8+, cytotoxic T cells', 'CD8+, exhausted T cells',
                 'CD8+, follicular T cells', 'CD8+T', 'CD8_T',],
    'T cells': ['T cells', 'T_NK', 'T_NK_cells'],
    'Treg': ['CD4+, regulatory T cells', 'Treg', 'Tregs',],
    'Dendritic': ['DC', 'Dendritic', 'PDC', 'pDC', 'cDC', 'mDC'],
    'B cell': ['B', 'B-cells/Plasma', 'Bcell', 'B(1)', 'B(2)'],
    'Fibroblast': ['CAF', 'Fibroblast', 'Fibroblasts', 'Myofibroblasts', 'cCAFs', 'mCAFs', 'vCAFs'],
    'Acinar': ['Acinar'],
    'Endothelial': ['Endothelial', 'Endothelial cells', 'Endothelials',],
    'Adipocyte': ['Adipocyte'],
    'Epithelial': ['Epithelial', 'Normal epithelial cells'],
    'Erythrocyte': ['Erythrocyte', 'Erythrocytes'],
    'Islet': ['Islet'],
    'Malignant': ['CESC_Malignant/Epithelial', 'CESC_Malignant/pEMT', 'Malignant/Epithelial', 'Melanoma',
                 'Tumor', 'Tumor cells', 'Tumor-like epithelial cells', 'Malignant'],
    'Mast': ['Mast', 'Mast cells'],
    'Monocyte': ['CD14 Monocytes', 'FCGR3A Monocytes', 'M1 Macrophages', 'M2 Macrophages', 'MM',
                 'Macrophage', 'Macrophage/Monocyte', 'Macrophages', 'Macrophages proliferating',
                 'Monocyte', 'Monocytes', 'CD14+Mono', 'CD16+Mono',
                'Macrophages(1)', 'Macrophages(2)', 'Macrophages(3)', 'macrophages', 'monocytes'],
    'Megakaryocyte': ['Megakaryocytes', 'Megakaryocyte'],
    'Microglia': ['Microglia'],
    'NK': ['NK', 'NK cells strong', 'NK cells weak'],
    'Neuron': ['Neurons'],
    'Oligodendrocytes': ['Oligodendrocytes'],
    'Plasma': ['Plasma', 'Plasma_BM'],
    'Platlete': ['Platelet'],
    'Tuft': ['Tuft'] 
}
R_T1_MAP = {v:k for k, vs in T1_MAP.items() for v in vs}

In [20]:
T2_MAP = {
    'CD4 T cell': ['CD4 T', 'CD4 T-cells', 'CD4+ T-cells', 'CD4+T', 'CD4_T'],
    'CD4 CTL': ['CD4 CTL'],
    'CD4 T cell activated': ['CD4 T-cells activated'],
    'CD4 T cell naive': ['CD4 T-cells naive', 'CD4+, Naive T cells', 'T_Naive'],
    'CD8 T cell': ['CD8 T', 'CD8+T', 'CD8_T',],
    'CD4/CD8 proliferating': ['CD4/CD8 proliferating'],
    'CD8 CTL': ['CD8 CTL', 'CD8+, cytotoxic T cells'],
    'CD8 CTL exausted': ['CD8 CTL exausted', 'CD8+, exhausted T cells'],
    'CD8 T cell preexhausted': ['CD8 T-cells preexhausted'],
    'CD8 follicular T-cell': ['CD8+, follicular T cells'],
    'T cells': ['T cells', 'T_NK', 'T_NK_cells'],
    'TRM': ['TRM'],
    'Treg': ['CD4+, regulatory T cells', 'Treg', 'Tregs',],
    'Dendritic': ['DC', 'Dendritic'],
    'pDC': ['PDC', 'pDC'],
    'cDC': ['cDC'],
    'B cell': ['B', 'B-cells/Plasma', 'Bcell', 'B(1)', 'B(2)'],
    'Fibroblast': ['CAF', 'Fibroblast', 'Fibroblasts'],
    'mCAF': ['Myofibroblasts', 'mCAFs'],
    'cCAF': ['cCAFs'],
    'vCAF': ['vCAFs'],
    'Acinar': ['Acinar'],
    'ADM': ['ADM'],
    'Endothelial': ['Endothelial', 'Endothelial cells', 'Endothelials',],
    'Adipocyte': ['Adipocyte'],
    'Epithelial': ['Epithelial', 'Normal epithelial cells'],
    'Erythrocyte': ['Erythrocyte', 'Erythrocytes'],
    'Islet': ['Islet'],
    'Malignant': ['CESC_Malignant/Epithelial', 'CESC_Malignant/pEMT', 'Malignant/Epithelial', 'Melanoma',
                 'Tumor', 'Tumor cells', 'Tumor-like epithelial cells',],
    'Mast': ['Mast', 'Mast cells'],
    'Monocyte': ['Macrophage/Monocyte', 'Monocyte', 'Monocytes', 'MM', 'monocytes'],
    'Basophil': ['Basophils'],
    'Neutrophil': ['Neutrophils', 'neutrophil'],
    'CD14 Monocyte': ['CD14 Monocyte', 'CD14+Mono'],
    'FCGR3A Monocyte': ['FCGR3A Monocytes', 'CD16+Mono'],
    'M1 Macrophage': ['M1 Macrophages'],
    'M2 Macrophage': ['M2 Macrophages'],
    'Macrophage': ['Macrophage', 'Macrophages', 'Macrophages(1)', 'Macrophages(2)', 'Macrophages(3)',
                  'macrophages'],
    'Macrophage proliferating': ['Macrophages proliferating'],
    'Megakaryocyte': ['Megakaryocytes'],
    'Microglia': ['Microglia'],
    'NK': ['NK'],
    'NK cell strong': ['NK cells strong'],
    'NK cell weak': ['NK cells weak'],
    'Neuron': ['Neurons'],
    'Oligodendrocytes': ['Oligodendrocytes'],
    'Plasma': ['Plasma', 'Plasma_BM'],
    'Tuft': ['Tuft'] ,
    'Platlete': ['Platelet'],
}
R_T2_MAP = {v:k for k, vs in T2_MAP.items() for v in vs}

In [21]:
cell_types = {s for dtype, d in adata_map.items()
             for disease, m in d.items()
             for s in m.obs['cell_type']}
cell_types

{'ADM',
 'Acinar',
 'Adipocyte',
 'B',
 'B(1)',
 'B(2)',
 'B-cells',
 'B-cells/Plasma',
 'Basophils',
 'Bcell',
 'CAF',
 'CD14 Monocytes',
 'CD14+Mono',
 'CD16+Mono',
 'CD4 CTL',
 'CD4 T',
 'CD4 T-cells',
 'CD4 T-cells activated',
 'CD4 T-cells naive',
 'CD4+ T-cells',
 'CD4+, Naive T cells',
 'CD4+, regulatory T cells',
 'CD4+T',
 'CD4/CD8 proliferating',
 'CD4_T',
 'CD8 CTL',
 'CD8 CTL exhausted',
 'CD8 T',
 'CD8 T-cells preexhausted',
 'CD8+, cytotoxic T cells',
 'CD8+, exhausted T cells',
 'CD8+, follicular T cells',
 'CD8+T',
 'CD8_T',
 'CESC_Malignant/Epithelial',
 'CESC_Malignant/pEMT',
 'DC',
 'Dendritic',
 'Doublets',
 'Endothelial',
 'Endothelial cells',
 'Endothelials',
 'Epithelial',
 'Erythrocyte',
 'Erythrocytes',
 'FCGR3A Monocytes',
 'Fibroblast',
 'Fibroblasts',
 'Islet',
 'M1 Macrophages',
 'M2 Macrophages',
 'MM',
 'Macrophage',
 'Macrophage/Monocyte',
 'Macrophages',
 'Macrophages proliferating',
 'Macrophages(1)',
 'Macrophages(2)',
 'Macrophages(3)',
 'Malignant',

In [22]:
for dtype, d in adata_map.items():
    for disease, m in d.items():
        print(dtype, disease, m.shape)
        mask = [True if c in R_T1_MAP.keys() else False
               for c in m.obs['cell_type']]
        adata = m.copy()[mask]
        adata.obs['cell_type'] = [R_T1_MAP[c] for c in adata.obs['cell_type']]
        out_fp = os.path.join(T1_OUTPUT, dtype)
        Path(out_fp).mkdir(parents=True, exist_ok=True)
        adata.write_h5ad(os.path.join(out_fp, f'{disease}.h5ad'))

        print(adata.shape)
        print(sorted(set(adata.obs['cell_type'])))
        
        mask = [True if c in R_T2_MAP.keys() else False
               for c in m.obs['cell_type']]
        adata = m.copy()[mask]
        adata.obs['cell_type'] = [R_T2_MAP[c] for c in adata.obs['cell_type']]
        out_fp = os.path.join(T2_OUTPUT, dtype)
        Path(out_fp).mkdir(parents=True, exist_ok=True)
        adata.write_h5ad(os.path.join(out_fp, f'{disease}.h5ad'))
        print(adata.shape)
        print(sorted(set(adata.obs['cell_type'])))
        

scRNAseq brca (98564, 27131)


  if not is_categorical(df_full[k]):
Trying to set attribute `.obs` of view, copying.
  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'cell_type' as categorical


(98564, 27131)
['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(98564, 27131)
['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Macrophage', 'Malignant', 'Mast', 'NK', 'Plasma', 'Treg']
scRNAseq cesc (31745, 22928)


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(31745, 22928)
['CD4 T cell', 'CD8 T cell', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma']


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(31745, 22928)
['CD4 T cell', 'CD8 T cell', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma']
scRNAseq hnscc (299761, 26929)


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(227195, 26929)
['B cell', 'CD4 T cell', 'CD8 T cell', 'Endothelial', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(227195, 26929)
['B cell', 'CD4 T cell', 'CD8 T cell', 'Endothelial', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
scRNAseq melanoma (52739, 23452)


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(52367, 23452)
['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Fibroblast', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg']


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(52367, 23452)
['B cell', 'CD4 T cell naive', 'CD8 CTL', 'CD8 CTL exausted', 'CD8 follicular T-cell', 'Dendritic', 'Fibroblast', 'M1 Macrophage', 'M2 Macrophage', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg', 'pDC']
scRNAseq myeloma (108187, 24020)


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(92446, 24020)
['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete']


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(79926, 24020)
['B cell', 'CD14 Monocyte', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Erythrocyte', 'FCGR3A Monocyte', 'Macrophage', 'Mast', 'Monocyte', 'NK', 'Neutrophil', 'Plasma', 'Platlete', 'pDC']
scRNAseq pbmc (2638, 32738)


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(2638, 32738)
['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Megakaryocyte', 'Monocyte', 'NK']
(2154, 32738)
['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'FCGR3A Monocyte', 'Megakaryocyte', 'NK']
scRNAseq pdac (182480, 28756)


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(177089, 28756)
['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(181937, 28756)
['ADM', 'Acinar', 'B cell', 'CD4 T cell', 'CD4 T cell naive', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Macrophage', 'Malignant', 'Mast', 'NK', 'Plasma', 'Treg', 'Tuft']
snATACseq brca (70679, 19891)


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(70679, 19891)
['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Treg']


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(70679, 19891)
['B cell', 'CD4 T cell', 'CD8 T cell', 'Endothelial', 'Macrophage', 'Malignant', 'Mast', 'NK', 'Treg', 'cCAF', 'cDC', 'mCAF', 'pDC', 'vCAF']
snATACseq ccrcc (79853, 928628)


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(79851, 928628)
['CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Monocyte']


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(79852, 928628)
['CD4 T cell', 'CD4/CD8 proliferating', 'Endothelial', 'Epithelial', 'Fibroblast', 'Macrophage', 'Malignant', 'TRM', 'cDC', 'mCAF']
snATACseq gbm (65891, 19891)


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(65782, 19891)
['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Neuron', 'Oligodendrocytes', 'T cells']


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(65782, 19891)
['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Neuron', 'Oligodendrocytes', 'T cells']
snRNAseq brca (110443, 29175)


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(109002, 29175)
['Adipocyte', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(109002, 29175)
['Adipocyte', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Macrophage', 'Malignant', 'Mast', 'NK', 'Plasma', 'Treg']
snRNAseq ccrcc (126034, 33538)


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(123250, 33538)
['CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg']


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(123794, 33538)
['Basophil', 'CD4 CTL', 'CD4 T cell', 'CD4 T cell activated', 'CD4 T cell naive', 'CD4/CD8 proliferating', 'CD8 CTL', 'CD8 T cell preexhausted', 'Endothelial', 'Epithelial', 'Fibroblast', 'Macrophage', 'Macrophage proliferating', 'Malignant', 'NK cell strong', 'NK cell weak', 'Plasma', 'TRM', 'Treg', 'cDC', 'mCAF', 'pDC']
snRNAseq gbm (206642, 29748)


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(192699, 29748)
['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Monocyte', 'Neuron', 'Oligodendrocytes', 'T cells']


Trying to set attribute `.obs` of view, copying.
... storing 'cell_type' as categorical


(192699, 29748)
['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Monocyte', 'Neuron', 'Oligodendrocytes', 'T cells']


## fix the melanoma one off

In [None]:
fp = '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/teir_1/scRNAseq/melanoma_val.h5ad'
adata = sc.read_h5ad(fp)
adata

In [None]:
adata.var.index[:5]

In [None]:
import mygene
mg = mygene.MyGeneInfo()

In [None]:
gene_ids = sorted(set(adata.var.index))
len(gene_ids)

In [None]:
result = mg.querymany(gene_ids, scopes='ensemblgene', fields='symbol', species='human')

In [None]:
ensembl_to_human_symbol = {x['query']:x['symbol'] for x in result if 'symbol' in x}
len(ensembl_to_human_symbol)

In [None]:
adata.var.index = [ensembl_to_human_symbol.get(g, '') for g in adata.var.index]
adata

In [None]:
adata.var_names_make_unique()

In [None]:
adata.var.index[:5]

In [None]:
len(set(adata.var.index))

In [None]:
adata.write_h5ad(fp)

## incorporate snATAC-seq gene activity, peaks, and chromvar

In [23]:
adata = sc.read_h5ad('/home/estorrs/rds_conversion/data/snATACseq/gbm/gbm_gene_activity_nadja.h5ad')
metadata = adata.obs.copy()
metadata.columns = metadata.columns.astype(str)
metadata

Unnamed: 0,orig.ident,nCount_peaks,nFeature_peaks,total,duplicate,chimeric,unmapped,lowmapq,mitochondrial,passed_filters,...,prediction.score.Maybe.OPC,prediction.score.Endothelial.cells,prediction.score.max,nCount_peaksinters,nFeature_peaksinters,dataset,prediction.score.Monocytes,cell_type_manual,cell_type_manual_2,cell_type
C3L-02705_CPT0189650015_2020-06-16_GCGAGAACATGCCCTT-1,ATAC,11593.0,4952,19264,5375,491,244,1706,65,11383,...,0.000000,0.000000,0.966328,7049.0,5941,C3L-02705_CPT0189650015_2020-06-16,,Tumor,Tumor,Tumor
C3L-02705_CPT0189650015_2020-06-16_CTCAGCTGTTATGCAC-1,ATAC,7530.0,3441,13929,3733,362,144,1324,15,8351,...,0.000000,0.000000,0.851200,4524.0,3989,C3L-02705_CPT0189650015_2020-06-16,,Microglia,Microglia_2,Microglia
C3L-02705_CPT0189650015_2020-06-16_ACCCAAAAGATCTAAG-1,ATAC,3645.0,1761,7842,1945,150,118,632,23,4974,...,0.000000,0.000000,1.000000,2472.0,2275,C3L-02705_CPT0189650015_2020-06-16,,Tumor,Tumor,Tumor
C3L-02705_CPT0189650015_2020-06-16_AACTGTGCACGCGCAT-1,ATAC,5921.0,2788,10175,2807,253,164,983,24,5944,...,0.000000,0.000000,0.997391,3488.0,3150,C3L-02705_CPT0189650015_2020-06-16,,Microglia,Microglia_1,Microglia
C3L-02705_CPT0189650015_2020-06-16_TTGCTTAGTCCGTGCA-1,ATAC,2092.0,1049,5986,1388,70,119,597,1,3811,...,0.000000,0.000000,0.932418,1527.0,1422,C3L-02705_CPT0189650015_2020-06-16,,Tumor,Tumor,Tumor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-03186_CPT0206880004_2020-06-22_AAACTGCTCCTTCGAC-1,ATAC,9837.0,4306,34007,8711,390,330,3219,222,21135,...,0.000000,0.000000,1.000000,6454.0,5412,C3N-03186_CPT0206880004_2020-06-22,,Tumor,Tumor,Tumor
C3N-03186_CPT0206880004_2020-06-22_ATAGTCGAGACGTCAG-1,ATAC,2448.0,1166,6307,1694,71,75,731,22,3714,...,0.000000,0.000000,1.000000,1750.0,1597,C3N-03186_CPT0206880004_2020-06-22,,Microglia,Microglia_2,Microglia
C3N-03186_CPT0206880004_2020-06-22_CTCACCAGTAACAATG-1,ATAC,2383.0,1147,9788,2440,110,122,1079,754,5283,...,0.000197,0.004569,0.550301,1519.0,1416,C3N-03186_CPT0206880004_2020-06-22,,Tumor,Tumor,Tumor
C3N-03186_CPT0206880004_2020-06-22_CCCAGAGCAATGGTCT-1,ATAC,3730.0,1753,14409,3574,131,120,1234,306,9044,...,0.000000,0.000000,1.000000,2947.0,2643,C3N-03186_CPT0206880004_2020-06-22,,Microglia,Microglia_2,Microglia


In [24]:
gene_activity = sc.read_h5ad('/home/estorrs/rds_conversion/data/snATACseq/gbm/gbm_gene_activity_nadja.h5ad')
peaks = sc.read_h5ad('/home/estorrs/rds_conversion/data/snATACseq/gbm/gbm_peaks_nadja.h5ad')
motif = sc.read_h5ad('/home/estorrs/rds_conversion/data/snATACseq/gbm/gbm_motif_accessability_nadja.h5ad')
gene_activity.shape, peaks.shape, motif.shape

((65891, 19891), (65891, 149449), (65891, 633))

In [25]:
np.count_nonzero(gene_activity.obs.index == peaks.obs.index), np.count_nonzero(gene_activity.obs.index == motif.obs.index)

(65891, 65891)

In [26]:
set(metadata['cell_type'])

{'B-cells/Plasma',
 'Endothelial',
 'Fibroblast',
 'Maybe OPC',
 'Microglia',
 'Neurons',
 'Oligodendrocytes',
 'T cells',
 'Tumor'}

In [27]:
trash = ['Unknown', 'Maybe OPC']
mask = [True if c not in trash else False
       for c in metadata['cell_type']]
metadata = metadata[mask]
gene_activity = gene_activity[mask]
peaks = peaks[mask]
motif = motif[mask]

  if not is_categorical(df_full[k]):


In [None]:
## go up to T1 and T2 sections and add cells as necessary

In [None]:
t1_metadata = metadata.copy()
t1_metadata['cell_type'] = [R_T1_MAP[c] for c in metadata['cell_type']]
set(t1_metadata['cell_type'])

In [None]:
t2_metadata = metadata.copy()
t2_metadata['cell_type'] = [R_T2_MAP[c] for c in metadata['cell_type']]
set(t2_metadata['cell_type'])

In [None]:
disease = 'gbm'
gene_activity.obs = t1_metadata
gene_activity.write_h5ad(os.path.join(T1_OUTPUT, 'snATACseq', f'{disease}_gene_activity.h5ad'))
gene_activity.obs = t2_metadata
gene_activity.write_h5ad(os.path.join(T2_OUTPUT, 'snATACseq', f'{disease}_gene_activity.h5ad'))

In [None]:
peaks.obs = t1_metadata
peaks.write_h5ad(os.path.join(T1_OUTPUT, 'snATACseq', f'{disease}_peaks.h5ad'))
peaks.obs = t2_metadata
peaks.write_h5ad(os.path.join(T2_OUTPUT, 'snATACseq', f'{disease}_peaks.h5ad'))

In [None]:
motif.obs = t1_metadata
motif.write_h5ad(os.path.join(T1_OUTPUT, 'snATACseq', f'{disease}_motif.h5ad'))
motif.obs = t2_metadata
motif.write_h5ad(os.path.join(T2_OUTPUT, 'snATACseq', f'{disease}_motif.h5ad'))