In [45]:
from pathlib import Path
from collections import Counter
import os
import re
import random

import anndata
import scanpy as sc
import pandas as pd
import numpy as np

import mgitools.os_helpers as os_helpers

In [46]:
DATA_DIR = '/home/estorrs/pollock/benchmarking/data/10232020/'
OUTPUT_DIR = '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/'
T1_OUTPUT = os.path.join(OUTPUT_DIR, 'teir_1')
T2_OUTPUT = os.path.join(OUTPUT_DIR, 'teir_2')
SANDBOX_DIR = '/home/estorrs/pollock/benchmarking/sandbox'

Path(T1_OUTPUT).mkdir(parents=True, exist_ok=True)
Path(T2_OUTPUT).mkdir(parents=True, exist_ok=True)

expected structure of input data is something like
- data
    - snATACseq
        - cesc.h5ad
        - pbmc.h5ad
        ....
    - scRNAseq
        - pbmc.h5ad
        - brca.h5ad
        .....
    - snRNAseq
        - brca.h5ad
        .....

In [None]:
fps = sorted(os_helpers.listfiles(DATA_DIR, regex='.h5ad$'))
adata_map = {fp.split('/')[-2]:{} for fp in fps}
for fp in fps:
    dtype = fp.split('/')[-2]
    disease = fp.split('/')[-1].replace('.h5ad', '')
    adata_map[dtype][disease] = sc.read_h5ad(fp)

for dtype, d in adata_map.items():
    for disease, m in d.items():
        print(dtype, disease, m.shape)

In [None]:
for dtype, d in adata_map.items():
    for disease, m in d.items():
        print(dtype, disease)
        print(sorted(set(m.obs['cell_type'])))

In [None]:
T1_MAP = {
    'CD4 T cell': ['CD4 CTL', 'CD4 T', 'CD4 T-cells', 'CD4 T-cells activated', 'CD4 T-cells naive',
                  'CD4+ T-cells', 'CD4+, Naive T cells', 'CD4+T', 'CD4_T'],
    'CD8 T cell': ['CD4/CD8 proliferating', 'CD8 CTL', 'CD8 CTL exausted', 'CD8 T',
                 'CD8 T-cells preexhausted', 'CD8+, cytotoxic T cells', 'CD8+, exhausted T cells',
                 'CD8+, follicular T cells', 'CD8+T', 'CD8_T',],
    'T cells': ['T cells'],
    'Treg': ['CD4+, regulatory T cells', 'Treg', 'Tregs',],
    'Dendritic': ['DC', 'Dendritic', 'PDC', 'pDC', 'cDC'],
    'B cell': ['B', 'B-cells/Plasma', 'Bcell'],
    'Fibroblast': ['CAF', 'Fibroblast', 'Fibroblasts', 'Myofibroblasts', 'cCAFs', 'mCAFs', 'vCAFs'],
    'Acinar': ['Acinar'],
    'Endothelial': ['Endothelial', 'Endothelial cells', 'Endothelials',],
    'Adipocyte': ['Adipocyte'],
    'Epithelial': ['Epithelial', 'Normal epithelial cells'],
    'Erythrocyte': ['Erythrocyte'],
    'Islet': ['Islet'],
    'Malignant': ['CESC_Malignant/Epithelial', 'CESC_Malignant/pEMT', 'Malignant/Epithelial', 'Melanoma',
                 'Tumor', 'Tumor cells', 'Tumor-like epithelial cells',],
    'Mast': ['Mast'],
    'Monocyte': ['CD14 Monocytes', 'FCGR3A Monocytes', 'M1 Macrophages', 'M2 Macrophages', 'MM',
                 'Macrophage', 'Macrophage/Monocyte', 'Macrophages', 'Macrophages proliferating',
                 'Monocyte', 'Monocytes'],
    'Megakaryocyte': ['Megakaryocytes'],
    'Microglia': ['Microglia'],
    'NK': ['NK', 'NK cells strong', 'NK cells weak'],
    'Neuron': ['Neurons'],
    'Oligodendrocytes': ['Oligodendrocytes'],
    'Plasma': ['Plasma'],
    'Tuft': ['Tuft'] 
}
R_T1_MAP = {v:k for k, vs in T1_MAP.items() for v in vs}

In [None]:
T2_MAP = {
    'CD4 T cell': ['CD4 T', 'CD4 T-cells', 'CD4+ T-cells', 'CD4+T', 'CD4_T'],
    'CD4 CTL': ['CD4 CTL'],
    'CD4 T cell activated': ['CD4 T-cells activated'],
    'CD4 T cell naive': ['CD4 T-cells naive', 'CD4+, Naive T cells', 'T_Naive'],
    'CD8 T cell': ['CD8 T', 'CD8+T', 'CD8_T',],
    'CD4/CD8 proliferating': ['CD4/CD8 proliferating'],
    'CD8 CTL': ['CD8 CTL', 'CD8+, cytotoxic T cells'],
    'CD8 CTL exausted': ['CD8 CTL exausted', 'CD8+, exhausted T cells'],
    'CD8 T cell preexhausted': ['CD8 T-cells preexhausted'],
    'CD8 follicular T-cell': ['CD8+, follicular T cells'],
    'T cells': ['T cells'],
    'TRM': ['TRM'],
    'Treg': ['CD4+, regulatory T cells', 'Treg', 'Tregs',],
    'Dendritic': ['DC', 'Dendritic'],
    'pDC': ['PDC', 'pDC'],
    'cDC': ['cDC'],
    'B cell': ['B', 'B-cells/Plasma', 'Bcell'],
    'Fibroblast': ['CAF', 'Fibroblast', 'Fibroblasts'],
    'mCAF': ['Myofibroblasts', 'mCAFs'],
    'cCAF': ['cCAFs'],
    'vCAF': ['vCAFs'],
    'Acinar': ['Acinar'],
    'ADM': ['ADM'],
    'Endothelial': ['Endothelial', 'Endothelial cells', 'Endothelials',],
    'Adipocyte': ['Adipocyte'],
    'Epithelial': ['Epithelial', 'Normal epithelial cells'],
    'Erythrocyte': ['Erythrocyte'],
    'Islet': ['Islet'],
    'Malignant': ['CESC_Malignant/Epithelial', 'CESC_Malignant/pEMT', 'Malignant/Epithelial', 'Melanoma',
                 'Tumor', 'Tumor cells', 'Tumor-like epithelial cells',],
    'Mast': ['Mast'],
    'Monocyte': ['Macrophage/Monocyte', 'Monocyte', 'Monocytes', 'MM'],
    'Basophil': ['Basophils'],
    'CD14 Monocyte': ['CD14 Monocyte'],
    'FCGR3A Monocyte': ['FCGR3A Monocytes'],
    'M1 Macrophage': ['M1 Macrophages'],
    'M2 Macrophage': ['M2 Macrophages'],
    'Macrophage': ['Macrophage', 'Macrophages'],
    'Macrophage proliferating': ['Macrophages proliferating'],
    'Megakaryocyte': ['Megakaryocytes'],
    'Microglia': ['Microglia'],
    'NK': ['NK'],
    'NK cell strong': ['NK cells strong'],
    'NK cell weak': ['NK cells weak'],
    'Neuron': ['Neurons'],
    'Oligodendrocytes': ['Oligodendrocytes'],
    'Plasma': ['Plasma'],
    'Tuft': ['Tuft'] 
}
R_T2_MAP = {v:k for k, vs in T2_MAP.items() for v in vs}

In [None]:
cell_types = {s for dtype, d in adata_map.items()
             for disease, m in d.items()
             for s in m.obs['cell_type']}
cell_types

In [None]:
for dtype, d in adata_map.items():
    for disease, m in d.items():
        print(dtype, disease, m.shape)
        mask = [True if c in R_T1_MAP.keys() else False
               for c in m.obs['cell_type']]
        adata = m.copy()[mask]
        adata.obs['cell_type'] = [R_T1_MAP[c] for c in adata.obs['cell_type']]
        out_fp = os.path.join(T1_OUTPUT, dtype)
        Path(out_fp).mkdir(parents=True, exist_ok=True)
        adata.write_h5ad(os.path.join(out_fp, f'{disease}.h5ad'))

        print(adata.shape)
        print(sorted(set(adata.obs['cell_type'])))
        
        mask = [True if c in R_T2_MAP.keys() else False
               for c in m.obs['cell_type']]
        adata = m.copy()[mask]
        adata.obs['cell_type'] = [R_T2_MAP[c] for c in adata.obs['cell_type']]
        out_fp = os.path.join(T2_OUTPUT, dtype)
        Path(out_fp).mkdir(parents=True, exist_ok=True)
        adata.write_h5ad(os.path.join(out_fp, f'{disease}.h5ad'))
        print(adata.shape)
        print(sorted(set(adata.obs['cell_type'])))
        

## fix the melanoma one off

In [43]:
fp = '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/teir_1/scRNAseq/melanoma_val.h5ad'
adata = sc.read_h5ad(fp)
adata

AnnData object with n_obs × n_vars = 65782 × 19891
    obs: 'orig.ident', 'nCount_peaks', 'nFeature_peaks', 'total', 'duplicate', 'chimeric', 'unmapped', 'lowmapq', 'mitochondrial', 'passed_filters', 'cell_id', 'is__cell_barcode', 'TSS_fragments', 'DNase_sensitive_region_fragments', 'enhancer_region_fragments', 'promoter_region_fragments', 'on_target_fragments', 'blacklist_region_fragments', 'peak_region_fragments', 'peak_region_cutsites', 'nucleosome_signal', 'nucleosome_percentile', 'TSS.enrichment', 'TSS.percentile', 'pct_reads_in_peaks', 'blacklist_ratio', 'high.tss', 'nucleosome_group', 'peaks_snn_res.0.8', 'seurat_clusters', 'nCount_RNA', 'nFeature_RNA', 'predicted.id', 'prediction.score.Tumor', 'prediction.score.Neurons', 'prediction.score.Microglia', 'prediction.score.Doublets', 'prediction.score.Oligodendrocytes', 'prediction.score.T.cells', 'prediction.score.Fibroblasts', 'prediction.score.B.cells.Plasma', 'prediction.score.Maybe.OPC', 'prediction.score.Endothelial.cells', 'p

In [44]:
adata.var.index[:5]

Index(['PLCXD1', 'GTPBP6', 'PPP2R3B', 'SHOX', 'CRLF2'], dtype='object')

In [32]:
import mygene
mg = mygene.MyGeneInfo()

In [33]:
gene_ids = sorted(set(adata.var.index))
len(gene_ids)

23452

In [34]:
result = mg.querymany(gene_ids, scopes='ensemblgene', fields='symbol', species='human')

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-23000...done.
querying 23001-23452...done.
Finished.
1 input query terms found dup hits:
	[('ENSG00000229425', 2)]
132 input query terms found no hit:
	['ENSG00000116883', 'ENSG00000154537', 'ENSG00000163009', 'ENSG00000168078', 'ENSG00000184258', 'ENS
Pass "returnall=True" to return complete lists of duplicate or missing query terms.


In [35]:
ensembl_to_human_symbol = {x['query']:x['symbol'] for x in result if 'symbol' in x}
len(ensembl_to_human_symbol)

23320

In [36]:
adata.var.index = [ensembl_to_human_symbol.get(g, '') for g in adata.var.index]
adata

AnnData object with n_obs × n_vars = 6735 × 23452
    obs: 'nGene', 'nUMI', 'orig.ident', 'percent.mito', 'res.0.6', 'sample', 'nCount_RNA', 'nFeature_RNA', 'predicted.id', 'prediction.score.Bcell', 'prediction.score.Tcell', 'prediction.score.MM', 'prediction.score.CAF', 'prediction.score.PDC', 'prediction.score.Melanoma', 'prediction.score.DC', 'prediction.score.Plasma', 'prediction.score.Erythrocyte', 'prediction.score.Jejunum', 'prediction.score.Endothelial', 'prediction.score.max', 'assigned.id', 'manual.id', 'final.id', 'bc', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.5', 'seurat_clusters', 'int.id', 'ident', 'cell_type'
    var: 'detection_rate', 'gmean', 'variance', 'residual_mean', 'residual_variance'
    obsm: 'X_pca', 'X_umap'
    layers: 'logcounts'

In [37]:
adata.var_names_make_unique()

In [38]:
adata.var.index[:5]

Index(['AL627309.1', 'LINC01409', 'FAM87B', 'LINC00115', 'FAM41C'], dtype='object')

In [39]:
len(set(adata.var.index))

23452

In [40]:
adata.write_h5ad(fp)