In [None]:
from pathlib import Path
from collections import Counter
import os
import re
import random

import anndata
import scanpy as sc
import pandas as pd
import numpy as np

import mgitools.os_helpers as os_helpers

In [None]:
DATA_DIR = '/home/estorrs/pollock/benchmarking/data/10232020/'
OUTPUT_DIR = '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/'
T1_OUTPUT = os.path.join(OUTPUT_DIR, 'teir_1')
T2_OUTPUT = os.path.join(OUTPUT_DIR, 'teir_2')
SANDBOX_DIR = '/home/estorrs/pollock/benchmarking/sandbox'

Path(T1_OUTPUT).mkdir(parents=True, exist_ok=True)
Path(T2_OUTPUT).mkdir(parents=True, exist_ok=True)

expected structure of input data is something like
- data
    - snATACseq
        - cesc.h5ad
        - pbmc.h5ad
        ....
    - scRNAseq
        - pbmc.h5ad
        - brca.h5ad
        .....
    - snRNAseq
        - brca.h5ad
        .....

In [None]:
fps = sorted(os_helpers.listfiles(DATA_DIR, regex='.h5ad$'))
adata_map = {fp.split('/')[-2]:{} for fp in fps}
for fp in fps:
    dtype = fp.split('/')[-2]
    disease = fp.split('/')[-1].replace('.h5ad', '')
    adata_map[dtype][disease] = sc.read_h5ad(fp)

for dtype, d in adata_map.items():
    for disease, m in d.items():
        print(dtype, disease, m.shape)

In [None]:
for dtype, d in adata_map.items():
    for disease, m in d.items():
        print(dtype, disease)
        print(sorted(set(m.obs['cell_type'])))

In [None]:
T1_MAP = {
    'CD4 T cell': ['CD4 CTL', 'CD4 T', 'CD4 T-cells', 'CD4 T-cells activated', 'CD4 T-cells naive',
                  'CD4+ T-cells', 'CD4+, Naive T cells', 'CD4+T', 'CD4_T'],
    'CD8 T cell': ['CD4/CD8 proliferating', 'CD8 CTL', 'CD8 CTL exausted', 'CD8 T',
                 'CD8 T-cells preexhausted', 'CD8+, cytotoxic T cells', 'CD8+, exhausted T cells',
                 'CD8+, follicular T cells', 'CD8+T', 'CD8_T',],
    'T cells': ['T cells'],
    'Treg': ['CD4+, regulatory T cells', 'Treg', 'Tregs',],
    'Dendritic': ['DC', 'Dendritic', 'PDC', 'pDC', 'cDC'],
    'B cell': ['B', 'B-cells/Plasma', 'Bcell'],
    'Fibroblast': ['CAF', 'Fibroblast', 'Fibroblasts', 'Myofibroblasts', 'cCAFs', 'mCAFs', 'vCAFs'],
    'Acinar': ['Acinar'],
    'Endothelial': ['Endothelial', 'Endothelial cells', 'Endothelials',],
    'Adipocyte': ['Adipocyte'],
    'Epithelial': ['Epithelial', 'Normal epithelial cells'],
    'Erythrocyte': ['Erythrocyte'],
    'Islet': ['Islet'],
    'Malignant': ['CESC_Malignant/Epithelial', 'CESC_Malignant/pEMT', 'Malignant/Epithelial', 'Melanoma',
                 'Tumor', 'Tumor cells', 'Tumor-like epithelial cells',],
    'Mast': ['Mast'],
    'Monocyte': ['CD14 Monocytes', 'FCGR3A Monocytes', 'M1 Macrophages', 'M2 Macrophages', 'MM',
                 'Macrophage', 'Macrophage/Monocyte', 'Macrophages', 'Macrophages proliferating',
                 'Monocyte', 'Monocytes'],
    'Megakaryocyte': ['Megakaryocytes'],
    'Microglia': ['Microglia'],
    'NK': ['NK', 'NK cells strong', 'NK cells weak'],
    'Neuron': ['Neurons'],
    'Oligodendrocytes': ['Oligodendrocytes'],
    'Plasma': ['Plasma'],
    'Tuft': ['Tuft'] 
}
R_T1_MAP = {v:k for k, vs in T1_MAP.items() for v in vs}

In [None]:
T2_MAP = {
    'CD4 T cell': ['CD4 T', 'CD4 T-cells', 'CD4+ T-cells', 'CD4+T', 'CD4_T'],
    'CD4 CTL': ['CD4 CTL'],
    'CD4 T cell activated': ['CD4 T-cells activated'],
    'CD4 T cell naive': ['CD4 T-cells naive', 'CD4+, Naive T cells', 'T_Naive'],
    'CD8 T cell': ['CD8 T', 'CD8+T', 'CD8_T',],
    'CD4/CD8 proliferating': ['CD4/CD8 proliferating'],
    'CD8 CTL': ['CD8 CTL', 'CD8+, cytotoxic T cells'],
    'CD8 CTL exausted': ['CD8 CTL exausted', 'CD8+, exhausted T cells'],
    'CD8 T cell preexhausted': ['CD8 T-cells preexhausted'],
    'CD8 follicular T-cell': ['CD8+, follicular T cells'],
    'T cells': ['T cells'],
    'TRM': ['TRM'],
    'Treg': ['CD4+, regulatory T cells', 'Treg', 'Tregs',],
    'Dendritic': ['DC', 'Dendritic'],
    'pDC': ['PDC', 'pDC'],
    'cDC': ['cDC'],
    'B cell': ['B', 'B-cells/Plasma', 'Bcell'],
    'Fibroblast': ['CAF', 'Fibroblast', 'Fibroblasts'],
    'mCAF': ['Myofibroblasts', 'mCAFs'],
    'cCAF': ['cCAFs'],
    'vCAF': ['vCAFs'],
    'Acinar': ['Acinar'],
    'ADM': ['ADM'],
    'Endothelial': ['Endothelial', 'Endothelial cells', 'Endothelials',],
    'Adipocyte': ['Adipocyte'],
    'Epithelial': ['Epithelial', 'Normal epithelial cells'],
    'Erythrocyte': ['Erythrocyte'],
    'Islet': ['Islet'],
    'Malignant': ['CESC_Malignant/Epithelial', 'CESC_Malignant/pEMT', 'Malignant/Epithelial', 'Melanoma',
                 'Tumor', 'Tumor cells', 'Tumor-like epithelial cells',],
    'Mast': ['Mast'],
    'Monocyte': ['Macrophage/Monocyte', 'Monocyte', 'Monocytes', 'MM'],
    'Basophil': ['Basophils'],
    'CD14 Monocyte': ['CD14 Monocyte'],
    'FCGR3A Monocyte': ['FCGR3A Monocytes'],
    'M1 Macrophage': ['M1 Macrophages'],
    'M2 Macrophage': ['M2 Macrophages'],
    'Macrophage': ['Macrophage', 'Macrophages'],
    'Macrophage proliferating': ['Macrophages proliferating'],
    'Megakaryocyte': ['Megakaryocytes'],
    'Microglia': ['Microglia'],
    'NK': ['NK'],
    'NK cell strong': ['NK cells strong'],
    'NK cell weak': ['NK cells weak'],
    'Neuron': ['Neurons'],
    'Oligodendrocytes': ['Oligodendrocytes'],
    'Plasma': ['Plasma'],
    'Tuft': ['Tuft'] 
}
R_T2_MAP = {v:k for k, vs in T2_MAP.items() for v in vs}

In [None]:
cell_types = {s for dtype, d in adata_map.items()
             for disease, m in d.items()
             for s in m.obs['cell_type']}
cell_types

In [None]:
for dtype, d in adata_map.items():
    for disease, m in d.items():
        print(dtype, disease, m.shape)
        mask = [True if c in R_T1_MAP.keys() else False
               for c in m.obs['cell_type']]
        adata = m.copy()[mask]
        adata.obs['cell_type'] = [R_T1_MAP[c] for c in adata.obs['cell_type']]
        out_fp = os.path.join(T1_OUTPUT, dtype)
        Path(out_fp).mkdir(parents=True, exist_ok=True)
        adata.write_h5ad(os.path.join(out_fp, f'{disease}.h5ad'))

        print(adata.shape)
        print(sorted(set(adata.obs['cell_type'])))
        
        mask = [True if c in R_T2_MAP.keys() else False
               for c in m.obs['cell_type']]
        adata = m.copy()[mask]
        adata.obs['cell_type'] = [R_T2_MAP[c] for c in adata.obs['cell_type']]
        out_fp = os.path.join(T2_OUTPUT, dtype)
        Path(out_fp).mkdir(parents=True, exist_ok=True)
        adata.write_h5ad(os.path.join(out_fp, f'{disease}.h5ad'))
        print(adata.shape)
        print(sorted(set(adata.obs['cell_type'])))
        