In [1]:
import os
import re
from collections import Counter

import anndata
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
a = sc.read_h5ad('../data/single_cell/annotated/all_annotated.h5ad')
a

AnnData object with n_obs × n_vars = 196868 × 3000
    obs: 'sample_id', 'dataset', 'tissue_type', 'doublet_score', 'is_doublet', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'Bailey | ADEX score', 'Collison | Exocrine-like score', 'Moffit | Basal score', 'Bailey | Squamous-like score', 'Bailey | Pancreatic-Progenitor score', 'Bailey | Immunogenic score', 'Collison | Quasi-Mesenchymal score', 'Collison | Classical score', 'Moffit | Classical score', 'subTME | deserted score', 'subTME | reactive score', 'raghaven | scBasal score', 'raghaven | scClassical score', 'raghaven | IC score', 'raghaven | Pericyte-like score', 'raghaven | Fibroblast-like score', 'raghaven | Inflammatory score', 'raghaven | TAM-FCN1 score', 'raghaven | TAM-C1QC score', 'raghaven | TAM-SPP1 score', 'elyada | myCAF score', 'elyada | iCAF score', 'elyada | apCAF score', 'ben-porath | ES exp1 score', 'ben-porath | ES exp2 score', 'ben-porath | Nanog targets score', 'be

In [3]:
keep = ['sample_id', 'dataset', 'tissue_type', 'cell_type', 'cell_type_specific', 'n_genes', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt']
a.obs = a.obs.loc[:, keep]

keep = ['hvg', 'log1p', 'neighbors', 'pca', 'umap']
a.uns = {k:v for k, v in a.uns.items() if k in keep}

a.obsp = None

a



AnnData object with n_obs × n_vars = 196868 × 3000
    obs: 'sample_id', 'dataset', 'tissue_type', 'cell_type', 'cell_type_specific', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
    var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'hvg', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'counts'

In [6]:
a.layers['counts']

(196868, 3000)

In [7]:
counts = sc.read_h5ad('../data/single_cell/original/count_matrix/counts.h5ad')
counts

AnnData object with n_obs × n_vars = 323120 × 34985
    obs: 'sample_id', 'dataset', 'tissue_type'

In [9]:
counts.obs

Unnamed: 0_level_0,sample_id,dataset,tissue_type
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
htan01_AAACCTGAGACCTAGG-1,htan01,htan_surgical,tumor_primary
htan01_AAACCTGAGTGCGTGA-1,htan01,htan_surgical,tumor_primary
htan01_AAACCTGCAATCGGTT-1,htan01,htan_surgical,tumor_primary
htan01_AAACCTGCACCGTTGG-1,htan01,htan_surgical,tumor_primary
htan01_AAACCTGCATCCCACT-1,htan01,htan_surgical,tumor_primary
...,...,...,...
G9903_TTTGTCAAGTTGTCGT-1,G9903,chan-seng-yue,tumor_primary
G9903_TTTGTCACAACTTGAC-1,G9903,chan-seng-yue,tumor_primary
G9903_TTTGTCACATATGGTC-1,G9903,chan-seng-yue,tumor_primary
G9903_TTTGTCATCCGAACGC-1,G9903,chan-seng-yue,tumor_primary


In [14]:
counts.obs = pd.merge(
    counts.obs, a.obs[['cell_type', 'cell_type_specific']], left_index=True, right_index=True, how='left')

In [15]:
counts.obs['filtered'] = pd.isnull(counts.obs['cell_type'])

In [16]:
counts.obs

Unnamed: 0_level_0,sample_id,dataset,tissue_type,cell_type,cell_type_specific,filtered
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
htan01_AAACCTGAGACCTAGG-1,htan01,htan_surgical,tumor_primary,NK/T cell,NK,False
htan01_AAACCTGAGTGCGTGA-1,htan01,htan_surgical,tumor_primary,Malignant,Malignant - Classical,False
htan01_AAACCTGCAATCGGTT-1,htan01,htan_surgical,tumor_primary,,,True
htan01_AAACCTGCACCGTTGG-1,htan01,htan_surgical,tumor_primary,,,True
htan01_AAACCTGCATCCCACT-1,htan01,htan_surgical,tumor_primary,NK/T cell,Treg,False
...,...,...,...,...,...,...
G9903_TTTGTCAAGTTGTCGT-1,G9903,chan-seng-yue,tumor_primary,Fibroblast,iCAF,False
G9903_TTTGTCACAACTTGAC-1,G9903,chan-seng-yue,tumor_primary,Fibroblast,myCAF,False
G9903_TTTGTCACATATGGTC-1,G9903,chan-seng-yue,tumor_primary,Malignant,Malignant - Classical,False
G9903_TTTGTCATCCGAACGC-1,G9903,chan-seng-yue,tumor_primary,Malignant,Malignant - Classical,False


In [17]:
counts

AnnData object with n_obs × n_vars = 323120 × 34985
    obs: 'sample_id', 'dataset', 'tissue_type', 'cell_type', 'cell_type_specific', 'filtered'

In [19]:
counts.write_h5ad('../data/single_cell/uploaded/counts.h5ad')

In [20]:
counts.X

<323120x34985 sparse matrix of type '<class 'numpy.float32'>'
	with 547692474 stored elements in Compressed Sparse Row format>