In [1]:
import os
import logging

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sns

import numpy as np
import scipy as sp
import pandas as pd

import scipy.io

import anndata
import scanpy as sc
sc.settings.verbosity = 3

In [10]:
data_dir = os.path.join('..', 'data', 'scRNAseq_YangLi')
prefix = os.path.join(data_dir, 'bcg0712_complete.QC2')

In [3]:
adata = anndata.read_h5ad(f'{prefix}.h5ad')

In [6]:
del adata.obsm
del adata.uns

In [4]:
sct_variance = pd.read_csv(f'{prefix}.SCT_variance.csv.gz', index_col=0, header=None)
assert sct_variance.shape[0] == adata.shape[1]
adata.var['sct_variance'] = sct_variance[1].values

In [7]:
raw_X = scipy.io.mmread(f'{prefix}.SCT_corr_counts.mtx')
assert adata.X.shape == raw_X.T.shape
adata.X = raw_X.T.tocsr()
adata.raw = adata

In [8]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

normalizing counts per cell
    finished (0:00:02)


In [11]:
adata.write(f'{prefix}.SCT_log_counts.h5ad', compression='gzip')

In [12]:
X = np.load(f'{prefix}.SCT_residuals.npy')
assert adata.X.shape == X.T.shape
adata.X = X.T

In [13]:
adata

AnnData object with n_obs × n_vars = 181446 × 20856
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'status', 'assignment', 'cells', 'age', 'gender', 'time', 'stim', 'ts', 'batch', 'pool', 'bp', 'percent.mt', 'integrated_snn_res.0.5', 'seurat_clusters', 'clusters1', 'cell_ts', 'ids', 'datafile', 'groupby', 'n_counts', 'log_counts', 'n_genes', 'mt_frac', 'high_mt_frac'
    var: 'features', 'n_cells', 'sct_variance'
    uns: 'log1p'

In [14]:
del adata.uns

In [15]:
adata.write(f'{prefix}.SCT_residuals.h5ad', compression='gzip')