In [1]:
import os
os.chdir('/rd2/user/xiacr/sle/other_sc_data/')

In [4]:
import numpy as np
import pandas as pd
import scanpy as sc
import harmonypy as hm
import importlib

import glob
import anndata as ad
import collections

import scanpy.external as sce
from mycolorpy import colorlist as mcp
from IPython.display import display, HTML

# import utils
# importlib.reload(utils)

In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=160, facecolor='white', fontsize=10)
sc._settings.ScanpyConfig.n_jobs = 32

scanpy==1.8.2 anndata==0.7.8 umap==0.5.2 numpy==1.19.0 scipy==1.8.0 pandas==1.4.1 scikit-learn==1.0.2 statsmodels==0.13.1 python-igraph==0.9.9 pynndescent==0.5.5


#  Read in data

In [5]:
core_data =  sc.read_h5ad('./h5ad/04-pbmc_all_anno_modify_meta.h5ad')

core_data.obs['group'] = core_data.obs['group'].astype(str)
core_data.obs.loc[core_data.obs[(core_data.obs['treatment'].isin(['untreated']))].index.tolist(),'group'] = 'sle_flare'
core_data.obs.loc[core_data.obs[(core_data.obs['treatment'].isin(['treated']))].index.tolist(),'group'] = 'sle_treated'
core_data.obs.loc[core_data.obs[(core_data.obs['treatment'].isin(['HC']))].index.tolist(),'group'] = 'hc'
core_data.obs['sample'] = core_data.obs['dataset'].astype(str) + '_' + core_data.obs['orig.ident'].astype(str)

In [7]:
science_22 = sc.read_h5ad('./data/22_science_sle/GSE174188_CLUES1_adjusted.h5ad')

In [25]:
science_22.obs['group'] = science_22.obs['Status'].tolist()
science_22.obs.loc[science_22.obs[(science_22.obs['group'].isin(['Flare']))].index.tolist(),'group'] = 'sle_flare'
science_22.obs.loc[science_22.obs[(science_22.obs['group'].isin(['Managed']))].index.tolist(),'group'] = 'sle'
science_22.obs.loc[science_22.obs[(science_22.obs['group'].isin(['Healthy']))].index.tolist(),'group'] = 'hc'
science_22.obs.loc[science_22.obs[(science_22.obs['group'].isin(['Treated']))].index.tolist(),'group'] = 'sle_treated'

In [6]:
nbt_17= sc.read_h5ad('./h5ad/nbt_17_meta.h5ad')
pnas_19 = sc.read_h5ad('./h5ad/pnas_19_meta.h5ad')
ni_20 =  sc.read_h5ad('./h5ad/ni_20_meta.h5ad')
ebi_21 = sc.read_h5ad('./h5ad/ebi_21_meta.h5ad')

In [8]:
nbt_17.obs['dataset'] = 'nbt_17'
pnas_19.obs['dataset'] = 'pnas_19'
ni_20.obs['dataset'] = 'ni_20'
ebi_21.obs['dataset'] = 'ebi_21'
science_22.obs['dataset'] = 'sciencd_22'
core_data.obs['dataset'] = 'core_dataset'

In [27]:
adata = core_data.concatenate([science_22, ebi_21, ni_20, pnas_19, nbt_17],join="outer")
adata

  warn(


AnnData object with n_obs × n_vars = 1933294 × 27584
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'group', 'treatment', 'pair', 'percent_mito', 'percent_ribo', 'S.Score', 'G2M.Score', 'Phase', 'old.ident', 'RNA_snn_res.0.8', 'RNA_snn_res.1', 'seurat_clusters', 'main_type', 'scrublet_doublet', 'subtype', 'RNA_snn_res.0.6', 'RNA_snn_res.1.2', 'RNA_snn_res.0.2', 'RNA_snn_res.0.3', 'RNA_snn_res.0.4', 'RNA_snn_res.0.5', 'barcode', 'dataset', 'sample', 'batch_cov', 'ind_cov', 'Processing_Cohort', 'louvain', 'cg_cov', 'ct_cov', 'L3', 'ind_cov_batch_cov', 'Age', 'Sex', 'pop_cov', 'Status', 'SLE_status', 'Run', 'BioSample', 'biospecimen_repository_sample_id', 'Bytes', 'Center.Name', 'Consent_Code', 'Consent', 'DATASTORE.filetype', 'DATASTORE.provider', 'DATASTORE.region', 'Experiment', 'Library.Name', 'Sample.Name', 'sex', 'submitted_subject_id', 'subject_is_affected', 'AvgSpotLen', 'Bases', 'study_disease', 'cell_type', 'batch'
    var: 'vst.mean-0', 'vst.variance-0', 'vst.variance.exp

In [28]:
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=20)

filtered out 1217299 cells that have less than 200 genes expressed
filtered out 2927 genes that are detected in less than 20 cells


In [31]:
adata

AnnData object with n_obs × n_vars = 715995 × 24657
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'group', 'treatment', 'pair', 'percent_mito', 'percent_ribo', 'S.Score', 'G2M.Score', 'Phase', 'old.ident', 'RNA_snn_res.0.8', 'RNA_snn_res.1', 'seurat_clusters', 'main_type', 'scrublet_doublet', 'subtype', 'RNA_snn_res.0.6', 'RNA_snn_res.1.2', 'RNA_snn_res.0.2', 'RNA_snn_res.0.3', 'RNA_snn_res.0.4', 'RNA_snn_res.0.5', 'barcode', 'dataset', 'sample', 'batch_cov', 'ind_cov', 'Processing_Cohort', 'louvain', 'cg_cov', 'ct_cov', 'L3', 'ind_cov_batch_cov', 'Age', 'Sex', 'pop_cov', 'Status', 'SLE_status', 'Run', 'BioSample', 'biospecimen_repository_sample_id', 'Bytes', 'Center.Name', 'Consent_Code', 'Consent', 'DATASTORE.filetype', 'DATASTORE.provider', 'DATASTORE.region', 'Experiment', 'Library.Name', 'Sample.Name', 'sex', 'submitted_subject_id', 'subject_is_affected', 'AvgSpotLen', 'Bases', 'study_disease', 'cell_type', 'batch', 'n_genes'
    var: 'vst.mean-0', 'vst.variance-0', 'vst.va

In [None]:
del science_22 ; del core_data; del ebi_21; del ni_20; del pnas_19; del nbt_17

In [None]:
adata.write('./output/01-pbmc_concat_outer_science.h5ad')