Date: 9/29/2022
Author: Hoang Van Phan

Here I extract pseudobulk counts per cell type. I will not do any gene filtering. I will only export the counts of lupus patients.

**NOTE:** The object is too big to analyze on a personal laptop.

In [1]:
import numpy as np
import scipy.stats as stats
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import scanpy as sc
sc.settings.verbosity = 3
sc.logging.print_version_and_date()
import scanpy.external as sce

Running Scanpy 1.9.1, on 2022-09-29 12:25.


In [2]:
# Figure settings
mpl.rcdefaults()
# Set font to be arial
# mpl.rc('font', **{'sans-serif':'Arial', 'size':12})
mpl.rc('font', **{'size':12})
mpl.rcParams['mathtext.rm'] = 'sans' # to have non-italic greek letter, use r'$\mathrm{\alpha}$', does NOT work with f-string
mpl.rcParams['axes.titlesize'] = 12
# Set default tick size
mpl.rcParams['xtick.major.size'] = 5.5
mpl.rcParams['ytick.major.size'] = 5.5
mpl.rcParams['xtick.minor.size'] = 2.5
mpl.rcParams['ytick.minor.size'] = 2.5
# Default legend settings
mpl.rcParams['legend.fancybox'] = False
mpl.rcParams['legend.edgecolor'] = 'k'

# sc.settings.set_figure_params(dpi=120)

In [3]:
# Load data
adata = sc.read_h5ad(
    "/home/lab/Van/20220715_CLUES_lupus/objects/"
    + "Ye_lab_annotated_cellxgene_lupus_only.h5ad")
adata

AnnData object with n_obs × n_vars = 597112 × 30933
    obs: 'library_uuid', 'assay_ontology_term_id', 'mapped_reference_annotation', 'is_primary_data', 'cell_type_ontology_term_id', 'author_cell_type', 'cell_state', 'author_cluster', 'sample_uuid', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'disease_state', 'suspension_enriched_cell_types', 'suspension_uuid', 'suspension_type', 'donor_uuid', 'ethnicity_ontology_term_id', 'organism_ontology_term_id', 'disease_ontology_term_id', 'sex_ontology_term_id', 'Processing_Cohort', 'ct_cov', 'ind_cov', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage', 'subjectid'
    var: 'feature_biotype', 'feature_is_filtered', 'feature_name', 'feature_reference'
    uns: 'X_normalization', 'author_cell_type_colors', 'ct_cov_colors', 'default_embedding', 'layer_descriptions', 'schema_version', 'title'
    obsm: 'X_umap'

In [4]:
adata.obs.head()

Unnamed: 0_level_0,library_uuid,assay_ontology_term_id,mapped_reference_annotation,is_primary_data,cell_type_ontology_term_id,author_cell_type,cell_state,author_cluster,sample_uuid,tissue_ontology_term_id,...,ind_cov,cell_type,assay,disease,organism,sex,tissue,ethnicity,development_stage,subjectid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CTAACTTCAATGAATG-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0,70a004b7-4a17-4702-8910-4557aa0c4279,EFO:0009899,GENCODE 19,True,CL:0000860,cM,na,7,577bce84-8d37-4851-9fb4-53f9467699ba,UBERON:0000178,...,1132_1132,classical monocyte,10x 3' v2,systemic lupus erythematosus,Homo sapiens,female,blood,European,45-year-old human stage,1132
GGCTCGATCGTTGACA-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0-0-0-0-0-0,c2641f62-eb23-4dad-9c22-b52e72b79df2,EFO:0009899,GENCODE 19,True,CL:0000236,B,na,3,5e0ed28c-a75a-4ecd-a0c7-49e71264690b,UBERON:0000178,...,1110_1110,B cell,10x 3' v2,systemic lupus erythematosus,Homo sapiens,female,blood,European,71-year-old human stage,1110
ACACCGGCACACAGAG-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0,222b358b-71e7-4b0f-9f9b-47b4c67aaa27,EFO:0009899,GENCODE 19,True,CL:0000624,T4,na,2,982d60b4-5677-4e2d-8b58-79503863710d,UBERON:0000178,...,1479_1479,"CD4-positive, alpha-beta T cell",10x 3' v2,systemic lupus erythematosus,Homo sapiens,female,blood,Asian,28-year-old human stage,1479
TCGTAGATCCTTGGTC-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0-0-0-0-0-0-0-0-0,0c7c125d-46d6-40ba-8088-31fb2b526a78,EFO:0009899,GENCODE 19,True,CL:0000625,T8,na,4,fc11ebf7-2767-4d92-96f9-a57af8f1be30,UBERON:0000178,...,1334_1334,"CD8-positive, alpha-beta T cell",10x 3' v2,systemic lupus erythematosus,Homo sapiens,female,blood,Asian,52-year-old human stage,1334
CCACCTAAGGGCTTCC-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0-0,0955850b-157b-4396-a3da-83b8bcbed172,EFO:0009899,GENCODE 19,True,CL:0000624,T4,na,15,d1272c4d-f68a-4d3d-bae3-715e337bd80b,UBERON:0000178,...,1333_1333,"CD4-positive, alpha-beta T cell",10x 3' v2,systemic lupus erythematosus,Homo sapiens,female,blood,Asian,64-year-old human stage,1333


In [5]:
# Use gene name instead of gene symbols
adata.var = adata.var.reset_index().set_index("feature_name")
adata.var_names = adata.var.index

adata.var.head()

AnnData expects .var.index to contain strings, but got values like:
    ['MIR1302-2HG', 'FAM138A', 'OR4F5', 'RP11-34P13.7', 'RP11-34P13.8']

    Inferred to be: categorical

  value_idx = self._prep_dim_index(value.index, attr)
AnnData expects .var.index to contain strings, but got values like:
    ['MIR1302-2HG', 'FAM138A', 'OR4F5', 'RP11-34P13.7', 'RP11-34P13.8']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "var")


Unnamed: 0_level_0,index,feature_biotype,feature_is_filtered,feature_reference
feature_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MIR1302-2HG,ENSG00000243485,gene,True,NCBITaxon:9606
FAM138A,ENSG00000237613,gene,True,NCBITaxon:9606
OR4F5,ENSG00000186092,gene,True,NCBITaxon:9606
RP11-34P13.7,ENSG00000238009,gene,True,NCBITaxon:9606
RP11-34P13.8,ENSG00000239945,gene,True,NCBITaxon:9606


# Export pseudobulk count

In [8]:
for i in ["T4","T8","B","cDC","cM","ncM","NK","pDC","Prolif","PB","Progen"]:
    
    # Isolate one cell type
    temp = adata[adata.obs["author_cell_type"]==i,:].copy()
    
    # Pseudocount data frame
    pseudobulk = pd.DataFrame(0, 
                              index=temp.var.index,
                              columns=temp.obs["ind_cov"].cat.categories,
                              dtype="float")
    for donor in pseudobulk.columns:
        pseudobulk.loc[:,donor] = np.ravel(np.sum(temp.raw.X[temp.obs["ind_cov"]==donor,:], axis=0))
    
    # Export
    pseudobulk.to_csv(
        "/home/lab/Van/20220715_CLUES_lupus/pseudobulk_counts/"
        + f"Ye_lab_annotated_cellxgene_lupus_only_physact_{i}_pseudobulk_count.csv"
    )

