In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import scipy.sparse as sp
from scipy.io import mmwrite
import os
import scipy.sparse as sparse


## Example data of Cite-Seq for ~160,000 PBMCs from the Satija lab (Hao et al., 2020) 

 - Data: https://atlas.fredhutch.org/nygc/multimodal-pbmc/
 - Pre-print: https://www.biorxiv.org/content/10.1101/2020.10.12.335331v1.full
 - Cell publication:
https://www.sciencedirect.com/science/article/pii/S0092867421005833#undfig1
 
Data converted from h5seurat to h5ad by [this notebook](./100k_pbmcs_labeled_figure.ipynb)

In [2]:
pbmc_rna = sc.read('../../../sciviewer_data/pbmc_citeseq100K/multi.h5ad')
pbmc_protein = sc.read('../../../sciviewer_data/pbmc_citeseq100K/multi_ADT.h5ad')


This is where adjacency matrices should go now.
  warn(


In [3]:
pbmc_rna


AnnData object with n_obs × n_vars = 161764 × 20729
    obs: 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'nCount_SCT', 'nFeature_SCT'
    var: 'features'
    uns: 'neighbors'
    obsm: 'X_apca', 'X_aumap', 'X_pca', 'X_spca', 'X_umap', 'X_wnn.umap'
    varm: 'PCs', 'SPCA'
    obsp: 'distances'

In [4]:
pbmc_protein

AnnData object with n_obs × n_vars = 161764 × 228
    obs: 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'nCount_SCT', 'nFeature_SCT'
    var: 'features'
    obsm: 'X_apca', 'X_aumap', 'X_pca', 'X_spca', 'X_umap', 'X_wnn.umap'
    varm: 'APCA'

In [5]:
pbmc_rna.X = pbmc_rna.raw.X
pbmc_rna.var = pbmc_rna.raw.var

In [6]:
pbmc_protein.X = pbmc_protein.raw.X
pbmc_protein.var = pbmc_protein.raw.var

In [7]:
sc.pp.normalize_per_cell(pbmc_rna, counts_per_cell_after=10000)
sc.pp.log1p(pbmc_rna)

In [8]:
sc.pp.normalize_per_cell(pbmc_protein, counts_per_cell_after=10000)
sc.pp.log1p(pbmc_protein)

In [9]:
merged_X = sparse.hstack([pbmc_protein.X, pbmc_rna.X], format='csc')

In [10]:
merged_X

<161764x20957 sparse matrix of type '<class 'numpy.float64'>'
	with 370535157 stored elements in Compressed Sparse Column format>

In [11]:
merged_var = pbmc_protein.raw.var.copy()
merged_var['_index'] = merged_var['_index'].apply(lambda x: 'Prot_' + x)
merged_var.index = merged_var['_index'].values
z = pbmc_rna.raw.var
z.index = z['_index'].values
merged_var = pd.concat([merged_var, z], axis=0)
merged_var.columns = ['features']
merged_var.head()

Unnamed: 0,features
Prot_CD39,Prot_CD39
Prot_Rat-IgG1-1,Prot_Rat-IgG1-1
Prot_CD107a,Prot_CD107a
Prot_CD62P,Prot_CD62P
Prot_TCR-2,Prot_TCR-2


In [12]:
pbmc_merged = sc.AnnData(X=merged_X, var=merged_var, obs=pbmc_rna.obs, obsm=pbmc_rna.obsm)

In [13]:
detected = np.array((pbmc_merged.X>0).sum(axis=0)).reshape(-1)

In [14]:
(detected<100).sum(), detected.shape[0], pbmc_merged.shape[0]

(3441, 20957, 161764)

In [15]:
pbmc_merged = pbmc_merged[:, detected>=100]

In [16]:
pbmc_merged

View of AnnData object with n_obs × n_vars = 161764 × 17516
    obs: 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'nCount_SCT', 'nFeature_SCT', 'n_counts'
    var: 'features'
    obsm: 'X_apca', 'X_aumap', 'X_pca', 'X_spca', 'X_umap', 'X_wnn.umap'

In [17]:
sc.write('../../../sciviewer_data/pbmc_citeseq100K/CiteSeqPBMC160K_ProteinAndRNA_merged.h5ad', pbmc_merged, )

Trying to set attribute `.obs` of view, copying.
... storing 'orig.ident' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'lane' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'donor' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'time' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'celltype.l1' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'celltype.l2' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'celltype.l3' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'Phase' as categorical


In [18]:
pbmc_merged.X = sparse.csr.csr_matrix(pbmc_merged.X)

In [19]:
sample_ind = np.random.choice(pbmc_merged.X.shape[0], 50000)

In [20]:
pbmc_merged_sub50k = pbmc_merged[sample_ind, :]

In [21]:
pbmc_merged_sub50k

View of AnnData object with n_obs × n_vars = 50000 × 17516
    obs: 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'nCount_SCT', 'nFeature_SCT', 'n_counts'
    var: 'features'
    obsm: 'X_apca', 'X_aumap', 'X_pca', 'X_spca', 'X_umap', 'X_wnn.umap'

In [22]:
pbmc_merged_sub50k.X = sparse.csc.csc_matrix(pbmc_merged_sub50k.X)

In [23]:
sc.write('../../../sciviewer_data/pbmc_citeseq100K/CiteSeqPBMC160K_ProteinAndRNA_merged_sub50k.h5ad', pbmc_merged_sub50k)