In [1]:
import pandas as pd
import scanpy as sc
import anndata as ad
from tqdm import tqdm
import cell2cell as c2c
import os
from cell2cell.preprocessing import aggregate_single_cells 

In [2]:
# verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.verbosity = 3             
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.21.6 scipy==1.7.3 pandas==1.3.5 scikit-learn==1.0.2 statsmodels==0.13.2 pynndescent==0.5.7


## 1. ScRNA-seq Data Preprocessing

### 1.1 Download Data
```bash
# download data (GSE174188) using the following bash commands:
raw_data_dir=/home/qdai8/projects/Projects/STDCC/Data/RDA/SLE/raw
mkdir -p ${raw_data_dir}
cd ${raw_data_dir}
wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE174nnn/GSE174188/suppl/GSE174188_CLUES1_adjusted.h5ad.gz
# unzip 
gzip -d GSE174188_CLUES1_adjusted.h5ad.gz
```

### 1.2 Preprocessing

In [3]:
# read in SLE data in h5ad 
# it needs ~25G memory and several minutes
data_dir = '/home/qdai8/projects/Projects/STDCC/Data/RDA/SLE/'
SLE_data_path = data_dir + 'raw/GSE174188_CLUES1_adjusted.h5ad'
data = sc.read_h5ad(SLE_data_path)

The data was cleaned, batch corrected, clustered, and annotated as described in the dataset's original publication: https://www.science.org/doi/10.1126/science.abf1970?url_ver=Z39.88-2003&rfr_id=ori:rid:crossref.org&rfr_dat=cr_pub%20%200pubmed. We extracted the raw counts and the annotations to use in the following analysis.

#### Get raw counts

In [4]:
# get the raw counts 
raw_data = ad.AnnData(data.raw.X)

The raw count data includes 1,263,676 cells and 32,738 genes

In [5]:
print(raw_data)

AnnData object with n_obs × n_vars = 1263676 × 32738


#### Get annotations

check annotations of cells:

In [6]:
data.obs.columns

Index(['batch_cov', 'ind_cov', 'Processing_Cohort', 'louvain', 'cg_cov',
       'ct_cov', 'L3', 'ind_cov_batch_cov', 'Age', 'Sex', 'pop_cov', 'Status',
       'SLE_status'],
      dtype='object')

get the information of sample id (ind_cov_batch_cov), subject id (ind_cov), processing batch (Processing_Cohort), age (Age), gender (Sex), ancestry (pop_cov), SLE status (SLE_status), and cell type (cg_cov):

In [7]:
raw_data.obs = data.obs[['ind_cov_batch_cov', 'ind_cov', 'Processing_Cohort', 
                         'Age', 'Sex', 'pop_cov', 'SLE_status',
                         'cg_cov']]

In [8]:
raw_data.obs.head()

Unnamed: 0,ind_cov_batch_cov,ind_cov,Processing_Cohort,Age,Sex,pop_cov,SLE_status,cg_cov
CAAGGCCAGTATCGAA-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0,HC-546:dmx_YS-JY-22_pool6,HC-546,4.0,28.0,Female,Asian,Healthy,T4
CTAACTTCAATGAATG-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0,1132_1132:dmx_YS-JY-22_pool6,1132_1132,4.0,45.0,Female,European,SLE,cM
AAGTCTGGTCTACCTC-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0,FLARE006:dmx_AbFlare-3,FLARE006,3.0,34.0,Female,European,SLE,cM
GGCTCGATCGTTGACA-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0-0-0-0-0-0,1110_1110:dmx_YS-JY-20_pool3,1110_1110,4.0,71.0,Female,European,SLE,B
ACACCGGCACACAGAG-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0,1479_1479:dmx_YE110,1479_1479,2.0,28.0,Female,Asian,SLE,T4


check annotations of genes:

In [9]:
data.raw.var.head()

Unnamed: 0,gene_ids,feature_types-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0
MIR1302-10,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
RP11-34P13.7,ENSG00000238009,Gene Expression
RP11-34P13.8,ENSG00000239945,Gene Expression


In [10]:
raw_data.var = data.raw.var

## 2. Aggregate single cell expression

### 2.1 reduced the dataset down to one sequencing sample per subject

The SLE scRNA-seq dataset collects multiplexed scRNA-seq of 264 PBMC samples and 91 technical replicates from 162 SLE patients and 99 healthy controls.

355 sequencing samples = 264 samples + 91 replicates:

In [11]:
len(raw_data.obs['ind_cov_batch_cov'].unique())

355

261 subjects = 162 SLE patients and 99 healthy controls:

In [12]:
raw_data.obs.groupby('SLE_status')['ind_cov'].nunique()

SLE_status
Healthy     99
SLE        162
Name: ind_cov, dtype: int64

We reduced the dataset down to one sequencing sample per subject by selecting the sample with largest number of cells. 

In [13]:
# get donor list
all_subjects = raw_data.obs.ind_cov.unique()

In [14]:
def preprocess_gene_expression(subject, out_cols, h5_data=raw_data,min_cells=3):
    
    # get all samples for one subject
    subject_adata = h5_data[h5_data.obs['ind_cov'] == subject].copy()
    
    # For subject with more than 1 sample, select the sample with most cells 
    if subject_adata.obs['ind_cov_batch_cov'].nunique() > 1:
        select_sample = subject_adata.obs['ind_cov_batch_cov'].value_counts().index[0]
        subject_adata = subject_adata[subject_adata.obs['ind_cov_batch_cov'] == select_sample]
    
    # extract expression matrix (count data)
    df = pd.DataFrame.sparse.from_spmatrix(subject_adata.X).T
    ## columns: cell barcodes
    df.columns = subject_adata.obs.index.values
    ## rows: genes
    df = df.set_index(subject_adata.var['gene_ids'])
    ## filter genes by number of expressed cells
    filtered_df = df.loc[(df > 0).sum(axis=1) > min_cells]
    
    # extract subject-level information
    subject_info = subject_adata.obs[out_cols].drop_duplicates().reset_index(drop=True)
    
     # extract cell types
    subject_meta = subject_adata.obs[['cg_cov']]
    
    return filtered_df, subject_meta, subject_info

### 2.2 aggregate single cell expression to cell-type-based expression matrices

create the output directory and the output file of subject-level information:

In [15]:
# output file of subject-level information
out_cols = ['ind_cov', 'Processing_Cohort', 
       'Age', 'Sex', 'pop_cov', 'SLE_status']
out_df = pd.DataFrame(columns = out_cols)
out_df.to_csv(data_dir + 'subject_info.txt',
              index=None,
              sep='\t',
              header=True,
              mode='w')

In [16]:
for sample in tqdm(all_subjects):
    filtered_df, sample_meta, subject_info = preprocess_gene_expression(sample, out_cols)
    
    # Aggregate single cell expression to cell-type-based expression matrices
    avg_df = aggregate_single_cells(rnaseq_data=filtered_df,
                                    metadata=sample_meta,
                                    barcode_col='index',
                                    celltype_col='cg_cov',
                                    method='nn_cell_fraction',
                                    transposed=False)
  
    avg_df.to_csv(data_dir + 'CellnnFraction_Validation/{}.CellnnFraction.csv.gz'.format(sample))
    
    subject_info.to_csv(data_dir + 'subject_info.txt',
                        index=None,
                        sep='\t',
                        header=False,
                        mode='a')

  df.reset_index(inplace=True)
100%|███████████████████████████████████████| 261/261 [2:41:45<00:00, 37.19s/it]
