
Author: Erno Hänninen

Created: 12.12.2022

Title: Concat_Zhou_Herb.ipynb

Description:
- Notebook that merges the output of AnnotateHerbData.ipynb and AnnotateZhouData.ipynb scripts
- Additional procesing is done where remaining low-quality cells and cell doublets are removed

Procedure
- Read the files to be merged (output from AnnotateHerbData.ipynb and AnnotateZhouData.ipynb scripts)
- Harmonize cell type names to be consistent between the two datasets
- Merge herb and zhou dataset
- PErform additional data filttering
    - Remove mito and ribo genes
    - Remove remaining low quality cells and cell doublets
- Save output

Python packages:
- scanpy, matplotlib, pandas
    
Usage:
- This script is launched and parameterized from the pipeline (data_processing_wf.nf)


### Read data and delete unnecessary columns

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
import pandas as pd

In [183]:
# Read data
herb_adata = sc.read(herb_path)
zhou_adata = sc.read(zhou_path)

In [None]:
# Slight changes are required to the data objects to avoid errors in scanpy functions
zhou_adata.uns["log1p"] = {"base":None}
del(herb_adata.obs["orig.ident"], herb_adata.obs["nCount_RNA"], herb_adata.obs["nFeature_RNA"], herb_adata.obs["percent.mt"])
herb_adata.var=herb_adata.var.set_index("features")
herb_adata.uns["log1p"] = {"base":None}

### Harmonize the cell names

In [None]:
zhou_adata.obs['Cell_types_2'] = (
    zhou_adata.obs["Cell_types"]
    .map(lambda x: {"OL": "Oligo"}.get(x, x))
    .astype("category")
)

In [None]:
herb_adata.obs['Cell_types_2'] = (
    herb_adata.obs["Cell_types"]
    .map(lambda x: {"Oligodendrocyte Progenitors_1": "OPC", "Oligodendrocyte Progenitors_2": "OPC","Oligodendrocytes [Immature]": "Oligo", "Oligodendrocytes [Maturing]":"Oligo", "Oligodendrocytes [Dividing]":"Oligo", "Oligodendrocytes [Mature]":"Oligo", "vSMC":"Mural", "Ependymal":"Ependy",
                    "Neural Progenitors_1":"NP", "Neural Progenitors_2":"NP", "Neurons":"Neuron", "Astrocyte Progenitors":"Astrocyte", "Astrocytes":"Astrocyte", "Endothelial [Venous]":"Endoth", "Endothelial [Arterial_2]": "Endoth", "Endothelial [Arterial_1]":"Endoth", "Pericytes_1":"VLMC", "Pericytes_2":"Pericyte"}.get(x, x))
    .astype("category")
)

### Merge zhou and herb

In [None]:
merged_adata = zhou_adata.concatenate(herb_adata, batch_key=None, join="outer")
merged_adata.var_names_make_unique()
merged_adata.layers["counts"] = merged_adata.X.copy()
merged_adata

### Process the data

In [None]:
# Compute qc metrix
merged_adata.var['mt'] = merged_adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(merged_adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

In [None]:
# Plot qc plots
ig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(1, 5,  figsize=(20,4), gridspec_kw={'wspace':0.9})
ax1_dict = sc.pl.violin(merged_adata, ['n_genes_by_counts'], jitter=0.4, show=False, ax = ax1) 
ax2_dict = sc.pl.violin(merged_adata, ['total_counts'], jitter=0.4, show=False, ax = ax2)
ax3_dict = sc.pl.violin(merged_adata, ['pct_counts_mt'], jitter=0.4, show=False, ax = ax3)    
ax4_dict = sc.pl.scatter(merged_adata, x='total_counts', y='pct_counts_mt', show=False, ax=ax4)
ax5_dict = sc.pl.scatter(merged_adata, x='total_counts', y='n_genes_by_counts', show=False, ax=ax5)

In [None]:
# Filter the outliers away
sc.pp.filter_cells(merged_adata, min_genes=500)
sc.pp.filter_cells(adata, min_counts=1100)

In [None]:
# Check whether mito / ribo genes are highly expressed
sc.pl.highest_expr_genes(merged_adata, n_top=20)

In [None]:
# Since mito / ribo genes are removed from the Zhou data its good to remove them from herb as well
import numpy as np
# Removing mito / ribo genes
ribo = merged_adata.var_names.str.startswith(('RPL', "RPS"))
mito = merged_adata.var_names.str.startswith('MT-')
remove = np.add(mito, ribo)
keep = np.invert(remove)

merged_adata = merged_adata[:,keep]

In [None]:
# Recompute qc metrics
sc.pp.calculate_qc_metrics(merged_adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(1, 5,  figsize=(20,4), gridspec_kw={'wspace':0.9})
ax1_dict = sc.pl.violin(merged_adata, ['n_genes_by_counts'], jitter=0.4, show=False, ax = ax1) 
ax2_dict = sc.pl.violin(merged_adata, ['total_counts'], jitter=0.4, show=False, ax = ax2)
ax3_dict = sc.pl.violin(merged_adata, ['pct_counts_mt'], jitter=0.4, show=False, ax = ax3)    
ax4_dict = sc.pl.scatter(merged_adata, x='total_counts', y='pct_counts_mt', show=False, ax=ax4)
ax5_dict = sc.pl.scatter(merged_adata, x='total_counts', y='n_genes_by_counts', show=False, ax=ax5)

In [None]:
# Normalaize data, raw counts are stored in adata.layers["counts"]
sc.pp.normalize_total(merged_adata, target_sum=1e4)
sc.pp.log1p(merged_adata)

In [None]:
# Find and remove cell doublets

doublet_adata = merged_adata.copy()
doublet_adata.X = doublet_adata.layers["counts"].copy()

import scrublet as scr

# split per batch into new objects.
batches = doublet_adata.obs['sample'].cat.categories.tolist()
alldata = {}
for batch in batches:
    tmp = doublet_adata[doublet_adata.obs['sample'] == batch,]
    print(batch, ":", tmp.shape[0], " cells")
    scrub = scr.Scrublet(tmp.X)
    out = scrub.scrub_doublets(verbose=False, n_prin_comps = 20)
    alldata[batch] = pd.DataFrame({'doublet_score':out[0],'predicted_doublets':out[1]},index = tmp.obs.index)
    print(alldata[batch].predicted_doublets.sum(), " predicted_doublets")

# add predictions to the merged_adata object.
scrub_pred = pd.concat(alldata.values())
merged_adata.obs['doublet_scores'] = scrub_pred['doublet_score'] 
merged_adata.obs['predicted_doublets'] = scrub_pred['predicted_doublets'] 
merged_adata.obs["predicted_doublets"].value_counts()

# Remove doublets
merged_adata = merged_adata[merged_adata.obs['predicted_doublets'] != True,:]
del merged_adata.obs["predicted_doublets"] 
print(merged_adata.shape)

In [None]:
merged_adata.raw = merged_adata.copy()
merged_adata.write("merged_zhou_herb.h5ad") #Write the data to file