### Zebrafish example

This notebook prepares a massive zebrafish sci-RNA dataset with a densely sampled timeseries and 20 knockout experiments, from [Saunders et al. 2022](https://www.biorxiv.org/content/10.1101/2022.08.04.502764v1). Here we preprocess the dataset and carry out a simple exploration in scanpy. We will:

- add specific metadata needed by our benchmarking framework
- check to see if KO'd genes have lower transcript levels
- check for low quality cells
- Make metacells or otherwise aggregate
- normalize and log-transform 
- exclude low-expressed genes and select HVG
- visualize


In [1]:
import warnings
warnings.filterwarnings('ignore')
import regex as re
import os
import gc
import shutil
import importlib
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import anndata 
import seaborn as sns
from scipy.stats import spearmanr as spearmanr
from IPython.display import display, HTML

import os, sys
import itertools as it
from scipy.stats import spearmanr, pearsonr, rankdata, f_oneway
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mutual_info_score
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# local
import importlib
import sys
sys.path.append("setup")
import ingestion
importlib.reload(ingestion)

#      visualization settings
plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300

# Specify the working directory explicitly.
os.chdir("/home/ekernf01/Desktop/jhu/research/projects/perturbation_prediction/cell_type_knowledge_transfer/perturbation_data")

# Universal
geneAnnotationPath = "../accessory_data/gencode.v35.annotation.gtf.gz"       # Downloaded from https://www.gencodegenes.org/human/release_35.html
humanTFPath = "../accessory_data/humanTFs.csv"                               # Downloaded from http://humantfs.ccbr.utoronto.ca/download.php
humanEpiPath = "../accessory_data/epiList.csv"                               # Downloaded from https://epifactors.autosome.org/description 

finalDataFileFolder = "perturbations/saunders"

### Combine into anndata to keep everything together

In [2]:
expression_quantified = {}
try:
    expression_quantified["train"] = sc.read_h5ad(os.path.join("not_ready/saunders", "train_working.h5ad"))
except:
    expression_quantified["train"] = anndata.AnnData(sc.read_mtx("not_ready/saunders/GSE202639_reference_raw_counts.mtx.gz").T, dtype=np.float32)
    expression_quantified["train"].var = pd.read_csv("not_ready/saunders/GSE202639_reference_gene_metadata.csv.gz")
    expression_quantified["train"].obs = pd.read_csv("not_ready/saunders/GSE202639_reference_cell_metadata.csv.gz")
    os.makedirs(finalDataFileFolder, exist_ok = True)
    expression_quantified["train"].write_h5ad(os.path.join("not_ready/saunders", "train_working.h5ad"))

In [3]:

try:
    expression_quantified["test"] = sc.read_h5ad(os.path.join("not_ready/saunders", "test_working.h5ad"))
except:
    expression_quantified["test"] = sc.AnnData(sc.read_mtx("not_ready/saunders/GSE202639_zperturb_full_raw_counts.mtx.gz").T, dtype=np.float32)         
    expression_quantified["test"].var = pd.read_csv("not_ready/saunders/GSE202639_zperturb_full_gene_metadata.csv.gz")
    expression_quantified["test"].obs = pd.read_csv("not_ready/saunders/GSE202639_zperturb_full_cell_metadata.csv.gz")
    os.makedirs(finalDataFileFolder, exist_ok = True)
    expression_quantified["test"].write_h5ad(os.path.join("not_ready/saunders", "test_working.h5ad"))

In [None]:
expression_quantified["train"].obs["perturbation"] = "control"
expression_quantified["train"].obs["is_control"] = True

expression_quantified["test"].obs["perturbation"] = expression_quantified["test"].obs["gene_target"].str.replace("-", ",").str.replace("ctrl,", "").str.replace("inj", "control")
expression_quantified["test"].obs["is_control"] = [g=="ctrl-inj" for g in expression_quantified["test"].obs["gene_target"]]

for t in ("train", "test"):
    expression_quantified[t].obs["is_control_int"] = [float(x) for x in expression_quantified[t].obs["is_control"]]
    expression_quantified[t].obs_names = [str(s) for s in expression_quantified[t].obs_names] 


### Remove low-quality cells

We remove unhashed cells where the timepoint or embryo is unknown. 
The data are already filtered on depth and mitochondrial content.

In [None]:
for t in ("train", "test"):
    for f in ("timepoint", "embryo", "cell_type_sub"):
        expression_quantified[t] = expression_quantified[t][pd.notnull(expression_quantified[t].obs[f]),:]
    # sns.jointplot(
    #     x = np.log10(expression_quantified[t].obs["n.umi"]),
    #     y = expression_quantified[t].obs["perc_mitochondrial_umis"], 
    #     kind = "hex", 
    #     cmap = "Blues"
    # )


### Aggregate and normalize

In [None]:
for t in ("train", "test"):
    X = ingestion.aggregate_by_perturbation(
        adata = expression_quantified[t][1:100,:],
        group_by=["perturbation", "cell_type_sub", "embryo"],
        use_raw = False,
    )
    print(f"done {t}")
    # expression_quantified[t] = X
    # expression_quantified[t].raw = expression_quantified[t].copy()
    # expression_quantified[t].X = ingestion.deseq2Normalization(expression_quantified[t].X.T).T
    expression_quantified[t].write_h5ad(os.path.join("not_ready/saunders", f"{t}_aggregated.h5ad"))

grouping
mapping groups to cells
summing
done train
grouping
mapping groups to cells
summing


KeyboardInterrupt: 

: 

### Check on the effects of the perturbations

In [None]:
# If verbose is set to True, display disconcordant trials and their controls
status, logFC = ingestion.checkConsistency(expression_quantified["test"], 
                                           perturbationType="knockout", 
                                           group="donor",
                                           verbose=False) 
expression_quantified["test"].obs["consistentW/Perturbation"] = status
expression_quantified["test"].obs["logFC"] = logFC
Counter(status)

ValueError: zero-size array to reduction operation minimum which has no identity

In [None]:
correlations = ingestion.computeCorrelation(expression_quantified["test"], verbose=True, group="embryo")
expression_quantified.obs["spearmanCorr"] = correlations[0]
expression_quantified.obs[ "pearsonCorr"] = correlations[1]

### Gene selection, per-gene dispersion ranking, and basic EDA

In [None]:
expression_quantified.var["mean_umi_count"] = expression_quantified.raw.X.mean(axis = 1)
sns.histplot(expression_quantified.var["mean_umi_count"])


In [None]:
sc.pp.log1p(expression_quantified)
sc.pp.highly_variable_genes(expression_quantified, flavor = "seurat_v3", n_top_genes=expression_quantified.shape[1])
sc.pl.highly_variable_genes(expression_quantified)
with warnings.catch_warnings():
    sc.tl.pca(expression_quantified, n_comps=5)
sc.pp.neighbors(expression_quantified)
sc.tl.umap(expression_quantified)
clusterResolutions = []


In [None]:
sc.pl.umap(expression_quantified, color = ["embryo", "timepoint", "perturbation", "cell_type", "cell_type_sub"])

In [None]:
controls = ["control"]
perturbed_genes = set(list(expression_quantified.obs['perturbation'].unique())).difference(controls)
perturbed_and_measured_genes = perturbed_genes.intersection(expression_quantified.var.index)
perturbed_but_not_measured_genes = perturbed_genes.difference(expression_quantified.var.index)
print("These genes were perturbed but not measured:")
print(perturbed_but_not_measured_genes)

In [None]:
# final form, ready to save
expression_quantified.uns["perturbed_and_measured_genes"]     = list(perturbed_and_measured_genes)
expression_quantified.uns["perturbed_but_not_measured_genes"] = list(perturbed_but_not_measured_genes)
expression_quantified = ingestion.describe_perturbation_effect(expression_quantified, "knockout")