### K562 overexpression example

This notebook prepares a dataset from a tech-dev paper with a gazillion Perturb-seq experiments applied to mostly K562 ([Replogle et al 2020](https://www.nature.com/articles/s41587-020-0470-y)). We'll focus on just the CRISPRa multiplexing experiment. 

Here we tidy the dataset and carry out a simple exploration in scanpy.

In [1]:
import warnings
warnings.filterwarnings('ignore')
import regex as re
import os
import shutil
import importlib
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import celloracle as co
from scipy.stats import spearmanr as spearmanr
from IPython.display import display, HTML
# local
import importlib
import sys
sys.path.append("setup")
import ingestion
importlib.reload(ingestion)

#      visualization settings
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300

# Specify the working directory explicitly.
os.chdir("/home/ekernf01/Desktop/jhu/research/projects/perturbation_prediction/cell_type_knowledge_transfer/perturbations/")

### How many TF's are perturbed?

In [2]:
dataset_name = "replogle"
human_tfs = pd.read_csv("../accessory_data/humanTFs.csv")
replogle_perturbations = pd.read_csv(f"not_ready/{dataset_name}/perturbed_genes.csv")
replogle_perturbations.drop_duplicates(inplace = True)
replogle_perturbations["is_tf"] = replogle_perturbations["gene"].isin(human_tfs["HGNC symbol"])
display(replogle_perturbations.groupby("experiment").count()) #total
display(replogle_perturbations.groupby("experiment").sum()) #tf only
replogle_perturbations.query("experiment=='CRISPRa multiplex'")["gene"].unique() #which TF's

Unnamed: 0_level_0,gene,is_tf
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1
CRISPRa multiplex,63,63
CRISPRi multiplex,104,104
IPSC,19,19
Interaction,41,42
UPR validation,30,30


Unnamed: 0_level_0,is_tf
experiment,Unnamed: 1_level_1
CRISPRa multiplex,39
CRISPRi multiplex,16
IPSC,5
Interaction,4
UPR validation,2


array(['CDKN1A', 'CDKN1B', 'CEBPB', 'CERKL', 'COL2A1', 'EOMES', 'ETS2',
       'FOXC1', 'HHEX', 'HIC2', 'HOXD13', 'IGDCC3', 'IRF1', 'KLF1',
       'LHX4', 'MAP2K3', 'MAP2K6', 'MAPK1', 'MEIS2', 'NEUROD1', 'NPM1',
       'PRTG', 'PTPN1', 'SET', 'SPI1', 'TIPARP', 'TLE3', 'TTC6', 'ZNF296',
       'AIRE', 'ASCL1', 'ATF4', 'ATF5', 'BACH2', 'CCDC51', 'COX5A',
       'CREB1', 'DDIT3', 'DDIT4', 'EIF2S1', 'FOXP3', 'GATA3', 'HIF1A',
       'HSF1', 'HSPA9', 'HSPD1', 'MTOR', 'NEUROG2', 'NRF1', 'PISD',
       'PPARGC1A', 'RANBP1', 'RORA', 'RPL30', 'RPS18', 'SREBF1', 'TBX21',
       'TFAP4', 'TIMM50', 'TMA7', 'UPF1', 'ZNF236', 'ZNF574'],
      dtype=object)

### Load expression data & set up cell metadata

In [None]:
expression_quantified = sc.read_10x_mtx(f"not_ready/{dataset_name}/GSM4367986_exp8/")

In [None]:
sc.pp.calculate_qc_metrics(expression_quantified, inplace = True)
cell_metadata = pd.read_csv(f"not_ready/{dataset_name}/GSM4367986_exp8/cell_identities.csv.gz")
cell_metadata.index = cell_metadata["cell_barcode"]
cell_metadata["target_symbol"] = ingestion.convert_ens_to_symbol(
    cell_metadata["target"], 
    gtf = "../accessory_data/gencode.v35.annotation.gtf.gz", 
    strip_version = True)
cell_metadata["target_is_tf"] = cell_metadata["target_symbol"].isin(human_tfs["HGNC symbol"]).astype("int")
expression_quantified.obs = pd.merge(expression_quantified.obs,
                                     cell_metadata, 
                                     how = "left", 
                                     left_index = True, 
                                     right_index = True)

expression_quantified.obs["has_guide_annotations"] = pd.notnull(expression_quantified.obs["coverage"]).astype("int")
expression_quantified.obs["good_coverage"] = expression_quantified.obs["good_coverage"].astype("str")

### How many cells do we have for each target?

In [None]:
n_cells_per_target = expression_quantified.obs.groupby("target_symbol")[["cell_barcode"]].count().sort_values("cell_barcode")
n_cells_per_target.columns = ["n_cells"]
n_cells_per_target["is_tf"] = n_cells_per_target.index.isin(human_tfs["HGNC symbol"])
n_cells_per_target.reset_index(inplace = True)
display(n_cells_per_target)
plt.rcParams['figure.figsize'] = [6, 9]
sns.barplot(data = n_cells_per_target, x = "n_cells", y = "target_symbol", hue = "is_tf").set_title("CRISPRa in K562")

### Convert ensembl gene id's to gene symbol

In [None]:
expression_quantified.var_names = ingestion.convert_ens_to_symbol(
    expression_quantified.var_names, 
    gtf = "../accessory_data/gencode.v35.annotation.gtf.gz", 
)
display(expression_quantified.var.head())
display(expression_quantified.var_names[0:5])

### Fill `perturbation` & `Is_control`

In [None]:
controls = ["Non-Targeting"]
expression_quantified.obs["perturbation"] = expression_quantified.obs['target_symbol']
expression_quantified.obs["is_control"]   = expression_quantified.obs['target_symbol'].isin(controls).astype(int)

### QC filters

In [None]:
expression_quantified = expression_quantified[expression_quantified.obs["total_counts"]>= 2000, :]
expression_quantified = expression_quantified[expression_quantified.obs["pct_counts_in_top_50_genes"]<= 50, :]
expression_quantified

### Basic EDA

In [None]:
sc.pp.normalize_total(expression_quantified, target_sum=1e4)
sc.pp.log1p(expression_quantified)
sc.pp.highly_variable_genes(expression_quantified, min_mean=0.2, max_mean=4, min_disp=0.2, n_bins=50)
sc.pl.highly_variable_genes(expression_quantified)
with warnings.catch_warnings():
    sc.tl.pca(expression_quantified, n_comps=100)
sc.pp.neighbors(expression_quantified)
sc.tl.umap(expression_quantified)
clusterResolutions = []
sc.tl.leiden(expression_quantified)
cc_genes = pd.read_csv("../accessory_data/regev_lab_cell_cycle_genes.txt", header = None)[0]
sc.tl.score_genes_cell_cycle(expression_quantified, s_genes=cc_genes[:43], g2m_genes=cc_genes[43:])

In [None]:
plt.rcParams['figure.figsize'] = [6, 4.5]
sc.pl.umap(expression_quantified, color = [
    "PTPRC",
    "leiden", 
    "is_control",
    "perturbation",
    'S_score',
    'G2M_score', 
    'phase', 
    'total_counts', 
    'log1p_total_counts',
    'pct_counts_in_top_50_genes', 
    'has_guide_annotations',
])
# Will ask CO to use only one cluster.
# Requires setting certain other undocumented aspects of object state. :(
expression_quantified.obs["fake_cluster"]="all_one_cluster"
expression_quantified.obs.fake_cluster = expression_quantified.obs.fake_cluster.astype("category")
expression_quantified.uns["fake_cluster_colors"] = ['#1f77b4']

### Data reduction

With only 64GB of RAM, I have been unable to make whole-transcriptome predictions with CellOracle. A data reduction step is necessary where only highly variable genes are included. We also keep all genes that are perturbed, whether or not they appear highly variable -- unless they are unavailable in the first place. 

In [None]:
perturbed_genes = set(list(expression_quantified.obs['perturbation'].unique())).difference(controls)
perturbed_and_measured_genes = perturbed_genes.intersection(expression_quantified.var.index)
perturbed_but_not_measured_genes = perturbed_genes.difference(expression_quantified.var.index)
genes_keep = expression_quantified.var.index[expression_quantified.var['highly_variable']]
genes_keep = set(genes_keep).union(perturbed_and_measured_genes)
expression_quantified_orig = expression_quantified.copy()
print("These genes were perturbed:")
print(perturbed_genes)
print("These genes were perturbed but not measured:")
print(perturbed_but_not_measured_genes)
print("This many variable genes will be kept and used by CO:")
print(len(genes_keep))

In [None]:
# final form, ready to save
expression_quantified = expression_quantified_orig[:,list(genes_keep)]
expression_quantified.uns["perturbed_and_measured_genes"]     = list(perturbed_and_measured_genes)
expression_quantified.uns["perturbed_but_not_measured_genes"] = list(perturbed_but_not_measured_genes)

In [None]:
os.makedirs(f"perturbations/{dataset_name}", exist_ok = True)
expression_quantified.write_h5ad(f"perturbations/{dataset_name}/test.h5ad")