### CD4 T cell KO example

This notebook prepares a dataset with 24 individual knockout experiments applied to CD4 T cells ([Freimer et al 2020](https://www.nature.com/articles/s41588-022-01106-y)). Each knockout was profiled with both ATAC and RNA-seq measurements, but we use only RNA. The data have UMI's. Controls are 8 guide RNA's targeting the "safe-harbor" AAVS1 locus, and are labeled `AAVS1_1`, `AAVS1_8`, etc. The experiment was done separately on blood from 3 different donors.

Here we tidy the dataset and carry out a simple exploration in scanpy. (It's not single cell data but scanpy is still useful for data exploration.)

In [None]:
import warnings
warnings.filterwarnings('ignore')
import regex as re
import os
import shutil
import importlib
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
from scipy.stats import spearmanr as spearmanr
from IPython.display import display, HTML

import os, sys
import itertools as it
from scipy.stats import spearmanr, pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# local
import importlib
import sys
sys.path.append("setup")
import ingestion
import global_effects
importlib.reload(ingestion)
importlib.reload(global_effects)

#      visualization settings
plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300

# Specify the working directory explicitly.
os.chdir("/home/ekernf01/Desktop/jhu/research/projects/perturbation_prediction/cell_type_knowledge_transfer/perturbation_data")

# Universal
geneAnnotationPath = "../accessory_data/gencode.v35.annotation.gtf.gz"       # Downloaded from https://www.gencodegenes.org/human/release_35.html
humanTFPath = "../accessory_data/humanTFs.csv"                               # Downloaded from http://humantfs.ccbr.utoronto.ca/download.php
humanEpiPath = "../accessory_data/epiList.csv"                               # Downloaded from https://epifactors.autosome.org/description 

# Freimer Specific
rawDataPath = "not_ready/freimer/GSE171677_RNA_UMI_dedup_counts.txt"
perturbEffectTFOnlyPath = "setup/freimerTFOnly.csv"                         # a path to store temp file
perturbEffectFullTranscriptomePath = "setup/freimerFullTranscriptome.csv"   # a path to store temp file

finalDataFileFolder = "perturbations/freimer"
finalDataFilePath   = "perturbations/freimer/test.h5ad"

### Reshape the data

In [None]:
expression_quantified = pd.read_csv(rawDataPath, 
                                    delimiter="\t",
                                    index_col=0, 
                                    header=0, 
                                    comment='!')   
gene_metadata   = expression_quantified.iloc[:,0:5]
expression_quantified = expression_quantified.iloc[:, 5:].T
sample_metadata = pd.DataFrame(columns = ["donor", "perturbation"], 
                               index = expression_quantified.index,
                               data = [g.split("_", maxsplit=2)[1:3] for g in expression_quantified.index])
print("\n\ngene_metadata\n")
display(gene_metadata.head())
print("\n\nsample_metadata\n")
display(sample_metadata.head())
print("\n\n expression_quantified\n")
display(expression_quantified.head().T.head())

### Combine into anndata to keep everything together

In [None]:
expression_quantified = sc.AnnData(expression_quantified, 
                                   var = gene_metadata.copy(),
                                   obs = sample_metadata.copy())
expression_quantified.raw = expression_quantified.copy()
# DRY these up once they're in AnnData
del gene_metadata
del sample_metadata

### Convert ensembl gene id's to gene symbol

In [None]:
expression_quantified.var_names = ingestion.convert_ens_to_symbol(expression_quantified.var_names, 
                                                                  gtf=geneAnnotationPath)
display(expression_quantified.var.head())
display(expression_quantified.var_names[0:5])

In [None]:
# Document controls with weird names
controls = [f"AAVS1_{i}" for i in range(1,9)]
for c in controls:
    assert c in expression_quantified.obs['perturbation'].unique() 
expression_quantified.obs["is_control"] = expression_quantified.obs['perturbation'].isin(controls)
expression_quantified.obs["is_control_int"] = expression_quantified.obs['perturbation'].isin(controls).astype(int)

### Retain genes w/ a minimum count in at least 3 samples

In [None]:
minCount = np.ceil(10/min(expression_quantified.X.sum(axis=1))/1e-6)
retainColumn = (np.sum(expression_quantified.X > minCount, axis=0) > 3)
expression_quantified._n_vars = np.sum(retainColumn)
expression_quantified.X = expression_quantified.X[:, retainColumn]
expression_quantified.var = expression_quantified.var.iloc[retainColumn]
expression_quantified

In [None]:
retainRows = ~(expression_quantified.obs.index == 'Donor_4_AAVS1_6')
expression_quantified._n_obs = np.sum(retainRows)
expression_quantified.X = expression_quantified.X[retainRows, :]
expression_quantified.obs = expression_quantified.obs.iloc[retainRows]
expression_quantified

In [None]:
def RPKM(mat: np.ndarray, geneLength: np.ndarray):
    """
    Expects 
    Args:
        mat: gene expression matrix with shape (n_obs, n_var)
        geneLength: gene length vector with shape (n_var,)
    """
    assert geneLength.shape[0] == mat.shape[1]
    assert geneLength.shape
    return mat * 1e3 * 1e6 / mat.sum(axis=1, keepdims=True) / geneLength[np.newaxis,:]

expression_quantified.X = RPKM(expression_quantified.X + 0.5, expression_quantified.var.Length)

In [None]:
""" Sanity Check:
The sum of gene expression before and after normalization """
fig, axes = plt.subplots(1, 2, figsize=(12,3))
axes[0].hist(expression_quantified.X.sum(axis=1), bins=100, log=False, label="before DESeq2 norm")
axes[1].hist(ingestion.deseq2Normalization(expression_quantified.X.T).T.sum(axis=1), bins=100, log=False, label="after DESeq2 norm")
axes[0].legend()
axes[1].legend()
plt.show()

### Normalization on bulk

In [None]:
expression_quantified.X = ingestion.deseq2Normalization(expression_quantified.X.T).T

In [None]:
# If verbose is set to True, display disconcordant trials and their controls
status, logFC = ingestion.checkConsistency(expression_quantified, 
                                           perturbationType="knockdown", 
                                           group="donor",
                                           verbose=False) 
expression_quantified.obs["consistentW/Perturbation"] = status
expression_quantified.obs["logFC"] = logFC
Counter(status)

In [None]:
correlations = ingestion.computeCorrelation(expression_quantified, verbose=True, group="donor")
expression_quantified.obs["spearmanCorr"] = correlations[0]
expression_quantified.obs[ "pearsonCorr"] = correlations[1]

In [None]:
"""
Downloaded from http://humantfs.ccbr.utoronto.ca/download.php """
TFList = pd.read_csv(humanTFPath, index_col=0).iloc[:, [1,3]]
TFDict = dict([tuple(i) for i in TFList.to_numpy().tolist() if i[1] == 'Yes'])

"""
Downloaded from https://epifactors.autosome.org/description """
EpiList = pd.read_csv(humanEpiPath, index_col=0).iloc[:, [0,14]]
EpiDict = dict([tuple(i) for i in EpiList.to_numpy().tolist()])

In [None]:
""" If want to look at bigness on TF only """
TFVar = [i for i,p in enumerate(expression_quantified.var.index) if p in TFDict or p in EpiDict]
expression_quantifiedTFOnly = expression_quantified[:, TFVar].copy()
global_effects.quantifyEffect(adata=expression_quantifiedTFOnly, 
                         fname=perturbEffectTFOnlyPath, 
                         group=None, 
                         diffExprFC=False, 
                         prefix="TFOnly")

global_effects.quantifyEffect(adata=expression_quantified, 
                         fname=perturbEffectFullTranscriptomePath, 
                         group=None,
                         diffExprFC=False, 
                         prefix="")

listOfMetrics = ["DEG", "MI", "logFCMean", "logFCNorm2", "logFCMedian"]
for m in listOfMetrics:
    expression_quantified.obs[f"TFOnly{m}"] = expression_quantifiedTFOnly.obs[f"TFOnly{m}"]

In [None]:
metricOfInterest = ["DEG", "logFCNorm2", "TFOnlyDEG", "TFOnlylogFCNorm2"]
ingestion.checkPerturbationEffectMetricCorrelation(expression_quantified, metrics=metricOfInterest)

In [None]:
ingestion.visualizePerturbationEffect(expression_quantified, metrics=metricOfInterest, TFDict=TFDict, EpiDict=EpiDict)

In [None]:
temp = expression_quantified.copy()

""" If you wish to see the magnitude of perturbation effect more clearer,
    i.e. a smoother gradient of the color shift, feel free to uncomment
    the line below, which takes the log of the norm2 """
temp.obs['logFCNorm2'] = np.log2(temp.obs['logFCNorm2'])

ingestion.visualizePerturbationMetadata(temp,
                                        x="spearmanCorr", 
                                        y="logFC", 
                                        style="consistentW/Perturbation", 
                                        hue="logFCNorm2", 
                                        markers=['o', '^'], 
                                        xlim=[-0.2, 1])

In [None]:
keepRow      = (~expression_quantified.obs.perturbation.duplicated() & (~expression_quantified.obs.is_control)) 
perturbagens = expression_quantified[keepRow].obs.perturbation
myDEG        = expression_quantified[keepRow].obs.DEG
myOrder      = np.array(perturbagens)[np.argsort(myDEG)[::-1]]

# Taken from Extended Figure 3d of the paper https://www.nature.com/articles/s41588-022-01106-y/figures/10
theirOrder = ['MED12', 'ATXN7L3', 'CBFB', 'TNFAIP3', 'IRF4', 'IL2RA', 'YY1', 'MYB',
 'KLF2', 'ETS1', 'PTEN', 'STAT5B', 'GATA3', 'KMT2A', 'ZNF217', 'MBD2', 
 'JAK3', 'STAT5A', 'FOXK1', 'IRF2', 'IRF1', 'FOXP1', 'RELA', 'HIVEP2']

theirInMyOrderPos = [np.where(myOrder==i)[0][0] for i in theirOrder]
plt.scatter(theirInMyOrderPos, range(24), s=50, marker='+', label="Actual Correspondance")
plt.scatter(range(24)        , range(24), s=10, marker="*", label="Ideal Correspondance")
plt.xlabel("my DEG rank")
plt.ylabel("paper DEG rank")
plt.legend()
(pearsonr (theirInMyOrderPos, range(24)), spearmanr(theirInMyOrderPos, range(24)))

### Basic EDA

In [None]:
# sc.pp.normalize_total(expression_quantified, target_sum=1e4)
sc.pp.log1p(expression_quantified)
sc.pp.highly_variable_genes(expression_quantified, flavor = "seurat_v3", n_top_genes=expression_quantified.shape[1])
sc.pl.highly_variable_genes(expression_quantified)
with warnings.catch_warnings():
    sc.tl.pca(expression_quantified, n_comps=5)
sc.pp.neighbors(expression_quantified)
sc.tl.umap(expression_quantified)
clusterResolutions = []


In [None]:
expression_quantified.var_names

In [None]:
"CTLA4" in set(expression_quantified.var.index)

In [None]:
sc.pl.umap(expression_quantified, color = ["donor", "is_control_int", "perturbation"])
try:
    sc.pl.umap(expression_quantified, color = ["IL2RA", "IL2", "CTLA4"])
except KeyError:
    print('Some plots are missing due to a weird scanpy KeyError on genes "IL2RA", "IL2", "CTLA4".')

In [None]:
perturbed_genes = set(list(expression_quantified.obs['perturbation'].unique())).difference(controls)
perturbed_and_measured_genes = perturbed_genes.intersection(expression_quantified.var.index)
perturbed_but_not_measured_genes = perturbed_genes.difference(expression_quantified.var.index)
print("These genes were perturbed but not measured:")
print(perturbed_but_not_measured_genes)

In [None]:
# final form, ready to save
expression_quantified.uns["perturbed_and_measured_genes"]     = list(perturbed_and_measured_genes)
expression_quantified.uns["perturbed_but_not_measured_genes"] = list(perturbed_but_not_measured_genes)
expression_quantified = ingestion.describe_perturbation_effect(expression_quantified, "knockout")

### Final QC Conclusion

In [None]:
""" No Filtering """
expression_quantified = expression_quantified.copy()

In [None]:
os.makedirs(finalDataFileFolder, exist_ok = True)
expression_quantified.write_h5ad(finalDataFilePath)