### RPE differentiation time-course

This notebook is for exploratory analysis and reformatting of the Joung et al. [TF overexpression atlas](https://www.cell.com/cell/fulltext/S0092-8674(22)01470-2).

In [None]:
import warnings
warnings.filterwarnings('ignore')
import regex as re
import os
import shutil
import importlib
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
from scipy.stats import spearmanr as spearmanr
from IPython.display import display, HTML
from collections import Counter

# local
import importlib
import sys
sys.path.append("setup")
import ingestion
import global_effects
importlib.reload(ingestion)
importlib.reload(global_effects)

import anndata
import os, sys
import itertools as it
from scipy.stats import spearmanr, pearsonr, rankdata, f_oneway
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mutual_info_score
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

#      visualization settings
%matplotlib inline
plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300

# I prefer to specify the working directory explicitly.
os.chdir("/home/ekernf01/Desktop/jhu/research/projects/perturbation_prediction/cell_type_knowledge_transfer/perturbation_data")

# Universal
geneAnnotationPath = "../accessory_data/gencode.v35.annotation.gtf.gz"       # Downloaded from https://www.gencodegenes.org/human/release_35.html
humanTFPath =  "../accessory_data/humanTFs.csv"                              # Downloaded from http://humantfs.ccbr.utoronto.ca/download.php
humanEpiPath = "../accessory_data/epiList.csv"                               # Downloaded from https://epifactors.autosome.org/description 
cellcycleGenePath = "../accessory_data/regev_lab_cell_cycle_genes.txt"

# petrusreurer Specific
dataset_name = "joung"
perturbEffectTFOnlyPath = "perturbations/TFOnly.csv"                         # a path to store temp file
perturbEffectFullTranscriptomePath = "perturbations/FullTranscriptome.csv"   # a path to store temp file

### Load expression data & set up cell metadata

In [None]:
# This is output from the script joung_reduce.py.
# Expression is totaled across all cells sharing a perturbation and a cell cycle phase assignment.
expression_quantified = sc.read_h5ad("not_ready/joung/GSE217460_210322_TFAtlas_aggregated.h5ad")
expression_quantified.X = np.round(expression_quantified.X, decimals=0)

### Fill `perturbation` & `is_control`

In [None]:
sc.pp.calculate_qc_metrics(expression_quantified)
expression_quantified.obs["total_umi"] = expression_quantified.X.sum(1)
expression_quantified.obs["log10_total_umi"] = np.log10(expression_quantified.obs["total_umi"])
expression_quantified.obs["log10_cellcount"] = np.log10(expression_quantified.obs["count"])
expression_quantified.obs.index = range(len(expression_quantified.obs_names))
expression_quantified.obs

In [None]:
expression_quantified.X[1:10,1:10].toarray()

In [None]:
expression_quantified.X[1:10, 1:10].toarray()

### Normalization and filtering

In [None]:
expression_quantified.raw = expression_quantified.copy()
sc.pp.filter_genes(expression_quantified, min_cells=20)
sc.pp.filter_cells(expression_quantified, min_counts=10000)
expression_quantified.X = ingestion.deseq2Normalization(expression_quantified.X.toarray().T).T
sc.pp.log1p(expression_quantified)

In [None]:
sc.pp.highly_variable_genes(expression_quantified, n_bins=50, n_top_genes = expression_quantified.var.shape[0], flavor = "seurat_v3" )

### Perturbation effects

Check if OE'd genes go up; check how much changes in general; filter; save the results. 

In [None]:
controls = ["mCherry", "GFP"]
expression_quantified.obs["perturbation"]   = [orf.split("-")[1] for orf in expression_quantified.obs["TF"]]
expression_quantified.obs["is_control"]     = [gene in controls for gene in expression_quantified.obs["perturbation"]]
expression_quantified.obs["is_control_int"] = [1 if is_control else 0 for is_control in expression_quantified.obs["is_control"]]
perturbed_genes = set.union(*[set(p.split(",")) for p in expression_quantified.obs["perturbation"]]).difference(set(controls))
perturbed_and_measured_genes     = perturbed_genes.intersection(expression_quantified.var_names)
perturbed_but_not_measured_genes = perturbed_genes.difference(expression_quantified.var_names)
print("These genes were perturbed and measured:")
print(perturbed_and_measured_genes)
print("These genes were perturbed but not measured:")
print(perturbed_but_not_measured_genes)
expression_quantified.uns["perturbed_and_measured_genes"]     = list(perturbed_and_measured_genes)
expression_quantified.uns["perturbed_but_not_measured_genes"] = list(perturbed_but_not_measured_genes)
expression_quantified.uns["perturbations_overlap"] = False
expression_quantified = ingestion.describe_perturbation_effect( adata = expression_quantified, 
                                                                       perturbation_type="overexpression", 
                                                                       multiple_genes_hit = False)
status, logFC = ingestion.checkConsistency(adata = expression_quantified, 
                                           perturbationType="overexpression", 
                                           verbose=False)
print(Counter(status))
expression_quantified.obs["consistentW/Perturbation"] = status
expression_quantified.obs["logFC"] = logFC

### Total effects genome-wide



In [None]:
global_effects.quantifyEffect(adata=expression_quantified, 
                         fname=perturbEffectTFOnlyPath, 
                         group=None, 
                         diffExprFC=False, 
                        #  prefix="TFOnly", 
                         withDEG=False, 
                         withMI=False)
ingestion.visualizePerturbationMetadata(
    expression_quantified, 
    x="logFCNorm2", 
    y="logFC", 
    style="consistentW/Perturbation", 
    hue="log10_cellcount", 
    markers=['o', '^'],  
    s=25, 
    xlim=[55, 130],
)

### Filter and save

In [None]:
expression_quantified = expression_quantified[expression_quantified.obs["consistentW/Perturbation"] != "No"]
os.makedirs(f"perturbations/{dataset_name}", exist_ok = True)
expression_quantified.write_h5ad(f"perturbations/{dataset_name}/test.h5ad")

### Basic EDA

Having aggregated within each perturbation makes factor analysis hard: there are huge depth effects and downsampling to uniform UMI count does not help. `:[` Our initial EDA was useless, so it is omitted. 