### Endoderm time-course

This notebook is for exploratory analysis and reformatting of the Cuomo et al. [endoderm directed differentiation scRNA data](https://www.nature.com/articles/s41467-020-14457-z).

In [None]:
import warnings
warnings.filterwarnings('ignore')
import regex as re
import os
import shutil
import importlib
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
from scipy.stats import spearmanr as spearmanr
from scipy.sparse import csr_matrix
from IPython.display import display, HTML
from collections import Counter

# local
import load_perturbations
import importlib
import sys
sys.path.append("setup")
import ingestion
import global_effects
importlib.reload(ingestion)
importlib.reload(global_effects)

import anndata
import os, sys
import itertools as it
from scipy.stats import spearmanr, pearsonr, rankdata, f_oneway
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mutual_info_score
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

#      visualization settings
%matplotlib inline
plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300

# I prefer to specify the working directory explicitly.
os.chdir("/home/ekernf01/Desktop/jhu/research/projects/perturbation_prediction/cell_type_knowledge_transfer/perturbation_data")

load_perturbations.set_data_path('perturbations')

# Universal
geneAnnotationPath = "../accessory_data/gencode.v35.annotation.gtf.gz"       # Downloaded from https://www.gencodegenes.org/human/release_35.html
humanTFPath =  "../accessory_data/humanTFs.csv"                              # Downloaded from http://humantfs.ccbr.utoronto.ca/download.php
humanEpiPath = "../accessory_data/epiList.csv"                               # Downloaded from https://epifactors.autosome.org/description 
cellcycleGenePath = "../accessory_data/regev_lab_cell_cycle_genes.txt"

# dataset-specific
dataset_name = "definitive_endoderm"

In [None]:
# Raw counts are from https://zenodo.org/records/3625024#.Xil-0y2cZ0s
expression_quantified = sc.read_csv("not_ready/cuomo_definitive_endoderm/raw_counts.csv")
expression_quantified = expression_quantified.T

In [None]:
expression_quantified.var_names = [gene.split("_")[1] for gene in expression_quantified.var_names]

In [None]:
obs = pd.read_csv("not_ready/cuomo_definitive_endoderm/cell_metadata_cols.tsv", sep = "\t")
assert all(expression_quantified.obs.index == obs.index)
expression_quantified.obs = obs

### Required metadata

In [None]:
expression_quantified.obs["is_control"]     = True
expression_quantified.obs["is_control_int"] = 1
expression_quantified.obs["perturbation"]  = "control"
expression_quantified.obs["timepoint"]     = expression_quantified.obs["day"]
expression_quantified.obs["cell_type"]     = "endoderm_differentiation"
expression_quantified.uns["perturbed_and_measured_genes"]     = []
expression_quantified.uns["perturbed_but_not_measured_genes"] = []


### Cell cycle annotation

Following the Figure 1 and S1 notebook from [Petrus-Reurer et al.](https://github.com/lamanno-epfl/rpe_differentiation_profiling_code/blob/main/JupyterNotebooks/HTML/Figure1_S1.html). 


In [None]:
S_genes_hum = ["MCM5", "PCNA", "TYMS", "FEN1", "MCM2", "MCM4", "RRM1", "UNG", "GINS2", 
            "MCM6", "CDCA7", "DTL", "PRIM1", "UHRF1", "CENPU", "HELLS", "RFC2", 
            "RPA2", "NASP", "RAD51AP1", "GMNN", "WDR76", "SLBP", "CCNE2", "UBR7", 
            "POLD3", "MSH2", "ATAD2", "RAD51", "RRM2", "CDC45", "CDC6", "EXO1", "TIPIN", 
            "DSCC1", "BLM", "CASP8AP2", "USP1", "CLSPN", "POLA1", "CHAF1B", "BRIP1", "E2F8"]
G2M_genes_hum = ["HMGB2", "CDK1", "NUSAP1", "UBE2C", "BIRC5", "TPX2", "TOP2A", "NDC80",
             "CKS2", "NUF2", "CKS1B", "MKI67", "TMPO", "CENPF", "TACC3", "PIMREG", 
             "SMC4", "CCNB2", "CKAP2L", "CKAP2", "AURKB", "BUB1", "KIF11", "ANP32E", 
             "TUBB4B", "GTSE1", "KIF20B", "HJURP", "CDCA3", "JPT1", "CDC20", "TTK",
             "CDC25C", "KIF2C", "RANGAP1", "NCAPD2", "DLGAP5", "CDCA2", "CDCA8", "ECT2", 
             "KIF23", "HMMR", "AURKA", "PSRC1", "ANLN", "LBR", "CKAP5", "CENPE", 
             "CTCF", "NEK2", "G2E3", "GAS2L3", "CBX5", "CENPA"]
sc.tl.score_genes_cell_cycle(expression_quantified, s_genes=S_genes_hum, g2m_genes=G2M_genes_hum)

### Normalization and variable gene ranking

Downstream benchmarking analysis expects both of these things to be done already. We will pair this with CRISPR knockdown data, so we also add dummy columns for perturbation and expression level after perturbation. 

In [None]:
expression_quantified.raw = expression_quantified.copy()
expression_quantified.X = csr_matrix(expression_quantified.X.T / np.array(expression_quantified.obs["size_factor"])).T
sc.pp.log1p(expression_quantified)
sc.pp.highly_variable_genes(expression_quantified, n_bins=50, n_top_genes = expression_quantified.var.shape[0], flavor = "seurat_v3" )
expression_quantified = ingestion.describe_perturbation_effect(expression_quantified, "knockdown")


### Save the data

In [None]:
load_perturbations.check_perturbation_dataset(ad = expression_quantified, is_timeseries=True, is_perturbation=False)
expression_quantified.write_h5ad(os.path.join(f"perturbations/{dataset_name}", f"train.h5ad"))

### Basic EDA

Similar to the original supplement, we see strong batch effects in UMAP but PC1 looks ok as pseudotime. 

In [None]:
sc.pp.highly_variable_genes(expression_quantified, n_bins=50, n_top_genes = 500, flavor = "seurat_v3" )
sc.pp.regress_out(expression_quantified, ["experiment"])
sc.pp.regress_out(expression_quantified, ["total_counts", "total_counts_endogenous", "total_counts_ERCC", "size_factor"])
with warnings.catch_warnings():
    sc.tl.pca(expression_quantified, n_comps=100)
sc.pp.neighbors(expression_quantified)
sc.tl.umap(expression_quantified)
clusterResolutions = []
sc.tl.louvain(expression_quantified)

In [None]:
plt.rcParams['figure.figsize'] = [6, 4.5]
stuff_to_show = [ 
       'experiment', "day", "doublet_score",
       'log10_total_counts', 'log10_total_counts_endogenous',
       'log10_total_counts_ERCC', 'log10_total_counts_feature_control',
       'log10_total_counts_MT', 'log10_total_features',
       'log10_total_features_endogenous', 'log10_total_features_ERCC',
       'log10_total_features_feature_control', 'log10_total_features_MT',
       'pct_counts_top_50_features',
       'plate_id', 'post_prob',
       'well_type',  'pseudo', "phase", "louvain",
       "SOX17", "FOXA2", "EOMES", "NANOG", "POU5F1", "T", "MIXL1"]
sc.pl.umap(expression_quantified, color = stuff_to_show)
sc.pl.pca(expression_quantified, color = stuff_to_show)
print(expression_quantified.obs.columns)