### Endoderm time-course

This notebook is for exploratory analysis and reformatting of the Maehr group's definitive endoderm time-course data.

In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
import importlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
from scipy.stats import spearmanr as spearmanr

# local
import pereggrn_perturbations
import importlib
import sys
sys.path.append("setup")
import ingestion
import global_effects
importlib.reload(ingestion)
importlib.reload(global_effects)

import os, sys
from scipy.stats import spearmanr
import matplotlib.pyplot as plt

#      visualization settings
%matplotlib inline
plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300

# I prefer to specify the working directory explicitly.
os.chdir("/home/ekernf01/Desktop/jhu/research/projects/perturbation_prediction/cell_type_knowledge_transfer/perturbation_data/setup")

pereggrn_perturbations.set_data_path('../perturbations')

# Universal
geneAnnotationPath = "../accessory_data/gencode.v35.annotation.gtf.gz"       # Downloaded from https://www.gencodegenes.org/human/release_35.html
humanTFPath =  "../accessory_data/humanTFs.csv"                              # Downloaded from http://humantfs.ccbr.utoronto.ca/download.php
humanEpiPath = "../accessory_data/epiList.csv"                               # Downloaded from https://epifactors.autosome.org/description 
cellcycleGenePath = "../accessory_data/regev_lab_cell_cycle_genes.txt"

# dataset-specific
dataset_name = "definitive_endoderm"

In [None]:
expression_quantified = sc.read_10x_mtx("../not_ready/maehrlab_endoderm/DE_Timecourse/outs/filtered_gene_bc_matrices/hg19")
expression_quantified.obs_names = [o.split("-")[0] for o in expression_quantified.obs_names]
expression_quantified.obs = sc.pp.calculate_qc_metrics(expression_quantified)[0]
tags = pd.read_csv("../not_ready/maehrlab_endoderm/DE_Timecourse_tags.dge.txt", index_col=0).T
expression_quantified.obs = expression_quantified.obs.join(tags)
expression_quantified.obs

### Tagging

These cells were fixed and multiplexed using the method of Gehring et al., and there are 5 tags. Each tag was applied to two of the five days. For each day, there are two tags that were added. A simple model is that tag counts will be proportional to M*days with days being one-hot encoded and with M as follows. (In the code, I have had to transpose it.)

```
1 0 0 0 1
1 1 0 0 0
0 1 1 0 0
0 0 1 1 0
0 0 0 1 1
```

> Gehring J, Hwee Park J, Chen S, Thomson M, Pachter L. Highly multiplexed single-cell RNA-seq by DNA oligonucleotide tagging of cellular proteins. Nat Biotechnol. 2020 Jan;38(1):35-38. doi: 10.1038/s41587-019-0372-z. Epub 2019 Dec 23. PMID: 31873215.

In [None]:
days_vs_tags = pd.DataFrame([
    (0, "day0_tag1__day4_tag1"),
    (4, "day0_tag1__day4_tag1"),	
    (0, "day0_tag2__day1_tag1"),	
    (1, "day0_tag2__day1_tag1"),	
    (1, "day1_tag2__day2_tag1"),	
    (2, "day1_tag2__day2_tag1"),	
    (2, "day2_tag2__day3_tag1"),	
    (3, "day2_tag2__day3_tag1"),	
    (3, "day3_tag2__day4_tag2"),
    (4, "day3_tag2__day4_tag2"),
], columns =["day", "tag"])

deconvolution_matrix = np.linalg.inv([
    [1, 0, 0, 0, 1],
    [1, 1, 0, 0, 0],
    [0, 1, 1, 0, 0],
    [0, 0, 1, 1, 0],
    [0, 0, 0, 1, 1]
])
print(deconvolution_matrix)
# expression_quantified.obs[]

expression_quantified.obs = expression_quantified.obs.merge(pd.DataFrame(
    np.matrix(expression_quantified.obs[[
        "day0_tag1__day4_tag1",
        "day0_tag2__day1_tag1",
        "day1_tag2__day2_tag1",
        "day2_tag2__day3_tag1",
        "day3_tag2__day4_tag2",
    ]]).dot(deconvolution_matrix.T), 
    columns = "deconv_day0 deconv_day1 deconv_day2 deconv_day3 deconv_day4".split(), 
    index = expression_quantified.obs_names
), left_index=True, right_index=True)
expression_quantified.obs

for day in range(5):
    expression_quantified.obs[f"day{day}_tag_total"] = expression_quantified.obs[days_vs_tags.loc[day==days_vs_tags["day"], "tag"]].sum(axis=1)
    expression_quantified.obs[f"day{day}_tag_proportion"] = expression_quantified.obs[f"day{day}_tag_total"] / expression_quantified.obs["total_reads"]
    expression_quantified.obs[f"deconv_day{day}_proportion"] = expression_quantified.obs[f"deconv_day{day}"] / expression_quantified.obs["total_reads"]


### Required metadata

In [4]:
expression_quantified.obs["is_control"]     = True
expression_quantified.obs["is_control_int"] = 1
expression_quantified.obs["perturbation"]  = "control"
expression_quantified.obs["timepoint"]                = np.argmax(expression_quantified.obs["deconv_day0 deconv_day1 deconv_day2 deconv_day3 deconv_day4".split()], axis = 1)
expression_quantified.obs["timepoint_confidence"]     = np.   max(expression_quantified.obs["deconv_day0 deconv_day1 deconv_day2 deconv_day3 deconv_day4".split()], axis = 1)/expression_quantified.obs["total_reads"]
expression_quantified.obs["timepoint_alt"]            = np.argmax(expression_quantified.obs[[f"day{i}_tag_proportion" for i in range(5)]], axis = 1)
expression_quantified.obs["timepoint_alt_confidence"] = np.   max(expression_quantified.obs[[f"day{i}_tag_proportion" for i in range(5)]], axis = 1)
expression_quantified = expression_quantified[ expression_quantified.obs["timepoint_confidence"] > 0.45, : ]
labels = {0: "pluripotent", 
          1: "mesendoderm", # This is before exposure to Activin A. 
          2: "mesendoderm",
          3: "endoderm", 
          4: "endoderm"}
expression_quantified.obs["cell_type"]     = [labels[t] for t in expression_quantified.obs["timepoint"]]
expression_quantified.uns["perturbed_and_measured_genes"]     = []
expression_quantified.uns["perturbed_but_not_measured_genes"] = []

### Cell cycle annotation

Following the Figure 1 and S1 notebook from [Petrus-Reurer et al.](https://github.com/lamanno-epfl/rpe_differentiation_profiling_code/blob/main/JupyterNotebooks/HTML/Figure1_S1.html). 


In [None]:
S_genes_hum = ["MCM5", "PCNA", "TYMS", "FEN1", "MCM2", "MCM4", "RRM1", "UNG", "GINS2", 
            "MCM6", "CDCA7", "DTL", "PRIM1", "UHRF1", "CENPU", "HELLS", "RFC2", 
            "RPA2", "NASP", "RAD51AP1", "GMNN", "WDR76", "SLBP", "CCNE2", "UBR7", 
            "POLD3", "MSH2", "ATAD2", "RAD51", "RRM2", "CDC45", "CDC6", "EXO1", "TIPIN", 
            "DSCC1", "BLM", "CASP8AP2", "USP1", "CLSPN", "POLA1", "CHAF1B", "BRIP1", "E2F8"]
G2M_genes_hum = ["HMGB2", "CDK1", "NUSAP1", "UBE2C", "BIRC5", "TPX2", "TOP2A", "NDC80",
             "CKS2", "NUF2", "CKS1B", "MKI67", "TMPO", "CENPF", "TACC3", "PIMREG", 
             "SMC4", "CCNB2", "CKAP2L", "CKAP2", "AURKB", "BUB1", "KIF11", "ANP32E", 
             "TUBB4B", "GTSE1", "KIF20B", "HJURP", "CDCA3", "JPT1", "CDC20", "TTK",
             "CDC25C", "KIF2C", "RANGAP1", "NCAPD2", "DLGAP5", "CDCA2", "CDCA8", "ECT2", 
             "KIF23", "HMMR", "AURKA", "PSRC1", "ANLN", "LBR", "CKAP5", "CENPE", 
             "CTCF", "NEK2", "G2E3", "GAS2L3", "CBX5", "CENPA"]
sc.tl.score_genes_cell_cycle(expression_quantified, s_genes=S_genes_hum, g2m_genes=G2M_genes_hum)

### Normalization and variable gene ranking

Downstream benchmarking analysis expects both of these things to be done already. We will pair this with CRISPR knockdown data, so we also add dummy columns for perturbation and expression level after perturbation. 

In [6]:
expression_quantified.raw = expression_quantified.copy()
sc.pp.normalize_total(expression_quantified, target_sum=1e4)
sc.pp.log1p(expression_quantified)
sc.pp.highly_variable_genes(expression_quantified, n_bins=50, n_top_genes = expression_quantified.var.shape[0], flavor = "seurat_v3" )
sc.tl.pca(expression_quantified, n_comps=10)
expression_quantified = ingestion.describe_perturbation_effect(expression_quantified, "knockdown")


### Cleaning up labels

In the visualizations below, a few cells seem to be mis-labeled by the click-tags. We will use nearest neighbors to exclude cells that seem mislabeled.

In [7]:
expression_quantified.obs["timepoint_smoothed"] = 0
sc.pp.neighbors(expression_quantified, n_neighbors=20, n_pcs=10)
for i,c in enumerate(expression_quantified.obs.index):
    nn = expression_quantified.obsp["connectivities"][i, :].nonzero()[1]
    expression_quantified.obs.loc[c, "timepoint_smoothed"] = expression_quantified[nn, :].obs["timepoint"].median()

In [None]:
print(expression_quantified.obs[["timepoint", "timepoint_smoothed"]].value_counts())
expression_quantified = expression_quantified[expression_quantified.obs["timepoint"]==expression_quantified.obs["timepoint_smoothed"], :]
del expression_quantified.obs["timepoint_smoothed"]

### Save the data

In [None]:
expression_quantified.write_h5ad(os.path.join(f"../perturbations/{dataset_name}", f"train.h5ad"))
pereggrn_perturbations.check_perturbation_dataset(ad = expression_quantified, is_timeseries=True, is_perturbation=False)

### Basic EDA

Similar to the original supplement, we see strong batch effects in UMAP but PC1 looks ok as pseudotime. 

In [10]:
sc.pp.highly_variable_genes(expression_quantified, n_bins=50, n_top_genes = 500, flavor = "seurat_v3" )
with warnings.catch_warnings():
    sc.tl.pca(expression_quantified, n_comps=10)
sc.pp.neighbors(expression_quantified)
sc.tl.umap(expression_quantified)
clusterResolutions = []
sc.tl.louvain(expression_quantified)

In [None]:
plt.rcParams['figure.figsize'] = [6, 4.5]

vars_to_show = [ 
       "total_counts", "phase", "cell_type", "louvain", "timepoint",  "timepoint_confidence",  "timepoint_alt", "timepoint_alt_confidence", 
       "SOX17", "FOXA2", "EOMES", "NANOG", "POU5F1", "T", "MIXL1", 
    ] + \
       [f"deconv_day{day}_proportion" for day in range(5)] + \
       [f"day{day}_tag_proportion" for day in range(5)]
for v in vars_to_show:
   fig = sc.pl.pca(expression_quantified, color = v, show = False, legend_loc='on data')
   try:
      os.makedirs(f"../perturbations/definitive_endoderm/train", exist_ok=True)
      fig.figure.savefig(f"../perturbations/definitive_endoderm/train/{v}.pdf", bbox_inches='tight')
   except Exception as e:
      print(f"Plots failed with error {repr(e)}")

sc.pl.pca(expression_quantified, color = vars_to_show)
print(expression_quantified.obs.columns)


In [None]:
expression_quantified.obs["timepoint"] = expression_quantified.obs["timepoint"].astype("string")
fig = sc.pl.DotPlot(expression_quantified,
               ["HNF1B","FOXA2",  "GATA6", "SOX17", "CER1", "EOMES", "HHEX", "NODAL", "GSC", "T", "MIXL1","POU5F1", "NANOG",  "SOX2" ], 
              groupby = "timepoint", 
              cmap = "viridis")
os.makedirs(f"../perturbations/definitive_endoderm/train", exist_ok=True)
fig.savefig("../perturbations/definitive_endoderm/train/dotplot.pdf", bbox_inches='tight')

In [None]:
fig