### RPE differentiation time-course

This notebook is for exploratory analysis and reformatting of the Joung et al. [TF overexpression atlas](https://www.cell.com/cell/fulltext/S0092-8674(22)01470-2).

In [1]:
import warnings
warnings.filterwarnings('ignore')
import regex as re
import os
import shutil
import importlib
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
from scipy.stats import spearmanr as spearmanr
from IPython.display import display, HTML
from collections import Counter

# local
import importlib
import sys
sys.path.append("setup")
import ingestion
import global_effects
importlib.reload(ingestion)
importlib.reload(global_effects)

import anndata
import os, sys
import itertools as it
from scipy.stats import spearmanr, pearsonr, rankdata, f_oneway
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mutual_info_score
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

#      visualization settings
%matplotlib inline
plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300

# I prefer to specify the working directory explicitly.
os.chdir("/home/ekernf01/Desktop/jhu/research/projects/perturbation_prediction/cell_type_knowledge_transfer/perturbation_data")

# Universal
geneAnnotationPath = "../accessory_data/gencode.v35.annotation.gtf.gz"       # Downloaded from https://www.gencodegenes.org/human/release_35.html
humanTFPath =  "../accessory_data/humanTFs.csv"                              # Downloaded from http://humantfs.ccbr.utoronto.ca/download.php
humanEpiPath = "../accessory_data/epiList.csv"                               # Downloaded from https://epifactors.autosome.org/description 
cellcycleGenePath = "../accessory_data/regev_lab_cell_cycle_genes.txt"

# petrusreurer Specific
dataset_name = "joung"
perturbEffectTFOnlyPath = "perturbations/TFOnly.csv"                         # a path to store temp file
perturbEffectFullTranscriptomePath = "perturbations/FullTranscriptome.csv"   # a path to store temp file

### Load expression data & set up cell metadata

In [25]:
import gzip
import csv
from scipy.sparse import csr_matrix

def read_gzipped_csv_to_sparse_matrix(gzipped_csv_filepath, chunksize=1000):
    genes = []
    counts = []
    with gzip.open(gzipped_csv_filepath, mode='rt', newline='') as gzipped_file:
        csv_reader = csv.reader(gzipped_file, delimiter="\t")
        barcodes = next(csv_reader, None)[1:]
        while True:
            print(".", end = "", flush = True)
            chunk = [next(csv_reader, None) for _ in range(chunksize)]
            chunk = [row for row in chunk if row is not None]  # Filter out None values at the end of the file
            if not chunk:
                break  # Exit loop if there are no more rows to read
            genes.append([row[0] for row in chunk])
            counts.append(csr_matrix(np.array(
                [[int(i) for i in row[1:]] for row in chunk]
            )))
    return np.concatenate(counts), barcodes, genes


In [26]:
expression_quantified = read_gzipped_csv_to_sparse_matrix("/home/ekernf01/Desktop/jhu/research/projects/perturbation_prediction/cell_type_knowledge_transfer/perturbation_data/not_ready/joung/example.csv.gz")
print(expression_quantified[0].shape)
# expression_quantified = read_gzipped_csv_to_sparse_matrix("/home/ekernf01/Desktop/jhu/research/projects/perturbation_prediction/cell_type_knowledge_transfer/perturbation_data/not_ready/joung/GSM6719950_210322_TFAtlas_S05.csv.gz")
expression_quantified[0][0, 0:5]

..

ValueError: zero-dimensional arrays cannot be concatenated

### Cell cycle annotation

Following the Figure 1 and S1 notebook from [here](https://github.com/lamanno-epfl/rpe_differentiation_profiling_code/blob/main/JupyterNotebooks/HTML/Figure1_S1.html). 


In [None]:
S_genes_hum = ["MCM5", "PCNA", "TYMS", "FEN1", "MCM2", "MCM4", "RRM1", "UNG", "GINS2", 
            "MCM6", "CDCA7", "DTL", "PRIM1", "UHRF1", "CENPU", "HELLS", "RFC2", 
            "RPA2", "NASP", "RAD51AP1", "GMNN", "WDR76", "SLBP", "CCNE2", "UBR7", 
            "POLD3", "MSH2", "ATAD2", "RAD51", "RRM2", "CDC45", "CDC6", "EXO1", "TIPIN", 
            "DSCC1", "BLM", "CASP8AP2", "USP1", "CLSPN", "POLA1", "CHAF1B", "BRIP1", "E2F8"]
G2M_genes_hum = ["HMGB2", "CDK1", "NUSAP1", "UBE2C", "BIRC5", "TPX2", "TOP2A", "NDC80",
             "CKS2", "NUF2", "CKS1B", "MKI67", "TMPO", "CENPF", "TACC3", "PIMREG", 
             "SMC4", "CCNB2", "CKAP2L", "CKAP2", "AURKB", "BUB1", "KIF11", "ANP32E", 
             "TUBB4B", "GTSE1", "KIF20B", "HJURP", "CDCA3", "JPT1", "CDC20", "TTK",
             "CDC25C", "KIF2C", "RANGAP1", "NCAPD2", "DLGAP5", "CDCA2", "CDCA8", "ECT2", 
             "KIF23", "HMMR", "AURKA", "PSRC1", "ANLN", "LBR", "CKAP5", "CENPE", 
             "CTCF", "NEK2", "G2E3", "GAS2L3", "CBX5", "CENPA"]
sc.tl.score_genes_cell_cycle(expression_quantified, s_genes=S_genes_hum, g2m_genes=G2M_genes_hum)

### Fill `perturbation` & `is_control`

In [None]:
expression_quantified.obs["is_control"]     = True
expression_quantified.obs["is_control_int"] = 1
expression_quantified.obs["perturbation"] = "control"
expression_quantified.obs

### Basic EDA

In [None]:
expression_quantified.raw = expression_quantified.copy()
sc.pp.normalize_total(expression_quantified)
sc.pp.log1p(expression_quantified)
sc.pp.filter_genes(expression_quantified, min_cells=20)
expression_quantified.var
sc.pp.highly_variable_genes(expression_quantified, n_bins=50, n_top_genes = expression_quantified.var.shape[0], flavor = "seurat_v3" )
sc.pp.regress_out(expression_quantified, ["CELL_LINE"])
with warnings.catch_warnings():
    sc.tl.pca(expression_quantified, n_comps=100)
sc.pp.neighbors(expression_quantified)
sc.tl.umap(expression_quantified)
clusterResolutions = []
sc.tl.louvain(expression_quantified)

In [None]:
plt.rcParams['figure.figsize'] = [6, 4.5]
sc.pl.umap(expression_quantified, color = [
    "CELLTYPE1", "CELLTYPE2", "DAY", "CELL_LINE", "batch"
])
sc.pl.pca(expression_quantified, color = [
    "CELLTYPE1", "CELLTYPE2", "DAY", "CELL_LINE", "batch"
])

In [None]:
expression_quantified.uns["perturbed_and_measured_genes"]     = []
expression_quantified.uns["perturbed_but_not_measured_genes"] = []

In [None]:
os.makedirs(f"perturbations/{dataset_name}", exist_ok = True)
expression_quantified.write_h5ad(f"perturbations/{dataset_name}/train.h5ad")