# Module loading

In [None]:
library(IRdisplay)
library(readr)
library(fgsea)
library(dplyr)
library(reticulate)

In [None]:
pd <- import("pandas")

# Settings

In [None]:
OUTPUT_DIR <- Sys.getenv("PHENOPLIER_RESULTS_CRISPR_ANALYSES_BASE_DIR")

In [None]:
OUTPUT_DIR

In [None]:
dir.create(OUTPUT_DIR, recursive=TRUE)

# Data loading

## Lipids gene sets

In [None]:
input_file <- Sys.getenv("PHENOPLIER_CRISPR_LIPIDS_GENE_SETS_FILE")
display(input_file)

In [None]:
all_genes_ranked <- read_csv(input_file)

In [None]:
orig_deg_gene_sets <- list()

for (r in unique(all_genes_ranked$rank)) {
    if (r == 0) {
        next
    }
    
    data <- all_genes_ranked[all_genes_ranked$rank == r,]
    #q <- quantile(data, 0.50, names=FALSE)
    
    orig_deg_gene_sets[[paste0("gene_set_", r)]] <- data$gene_name
}

In [None]:
length(orig_deg_gene_sets)

### Combine gene sets into "increase lipids" and "decrease lipids"

In [None]:
deg_gene_sets <- list()

In [None]:
# genes that increase lipids
deg_gene_sets[["gene_set_increase"]] <- c(
    orig_deg_gene_sets[["gene_set_2"]],
    orig_deg_gene_sets[["gene_set_3"]]
)

In [None]:
# genes that decrease lipids
deg_gene_sets[["gene_set_decrease"]] <- c(
    orig_deg_gene_sets[["gene_set_-2"]],
    orig_deg_gene_sets[["gene_set_-3"]]
)

In [None]:
length(deg_gene_sets)

In [None]:
length(deg_gene_sets[["gene_set_increase"]])

In [None]:
length(deg_gene_sets[["gene_set_decrease"]])

In [None]:
# test new increase set
new_set <- deg_gene_sets[["gene_set_increase"]]
expected_set <- union(
    orig_deg_gene_sets[["gene_set_2"]],
    orig_deg_gene_sets[["gene_set_3"]]
)

stopifnot(length(new_set) == length(unique(new_set)))

stopifnot(
    length(new_set) == 
    length(
        intersect(
            new_set,
            expected_set
        )
    )
)

In [None]:
# test new decrease set
new_set <- deg_gene_sets[["gene_set_decrease"]]
expected_set <- union(
    orig_deg_gene_sets[["gene_set_-2"]],
    orig_deg_gene_sets[["gene_set_-3"]]
)

stopifnot(length(new_set) == length(unique(new_set)))

stopifnot(
    length(new_set) == 
    length(
        intersect(
            new_set,
            expected_set
        )
    )
)

## MultiPLIER Z

In [None]:
multiplier_z = pd$read_pickle(
    Sys.getenv("PHENOPLIER_MULTIPLIER_MODEL_Z_MATRIX_FILE")
)

In [None]:
dim(multiplier_z)

In [None]:
head(multiplier_z)

# Prepare LVs list

In [None]:
lvs = list()
z_gene_names <- rownames(multiplier_z)

for (cidx in 1:ncol(multiplier_z)) {
    data <- multiplier_z[, cidx]
    names(data) <- z_gene_names
    # q <- quantile(data, 0.75, names=FALSE)
    q <- 0.0
    
    lvs[[paste0("LV", cidx)]] <- data[data > q]
}

# Compute enrichment on all LVs

In [None]:
n_reps = 10

In [None]:
set.seed(0)

In [None]:
results = list()

for (lv in names(lvs)) {
    repetitions = list()
    
    for (i in 1:n_reps) {
        rep_res <- fgsea(pathways = deg_gene_sets, stats = lvs[[lv]], scoreType = "pos", eps = 0.0)[order(pval), ]
        rep_res[, "leadingEdge"] <- sapply(rep_res$leadingEdge, paste, collapse=",")
        rep_res[, "lv"] <- lv
        rep_res[, "rep_idx"] <- i
        
        repetitions[[i]] <- rep_res
    }
    
    res <- do.call(rbind, repetitions)

    results[[lv]] <- res
}

In [None]:
length(results)

In [None]:
df <- do.call(rbind, results)

In [None]:
dim(df)

In [None]:
head(df)

## Save

In [None]:
output_file <- file.path(OUTPUT_DIR, "fgsea-all_lvs.tsv")
display(output_file)

In [None]:
write_tsv(df, output_file)

# Quick analyses/tests

## See how one LV looks like

In [None]:
df %>% filter(lv == "LV100" & pathway == "gene_set_increase") %>% arrange(desc(padj))

## Show significant LVs

In [None]:
df_signif <- df %>% group_by(lv, pathway) %>% summarize(max_padj = max(padj)) %>% filter(max_padj < 0.05)

In [None]:
nrow(df_signif)

In [None]:
stopifnot(nrow(df_signif) > 500)

In [None]:
df_signif %>% arrange(max_padj)