# Module loading

In [1]:
library(IRdisplay)
library(readr)
library(fgsea)
library(dplyr)
library(tidyverse)
library(reticulate)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.1     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [2]:
pd <- import("pandas")

# Settings

In [3]:
OUTPUT_DIR <- Sys.getenv("PHENOPLIER_RESULTS_CRISPR_ANALYSES_BASE_DIR")

In [4]:
OUTPUT_DIR

In [5]:
dir.create(OUTPUT_DIR, recursive=TRUE)

“'/home/miltondp/projects/labs/greenelab/phenoplier/base/results/crispr_analyses' already exists”


# Data loading

## Lipids gene sets

In [6]:
input_file <- Sys.getenv("PHENOPLIER_CRISPR_LIPIDS_GENE_SETS_FILE")
display(input_file)

In [7]:
all_genes_ranked <- read_csv(input_file)


[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
cols(
  gene_name = [31mcol_character()[39m,
  GFPLow_vs_UnSorted.log2FC = [32mcol_double()[39m,
  GFPLow_vs_UnSorted.FDR = [32mcol_double()[39m,
  GFPLow_vs_UnSorted.DEG = [33mcol_logical()[39m,
  GFPHigh_vs_UnSorted.log2FC = [32mcol_double()[39m,
  GFPHigh_vs_UnSorted.FDR = [32mcol_double()[39m,
  GFPHigh_vs_UnSorted.DEG = [33mcol_logical()[39m,
  GFPHigh_vs_GFPLow.log2FC = [32mcol_double()[39m,
  GFPHigh_vs_GFPLow.FDR = [32mcol_double()[39m,
  GFPHigh_vs_GFPLow.DEG = [33mcol_logical()[39m,
  `lipid effect` = [31mcol_character()[39m,
  rank = [32mcol_double()[39m
)




In [8]:
orig_deg_gene_sets <- list()

for (r in unique(all_genes_ranked$rank)) {
    if (r == 0) {
        next
    }
    
    data <- all_genes_ranked[all_genes_ranked$rank == r,]
    #q <- quantile(data, 0.50, names=FALSE)
    
    orig_deg_gene_sets[[paste0("gene_set_", r)]] <- data$gene_name
}

In [9]:
length(orig_deg_gene_sets)

### Combine gene sets into "increase lipids" and "decrease lipids"

In [10]:
deg_gene_sets <- list()

In [11]:
# genes that increase lipids
deg_gene_sets[["gene_set_increase"]] <- c(
#     orig_deg_gene_sets[["gene_set_2"]],
    orig_deg_gene_sets[["gene_set_3"]]
)

In [12]:
# genes that decrease lipids
deg_gene_sets[["gene_set_decrease"]] <- c(
#     orig_deg_gene_sets[["gene_set_-2"]],
    orig_deg_gene_sets[["gene_set_-3"]]
)

In [13]:
length(deg_gene_sets)

In [14]:
length(deg_gene_sets[["gene_set_increase"]])

In [15]:
stopifnot(length(deg_gene_sets[["gene_set_increase"]]) == 6)

In [16]:
length(deg_gene_sets[["gene_set_decrease"]])

In [17]:
stopifnot(length(deg_gene_sets[["gene_set_decrease"]]) == 8)

In [18]:
# test new increase set
new_set <- deg_gene_sets[["gene_set_increase"]]
expected_set <- orig_deg_gene_sets[["gene_set_3"]]

stopifnot(length(new_set) == length(unique(new_set)))

stopifnot(
    length(new_set) == 
    length(
        intersect(
            new_set,
            expected_set
        )
    )
)

In [19]:
# test new decrease set
new_set <- deg_gene_sets[["gene_set_decrease"]]
expected_set <- orig_deg_gene_sets[["gene_set_-3"]]

stopifnot(length(new_set) == length(unique(new_set)))

stopifnot(
    length(new_set) == 
    length(
        intersect(
            new_set,
            expected_set
        )
    )
)

## MultiPLIER Z

In [20]:
multiplier_z = pd$read_pickle(
    Sys.getenv("PHENOPLIER_MULTIPLIER_MODEL_Z_MATRIX_FILE")
)

In [21]:
dim(multiplier_z)

In [22]:
head(multiplier_z)

Unnamed: 0_level_0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,⋯,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
GAS6,0.0,0,0.03943774,0,0.05047625,0.0,0.0,0.0,0.5909494,0.0,⋯,0.0501251,0.0,0.033407371,0.0,0.0,0.0059633917,0.34736209,0,0.0,0.0
MMP14,0.0,0,0.0,0,0.07007159,0.0,0.0,0.004904131,1.7201788,2.42359463,⋯,0.0,0.0,0.001007286,0.0,0.03574724,0.0,0.0,0,0.01497801,0.0
DSP,0.0,0,0.0,0,0.0,0.04169683,0.0,0.005718149,0.0,0.0,⋯,0.02085321,0.0,0.0,0.0,0.0,0.0057744399,0.0,0,0.0,0.41640455
MARCKSL1,0.3052117,0,0.0,0,0.0,0.0,0.0,0.0,0.1618435,0.14947148,⋯,0.02713418,0.05271997,0.0,0.03018947,0.06088351,0.0,0.0,0,0.0,0.44847996
SPARC,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.01401441,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.06777859,0,0.12241734,0.0626649
CTSD,0.2778532,0,0.06459781,0,0.0,0.0,0.4694809,0.099949232,0.9717295,0.43334911,⋯,0.0,0.01907577,0.008904815,0.0,0.0,0.0007222884,0.0,0,0.0,0.06193245


# Prepare LVs list

In [23]:
lvs = list()
z_gene_names <- rownames(multiplier_z)

for (cidx in 1:ncol(multiplier_z)) {
    data <- multiplier_z[, cidx]
    names(data) <- z_gene_names
    
    lvs[[paste0("LV", cidx)]] <- data # [data > 0.0]
}

In [24]:
display(length(lvs))
stopifnot(length(lvs) == 987)

# Compute enrichment on all LVs

In [25]:
n_reps = 10

In [26]:
set.seed(0)

In [27]:
results = list()

for (lv in names(lvs)) {
    repetitions = list()
    
    for (i in 1:n_reps) {
        rep_res <- fgsea(pathways = deg_gene_sets, stats = lvs[[lv]], scoreType = "pos", eps = 0.0)[order(pval), ]
        rep_res[, "lv"] <- lv
        rep_res[, "rep_idx"] <- i
        
        repetitions[[i]] <- rep_res
    }
    
    res <- do.call(rbind, repetitions)

    results[[lv]] <- res
}

In [28]:
length(results)

In [29]:
df <- do.call(rbind, results)

In [30]:
df <- df %>% mutate(leadingEdge = map_chr(leadingEdge, toString))

In [31]:
dim(df)

In [32]:
head(df)

pathway,pval,padj,log2err,ES,NES,size,leadingEdge,lv,rep_idx
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>,<chr>,<int>
gene_set_decrease,0.0959041,0.1918082,0.14040624,0.9095092,1.257073,5,"PCYT2, UBE2J2, FBXW7",LV1,1
gene_set_increase,0.2287712,0.2287712,0.08383611,0.8402528,1.286063,3,"ACACA, MBTPS1",LV1,1
gene_set_decrease,0.1018981,0.2037962,0.13574094,0.9095092,1.25356,5,"PCYT2, UBE2J2, FBXW7",LV1,2
gene_set_increase,0.2857143,0.2857143,0.0721798,0.8402528,1.258282,3,"ACACA, MBTPS1",LV1,2
gene_set_decrease,0.0979021,0.1958042,0.13880511,0.9095092,1.260276,5,"PCYT2, UBE2J2, FBXW7",LV1,3
gene_set_increase,0.2667333,0.2667333,0.07569463,0.8402528,1.257656,3,"ACACA, MBTPS1",LV1,3


## Save

In [33]:
output_file <- file.path(OUTPUT_DIR, "fgsea-hi_conf-all_lvs.tsv")
display(output_file)

In [34]:
write_tsv(df, output_file)

# Quick analyses/tests

## See how one LV looks like

In [35]:
df %>% filter(lv == "LV100" & pathway == "gene_set_increase") %>% arrange(desc(padj))

pathway,pval,padj,log2err,ES,NES,size,leadingEdge,lv,rep_idx
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>,<chr>,<int>
gene_set_increase,0.8631369,0.9080919,0.01816749,0.5036312,0.7534076,3,"ACACA, MBTPS1, DGAT2",LV100,1
gene_set_increase,0.8671329,0.9070929,0.01785899,0.5036312,0.754264,3,"ACACA, MBTPS1, DGAT2",LV100,8
gene_set_increase,0.8571429,0.9030969,0.01862588,0.5036312,0.7604084,3,"ACACA, MBTPS1, DGAT2",LV100,7
gene_set_increase,0.8681319,0.9020979,0.01778148,0.5036312,0.75765,3,"ACACA, MBTPS1, DGAT2",LV100,9
gene_set_increase,0.8551449,0.9010989,0.01877759,0.5036312,0.7623073,3,"ACACA, MBTPS1, DGAT2",LV100,5
gene_set_increase,0.8501499,0.8961039,0.01915466,0.5036312,0.76201,3,"ACACA, MBTPS1, DGAT2",LV100,2
gene_set_increase,0.8551449,0.8961039,0.01877759,0.5036312,0.7678539,3,"ACACA, MBTPS1, DGAT2",LV100,3
gene_set_increase,0.8591409,0.8951049,0.01847364,0.5036312,0.7515552,3,"ACACA, MBTPS1, DGAT2",LV100,10
gene_set_increase,0.8701299,0.8931069,0.01762598,0.5036312,0.7535733,3,"ACACA, MBTPS1, DGAT2",LV100,4
gene_set_increase,0.8661339,0.8881119,0.01793635,0.5036312,0.7594586,3,"ACACA, MBTPS1, DGAT2",LV100,6


## Show significant LVs

In [36]:
df_signif <- df %>% group_by(lv, pathway) %>% summarize(max_pval = max(pval)) %>% filter(max_pval < 0.05)

`summarise()` has grouped output by 'lv'. You can override using the `.groups` argument.



In [37]:
nrow(df_signif)

In [38]:
stopifnot(nrow(df_signif) > 50)

In [39]:
df_signif %>% arrange(max_pval)

lv,pathway,max_pval
<chr>,<chr>,<dbl>
LV520,gene_set_decrease,0.0005541142
LV801,gene_set_decrease,0.0022014036
LV512,gene_set_decrease,0.0024603865
LV246,gene_set_increase,0.0035333199
LV612,gene_set_decrease,0.0035703176
LV41,gene_set_decrease,0.0040527472
LV702,gene_set_increase,0.0045732698
LV607,gene_set_increase,0.0058374008
LV890,gene_set_increase,0.0067297311
LV838,gene_set_decrease,0.0069528139
