# Module loading

In [1]:
library(IRdisplay)
library(readr)
library(fgsea)
library(dplyr)
library(reticulate)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
pd <- import("pandas")

# Settings

In [3]:
CRISPR_DATA_DIR <- Sys.getenv("PHENOPLIER_CRISPR_BASE_DIR")

In [4]:
CRISPR_DATA_DIR

# Data loading

## Lipids gene sets

In [5]:
input_file <- Sys.getenv("PHENOPLIER_CRISPR_LIPIDS_GENE_SETS_FILE")
display(input_file)

In [6]:
all_genes_ranked <- read_csv(input_file)


[36m──[39m [1m[1mColumn specification[1m[22m [36m─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
  gene_name = [31mcol_character()[39m,
  GFPLow_vs_UnSorted.log2FC = [32mcol_double()[39m,
  GFPLow_vs_UnSorted.FDR = [32mcol_double()[39m,
  GFPLow_vs_UnSorted.DEG = [33mcol_logical()[39m,
  GFPHigh_vs_UnSorted.log2FC = [32mcol_double()[39m,
  GFPHigh_vs_UnSorted.FDR = [32mcol_double()[39m,
  GFPHigh_vs_UnSorted.DEG = [33mcol_logical()[39m,
  GFPHigh_vs_GFPLow.log2FC = [32mcol_double()[39m,
  GFPHigh_vs_GFPLow.FDR = [32mcol_double()[39m,
  GFPHigh_vs_GFPLow.DEG = [33mcol_logical()[39m,
  `lipid effect` = [31mcol_character()[39m,
  rank = [32mcol_double()[39m
)




In [7]:
orig_deg_gene_sets <- list()

for (r in unique(all_genes_ranked$rank)) {
    if (r == 0) {
        next
    }
    
    data <- all_genes_ranked[all_genes_ranked$rank == r,]
    #q <- quantile(data, 0.50, names=FALSE)
    
    orig_deg_gene_sets[[paste0("gene_set_", r)]] <- data$gene_name
}

In [8]:
length(orig_deg_gene_sets)

### Combine gene sets into "increase lipids" and "decrease lipids"

In [30]:
deg_gene_sets <- list()

In [31]:
# genes that increase lipids
deg_gene_sets[["gene_set_increase"]] <- c(
    orig_deg_gene_sets[["gene_set_2"]],
    orig_deg_gene_sets[["gene_set_3"]]
)

In [32]:
# genes that decrease lipids
deg_gene_sets[["gene_set_decrease"]] <- c(
    orig_deg_gene_sets[["gene_set_-2"]],
    orig_deg_gene_sets[["gene_set_-3"]]
)

In [33]:
length(deg_gene_sets)

In [37]:
length(deg_gene_sets[["gene_set_increase"]])

In [38]:
length(deg_gene_sets[["gene_set_decrease"]])

In [34]:
# test new increase set
new_set <- deg_gene_sets[["gene_set_increase"]]
expected_set <- union(
    orig_deg_gene_sets[["gene_set_2"]],
    orig_deg_gene_sets[["gene_set_3"]]
)

stopifnot(length(new_set) == length(unique(new_set)))

stopifnot(
    length(new_set) == 
    length(
        intersect(
            new_set,
            expected_set
        )
    )
)

In [39]:
# test new decrease set
new_set <- deg_gene_sets[["gene_set_decrease"]]
expected_set <- union(
    orig_deg_gene_sets[["gene_set_-2"]],
    orig_deg_gene_sets[["gene_set_-3"]]
)

stopifnot(length(new_set) == length(unique(new_set)))

stopifnot(
    length(new_set) == 
    length(
        intersect(
            new_set,
            expected_set
        )
    )
)

## MultiPLIER Z

In [10]:
multiplier_z = pd$read_pickle(
    Sys.getenv("PHENOPLIER_MULTIPLIER_MODEL_Z_MATRIX_FILE")
)

In [11]:
dim(multiplier_z)

In [12]:
head(multiplier_z)

Unnamed: 0_level_0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,⋯,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
GAS6,0.0,0,0.03943774,0,0.05047625,0.0,0.0,0.0,0.5909494,0.0,⋯,0.0501251,0.0,0.033407371,0.0,0.0,0.0059633917,0.34736209,0,0.0,0.0
MMP14,0.0,0,0.0,0,0.07007159,0.0,0.0,0.004904131,1.7201788,2.42359463,⋯,0.0,0.0,0.001007286,0.0,0.03574724,0.0,0.0,0,0.01497801,0.0
DSP,0.0,0,0.0,0,0.0,0.04169683,0.0,0.005718149,0.0,0.0,⋯,0.02085321,0.0,0.0,0.0,0.0,0.0057744399,0.0,0,0.0,0.41640455
MARCKSL1,0.3052117,0,0.0,0,0.0,0.0,0.0,0.0,0.1618435,0.14947148,⋯,0.02713418,0.05271997,0.0,0.03018947,0.06088351,0.0,0.0,0,0.0,0.44847996
SPARC,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.01401441,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.06777859,0,0.12241734,0.0626649
CTSD,0.2778532,0,0.06459781,0,0.0,0.0,0.4694809,0.099949232,0.9717295,0.43334911,⋯,0.0,0.01907577,0.008904815,0.0,0.0,0.0007222884,0.0,0,0.0,0.06193245


# Prepare LVs list

In [13]:
lvs = list()
z_gene_names <- rownames(multiplier_z)

for (cidx in 1:ncol(multiplier_z)) {
    data <- multiplier_z[, cidx]
    names(data) <- z_gene_names
    # q <- quantile(data, 0.75, names=FALSE)
    q <- 0.0
    
    lvs[[paste0("LV", cidx)]] <- data[data > q]
}

# Compute enrichment on all LVs

In [14]:
n_reps = 10

In [15]:
set.seed(0)

In [16]:
results = list()

for (lv in names(lvs)) {
    repetitions = list()
    
    for (i in 1:n_reps) {
        rep_res <- fgsea(pathways = deg_gene_sets, stats = lvs[[lv]], scoreType = "pos", eps = 0.0)[order(pval), ]
        rep_res[, "leadingEdge"] <- sapply(rep_res$leadingEdge, paste, collapse=",")
        rep_res[, "lv"] <- lv
        rep_res[, "rep_idx"] <- i
        
        repetitions[[i]] <- rep_res
    }
    
    res <- do.call(rbind, repetitions)

    results[[lv]] <- res
}

In [17]:
length(results)

In [18]:
df <- do.call(rbind, results)

In [19]:
dim(df)

In [20]:
head(df)

pathway,pval,padj,log2err,ES,NES,size,leadingEdge,lv,rep_idx
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>,<chr>,<int>
gene_set_decrease_-2_and_-3,0.1848152,0.2357642,0.09592068,0.5784179,1.126666,35,"PTBP1,KEAP1,PEX14,DLST,PCYT2,MAD2L2,GLRX5,OGDH,UBE2J2,CSK",LV1,1
gene_set_increase_2_and_3,0.2357642,0.2357642,0.08220549,0.5391549,1.076511,63,"CHERP,RANGAP1,HNRNPL,RPS2,E4F1,TAF1C,GATAD2A,SAFB,TAF6,FBL,LSM4,SUPT5H,CHD4,PFDN6,SLC35B2,USP39,POLR3E,POLR2C,RPLP0,SREBF2,ACACA,RPL18",LV1,1
gene_set_decrease_-2_and_-3,0.1988012,0.2517483,0.09167952,0.5784179,1.123767,35,"PTBP1,KEAP1,PEX14,DLST,PCYT2,MAD2L2,GLRX5,OGDH,UBE2J2,CSK",LV1,2
gene_set_increase_2_and_3,0.2517483,0.2517483,0.07871138,0.5391549,1.078565,63,"CHERP,RANGAP1,HNRNPL,RPS2,E4F1,TAF1C,GATAD2A,SAFB,TAF6,FBL,LSM4,SUPT5H,CHD4,PFDN6,SLC35B2,USP39,POLR3E,POLR2C,RPLP0,SREBF2,ACACA,RPL18",LV1,2
gene_set_decrease_-2_and_-3,0.1628372,0.2157842,0.10357633,0.5784179,1.13718,35,"PTBP1,KEAP1,PEX14,DLST,PCYT2,MAD2L2,GLRX5,OGDH,UBE2J2,CSK",LV1,3
gene_set_increase_2_and_3,0.2157842,0.2157842,0.08705159,0.5391549,1.087494,63,"CHERP,RANGAP1,HNRNPL,RPS2,E4F1,TAF1C,GATAD2A,SAFB,TAF6,FBL,LSM4,SUPT5H,CHD4,PFDN6,SLC35B2,USP39,POLR3E,POLR2C,RPLP0,SREBF2,ACACA,RPL18",LV1,3


## Save

In [22]:
output_file <- file.path(CRISPR_DATA_DIR, "fsgea-all_lvs.tsv")
display(output_file)

In [23]:
write_tsv(df, output_file)

# Quick analyses

## See how one LV looks like

In [36]:
df %>% filter(lv == "LV100" & pathway == "gene_set_increase_2_and_3") %>% arrange(desc(padj))

pathway,pval,padj,log2err,ES,NES,size,leadingEdge,lv,rep_idx
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>,<chr>,<int>
gene_set_increase_2_and_3,0.1048951,0.1048951,0.133555,0.7069861,1.480363,40,"PCBP1,IGF1R,UXT,SNRPD1,GTF2H1",LV100,3
gene_set_increase_2_and_3,0.1028971,0.1028971,0.135002,0.7069861,1.478466,40,"PCBP1,IGF1R,UXT,SNRPD1,GTF2H1",LV100,5
gene_set_increase_2_and_3,0.1028971,0.1028971,0.135002,0.7069861,1.481544,40,"PCBP1,IGF1R,UXT,SNRPD1,GTF2H1",LV100,7
gene_set_increase_2_and_3,0.1018981,0.1018981,0.1357409,0.7069861,1.461778,40,"PCBP1,IGF1R,UXT,SNRPD1,GTF2H1",LV100,6
gene_set_increase_2_and_3,0.1018981,0.1018981,0.1357409,0.7069861,1.477454,40,"PCBP1,IGF1R,UXT,SNRPD1,GTF2H1",LV100,9
gene_set_increase_2_and_3,0.0979021,0.0979021,0.1388051,0.7069861,1.494823,40,"PCBP1,IGF1R,UXT,SNRPD1,GTF2H1",LV100,8
gene_set_increase_2_and_3,0.09490509,0.09490509,0.1412251,0.7069861,1.504016,40,"PCBP1,IGF1R,UXT,SNRPD1,GTF2H1",LV100,4
gene_set_increase_2_and_3,0.08991009,0.08991009,0.1455161,0.7069861,1.482328,40,"PCBP1,IGF1R,UXT,SNRPD1,GTF2H1",LV100,1
gene_set_increase_2_and_3,0.08891109,0.08891109,0.1464162,0.7069861,1.484432,40,"PCBP1,IGF1R,UXT,SNRPD1,GTF2H1",LV100,2
gene_set_increase_2_and_3,0.08591409,0.08591409,0.1492075,0.7069861,1.490424,40,"PCBP1,IGF1R,UXT,SNRPD1,GTF2H1",LV100,10


## Show significant LVs

In [44]:
df_signif <- df %>% group_by(lv, pathway) %>% summarize(max_padj = max(padj)) %>% filter(max_padj < 0.05)

`summarise()` has grouped output by 'lv'. You can override using the `.groups` argument.



In [None]:
length(df_signif)

In [52]:
df_signif %>% arrange(max_padj)

lv,pathway,max_padj
<chr>,<chr>,<dbl>
LV707,gene_set_increase_2_and_3,3.477712e-07
LV678,gene_set_decrease_-2_and_-3,5.210582e-07
LV905,gene_set_increase_2_and_3,8.587291e-05
LV915,gene_set_increase_2_and_3,0.0001274721
LV750,gene_set_increase_2_and_3,0.001126546
LV341,gene_set_increase_2_and_3,0.001566165
LV64,gene_set_decrease_-2_and_-3,0.002411481
LV897,gene_set_decrease_-2_and_-3,0.00322143
LV575,gene_set_decrease_-2_and_-3,0.008551656
LV310,gene_set_increase_2_and_3,0.00914654
