## Coisas de setup pra poder rodar as funçoes, declaracoes de variaveis e etc

In [1]:
gene_col<-"Gene.symbol"
logFC_col<-"logFC"
pvalue_col<-"P.Value"
min_genes<-100
max_genes<-1500
file_in <- "GSE49757_Septic_vs_Healthy.txt"
gmt_file <- "Reactome_2016_15and100Genes.gmt"
deg_list <- read.csv(file_in, header = TRUE, sep = "\t")
deg_list <- deg_list[which(deg_list[, gene_col]!=""), ]
    ## Make logFC and p-value columns numeric
deg_list[, logFC_col] <- as.numeric(deg_list[, logFC_col])
deg_list[, pvalue_col] <- as.numeric(deg_list[, pvalue_col])

# Declarando funções

## .get_pathway

In [1]:
.get_pathway <- function(merge_p, term2gene, all_genes, deg_list,
                        gene_col, logFC_col, pvalue_col, direction,
                        min_genes, max_genes, p_cut){

   
    
    if(tolower(direction) == "up"){
        top <- deg_list[head(order(deg_list[, logFC_col], decreasing=TRUE), n=max_genes), ]
    }else if(tolower(direction) == "down"){
        top <- deg_list[head(order(deg_list[, logFC_col], decreasing=FALSE), n=max_genes), ]
    }else if(tolower(direction) == "any"){
        top <- deg_list[head(order(abs(deg_list[, logFC_col]), decreasing=FALSE), n=max_genes), ]
    }else{
        stop("Invalid direction argument")
    }

    # add pi_value
    top$pi_value = abs(top[, logFC_col])*-log10(top[, pvalue_col])

    # order pi_value
    top <- top[order(top$pi_value, decreasing=TRUE), ]
    
  
    
    for (i in seq(from=min_genes, to=max_genes, by=50)) {
        top_genes  <- as.character(top[1:i, gene_col])
      
        pathG <- .run_enrich(top_genes, all_genes, term2gene)
       
    
        colnames(pathG) <- c("term",  i)
       
        merge_p <- merge(merge_p, pathG, by="term", all=TRUE)

        merge_p[is.na(merge_p)] <- 1 # substitui Nan por 1
    }
   
    rownames(merge_p) <- merge_p[, 1]
    merge_p           <- merge_p[, -1]
   
    merge_p2          <- log10(merge_p)*-1

    path_cut_p <- log10(p_cut)*-1

    df <- data.frame(matrix(0, nrow(merge_p2), ncol=0))
    rownames(df) <- rownames(merge_p2)
    #top cut with maximum MinuslogP
    df$NG <- as.numeric(colnames(merge_p2)[apply(merge_p2, 1, which.max)])
    df$p  <- as.numeric(apply(merge_p2, 1, max))
    df$P  <- as.numeric(apply(merge_p2, 1, sum))

    #How many columns above path_cut_p (freq)
    df$times <- as.numeric(apply(merge_p2, 1, function(x) length(which(x > path_cut_p))))/ncol(merge_p2)

    #If the pathway has times > 0
    #First column above path_cut_p
    df$first <- apply(merge_p2, 1, function(x) ifelse (length(which(x > path_cut_p)) >0,
                                                as.numeric(colnames(merge_p2)[min(which(x > path_cut_p))]),
                                                0))

    #ES3
    df$ES3 <- (1 - exp(-df$p))/(1 + 0.1*sqrt(df$NG))

    #order
    merge_p2 <- merge_p2[order(df[, "first"], decreasing=TRUE), ]

    colnames(df) <- c(paste("TopCut_highestMinuslogP", "_", direction, sep=""),
                      paste("maximum_MinuslogP", "_", direction, sep=""),
                      paste("sum_MinuslogP", "_", direction, sep=""),
                      paste("times_significant", "_", direction, sep=""),
                      paste("FirstTopCut_significant", "_", direction, sep=""),
                      paste("PEBBA_score", "_", direction, sep=""))

    newList <- list("data.frame" = df, "data.frame" = merge_p2)
    return(newList)
}

## .run_enrich

In [2]:
.cutoff_path <- function(path_table, p_cut, direction){
    df <- data.frame(matrix(0, nrow=ncol(path_table), ncol=0))
    rownames(df) <- colnames(path_table)

    df$MaxR  <- as.numeric(apply(path_table, 2, max))
    df$SumR  <- as.numeric(apply(path_table, 2, sum))
    path_cut_p <- log10(p_cut)*-1
    #How many pathways above path_cut_p (freq)
    df$times <- as.numeric(apply(path_table, 2,
                               function(x) length(which(x > path_cut_p))))/nrow(path_table)
    colnames(df) <- c(paste0("maximum_MinuslogP_", direction),
                    paste0("sum_MinuslogP_", direction),
                    paste0("times_significant_", direction))
    return(df)
}




In [3]:
.run_enrich <- function(top_genes, all_genes, term2gene){
    enriched <- as.data.frame(clusterProfiler::enricher(gene = top_genes,
                                    pvalueCutoff = 1,
                                    minGSSize = 1,
                                    universe = all_genes,
                                    TERM2GENE = term2gene,
                                    qvalueCutoff = 1,
                                    maxGSSize = 100))#[, c(1, 6)]
    return(enriched)
}



.get_cutoff <- function(deg_list, logFC_col, pvalue_col, min_genes, max_genes){
    dirs <- c("down", "up")

    res <- lapply(dirs, function(direction){

        decreasing <- ifelse(direction == "down", FALSE, TRUE)

        top <- deg_list[head(order(deg_list[, logFC_col],
                                   decreasing=decreasing),
                             n=max_genes),
                        c(logFC_col, pvalue_col)]
        #Add pi_value
        top$pi_value <- abs(top[, logFC_col]) * -log10(top[, pvalue_col])
        #Order pi_value
        top <- top[order(top$pi_value, decreasing=TRUE), ]
        df1 <- data.frame(minFC=numeric(0), minP=numeric(0), minPi=numeric(0))
        for (i in seq(from=min_genes, to=max_genes, by=50)) {
            top_genes  <- top[1:i, ]
            minFC <- min(abs(top_genes[, 1]))
            maxP  <- max(top_genes[, 2])
            minP  <- -log10(maxP)
            minPi <- min(top_genes[i, 3])
            rowX  <- data.frame(minFC=minFC, minP=minP, minPi=minPi)
            df1 <- rbind(df1,rowX)
        }
        df1
    })
    names(res) <- dirs
    top_cut <- seq(from=min_genes, to=max_genes, by=50)
    res <- do.call("cbind", res)
    res <- cbind(top_cut, res)

    res$fc <- apply(res, 1, function(x) min(x[2], x[6]) )
    res$p  <- apply(res, 1, function(x) min(x[3], x[7]) )
    res$pi <- apply(res, 1, function(x) min(x[4], x[8]) )

    names(res) <- c("TopCut", "minimum_log2fc_down", "minimum_MinuslogP_down",
                    "minimum_Pi_down", "minimum_log2fc_up", "minimum_MinuslogP_up",
                    "minimum_Pi_up", "minimum_log2fc_combined",
                    "minimum_MinuslogP_combined", "minimum_Pi_combined")

    rownames(res) <- res[, 1]
    return(res)
}

read_gmt_hier <- function(fname){
    res <- list(genes=list(), desc=list())
    gmt <- file(fname)
    gmt_lines <- readLines(gmt)
    close(gmt)
    gmt_list <- lapply(gmt_lines, function(x) unlist(strsplit(x, split="\t")))
    gmt_names <- sapply(gmt_list, '[', 1)
    gmt_desc <- lapply(gmt_list, '[', 2)
    gmt_genes <- lapply(gmt_list, function(x){x[3:length(x)]})
    names(gmt_desc) <- names(gmt_genes) <- gmt_names
    res <- do.call(rbind, lapply(names(gmt_genes),
                                 function(n) cbind.data.frame(term=n, gene=gmt_genes[[n]], hier=gmt_desc[[n]], stringsAsFactors=FALSE)))
    res$term <- as.factor(res$term)
    path_desc <- as.data.frame(cbind(gmt_names, gmt_desc))
    return(list(res, path_desc))
}                                  
                                   

## pebba

In [4]:

pebba <- function(file_in, gmt_file, gene_col="Gene.symbol",
                  logFC_col="logFC", pvalue_col="P.Value",
                  min_genes=100, max_genes=1500,
                  p_cut=0.2, verbose=TRUE,
                  analysis_name=NULL, results_dir="Results",
                  force=FALSE){

    # Validating inputs
    if(min_genes < 50 | min_genes > 2900){
        stop("Variable min_genes must be between 50 and 2900 genes")
    }
    if(max_genes < 100 | max_genes > 3000){
        stop("Variable max_genes must be between 100 and 3000 genes")
    }
    if(p_cut < 0.00001 | p_cut > 1){
        stop("Variable p_cut must be between 0.00001 and 1")
    }


    # Preparing files and workspace
    ## Disable scientifc notation
    options(scipen=999)

    ## Create a results directory
    if(dir.exists(results_dir)){
        if(!force){
            stop("Stopping analysis: ", results_dir,
                 " already exists! Use force=TRUE to overwrite.")
        }
    }else{
        dir.create(results_dir)
        dir.create(file.path(results_dir, "Tables"))
        dir.create(file.path(results_dir, "Heatmaps"))
    }

    if(is.null(analysis_name)){
        analysis_name <- "PEBBA_analysis"
    }

    ## Get information from all unique terms
    gmt_res <- read_gmt_hier(gmt_file)
    term2gene <- gmt_res[[1]]
    path_desc <- gmt_res[[2]]
   
    
    
    merge_p <- data.frame(unique(term2gene[1]))
   
    if(is.character(file_in)){
        deg_list <- read.csv(file_in, header = TRUE, sep = "\t")
        if(is.null(analysis_name)){
            analysis_name <- tools::file_path_sans_ext(basename(file_in))
        }
    }else if(is.data.frame(file_in)){
        deg_list <- file_in
    }
        
         
  

    ## Remove rows that do not have a valid gene symbol
    deg_list <- deg_list[which(deg_list[, gene_col]!=""), ]
    ## Make logFC and p-value columns numeric
    deg_list[, logFC_col] <- as.numeric(deg_list[, logFC_col])
    deg_list[, pvalue_col] <- as.numeric(deg_list[, pvalue_col])

    ## Get background genes as a character vector
    ## Empty values (non-annotated genes) will be removed
    all_genes <- as.character(deg_list[, gene_col])
   
    # Get cutoff values -------------------------------------------------------

    if(verbose) message("Getting cutoff")
    ## Get info about p-value and log2fc cutoff used on each top segments
    
   
    
    table_cut <- .get_cutoff(deg_list, logFC_col, pvalue_col, min_genes, max_genes)

    
    dirs <- c("down")#, "down", "any")
    cut_path_list <- lapply(dirs, function(direction){
        if(verbose) message(direction)
        if(verbose) message("Getting pathways")
        list_p <- .get_pathway(merge_p, term2gene, all_genes,
                            deg_list, gene_col, logFC_col,
                            pvalue_col, direction,
                            min_genes, max_genes, p_cut)
        
        return(list_p)
        df <- list_p[[1]]
        path <- list_p[[2]]
        
        if(verbose) message("Getting pathway cutoff")
        cut_path <- .cutoff_path(path, p_cut, direction)
        res <- list(cut_path, df, path)
        names(res) <- c("cut_path", "df", "path")
        res
    })
    return(cut_path_list[1])
    names(cut_path_list) <- dirs

}




# Testes e chamando uma função de cada vez

## Retorno do enricher

In [120]:
retorno_do_enricher[, c(1, 6)]

Unnamed: 0,ID,p.adjust
Glycolysis,Glycolysis,0.2230054
Metabolism of nucleotides,Metabolism of nucleotides,0.2230054
IL_6_type cytokine receptor ligand interactions,IL_6_type cytokine receptor ligand interactions,0.2230054
Diseases associated with O_glycosylation of proteins,Diseases associated with O_glycosylation of proteins,0.2230054
Gap junction trafficking and regulation,Gap junction trafficking and regulation,0.2230054
Pre_NOTCH Expression and Processing,Pre_NOTCH Expression and Processing,0.2230054
Pre_NOTCH Processing in Golgi,Pre_NOTCH Processing in Golgi,0.2230054
Collagen degradation,Collagen degradation,0.2230054
Interleukin_6 family signaling,Interleukin_6 family signaling,0.2230054
Synthesis and interconversion of nucleotide di_ and triphosphates,Synthesis and interconversion of nucleotide di_ and triphosphates,0.2230054


In [107]:
pebba(file_in, gmt_file,force = TRUE)

Getting cutoff
down
Getting pathways


term,100,150,200,250,300,350,400,450,500,⋯,1050,1100,1150,1200,1250,1300,1350,1400,1450,1500
ABC_family proteins mediated transport,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,0.8716482,0.9311812,0.9265998,⋯,0.9351672,0.9303596,0.9222777,0.9463349,0.9533222,0.9631968,0.9748761,0.9649513,0.9435197,0.9546258
ABC transporters in lipid homeostasis,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,⋯,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000
Abortive elongation of HIV_1 transcript in the absence of Tat,1.0000000,0.5644864,0.6689054,0.7733165,0.7758824,0.8103123,0.8716482,0.9311812,0.9265998,⋯,0.9351672,0.9303596,0.9222777,0.9463349,0.9533222,0.9631968,0.9748761,0.9649513,0.9435197,0.9546258
Acetylcholine Neurotransmitter Release Cycle,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,⋯,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000
Activated NOTCH1 Transmits Signal to the Nucleus,0.3693850,0.5644864,0.5425624,0.6154009,0.7087998,0.8103123,0.8716482,0.9311812,0.9265998,⋯,0.9351672,0.9303596,0.9360298,0.9497767,0.9533222,0.9631968,0.9748761,0.9649513,0.9700191,0.9839359
Activated PKN1 stimulates transcription of AR androgen receptor regulated genes KLK2 and KLK3,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,⋯,0.9351672,0.9303596,0.9360298,0.9497767,0.9533222,0.9631968,0.9789208,0.9828556,0.9850361,0.9839359
Activated point mutants of FGFR2,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,⋯,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000
activated TAK1 mediates p38 MAPK activation,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,⋯,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,1.0000000,0.9826549,0.9839359
Activation of anterior HOX genes in hindbrain development during early embryogenesis,1.0000000,0.5644864,0.6689054,0.7733165,0.7758824,0.8160899,0.8716482,0.9311812,0.9358161,⋯,0.9351672,0.9303596,0.9222777,0.9463349,0.9533222,0.9631968,0.9748761,0.9649513,0.9435197,0.9546258
Activation of APCslashC and APCslashC:Cdc20 mediated degradation of mitotic proteins,0.5556043,0.7178811,0.6689054,0.7733165,0.7880233,0.8103123,0.8716482,0.9311812,0.9265998,⋯,0.9351672,0.9303596,0.9222777,0.9463349,0.9533222,0.9631968,0.9748761,0.9649513,0.9435197,0.9546258


## path_desc

In [81]:
pebba(file_in, gmt_file,force = TRUE)

Unnamed: 0,gmt_names,gmt_desc
Fertilization,Fertilization,R_HSA_1187000
Reproduction,Reproduction,R_HSA_1474165
Miscellaneous transport and binding events,Miscellaneous transport and binding events,R_HSA_5223345
Uptake and actions of bacterial toxins,Uptake and actions of bacterial toxins,R_HSA_5339562
Dopamine Neurotransmitter Release Cycle,Dopamine Neurotransmitter Release Cycle,R_HSA_212676
Serotonin Neurotransmitter Release Cycle,Serotonin Neurotransmitter Release Cycle,R_HSA_181429
Norepinephrine Neurotransmitter Release Cycle,Norepinephrine Neurotransmitter Release Cycle,R_HSA_181430
Acetylcholine Neurotransmitter Release Cycle,Acetylcholine Neurotransmitter Release Cycle,R_HSA_264642
Glutamate Neurotransmitter Release Cycle,Glutamate Neurotransmitter Release Cycle,R_HSA_210500
Neurotransmitter Release Cycle,Neurotransmitter Release Cycle,R_HSA_112310


## all_genes

In [79]:
pebba(file_in, gmt_file,force = TRUE)

## term2gene

In [77]:
pebba(file_in, gmt_file,force = TRUE)

term,gene,hier
Fertilization,IZUMO4,R_HSA_1187000
Fertilization,OVGP1,R_HSA_1187000
Fertilization,IZUMO3,R_HSA_1187000
Fertilization,IZUMO2,R_HSA_1187000
Fertilization,IZUMO1,R_HSA_1187000
Fertilization,ADAM30,R_HSA_1187000
Fertilization,ACR,R_HSA_1187000
Fertilization,CATSPERB,R_HSA_1187000
Fertilization,SPAM1,R_HSA_1187000
Fertilization,KCNU1,R_HSA_1187000


## merge_P
term2gene unique terms

In [17]:
pebba(file_in, gmt_file,force = TRUE)

Unnamed: 0,term
1,Fertilization
27,Reproduction
53,Miscellaneous transport and binding events
71,Uptake and actions of bacterial toxins
98,Dopamine Neurotransmitter Release Cycle
121,Serotonin Neurotransmitter Release Cycle
139,Norepinephrine Neurotransmitter Release Cycle
157,Acetylcholine Neurotransmitter Release Cycle
174,Glutamate Neurotransmitter Release Cycle
198,Neurotransmitter Release Cycle
