From 6cd10f2347f2e2e9365331a29793167a7cc1ce00 Mon Sep 17 00:00:00 2001
From: sreichl <reichl.stephan@gmail.com>
Date: Thu, 5 Jan 2023 14:33:23 +0100
Subject: [PATCH] add feature score calculation for downstream analyses e.g.,
 ranked GSEA

---
 README.md                    |  6 ++++--
 config/config.yaml           |  8 ++++++++
 workflow/rules/dea.smk       |  1 +
 workflow/scripts/aggregate.R | 22 +++++++++++++++++++---
 4 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index a69e9a1..58a91a7 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ This project wouldn't be possible without the following software and their depen
 # Methods
 This is a template for the Methods section of a scientific publication and is intended to serve as a starting point. Only retain paragraphs relevant to your analysis. References [ref] to the respective publications are curated in the software table above. Versions (ver) have to be read out from the respective conda environment specifications (.yaml file) or post execution. Parameters that have to be adapted depending on the data or workflow configurations are denoted in squared brackets e.g. [X].
 
-__Differential Expression Analysis (DEA).__ DEA was performed on the quality-controlled filtered [raw/normalized] counts using the LIMMA (ver) [ref] workflow for fitting a linear model [formula] to identify features (genes/regions) that statistically significantly change with [comparisons] compared to the control group [reference levels] (intercept). Briefly, we determined normalization factors with edgeR::calcNormFactors (optional) using method [X], then applied voom (optional) to estimate the mean-variance relationship of the log-counts. We used blocking on (optional) variable [X] to account for repeated measurements, lmFit to fit the model to the data, and finally eBayes (optional) with the robust (and trend flag – optional for normalized data) flag to compute (moderated/ordinary) t-statistics. For each comparison we extracted feature-wise average expression, effect sizes (log2 fold change) and their statistical significance as adjusted p-values, determined using the Benjamini-Hochberg method. Next, these results were filtered for relevant features based on the following criteria: statistical significance (adjusted p-value < [X]), absolute log2 fold change (> [X]), and average gene expression (> [X]). Finally, we performed hierarchical clustering on the effect sizes (log2 fold changes) of the union of all relevant features and comparison groups.
+__Differential Expression Analysis (DEA).__ DEA was performed on the quality-controlled filtered [raw/normalized] counts using the LIMMA (ver) [ref] workflow for fitting a linear model [formula] to identify features (genes/regions) that statistically significantly change with [comparisons] compared to the control group [reference levels] (intercept). Briefly, we determined normalization factors with edgeR::calcNormFactors (optional) using method [X], then applied voom (optional) to estimate the mean-variance relationship of the log-counts. We used blocking on (optional) variable [X] to account for repeated measurements, lmFit to fit the model to the data, and finally eBayes (optional) with the robust (and trend flag – optional for normalized data) flag to compute (moderated/ordinary) t-statistics. For each comparison we used topTable to extract feature-wise average expression, effect sizes (log2 fold change) and their statistical significance as adjusted p-values, determined using the Benjamini-Hochberg method. Furthermore, we calculated feature scores, for each feature in all comparisons, using the formula [score_formula] for downstream ranked enrichment analyses. Next, these results were filtered for relevant features based on the following criteria: statistical significance (adjusted p-value < [X]), absolute log2 fold change (> [X]), and average gene expression (> [X]). Finally, we performed hierarchical clustering on the effect sizes (log2 fold changes) of the union of all relevant features and comparison groups.
 
 __Visualization.__ The filtered result statistics, i.e., number of relevant features split by positive (up) and negative (down) effect sizes, were visualized with stacked bar plots using ggplot (ver) [ref].
 To visually summarize results of all performed comparisons, the filtered effect size (log2 fold change) values of all features that were found to be relevant in at least one comparison were plotted in a hierarchically clustered heatmap using pheatmap (ver) [ref]. 
@@ -46,7 +46,7 @@ Volcano plots were generated for each comparison using EnhancedVolcano (ver) [re
 Finally, quality control plots of the fitted mean-variance relationship and raw p-values of the features were generated.
 
 
-**The analysis and visualizations described here were performed using a publicly available Snakemake [ver] (ref) workflow [ref - cite this workflow here].**
+**The analysis and visualizations described here were performed using a publicly available Snakemake (ver) [ref] workflow (ver) [ref - cite this workflow here].**
 
 # Features
 The workflow performs the following steps that produce the outlined results:
@@ -63,6 +63,8 @@ The workflow performs the following steps that produce the outlined results:
   - extract all statistics for variables of interest (=configured comparisons) using __topTable__ (eg coefficients/effect size, statistical significance,...).
   - save a feature list per comparison group and direction of change (up/down) for downstream analyses (eg enrichment analysis) (TXT).
     - (optional) annotated feature list with suffix "_annot" (TXT).
+  - (optional) save feature score tables (with two columns: "feature" and "score") per comparison group using score_formula for downstream analyses (eg ranked enrichment analysis) (CSV).
+    - (optional) annotated feature scores tables (with two columns: "feature_name" and "score") with suffix "_annot" (CSV).
 - DEA result statistics: total number of statistically significant features and split by positive (up) and negative (down) change (CSV).
 - DEA result filtering of features (eg genes) by 
   - statistical significance (<= adjusted p-value: adj_pval)
diff --git a/config/config.yaml b/config/config.yaml
index 5f064e6..b3e3181 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -31,6 +31,14 @@ reference_levels:
     time: "0h"
 
 ##### AGGREGATION #####
+# score_formula is used to calculate a score for each gene and comparison (group) that can be used downstream e.g., for ranked GSEA
+# eval based score calculation -> eval(parse(text=" "))
+# DEA result dataframe is called: dea_results
+# available numerical columns: logFC; AveExpr; t; P.Value; adj.P.Val; B
+# common/popular example: "-log10(dea_results$P.Value)*sign(dea_results$logFC)"
+# if not used leave empty: ""
+score_formula: "-log10(dea_results$P.Value)*sign(dea_results$logFC)"
+
 # filters are applied to the DEA results, which contain all results, and used for downstream outputs (eg feature lists, LFC matrix).
 filters:
     adj_pval: 0.05
diff --git a/workflow/rules/dea.smk b/workflow/rules/dea.smk
index 3636f0d..dfd4090 100644
--- a/workflow/rules/dea.smk
+++ b/workflow/rules/dea.smk
@@ -50,6 +50,7 @@ rule aggregate:
         os.path.join("logs","rules","aggregate_{analysis}.log"),
     params:
         partition=config.get("partition"),
+        score_formula = config["score_formula"],
         adj_pval = config["filters"]["adj_pval"],
         lfc = config["filters"]["lfc"],
         ave_expr = config["filters"]["ave_expr"],
diff --git a/workflow/scripts/aggregate.R b/workflow/scripts/aggregate.R
index 73884cb..04f2342 100644
--- a/workflow/scripts/aggregate.R
+++ b/workflow/scripts/aggregate.R
@@ -19,6 +19,7 @@ dea_stats_plot_path <- snakemake@output[["dea_stats_plot"]]
 adj_pval <- snakemake@params[["adj_pval"]] # 0.05
 lfc <- snakemake@params[["lfc"]] # 0
 ave_expr <- snakemake@params[["ave_expr"]] # 0
+score_formula <- snakemake@params[["score_formula"]] # "-log10(dea_results$P.Value)*sign(dea_results$logFC)"
 
 # plot specifications
 width <- 0.5
@@ -32,7 +33,7 @@ if (!dir.exists(results_path)){
 
 ### load DEA results
 dea_results <- read.csv(file=file.path(dea_result_path))
-
+groups <- unique(dea_results$group)
 
 ### save list of all expressed features for downstream analysis (eg as background in enrichment analyses)
 all_features <- unique(dea_results$feature)
@@ -44,6 +45,22 @@ if("feature_name" %in% colnames(dea_results)){
     write(all_features_annot, file.path(results_path, "ALL_features_annot.txt"))
 }
 
+# determine and save feature scores for each gene and group for downstream analysis e.g., ranked GSEA
+if (score_formula!=""){
+    dea_results$score <- eval(parse(text=score_formula))
+    
+    for (group in groups){
+        tmp_features <- dea_results[(dea_results$group==group),c("feature","score")]
+        write.csv(tmp_features, file.path(results_path,paste0(group,"_featureScores.csv")), row.names=FALSE)
+
+        if("feature_name" %in% colnames(dea_results)){
+            tmp_features <- dea_results[(dea_results$group==group),c("feature_name","score")]
+            tmp_features <- tmp_features[tmp_features$feature_name != "",]
+            write.csv(tmp_features, file.path(results_path,paste0(group,"_featureScores_annot.csv")), row.names=FALSE)
+        }
+    }
+}
+
 # annotate differential direction (up or down)
 dea_results$direction <- as.character(lapply(dea_results$logFC, function(x) if(x>0){"up"}else{"down"}))
 
@@ -59,12 +76,11 @@ dea_filtered_stats_df$total <- rowSums(dea_filtered_stats_df)
 write.csv(dea_filtered_stats_df, file=file.path(dea_stats_path), row.names=TRUE)
 
 ### aggregate & save LFC matrix from filtered DEA result features
-groups <- unique(dea_results$group)
 features <- unique(dea_filtered_results$feature)
                                              
 lfc_df <- data.frame(matrix(nrow=length(features), ncol=length(groups), dimnames=list(features, groups)))
 
-for (group in unique(dea_results$group)){
+for (group in groups){
     tmp_dea_results <- dea_results[dea_results$group==group, ]
     rownames(tmp_dea_results) <- tmp_dea_results$feature
     lfc_df[features, group] <- tmp_dea_results[features, 'logFC']