fmicompbio · csoneson · Aug 25, 2023 · Aug 25, 2023 · Aug 25, 2023 · Aug 25, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: einprot
 Type: Package
 Title: A collection of proteomics analysis utilities and workflows
-Version: 0.7.6
+Version: 0.7.7
 Authors@R: c(
     person("Charlotte", "Soneson", email = "charlotte.soneson@fmi.ch", 
            role = c("aut", "cre"), comment = c(ORCID = "0000-0003-3833-2169")),

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+# einprot 0.7.7
+
+* Change default value of singleFit to TRUE everywhere, for consistency
+
 # einprot 0.7.6
 
 * Bug fix in filtering plot when only one criterion is available

diff --git a/R/runFragPipeAnalysis.R b/R/runFragPipeAnalysis.R
@@ -235,7 +235,7 @@ runFragPipeAnalysis <- function(
     includeOnlySamples = "", excludeSamples = "",
     minScore = 10, minPeptides = 2, imputeMethod = "MinProb",
     assaysForExport = NULL, mergeGroups = list(), comparisons = list(),
-    ctrlGroup = "", allPairwiseComparisons = TRUE, singleFit = FALSE,
+    ctrlGroup = "", allPairwiseComparisons = TRUE, singleFit = TRUE,
     subtractBaseline = FALSE, baselineGroup = "", normMethod = "none",
     spikeFeatures = NULL, stattest = "limma", minNbrValidValues = 2,
     minlFC = 0, samSignificance = TRUE, nperm = 250, volcanoAdjPvalThr = 0.05,

diff --git a/R/runMaxQuantAnalysis.R b/R/runMaxQuantAnalysis.R
@@ -246,7 +246,7 @@ runMaxQuantAnalysis <- function(
     minScore = 10, minPeptides = 2, imputeMethod = "MinProb",
     assaysForExport = c("iBAQ", "Top3"), mergeGroups = list(),
     comparisons = list(),
-    ctrlGroup = "", allPairwiseComparisons = TRUE, singleFit = FALSE,
+    ctrlGroup = "", allPairwiseComparisons = TRUE, singleFit = TRUE,
     subtractBaseline = FALSE, baselineGroup = "", normMethod = "none",
     spikeFeatures = NULL, stattest = "limma", minNbrValidValues = 2,
     minlFC = 0, samSignificance = TRUE, nperm = 250, volcanoAdjPvalThr = 0.05,

diff --git a/R/runPDTMTAnalysis.R b/R/runPDTMTAnalysis.R
@@ -280,7 +280,7 @@ runPDTMTAnalysis <- function(
     minScore = 2, minDeltaScore = 0.2, minPeptides = 2, minPSMs = 2,
     masterProteinsOnly = FALSE, imputeMethod = "MinProb",
     assaysForExport = NULL, mergeGroups = list(), comparisons = list(),
-    ctrlGroup = "", allPairwiseComparisons = TRUE, singleFit = FALSE,
+    ctrlGroup = "", allPairwiseComparisons = TRUE, singleFit = TRUE,
     subtractBaseline = FALSE, baselineGroup = "", normMethod = "none",
     spikeFeatures = NULL, stattest = "limma", minNbrValidValues = 2,
     minlFC = 0, samSignificance = FALSE, nperm = 250, volcanoAdjPvalThr = 0.05,

diff --git a/R/runPDTMTptmAnalysis.R b/R/runPDTMTptmAnalysis.R
@@ -133,7 +133,7 @@ runPDTMTptmAnalysis <- function(
         proteinIdColProteins = function(df) einprot::getFirstId(df, "einprotProtein", ";"),
         proteinIdColPeptides = function(df) einprot::getFirstId(df, "einprotProtein", ";"),
         comparisons = list(),
-        ctrlGroup = "", allPairwiseComparisons = TRUE, singleFit = FALSE,
+        ctrlGroup = "", allPairwiseComparisons = TRUE, singleFit = TRUE,
         subtractBaseline = FALSE, baselineGroup = "",
         testType = "interaction", minNbrValidValues = 2,
         minlFC = 0, volcanoAdjPvalThr = 0.05,

diff --git a/man/runFragPipeAnalysis.Rd b/man/runFragPipeAnalysis.Rd
diff --git a/man/runMaxQuantAnalysis.Rd b/man/runMaxQuantAnalysis.Rd
diff --git a/man/runPDTMTAnalysis.Rd b/man/runPDTMTAnalysis.Rd
diff --git a/man/runPDTMTptmAnalysis.Rd b/man/runPDTMTptmAnalysis.Rd
diff --git a/vignettes/einprot.Rmd b/vignettes/einprot.Rmd
@@ -393,7 +393,12 @@ imp <- importExperiment(mqFile, iColPattern = "unknown_pattern")
 A sample annotation table must be provided when running each of the `einprot` 
 workflows. This `data.frame` must have at least two columns, named `sample` and
 `group`, but any additional columns are also supported and will be included 
-in the final `SingleCellExperiment` object. The values in the `sample` column 
+in the final `SingleCellExperiment` object. However, except for the special 
+case of a column named `batch` (see the section on "Batch adjustment" below), 
+additional columns will not be automatically included when fitting the linear
+model. 
+
+The values in the `sample` column 
 must correspond to the column names of the imported `SingleCellExperiment` 
 object, which are generated by removing the specified column pattern from 
 the raw file column names. For example, in the data we imported in the 
@@ -446,14 +451,31 @@ merged group as replicates, a model is first fit with all the original groups,
 and a contrast is designed to compare the averages of the fitted values across
 the merged groups in the comparison. 
 
-## Batch adjustment
+## Batch adjustment and complex designs
 
 `einprot` will adjust for batch effects by including an extra term in the 
 linear model (if the chosen statistical test is either `limma` or `proDA`), 
 if the sample annotation table contains a column named `batch`. In this case, 
 it will also calculate a matrix of batch corrected abundance values, and use 
 this for visualization. 
 
+In the current version, all statistical tests in `einprot` are based on 
+pairwise comparisons of groups (possibly after including a batch covariate in 
+the model as described here). While this is likely to cover a large fraction of 
+practical use cases, there are setups that can not be directly cast into this 
+framework. A multi-factorial design (e.g., samples from strain A and strain B, 
+which are either treated or untreated), can be accommodated by combining all 
+predictors into a single one (with values strainA-treated, strainA-untreated, 
+strainB-treated and strainB-untreated), and use this as the 'group' column, 
+after which any pairwise comparisons between subgroups can be performed. 
+For more complex experimental designs, or to access the full capabilities of 
+statistical analysis packages like `limma`, we recommend that users start with 
+the exported `SingleCellExperiment` object (`einprot` can be run with `stattest` 
+set to `"none"` to just perform the processing, skipping the testing part), 
+extract the appropriate assay and the sample annotations (included in the 
+`colData` of the `SingleCellExperiment` object), and set up the required test 
+manually using a suitable package (such as `limma`, `proDA`, or `prolfqua`). 
+
 ## Valid values for `linkTableColumns` and `interactiveDisplayColumns`
 
 The report generated by `einprot` contains several interactive elements. If 
@@ -544,6 +566,37 @@ PCA.
 Furthermore, using a standard container makes it possible to directly use the 
 `einprot` output as input to functions from a variety of Bioconductor packages.
 
+### Which statistical test frameworks are supported by `einprot`?
+
+`einprot` provides access to three different statistical frameworks. The 
+default is to use the `limma` package [@Ritchie2015limma], which has been shown 
+to perform well for analysis of proteomics data [@Peng2023optim]. 
+`limma` is a general-purpose inference package that fits a linear model and 
+performs inference based on a moderated t-statistic. It can accommodate batch 
+effects as well as individual sample weights, if required. We also provide the 
+option of performing a t-test, which more closely mimics the default setup of 
+`Perseus` [@Tyanova2016perseus]. Note that in this case, no batch effect 
+adjustment can be made, and some plots will not be generated. Finally, we 
+provide the option to use `proDA` [@AhlmannEltze2020proda], which is a 
+statistical analysis package developed specifically for proteomics data. The 
+main feature here is that no imputation of missing values is required; this 
+will instead be accounted for internally by a probabilistic dropout model. 
+
+For `limma` and `proDA`, `einprot` offers the possibility of either fitting a 
+single model to all samples, and extract comparisons of interest using linear
+contrasts, or to subset the data to only the samples used for each comparison. 
+The main advantage of fitting a single model (`singleFit = TRUE`) is that a 
+larger number of samples are used to estimate parameters, which usually give 
+more precise estimates. For this reason, this is typically the recommended 
+approach for `limma` and other inference pipelines. However, it also involves 
+making assumptions about similarities of variances between groups, and if 
+there are large differences, either fitting separate models or potentially 
+using a weighting approach may be more suitable. For more discussions about 
+this topic, see e.g. the following posts from the Bioconductor support forum: 
+https://support.bioconductor.org/p/60556/, 
+https://support.bioconductor.org/p/88032/, 
+https://support.bioconductor.org/p/61556/.
+
 ## Session info
 
 <details>

diff --git a/vignettes/einprot.bib b/vignettes/einprot.bib
@@ -45,3 +45,50 @@ @ARTICLE{Rue-Albrecht2018isee
   pages    =  741,
   year     =  2018
 }
+
+@ARTICLE{Ritchie2015limma,
+  title    = "limma powers differential expression analyses for
+              {RNA-sequencing} and microarray studies",
+  author   = "Ritchie, ME and Phipson, B and Wu, D and Hu, Y and Law, CW and
+              Shi, W and Smyth, GK",
+  journal  = "Nucleic Acids Research",
+  volume   =  43,
+  number   =  7,
+  pages    = "e47",
+  year     =  2015,
+  url      = "https://academic.oup.com/nar/article/43/7/e47/2414268"
+}
+
+@ARTICLE{AhlmannEltze2020proda,
+  title    = "{proDA}: Probabilistic Dropout Analysis for Identifying
+              Differentially Abundant Proteins in {Label-Free} Mass
+              Spectrometry",
+  author   = "Ahlmann-Eltze, Constantin and Anders, Simon",
+  journal  = "bioRxiv doi:https://doi.org/10.1101/661496",
+  year     =  2020
+}
+
+@ARTICLE{Tyanova2016perseus,
+  title    = "The Perseus computational platform for comprehensive analysis of
+              (prote)omics data",
+  author   = "Tyanova, S and Temu, T and Sinitcyn, P and Carlson,
+              A and Hein, MY and Geiger, T and Mann, M and
+              Cox, J",
+  journal  = "Nature Methods",
+  volume   =  13,
+  number   =  9,
+  pages    = "731--740",
+  year     =  2016,
+  url      = "https://www.nature.com/articles/nmeth.3901"
+}
+
+@ARTICLE{Peng2023optim,
+  title    = "Optimizing Proteomics Data Differential Expression Analysis via
+              {High-Performing} Rules and Ensemble Inference",
+  author   = "Peng, Hui and Wang, He and Kong, Weijia and Li, Jinyan and Goh,
+              Wilson Wen Bin",
+  journal  = "bioRxiv doi:10.1101/2023.06.26.546625",
+  year     =  2023,
+  doi      = "10.1101/2023.06.26.546625"
+}
+