Merge pull request #237 from zieglerSe/SAM-Rework

fslaborg · Jan 25, 2023 · f73b05f · f73b05f
2 parents b0b766b + a0cd3ec
commit f73b05f
Show file tree

Hide file tree

Showing 9 changed files with 1,141 additions and 15,252 deletions.
diff --git a/docs/Testing.fsx b/docs/Testing.fsx
@@ -12,18 +12,24 @@ categoryindex: 0
 (*** condition: prepare ***)
 #I "../src/FSharp.Stats/bin/Release/netstandard2.0/"
 #r "FSharp.Stats.dll"
-#r "nuget: Plotly.NET, 2.0.0-preview.16"
+#r "nuget: Plotly.NET, 3.0.1"
+#r "nuget: FSharpAux, 1.1.0"
+#r "nuget: Deedle, 3.0.0"
 
 (*** condition: ipynb ***)
 #if IPYNB
-#r "nuget: Plotly.NET, 2.0.0-preview.16"
-#r "nuget: Plotly.NET.Interactive, 2.0.0-preview.16"
+#r "nuget: Plotly.NET, 3.0.1"
+#r "nuget: Plotly.NET.Interactive, 3.0.2"
 #r "nuget: FSharp.Stats"
+#r "nuget: FSharpAux, 1.1.0"
+#r "nuget: Deedle, 3.0.0"
 #endif // IPYNB
 
 open Plotly.NET
 open Plotly.NET.StyleParam
 open Plotly.NET.LayoutObjects
+open FSharpAux
+open Deedle
 
 //some axis styling
 module Chart = 
@@ -34,6 +40,8 @@ module Chart =
         |> Chart.withTemplate ChartTemplates.lightMirrored
         |> Chart.withXAxis (myAxis x) 
         |> Chart.withYAxis (myAxis y)
+
+
 
 (**
 
@@ -49,7 +57,7 @@ _Summary:_ this tutorial explains how to perform various statistical tests with
     - [T-Test](#T-Test)
     - [Anova](#Anova)
     - [F-Test](#F-Test)
-    - [H Test](#H-Test)
+    - [H-Test](#H-Test)
     - [Friedman-Test](#Friedman-Test)
     - [Wilcoxon signed-rank Test](#Wilcoxon-Test)
     - [Chi-Squared Test](#Chi-Squared-Test)
@@ -63,6 +71,7 @@ _Summary:_ this tutorial explains how to perform various statistical tests with
 - [Multiple testing](#Multiple-testing)
     - [Benjamini-Hochberg](#Benjamini-Hochberg)
     - [Q Value](#Q-Value)
+    - [SAM](#SAM)
 
 FSharp.Stats provides hypothesis tests for different applications.
 A hypothesis test is a statistical test that is used to determine whether there is enough evidence 
@@ -368,16 +377,16 @@ let fTestFromParameters = FTest.testVariancesFromVarAndDof sampleF1 sampleF2
 (**
 Using a significance level of 0.05 the sample variances do differ significantly.
 
-### H Test
+### H-Test
 
-The H test is also known as Kruskal-Wallis one-way analysis-of-variance-by-ranks and is the non-parametric equivalent of one-way ANOVA. 
+The H-test is also known as Kruskal-Wallis one-way analysis-of-variance-by-ranks and is the non-parametric equivalent of one-way ANOVA. 
 It is a non-parametric test for comparing the means of more than two independent samples (equal or different sample size), and therefore is an extension of Wilcoxon-Mann-Whitney two sample test.
 Testing with H test gives information whether the samples are from the same distribution.
 
-A benefit of the H test is that it does not require normal distribution of the samples.
+A benefit of the H-test is that it does not require normal distribution of the samples.
 The downside is that there is no information which samples are different from each other, or how many differences occur. For further investigation a post hoc test is recommended. 
 
-The distribution of the H test statistic is approximated by chi-square distribution with degrees of freedom = sample count - 1. 
+The distribution of the H-test statistic is approximated by chi-square distribution with degrees of freedom = sample count - 1. 
 
 The implemented H-test is testing for duplicate values in the data. 
 Duplicates lead to ties in the ranking, and are corrected by using a correction factor. 
@@ -403,15 +412,15 @@ let groupB = [70.;77.;48.;64.;71.;75.]
 let groupC = [80.;76.;34.;80.;73.;80.] 
 let samples = [groupA;groupB;groupC]
 
-// calculation of the H test 
+// calculation of the H-test 
 let hResult = HTest.createHTest samples 
 
 (*** include-value:hResult ***)
 
 (**
 _PValueRight is significant at a alpha level of 0.05_
 
-A suitable post hoc test for H tests is Dunn's test.
+A suitable post hoc test for H-tests is Dunn's test.
 *)
 
 (** 
@@ -458,6 +467,7 @@ ID   |  pre   | month 1| month 2| month 3| month 4
 
 
 </pre>
+
 Ranking the results - average if values appear multiple times in one ID 
 
 
@@ -824,8 +834,8 @@ let aErrorAcc =
     [1. .. 100.]
     |> List.map (fun x -> x,(1. - 0.95**x))
     |> Chart.Line
-    |> Chart.withAxisTitles "number of tests (k)" "probability of at least one false positive test"
-
+    |> Chart.withXAxisStyle ( "number of tests (k)" )
+    |> Chart.withYAxisStyle ( "probability of at least one false positive test")
 (*** condition: ipynb ***)
 #if IPYNB
 aErrorAcc
@@ -865,7 +875,8 @@ let bhValues =
         Chart.Line(pValsAdj,Name="adj")
     ]
     |> Chart.combine
-    |> Chart.withAxisTitles "pValue" "BH corrected pValue"
+    |> Chart.withXAxisStyle "pValue" 
+    |> Chart.withYAxisStyle "BH corrected pValue"
 
 (*** condition: ipynb ***)
 #if IPYNB
@@ -929,8 +940,8 @@ let qHisto =
         Chart.Line([(0.,pi0);(1.,pi0)],Name="pi<sub>0</sub>",LineDash=StyleParam.DrawingStyle.Dash)
     ]
     |> Chart.combine
-    |> Chart.withAxisTitles "p value" "density"
-
+    |> Chart.withXAxisStyle "p value" 
+    |> Chart.withYAxisStyle"density"
 (*** condition: ipynb ***)
 #if IPYNB
 qChart
@@ -947,4 +958,175 @@ qHisto
 
 (***hide***)
 qHisto |> GenericChart.toChartHTML
-(***include-it-raw***)
+(***include-it-raw***)
+
+
+(**
+###SAM
+SAM (Significance Analysis of Microarrays) is a method developed to overcome multiple testing problems, for example in microarray experiments.
+SAM, short for Significance Analysis of Microarrays, is a statistical analysis created, but not restricted for, microarray analysis. 
+It serves as a blue print for any permutation test.
+Therefore, high throughput experiments can be analysed using a combined permutation-bootstrap-method. For more in depth information see the [SAM BlogPost](https://csbiology.github.io/CSBlog/posts/7_SAM.html).
+*)
+
+(** 
+Data: 
+To use SAM, data need to be in the format (string*float[])[]), with string being the name and float array being the replicates. One way of achieving this is the following data preparation:
+<center><img style="max-width:40%" src="../img/DataStructureSAM.jpeg"></img></center>
+*)
+(**
+Columns are samples, here 1 and 2, representing control and factor. Rows are transcript counts (here indicated with gene identifier).
+Rows are indicated by the sample name, here 1 and 2, representing control and factor. Columns are indicated with the rowkey(A1), here the gene id. This can be saved as .txt. 
+The next step is to read in the data, e.g. via deedle, and to create a dataframe. The rows are indexed by the sample name and the rowkeys are extracted.
+*)
+
+let df:Frame<string,string> = 
+    Frame.ReadCsv(@"data/TestDataSAM.txt",hasHeaders=true,separators = "\t")
+    // here, the name of A1 is needed. 
+    |> Frame.indexRows "gene"
+
+// get Rowkeys as Array
+let rowheader :string[] = df.RowKeys |> Seq.toArray
+
+// to separate control and factor sets (sample1 and sample2, respectively), 
+// the datasets are chunked by the number of replicates (here: triplicates -> chunkBySize 3).
+let (preData1,preData2) :float[][] * float [][]=  
+    df
+    |> Frame.getRows
+    |> Series.values
+    |> Seq.map (Series.values >> Seq.toArray >> Array.chunkBySize 3 >> fun x -> x.[0],x.[1])
+    |> Array.ofSeq
+    |> Array.unzip
+
+// After chunking, the datasets are separated by factor and the rowkey is added to the data for later identification.
+let data1 = Array.zip rowheader preData1 
+let data2 = Array.zip  rowheader preData2 
+(**
+Optional: Data can be normalised by median centering using the following function:
+*)
+open SAM
+let corrected1 = 
+    let medCorrect = 
+        medianCentering data1 
+    Array.zip rowheader medCorrect
+let corrected2 = 
+    let medCorrect = 
+        medianCentering data2
+    Array.zip rowheader medCorrect
+
+(**
+This calculates the median of a column and subtracts it from each value in this column. 
+The median centering is performed for each column (all samples) separately.
+With data1 and data2 , or corrected1 and corrected2, SAM can be performed.
+*)
+let res = FSharp.Stats.Testing.SAM.twoClassUnpaired 100 0.05 data1 data2 (System.Random(1234))
+(**
+The parameters are number of permutations, desired FDR, the datasets and a seed. For balanced permutations as in SAM, it seems to be sufficient to use ~ 100 permutations (also default in samR). The seed is used for randomization of the permutations (System.Random()), or can be fixed to achieve the same results multiple times (e.g. System.Random(1234)).
+Result contains now the following information: s0, pi0, delta, upper Cut, lower Cut, positive significant bioitems, negative significant bioitems, non significant bioitems, False Discovery Rate (FDR), and median False Positives.
+The following code will help to achieve the typical SAM plot.
+*)
+
+let SAMChart = 
+
+    let observed = [| res.NegSigBioitem; res.NonSigBioitem; res.PosSigBioitem|] |> Array.concat 
+    let obs = observed |> Array.map (fun x -> x.Statistics) 
+    let expected = res.AveragePermutations |> Array.map (fun x -> x.Statistics)
+    let minDi = Seq.min obs
+    let maxDi = Seq.max obs
+
+
+    // positive significant changes 
+    let posExpected = expected.[res.NegSigBioitem.Length + res.NonSigBioitem.Length .. res.NegSigBioitem.Length + res.NonSigBioitem.Length + res.PosSigBioitem.Length-1]
+    let posChart = 
+        Chart.Point(posExpected,res.PosSigBioitem |> Array.map (fun x -> x.Statistics))
+        |> Chart.withLineStyle(Color=Color.fromKeyword Green)
+        |> Chart.withTraceInfo("positive change",Visible = Visible.True )
+
+
+    // no significant changes
+    let nonex = expected.[res.NegSigBioitem.Length .. res.NegSigBioitem.Length + res.NonSigBioitem.Length-1]
+    let nonchart = 
+        Chart.Point(nonex,res.NonSigBioitem |> Array.map (fun x -> x.Statistics))
+        |> Chart.withLineStyle(Color=Color.fromKeyword Gray)
+        |> Chart.withTraceInfo("no change",Visible = Visible.True)
+
+    // negative significant changes 
+    let negex = expected.[0 .. res.NegSigBioitem.Length-1]
+    let negchart = 
+        Chart.Point(negex,res.NegSigBioitem |> Array.map (fun x -> x.Statistics))
+        |> Chart.withLineStyle(Color=Color.fromKeyword Red)
+        |> Chart.withTraceInfo("negative change",Visible = Visible.True)
+
+    let samValues = 
+        [
+            negchart
+            nonchart
+            posChart
+        ]
+        |> Chart.combine
+
+    let chartConfig =
+        let svdConfig =
+            ConfigObjects.ToImageButtonOptions.init(
+                Format = StyleParam.ImageFormat.SVG)
+        Config.init (
+            ToImageButtonOptions = svdConfig,
+            ModeBarButtonsToAdd=[ModeBarButton.HoverCompareCartesian]
+
+        )
+
+    let cutLineUp = [(minDi + res.Delta) ; (maxDi + res.Delta)]
+    let cutsUp = 
+        Chart.Line(cutLineUp,[minDi;maxDi])
+        |> Chart.withLineStyle(Color=Color.fromKeyword Purple,Dash = StyleParam.DrawingStyle.Dash, Width = 0.5)
+        |> Chart.withTraceInfo("delta",Visible = Visible.True)
+    let cutLineLow = [(minDi - res.Delta) ; (maxDi - res.Delta)]
+    let cutsLow = 
+        Chart.Line(cutLineLow,[minDi;maxDi])
+        |> Chart.withLineStyle(Color=Color.fromKeyword Purple,Dash = StyleParam.DrawingStyle.Dash, Width = 0.5)
+        |> Chart.withTraceInfo("delta",Visible = Visible.True)
+    let linechart = 
+        Chart.Line([minDi;maxDi], [minDi;maxDi])
+        |> Chart.withTraceInfo("bisecting angle",Visible = Visible.True)
+        |> Chart.withLineStyle(Color=Color.fromKeyword Black, Width = 1)
+
+    let uppercut = 
+        let xAnchorUppercut = [minDi .. 5. .. maxDi]
+        Chart.Line (xAnchorUppercut, List.init xAnchorUppercut.Length (fun x -> res.UpperCut))
+
+        |> Chart.withTraceInfo("upper cut",Visible = Visible.True)
+        |> Chart.withLineStyle(Color=Color.fromKeyword Black,Dash = StyleParam.DrawingStyle.Dash, Width = 0.3)
+
+    let lowercut = 
+        Chart.Line([minDi;maxDi],[res.LowerCut;res.LowerCut])
+        |> Chart.withTraceInfo("lower cut",Visible = Visible.True)
+        |> Chart.withLineStyle(Color=Color.fromKeyword Black,Dash = StyleParam.DrawingStyle.Dash,Width = 0.3)
+        //|> Chart.withXAxisStyle(MinMax = (-15,20))
+
+        |> Chart.withTraceInfo("lower cut",Visible = Visible.True)
+    let plot = 
+        [linechart;
+        samValues;
+        cutsUp;
+        cutsLow;
+        uppercut;
+        lowercut]
+        |> Chart.combine
+        |> Chart.withTitle(title = "SAM results")
+        |> Chart.withXAxisStyle("expected Score")
+        |> Chart.withYAxisStyle ("observed Score")
+        |> Chart.withConfig(chartConfig)
+        |> Chart.withTemplate(ChartTemplates.lightMirrored)
+    plot 
+
+
+
+
+(*** condition: ipynb ***)
+#if IPYNB
+SAMChart
+#endif // IPYNB
+
+(***hide***)
+System.IO.File.ReadAllText (__SOURCE_DIRECTORY__ + "/img/SAM005.html")
+(*** include-it-raw ***)
diff --git a/docs/img/DataStructureSAM.jpeg b/docs/img/DataStructureSAM.jpeg