signtest: fully Matlab compatible, added tests, correct output, bug #…

…49961
gnu-octave · Aug 1, 2022 · 299c639 · 299c639
1 parent 213f80e
commit 299c639
Show file tree

Hide file tree

Showing 2 changed files with 169 additions and 122 deletions.
diff --git a/NEWS b/NEWS
@@ -2,63 +2,70 @@ Summary of important user-visible changes for statistics 1.5.0:
 -------------------------------------------------------------------
 
  Important Notice: dependency change to Octave>=6.1.0
+                   mean shadows core Octave's respective function
 
  New functions:
  ==============
-
- ** friedman (fully Matlab compatible)
 
- ** ranksum (fully Matlab compatible. bug #42079)
-    tiedrank
-
  ** anova2 (fully Matlab compatible)
 
- ** mean (fully Matlab compatible, it shadows mean from core Octave)
-
+ ** chi2gof (bug #46764)
+
+ ** friedman (fully Matlab compatible)
+
  ** fillmissing (patch #10102)
-
+
+ ** mean (fully Matlab compatible, it shadows mean from core Octave)
+
+ ** ranksum (fully Matlab compatible. bug #42079)
+
  ** standardizeMissing (patch #10102)
+
+ ** tiedrank (complementary to ranksum)
 
  Improvements:
  =============
+
+ ** binopdf.m: implement high accuracy Loader algorithm for m>=10 (bug #34362)
+
+ ** cdf.m: extended to include all available distributions
+
+ ** geomean.m: fully Matlab compatible. (patch #59410)
+
+ ** harmmean.m: fully Matlab compatible.
+
+ ** ismissing.m: corrects handling of n-D arrays, NaN indicators, and improves
+    matlab compatibility for different data types. (patch #10102)
 
  ** laplace_cdf.m: allow for parameters mu and scale (bug #58688)
     laplace_inv.m
-    loplace.pdf.m
+    laplace_pdf.m
     logistic_cdf.m
     logistic_inv.m
     logistic_pdf.m
-
- ** ttest2.m: can hale NaN values as missing data (bug #58697)
-
- ** binopdf.m: implement high accuracy Loader algorithm for m>=10 (bug #34362)
-
- ** violin.m: fix parsing color vector affecting Octave>=6.1.0 (bug #62805)
 
- ** cdf.m: extended to include all available distributions
-    pdf.m
+ ** pdf.m: extended to include all available distributions
 
  ** pdist.m: updated the 'cosine' metric to be more efficient (bug #62495)
 
- ** harmmean.m: fully Matlab compatible.
+ ** rmmissing.m: corrects cellstr array handling and
+    improves matlab compatibility for different data types. (patch #10102)
 
- ** geomean.m: fully Matlab compatible. (patch #59410)
+ ** ttest2.m: can hale NaN values as missing data (bug #58697)
 
- ** wblplot.m: fixed coding style and help texinfo. (patch #8579)
+ ** violin.m: fix parsing color vector affecting Octave>=6.1.0 (bug #62805)
 
- ** ismissing.m: corrects handling of n-D arrays, NaN indicators, and improves
-    matlab compatibility for different data types. (patch #10102)
-
- ** rmmissing.m: corrects cellstr array handling and
-    improves matlab compatibility for different data types. (patch #10102)
-
+ ** wblplot.m: fixed coding style and help texinfo. (patch #8579)
+
  Removed Functions:
  ==================
 
  ** wilcoxon_test (replaced by ranksum)
 
  ** kruskal_wallis_test (replaced by kruskalwallis)
 
+ ** sign_test (replaced by updated signtest)
+
  Available Data Sets:
  ====================
 

diff --git a/inst/signtest.m b/inst/signtest.m
@@ -1,3 +1,4 @@
+## Copyright (C) 2022 Andreas Bertsatos <abertsatos@biol.uoa.gr>
 ## Copyright (C) 2014 Tony Richardson
 ##
 ## This program is free software; you can redistribute it and/or modify it under
@@ -18,10 +19,11 @@
 ## @deftypefnx {Function File} {[@var{pval}, @var{h}, @var{stats}] =} signtest (@var{x}, @var{m})
 ## @deftypefnx {Function File} {[@var{pval}, @var{h}, @var{stats}] =} signtest (@var{x}, @var{y})
 ## @deftypefnx {Function File} {[@var{pval}, @var{h}, @var{stats}] =} signtest (@var{x}, @var{y}, @var{Name}, @var{Value})
+##
 ## Test for median.
 ##
 ## Perform a signtest of the null hypothesis that @var{x} is from a distribution
-## that has a zero median.
+## that has a zero median.  @var{x} must be a vector.
 ##
 ## If the second argument @var{m} is a scalar, the null hypothesis is that
 ## X has median m.
@@ -32,15 +34,19 @@
 ## The argument @qcode{"alpha"} can be used to specify the significance level
 ## of the test (the default value is 0.05).  The string
 ## argument @qcode{"tail"}, can be used to select the desired alternative
-## hypotheses.  If @qcode{"alt"} is @qcode{"both"} (default) the null is 
+## hypotheses.  If @qcode{"tail"} is @qcode{"both"} (default) the null is 
 ## tested against the two-sided alternative @code{median (@var{x}) != @var{m}}.
-## If @qcode{"alt"} is @qcode{"right"} the one-sided 
+## If @qcode{"tail"} is @qcode{"right"} the one-sided 
 ## alternative @code{median (@var{x}) > @var{m}} is considered.
 ## Similarly for @qcode{"left"}, the one-sided alternative @code{median
-## (@var{x}) < @var{m}} is considered.  When @qcode{"method"} is @qcode{"exact"}
-## the p-value is computed using an exact method (this is the default).  When
-## @qcode{"method"} is @qcode{"approximate"} a normal approximation is used for the
-## test statistic.
+## (@var{x}) < @var{m}} is considered.
+##
+## When @qcode{"method"} is @qcode{"exact"} the p-value is computed using an
+## exact method.  When @qcode{"method"} is @qcode{"approximate"} a normal
+## approximation is used for the test statistic. When @qcode{"method"} is not
+## defined as an optional input argument, then for @code{length (@var{x}) < 100}
+## the @qcode{"exact"} method is computed, otherwise the @qcode{"approximate"}
+## method is used.
 ##
 ## The p-value of the test is returned in @var{pval}. If @var{h} is 0 the 
 ## null hypothesis is accepted, if it is 1 the null hypothesis is rejected. 
@@ -50,114 +56,148 @@
 ##
 ## @end deftypefn
 
-## Author: Tony Richardson <richardson.tony@gmail.com>
-
-function [p, h, stats] = signtest(x, my, varargin)
+function [p, h, stats] = signtest (x, my, varargin)
 
+  ## Check X being a vector
+  if ! isvector (x)
+    error ("signtest: X must be a vector.");
+  endif
+  ## Add defaults
   my_default = 0;
   alpha = 0.05;
-  tail  = 'both';
-  method = 'exact';
-
-  % Find the first non-singleton dimension of x
-  dim = min(find(size(x)~=1));
-  if isempty(dim), dim = 1; end
-
-  if (nargin == 1)
-    my = my_default;
-  end 
-
+  tail  = "both";
+  ## Matlab compliant default method selection
+  method_present = false;
+  if length (x) < 100
+    method = "exact";
+  else
+    method = "approximate";
+  endif
+  ## When called with a single input argument of second argument is empty
+  if nargin == 1 || isempty (my)
+    my = zeros (size (x));
+  endif
+  ## If second argument is a scalar convert to vector or check for Y being a
+  ## vector and that X and Y have equal lengths
+  if isscalar (my)
+    my = repmat (my, size (x));
+  elseif ! isvector (my)
+    error ("signtest: Y must be either a scalar of a vector.");
+  elseif numel (x) != numel (my)
+    error ("signtest: X and Y vectors have different lengths.");
+  endif
+  ## Get optional input arguments
   i = 1;
-  while ( i <= length(varargin) )
-    switch lower(varargin{i})
-      case 'alpha'
+  while (i <= length (varargin))
+    switch lower (varargin{i})
+      case "alpha"
         i = i + 1;
         alpha = varargin{i};
-      case 'tail'
+      case "tail"
         i = i + 1;
         tail = varargin{i};
-      case 'method'
+      case "method"
         i = i + 1;
         method = varargin{i};
-      case 'dim'
-        i = i + 1;
-        dim = varargin{i};
+        method_present = true;
       otherwise
-        error('Invalid Name argument.',[]);
-    end
+        error ("signtest: Invalid Name argument.");
+    endswitch
     i = i + 1;
+  endwhile
+  ## Check values for optional input arguments
+  if (! isnumeric (alpha) || isnan (alpha) || ! isscalar (alpha) ...
+                          || alpha <= 0 || alpha >= 1)
+    error ("signtest: alpha must be a numeric scalar in the range (0,1).");
+  endif
+  if ! isa (tail, 'char')
+    error ("signtest: tail argument must be a string");
+  elseif sum (strcmpi (tail, {"both", "right", "left"})) < 1
+    error ("signtest: tail value must be either 'both', right' or 'left'.");
+  endif
+  if ! isa (method, 'char')
+    error("signtest: method argument must be a string");
+  elseif sum (strcmpi (method, {"exact", "approximate"})) < 1
+    error ("signtest: method value must be either 'exact' or 'approximate'.");
   end
-
-  if ~isa(tail, 'char')
-    error('tail argument to signtest must be a string\n',[]);
-  end
-
-  if ~isa(method, 'char')
-    error('method argument to signtest must be a string\n',[]);
-  end
-
-  % Set default values if arguments are present but empty
-  if isempty(my)
-    my = my_default;
-  end
-
-  % This adjustment allows everything else to remain the
-  % same for both the one-sample t test and paired tests.
-  % If second argument is a vector
-  if ~isscalar(my)
-    x = x - my;
-    my = my_default;
-  end  
-
-  n = size(x, dim);
-
+  ## Calculate differences between X and Y vectors: remove equal values of NaNs
+  XY_diff = x(:) - my(:);
+  NO_diff = (XY_diff == 0);
+  XY_diff(NO_diff | isnan (NO_diff)) = [];
+  ## Recalculate remaining length of X vector (after equal or NaNs removal)
+  n = length (XY_diff);
+  ## Check for identical X and Y input arguments
+  if n == 0
+    p = 1;
+    h = 0;
+    stats.sign = 0;
+    stats.zval = NaN;
+    return;
+  endif
+  ## Re-evaluate method selection
+  if ! method_present && n < 100
+    method = "exact";
+  else
+    method = "approximate";
+  endif
+  ## Get the number of positive and negative elements from X-Y differences
+  pos_n = length (find (XY_diff > 0));
+  neg_n = n - pos_n;
+  ## Calculate stats according to selected method and tail
   switch lower(method)
-    case 'exact'
+    case "exact"
       stats.zval = nan;
       switch lower(tail)
-        case 'both'
-          w = min(sum(x<my),sum(x>my));
-          pl = binocdf(w, n, 0.5);
-          p = 2*min(pl,1-pl);
-        case 'left'
-          w = sum(x<my);
-          p = binocdf(w, n, 0.5);
-        case 'right'
-          w = sum(x>my);
-          p = 1 - binocdf(w, n, 0.5);
-        otherwise
-          error('Invalid tail argument to signtest\n',[]);
-      end
+        case "both"
+          p = 2 * binocdf (min (neg_n, pos_n), n, 0.5);
+          p = min (1, p);
+        case "left"
+          p = binocdf (pos_n, n, 0.5);
+        case "right"
+          p = binocdf (neg_n, n, 0.5);
+      endswitch
+      stats.zval = NaN;
     case 'approximate'
       switch lower(tail)
         case 'both'
-          npos = sum(x>my);
-          nneg = sum(x<my);
-          w = min(npos,nneg);
-          stats.zval = (w - 0.5*n - 0.5*sign(npos-nneg))/sqrt(0.25*n);
-          pl = normcdf(stats.zval);
-          p = 2*min(pl,1-pl);
+          z_value = (pos_n - neg_n - sign (pos_n - neg_n)) / sqrt (n);
+          p = 2 * normcdf (- abs (z_value));
         case 'left'
-          npos = sum(x>my);
-          nneg = sum(x<my);
-          w = sum(x<my);
-          stats.zval = (w - 0.5*n - 0.5*sign(npos-nneg))/sqrt(0.25*n);
-          p = normcdf(stats.zval);
+          z_value = (pos_n - neg_n + 1) / sqrt (n);
+          p = normcdf (z_value);
         case 'right'
-          npos = sum(x>my);
-          nneg = sum(x<my);
-          w = sum(x>my);
-          stats.zval = (w - 0.5*n - 0.5*sign(npos-nneg))/sqrt(0.25*n);
-          p = 1-normcdf(stats.zval);
-        otherwise
-          error('Invalid tail argument to signtest\n',[]);
-      end
-    otherwise
-      error('Invalid method argument to signtest\n',[]);
-  end
+          z_value = (pos_n - neg_n - 1) / sqrt (n);
+          p = normcdf (- z_value);
+      endswitch
+      stats.zval = z_value;
+  endswitch
+  stats.sign = pos_n;
+  h = double (p < alpha);
+endfunction
 
-  stats.sign = w;
-
-  h = double(p < alpha);  
-
-end
+## Test suite
+%!error signtest ();
+%!error signtest ([]);
+%!error signtest (ones(1,10), ones(1,8));
+%!error signtest (ones(1,10), ones(2,10));
+%!error signtest (ones(2,10), 0);
+%!error signtest (ones(1,10), zeros(1,10), "alpha", 1.4)
+%!error signtest (ones(1,10), zeros(1,10), "tail", "<")
+%!error signtest (ones(1,10), zeros(1,10), "method", "some")
+%!test
+%! [pval, h, stats] = signtest ([-ones(1, 1000) 1], 0, "tail", "left");
+%! assert (pval, 1.091701889420221e-218, 1e-14);
+%! assert (h, 1);
+%! assert (stats.zval, -31.5437631079266, 1e-14);
+%!test
+%! [pval, h, stats] = signtest ([-2 -1 0 2 1 3 1], 0);
+%! assert (pval, 0.6875000000000006, 1e-14);
+%! assert (h, 0);
+%! assert (stats.zval, NaN);
+%! assert (stats.sign, 4);
+%!test
+%! [pval, h, stats] = signtest ([-2 -1 0 2 1 3 1], 0, "method", "approximate");
+%! assert (pval, 0.6830913983096086, 1e-14);
+%! assert (h, 0);
+%! assert (stats.zval, 0.4082482904638631, 1e-14);
+%! assert (stats.sign, 4);