version 0.0.6

cran · Nov 15, 2017 · 7fbf0bc · 7fbf0bc
commit 7fbf0bc
Show file tree

Hide file tree

Showing 17 changed files with 581 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,17 @@
+Package: IndepTest
+Type: Package
+Title: Nonparametric Independence Tests Based on Entropy Estimation
+Version: 0.0.6
+Date: 2017-11-15
+Author: Thomas B. Berrett <t.berrett@statslab.cam.ac.uk> [aut], Daniel J. Grose <dan.grose@lancaster.ac.uk> [cre,ctb], Richard J. Samworth <r.samworth@statslab.cam.ac.uk> [aut]  
+Maintainer: Daniel Grose <dan.grose@lancaster.ac.uk>
+Description: Implementations of the weighted Kozachenko-Leonenko entropy estimator and independence tests based on this estimator, (Kozachenko and Leonenko (1987) <http://mi.mathnet.ru/eng/ppi797>). Also includes a goodness-of-fit test for a linear model which is an independence test between covariates and errors.
+Depends: FNN,mvtnorm
+Imports: Rdpack
+RdMacros: Rdpack
+License: GPL
+RoxygenNote: 6.0.1
+NeedsCompilation: no
+Packaged: 2017-11-15 15:42:45 UTC; grosedj1
+Repository: CRAN
+Date/Publication: 2017-11-15 18:56:58 UTC
diff --git a/MD5 b/MD5
@@ -0,0 +1,16 @@
+02a49c0761c870579115f01c9d10219d *DESCRIPTION
+6e1ea53b207b1a480a478e7ae1fb7375 *NAMESPACE
+43af04761dcbf7efeda7a534d6002002 *R/KLentropy.R
+221e586a552442d3de658e351e355d6c *R/L2OptW.R
+e3db14e51d93e45949b41cbf8dca93d9 *R/MINTauto.R
+7ff1d651749533783efdc6804632b9a2 *R/MINTknown.R
+2851aad67fb1f59e57670698e7166461 *R/MINTperm.R
+ed31600cbfe9583732119759154c41fa *R/MINTregression.R
+6f9389ed38d819047e32f50cd9aa919e *build/partial.rdb
+7fd3a0bcd8a50f6b75297bf350849092 *inst/REFERENCES.bib
+02aa69b2fd20f395c3705eab26eadfc6 *man/KLentropy.Rd
+6ff776ed5b3651cf9c1809ffe87a67b1 *man/L2OptW.Rd
+8a38149ca0cf9e717815828518ea38a3 *man/MINTauto.Rd
+01ef37a8ab5d3cf063e942139594e430 *man/MINTknown.Rd
+afa0cde653a70687f2fcb488f6dd2aa8 *man/MINTperm.Rd
+f551fba2f91a58d53db42844420696fb *man/MINTregression.Rd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,4 @@
+exportPattern("^[[:alpha:]]+")
+import(FNN)
+import(mvtnorm)
+importFrom(Rdpack,reprompt)
diff --git a/R/KLentropy.R b/R/KLentropy.R
@@ -0,0 +1,44 @@
+
+#' KLentropy
+#'
+#' Calculates the (weighted) Kozachenko--Leonenko entropy estimator studied in Berrett, Samworth and Yuan (2017), which is based on the \eqn{k}-nearest neighbour distances of the sample.
+#'
+#' @param x The \eqn{n \times d} data matrix.
+#' @param k The tuning parameter that gives the maximum number of neighbours that will be considered by the estimator.
+#' @param weights Specifies whether a weighted or unweighted estimator is used. If a weighted estimator is to be used then the default (\code{weights=TRUE}) results in the weights being calculated by \code{\link{L2OptW}}, otherwise the user may specify their own weights.
+#'
+#' @return The first element of the list is the unweighted estimator for the value of 1 up to the user-specified \eqn{k}. The second element of the list is the weighted estimator, obtained by taking the inner product between the first element of the list and the weight vector.
+#'
+#' @references \insertRef{BSY2017}{IndepTest}
+#' 
+#' @examples
+#' n=1000; x=rnorm(n); KLentropy(x,30)  # The true value is 0.5*log(2*pi*exp(1)) = 1.42.
+#' n=5000; x=matrix(rnorm(4*n),ncol=4)  # The true value is 2*log(2*pi*exp(1)) = 5.68
+#' KLentropy(x,30,weights=FALSE)            # Unweighted estimator
+#' KLentropy(x,30,weights=TRUE)                           # Weights chosen by L2OptW
+#' w=runif(30); w=w/sum(w); KLentropy(x,30,weights=w)  # User-specified weights 
+#' 
+#' @export
+KLentropy=function(x,k,weights=FALSE){
+  dim=dim(as.matrix(x)); n=dim[1]; d=dim[2]
+  V=pi^(d/2)/gamma(1+d/2)
+  if(length(weights)>1){
+    w=weights
+  }else if(weights==TRUE){
+      w=L2OptW(k,d)
+  }else{
+    w=c(rep(0,k-1),1)
+  }
+  rho=knn.dist(x,k=k)
+  H=(1/n)*colSums(t(log(t(rho)^d*V*(n-1))-digamma(1:k)))
+  value=list(); value[[1]]=H; value[[2]]=sum(H*w)
+  return(value)
+}
+
+
+
+
+
+
+
+
diff --git a/R/L2OptW.R b/R/L2OptW.R
@@ -0,0 +1,39 @@
+
+#' L2OptW
+#'
+#' Calculates a weight vector to be used for the weighted Kozachenko--Leonenko estimator. The weight vector has minimum \eqn{L_2} norm subject to the linear and sum-to-one constraints of (2) in Berrett, Samworth and Yuan (2017). 
+#'
+#' @param k The tuning parameter that gives the number of neighbours that will be considered by the weighted Kozachenko--Leonenko estimator.
+#' @param d The dimension of the data.
+#'
+#' @return The weight vector that is the solution of the optimisation problem.
+#'
+#' @examples
+#' # When d < 4 there are no linear constraints and the returned vector is (0,0,...,0,1).
+#' L2OptW(100,3)    
+#' w=L2OptW(100,4)
+#' plot(w,type="l")
+#' w=L2OptW(100,8);
+#' # For each multiple of 4 that d increases an extra constraint is added.
+#' plot(w,type="l")  
+#' w=L2OptW(100,12)
+#' plot(w, type="l") # This can be seen in the shape of the plot
+#'
+#' @references \insertRef{BSY2017}{IndepTest}
+#' 
+#' @export
+L2OptW=function(k,d){
+  dprime=floor(d/4)
+  if(dprime==0){
+    return(c(rep(0,k-1),1))
+    }else{
+    G=matrix(rep(0,(dprime+1)*k),ncol=k)
+    G[1,]=rep(1,k)
+    for(l in 1:dprime){
+      G[l+1,]=exp(lgamma(1:k+2*l/d)-lgamma(1:k)) # exp(lgamma-lgamma) better than gamma/gamma for large k
+    }
+    A=G%*%t(G)
+    Lambda=solve(A,c(1,rep(0,dprime)))
+    return(as.vector(t(G)%*%Lambda))
+  }
+}
diff --git a/R/MINTauto.R b/R/MINTauto.R
@@ -0,0 +1,43 @@
+
+#' MINTauto
+#'
+#'  Performs an independence test without knowledge of either marginal distribution using permutations and using a data-driven choice of \eqn{k}.
+#'
+#' @param x The \eqn{n \times d_{X}} data matrix of the \eqn{X} values.
+#' @param y The response vector of length \eqn{n \times d_{Y}} data matrix of the \eqn{Y} values.
+#' @param kmax The maximum value of \eqn{k} to be considered for estimation of the joint entropy \eqn{H(X,Y)}.
+#' @param B1 The number of repetitions used when choosing \eqn{k}, set to 1000 by default.
+#' @param B2 The  number of permutations to use for the final test, set at 1000 by default.
+#' 
+#' @return The \eqn{p}-value corresponding the independence test carried out and the value of \eqn{k} used.
+#'
+#' @examples
+#' \donttest{
+#' # Independent univariate normal data
+#' x=rnorm(1000); y=rnorm(1000);
+#' MINTauto(x,y,kmax=200,B1=100,B2=100)
+#' # Dependent univariate normal data
+#' library(mvtnorm)
+#' data=rmvnorm(1000,sigma=matrix(c(1,0.5,0.5,1),ncol=2))  
+#' MINTauto(data[,1],data[,2],kmax=200,B1=100,B2=100)
+#' # Dependent multivariate normal data
+#' Sigma=matrix(c(1,0,0,0,0,1,0,0,0,0,1,0.5,0,0,0.5,1),ncol=4)
+#' data=rmvnorm(1000,sigma=Sigma)
+#' MINTauto(data[,1:3],data[,4],kmax=50,B1=100,B2=100)
+#'}
+#'
+#' @references \insertRef{BS2017}{IndepTest}
+#' 
+#' @export
+MINTauto=function(x,y,kmax,B1=1000,B2=1000){
+  data=cbind(x,y); d=dim(data)[2]; n=dim(data)[1]
+  Hp=matrix(rep(0,B1*kmax),ncol=kmax)
+  for(b in 1:B1){
+    y1=as.matrix(y)[sample(n),]; data1=cbind(x,y1)
+    y2=as.matrix(y)[sample(n),]; data2=cbind(x,y2)
+    Hp[b,]=KLentropy(data1,k=kmax)[[1]]-KLentropy(data2,k=kmax)[[1]]
+  }
+  MSE=(1/B1)*colSums(Hp^2); kopt=which.min(MSE)
+  return(c(MINTperm(x,y,kopt,B=B2),kopt))
+}
+
diff --git a/R/MINTknown.R b/R/MINTknown.R
@@ -0,0 +1,47 @@
+#' MINTknown
+#'
+#' Performs an independence test when it is assumed that the marginal distribution of \eqn{Y} is known and can be simulated from.
+#'
+#' @param x The \eqn{n \times d_X} data matrix of \eqn{X} values.
+#' @param y The \eqn{n \times d_Y} data matrix of \eqn{Y} values.
+#' @param k The value of \eqn{k} to be used for estimation of the joint entropy \eqn{H(X,Y)}.
+#' @param ky The value of \eqn{k} to be used for estimation of the marginal entropy \eqn{H(Y)}.
+#' @param w The weight vector to used for estimation of the joint entropy \eqn{H(X,Y)}, with the same options as for the \code{\link{KLentropy}} function.
+#' @param wy The weight vector to used for estimation of the marginal entropy \eqn{H(Y)}, with the same options as for the \code{\link{KLentropy}} function.
+#' @param y0 The data matrix of simulated \eqn{Y} values.
+#' 
+#' @return The \eqn{p}-value corresponding the independence test carried out.
+#'
+#' @examples
+#' library(mvtnorm)
+#' x=rnorm(1000); y=rnorm(1000);
+#' # Independent univariate normal data
+#' MINTknown(x,y,k=20,ky=30,y0=rnorm(100000))  
+#' library(mvtnorm)
+#' # Dependent univariate normal data
+#' data=rmvnorm(1000,sigma=matrix(c(1,0.5,0.5,1),ncol=2))
+#' # Dependent multivariate normal data
+#' MINTknown(data[,1],data[,2],k=20,ky=30,y0=rnorm(100000))   
+#' Sigma=matrix(c(1,0,0,0,0,1,0,0,0,0,1,0.5,0,0,0.5,1),ncol=4)
+#' data=rmvnorm(1000,sigma=Sigma)
+#' MINTknown(data[,1:3],data[,4],k=20,ky=30,w=TRUE,wy=FALSE,y0=rnorm(100000))
+#'
+#' @references \insertRef{BS2017}{IndepTest}
+#' 
+#' @export
+MINTknown=function(x,y,k,ky,w=FALSE,wy=FALSE,y0){
+  n=dim(cbind(x,y))[1]
+  B=dim(as.matrix(y0))[1]%/%n   # The number of null statistics we can calculate
+  nullstat=rep(0,B)
+  for(b in 1:B){
+    yb=as.matrix(y0)[((b-1)*n+1):(b*n),] # Extracting the bth null sample
+    nullstat[b]=KLentropy(yb,k=ky,weights=wy)[[2]]-KLentropy(cbind(x,yb),k=k,weights=w)[[2]] # Calculating null stat
+  }
+  data=cbind(x,y)          # Real data
+  stat=KLentropy(y,k=ky,weights=wy)[[2]]-KLentropy(data,k=k,weights=w)[[2]] # Calculating the real test stat
+  p=(1+sum(nullstat>=stat))/(B+1) # Comparing the real test stat to the null test stats
+  return(p)
+}
+
+
+
diff --git a/R/MINTperm.R b/R/MINTperm.R
@@ -0,0 +1,47 @@
+
+
+#' MINTknown
+#'
+#' Performs an independence test without knowledge of either marginal distribution using permutations.
+#'
+#' @param x The \eqn{n \times d_X} data matrix of \eqn{X} values.
+#' @param y The \eqn{n \times d_Y} data matrix of \eqn{Y} values.
+#' @param k The value of \eqn{k} to be used for estimation of the joint entropy \eqn{H(X,Y)}.
+#' @param w The weight vector to used for estimation of the joint entropy \eqn{H(X,Y)}, with the same options as for the \code{\link{KLentropy}} function.
+#' @param B The number of permutations to use, set at 1000 by default.
+#' 
+#' @return The \eqn{p}-value corresponding the independence test carried out.
+#'
+#' @examples
+#' # Independent univariate normal data
+#' x=rnorm(1000); y=rnorm(1000)
+#' MINTperm(x,y,k=20,B=100)
+#' # Dependent univariate normal data
+#' library(mvtnorm)
+#' data=rmvnorm(1000,sigma=matrix(c(1,0.5,0.5,1),ncol=2))  
+#' MINTperm(data[,1],data[,2],k=20,B=100)
+#' # Dependent multivariate normal data
+#' Sigma=matrix(c(1,0,0,0,0,1,0,0,0,0,1,0.5,0,0,0.5,1),ncol=4)
+#' data=rmvnorm(1000,sigma=Sigma)
+#' MINTperm(data[,1:3],data[,4],k=20,w=TRUE,B=100)
+#'
+#' @references \insertRef{BS2017}{IndepTest}
+#' 
+#' @export
+MINTperm=function(x,y,k,w=FALSE,B=1000){
+  data=cbind(x,y); n=dim(data)[1]
+  Hp=rep(0,B)
+  for(b in 1:B){
+    yp=as.matrix(y)[sample(n),]
+    Hp[b]=KLentropy(cbind(x,yp),k,weights=w)[[2]]
+  }
+  H=KLentropy(data,k,weights=w)[[2]]
+  p=(1+sum(Hp<=H))/(B+1)
+  return(p)
+}
+
+
+
+
+
+
diff --git a/R/MINTregression.R b/R/MINTregression.R
@@ -0,0 +1,60 @@
+
+
+#' MINTregression
+#'
+#' Performs a goodness-of-fit test of a linear model by testing whether the errors are independent of the covariates.
+#'
+#' @param x The \eqn{n \times p} design matrix.
+#' @param y The response vector of length \eqn{n}.
+#' @param k The value of \eqn{k} to be used for estimation of the joint entropy \eqn{H(X,\epsilon)}.
+#' @param keps The value of \eqn{k} to be used for estimation of the marginal entropy \eqn{H(\epsilon)}.
+#' @param w The weight vector to be used for estimation of the joint entropy \eqn{H(X,\epsilon)}, with the same options as for the \code{\link{KLentropy}} function.
+#'@param eps A vector of null errors which should have the same distribution as the errors are assumed to have in the linear model.
+#' 
+#' @return The \eqn{p}-value corresponding the independence test carried out.
+#'
+#' @examples
+#' # Correctly specified linear model
+#' x=runif(100,min=-1.5,max=1.5); y=x+rnorm(100)
+#' plot(lm(y~x),which=1) 
+#' MINTregression(x,y,5,10,w=FALSE,rnorm(10000))
+#' # Misspecified mean linear model
+#' x=runif(100,min=-1.5,max=1.5); y=x^3+rnorm(100)
+#' plot(lm(y~x),which=1)
+#' MINTregression(x,y,5,10,w=FALSE,rnorm(10000))
+#' # Heteroscedastic linear model
+#' x=runif(100,min=-1.5,max=1.5); y=x+x*rnorm(100);
+#' plot(lm(y~x),which=1) 
+#' MINTregression(x,y,5,10,w=FALSE,rnorm(10000))
+#' # Multivariate misspecified mean linear model
+#' x=matrix(runif(1500,min=-1.5,max=1.5),ncol=3)
+#' y=x[,1]^3+0.3*x[,2]-0.3*x[,3]+rnorm(500)
+#' plot(lm(y~x),which=1)
+#' MINTregression(x,y,30,50,w=TRUE,rnorm(50000))  
+#'
+#' @references \insertRef{BS2017}{IndepTest}
+#' 
+#' @export
+MINTregression=function(x,y,k,keps,w=FALSE,eps){
+  dim=dim(cbind(x,y)); d=dim[2]; n=dim[1]
+  P=x%*%solve(t(x)%*%x)%*%t(x) # Projection matrix  
+  B=length(eps)%/%n         # The number of null statistics we can calculate
+  nullstat=rep(0,B)
+  for(b in 1:B){
+    data=eps[((b-1)*n+1):(b*n)]
+    res=data-P%*%data; sigma2=sum(res^2)/n
+    stdres=res/sqrt(sigma2)
+    nullstat[b]=KLentropy(stdres,keps,weights=FALSE)[[2]]-KLentropy(cbind(x,stdres),k,w)[[2]]
+  }
+  res=y-P%*%y; sigma2=sum(res^2)/n # Residuals and \hat{\sigma}^2
+  stdres=res/sqrt(sigma2)
+  teststat=KLentropy(stdres,keps,weights=FALSE)[[2]]-KLentropy(cbind(x,stdres),k,w)[[2]]
+  p=(1+sum(nullstat>=teststat))/(B+1)
+  return(p)
+}
+
+
+
+
+
+
diff --git a/build/partial.rdb b/build/partial.rdb
diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib
@@ -0,0 +1,15 @@
+
+
+@article{BSY2017,
+  author  = {{Berrett, T. B.} and {Samworth, R. J.} and {Yuan, M.}}, 
+  title   = {Efficient multivariate entropy estimation via k-nearest neighbour distances.},
+  journal = {Submitted},
+  year = {2017},
+}
+
+@article{BS2017,
+  author  = {{Berrett, T. B.} and {Samworth, R. J.}}, 
+  title   = {Nonparametric independence testing via mutual information.},
+  journal = {In preparation},
+  year = {2017},
+}
diff --git a/man/KLentropy.Rd b/man/KLentropy.Rd