Skip to content

Commit

Permalink
version 0.0.6
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Grose authored and cran-robot committed Nov 15, 2017
0 parents commit 7fbf0bc
Show file tree
Hide file tree
Showing 17 changed files with 581 additions and 0 deletions.
17 changes: 17 additions & 0 deletions DESCRIPTION
@@ -0,0 +1,17 @@
Package: IndepTest
Type: Package
Title: Nonparametric Independence Tests Based on Entropy Estimation
Version: 0.0.6
Date: 2017-11-15
Author: Thomas B. Berrett <t.berrett@statslab.cam.ac.uk> [aut], Daniel J. Grose <dan.grose@lancaster.ac.uk> [cre,ctb], Richard J. Samworth <r.samworth@statslab.cam.ac.uk> [aut]
Maintainer: Daniel Grose <dan.grose@lancaster.ac.uk>
Description: Implementations of the weighted Kozachenko-Leonenko entropy estimator and independence tests based on this estimator, (Kozachenko and Leonenko (1987) <http://mi.mathnet.ru/eng/ppi797>). Also includes a goodness-of-fit test for a linear model which is an independence test between covariates and errors.
Depends: FNN,mvtnorm
Imports: Rdpack
RdMacros: Rdpack
License: GPL
RoxygenNote: 6.0.1
NeedsCompilation: no
Packaged: 2017-11-15 15:42:45 UTC; grosedj1
Repository: CRAN
Date/Publication: 2017-11-15 18:56:58 UTC
16 changes: 16 additions & 0 deletions MD5
@@ -0,0 +1,16 @@
02a49c0761c870579115f01c9d10219d *DESCRIPTION
6e1ea53b207b1a480a478e7ae1fb7375 *NAMESPACE
43af04761dcbf7efeda7a534d6002002 *R/KLentropy.R
221e586a552442d3de658e351e355d6c *R/L2OptW.R
e3db14e51d93e45949b41cbf8dca93d9 *R/MINTauto.R
7ff1d651749533783efdc6804632b9a2 *R/MINTknown.R
2851aad67fb1f59e57670698e7166461 *R/MINTperm.R
ed31600cbfe9583732119759154c41fa *R/MINTregression.R
6f9389ed38d819047e32f50cd9aa919e *build/partial.rdb
7fd3a0bcd8a50f6b75297bf350849092 *inst/REFERENCES.bib
02aa69b2fd20f395c3705eab26eadfc6 *man/KLentropy.Rd
6ff776ed5b3651cf9c1809ffe87a67b1 *man/L2OptW.Rd
8a38149ca0cf9e717815828518ea38a3 *man/MINTauto.Rd
01ef37a8ab5d3cf063e942139594e430 *man/MINTknown.Rd
afa0cde653a70687f2fcb488f6dd2aa8 *man/MINTperm.Rd
f551fba2f91a58d53db42844420696fb *man/MINTregression.Rd
4 changes: 4 additions & 0 deletions NAMESPACE
@@ -0,0 +1,4 @@
exportPattern("^[[:alpha:]]+")
import(FNN)
import(mvtnorm)
importFrom(Rdpack,reprompt)
44 changes: 44 additions & 0 deletions R/KLentropy.R
@@ -0,0 +1,44 @@

#' KLentropy
#'
#' Calculates the (weighted) Kozachenko--Leonenko entropy estimator studied in Berrett, Samworth and Yuan (2017), which is based on the \eqn{k}-nearest neighbour distances of the sample.
#'
#' @param x The \eqn{n \times d} data matrix.
#' @param k The tuning parameter that gives the maximum number of neighbours that will be considered by the estimator.
#' @param weights Specifies whether a weighted or unweighted estimator is used. If a weighted estimator is to be used then the default (\code{weights=TRUE}) results in the weights being calculated by \code{\link{L2OptW}}, otherwise the user may specify their own weights.
#'
#' @return The first element of the list is the unweighted estimator for the value of 1 up to the user-specified \eqn{k}. The second element of the list is the weighted estimator, obtained by taking the inner product between the first element of the list and the weight vector.
#'
#' @references \insertRef{BSY2017}{IndepTest}
#'
#' @examples
#' n=1000; x=rnorm(n); KLentropy(x,30) # The true value is 0.5*log(2*pi*exp(1)) = 1.42.
#' n=5000; x=matrix(rnorm(4*n),ncol=4) # The true value is 2*log(2*pi*exp(1)) = 5.68
#' KLentropy(x,30,weights=FALSE) # Unweighted estimator
#' KLentropy(x,30,weights=TRUE) # Weights chosen by L2OptW
#' w=runif(30); w=w/sum(w); KLentropy(x,30,weights=w) # User-specified weights
#'
#' @export
KLentropy=function(x,k,weights=FALSE){
dim=dim(as.matrix(x)); n=dim[1]; d=dim[2]
V=pi^(d/2)/gamma(1+d/2)
if(length(weights)>1){
w=weights
}else if(weights==TRUE){
w=L2OptW(k,d)
}else{
w=c(rep(0,k-1),1)
}
rho=knn.dist(x,k=k)
H=(1/n)*colSums(t(log(t(rho)^d*V*(n-1))-digamma(1:k)))
value=list(); value[[1]]=H; value[[2]]=sum(H*w)
return(value)
}








39 changes: 39 additions & 0 deletions R/L2OptW.R
@@ -0,0 +1,39 @@

#' L2OptW
#'
#' Calculates a weight vector to be used for the weighted Kozachenko--Leonenko estimator. The weight vector has minimum \eqn{L_2} norm subject to the linear and sum-to-one constraints of (2) in Berrett, Samworth and Yuan (2017).
#'
#' @param k The tuning parameter that gives the number of neighbours that will be considered by the weighted Kozachenko--Leonenko estimator.
#' @param d The dimension of the data.
#'
#' @return The weight vector that is the solution of the optimisation problem.
#'
#' @examples
#' # When d < 4 there are no linear constraints and the returned vector is (0,0,...,0,1).
#' L2OptW(100,3)
#' w=L2OptW(100,4)
#' plot(w,type="l")
#' w=L2OptW(100,8);
#' # For each multiple of 4 that d increases an extra constraint is added.
#' plot(w,type="l")
#' w=L2OptW(100,12)
#' plot(w, type="l") # This can be seen in the shape of the plot
#'
#' @references \insertRef{BSY2017}{IndepTest}
#'
#' @export
L2OptW=function(k,d){
dprime=floor(d/4)
if(dprime==0){
return(c(rep(0,k-1),1))
}else{
G=matrix(rep(0,(dprime+1)*k),ncol=k)
G[1,]=rep(1,k)
for(l in 1:dprime){
G[l+1,]=exp(lgamma(1:k+2*l/d)-lgamma(1:k)) # exp(lgamma-lgamma) better than gamma/gamma for large k
}
A=G%*%t(G)
Lambda=solve(A,c(1,rep(0,dprime)))
return(as.vector(t(G)%*%Lambda))
}
}
43 changes: 43 additions & 0 deletions R/MINTauto.R
@@ -0,0 +1,43 @@

#' MINTauto
#'
#' Performs an independence test without knowledge of either marginal distribution using permutations and using a data-driven choice of \eqn{k}.
#'
#' @param x The \eqn{n \times d_{X}} data matrix of the \eqn{X} values.
#' @param y The response vector of length \eqn{n \times d_{Y}} data matrix of the \eqn{Y} values.
#' @param kmax The maximum value of \eqn{k} to be considered for estimation of the joint entropy \eqn{H(X,Y)}.
#' @param B1 The number of repetitions used when choosing \eqn{k}, set to 1000 by default.
#' @param B2 The number of permutations to use for the final test, set at 1000 by default.
#'
#' @return The \eqn{p}-value corresponding the independence test carried out and the value of \eqn{k} used.
#'
#' @examples
#' \donttest{
#' # Independent univariate normal data
#' x=rnorm(1000); y=rnorm(1000);
#' MINTauto(x,y,kmax=200,B1=100,B2=100)
#' # Dependent univariate normal data
#' library(mvtnorm)
#' data=rmvnorm(1000,sigma=matrix(c(1,0.5,0.5,1),ncol=2))
#' MINTauto(data[,1],data[,2],kmax=200,B1=100,B2=100)
#' # Dependent multivariate normal data
#' Sigma=matrix(c(1,0,0,0,0,1,0,0,0,0,1,0.5,0,0,0.5,1),ncol=4)
#' data=rmvnorm(1000,sigma=Sigma)
#' MINTauto(data[,1:3],data[,4],kmax=50,B1=100,B2=100)
#'}
#'
#' @references \insertRef{BS2017}{IndepTest}
#'
#' @export
MINTauto=function(x,y,kmax,B1=1000,B2=1000){
data=cbind(x,y); d=dim(data)[2]; n=dim(data)[1]
Hp=matrix(rep(0,B1*kmax),ncol=kmax)
for(b in 1:B1){
y1=as.matrix(y)[sample(n),]; data1=cbind(x,y1)
y2=as.matrix(y)[sample(n),]; data2=cbind(x,y2)
Hp[b,]=KLentropy(data1,k=kmax)[[1]]-KLentropy(data2,k=kmax)[[1]]
}
MSE=(1/B1)*colSums(Hp^2); kopt=which.min(MSE)
return(c(MINTperm(x,y,kopt,B=B2),kopt))
}

47 changes: 47 additions & 0 deletions R/MINTknown.R
@@ -0,0 +1,47 @@
#' MINTknown
#'
#' Performs an independence test when it is assumed that the marginal distribution of \eqn{Y} is known and can be simulated from.
#'
#' @param x The \eqn{n \times d_X} data matrix of \eqn{X} values.
#' @param y The \eqn{n \times d_Y} data matrix of \eqn{Y} values.
#' @param k The value of \eqn{k} to be used for estimation of the joint entropy \eqn{H(X,Y)}.
#' @param ky The value of \eqn{k} to be used for estimation of the marginal entropy \eqn{H(Y)}.
#' @param w The weight vector to used for estimation of the joint entropy \eqn{H(X,Y)}, with the same options as for the \code{\link{KLentropy}} function.
#' @param wy The weight vector to used for estimation of the marginal entropy \eqn{H(Y)}, with the same options as for the \code{\link{KLentropy}} function.
#' @param y0 The data matrix of simulated \eqn{Y} values.
#'
#' @return The \eqn{p}-value corresponding the independence test carried out.
#'
#' @examples
#' library(mvtnorm)
#' x=rnorm(1000); y=rnorm(1000);
#' # Independent univariate normal data
#' MINTknown(x,y,k=20,ky=30,y0=rnorm(100000))
#' library(mvtnorm)
#' # Dependent univariate normal data
#' data=rmvnorm(1000,sigma=matrix(c(1,0.5,0.5,1),ncol=2))
#' # Dependent multivariate normal data
#' MINTknown(data[,1],data[,2],k=20,ky=30,y0=rnorm(100000))
#' Sigma=matrix(c(1,0,0,0,0,1,0,0,0,0,1,0.5,0,0,0.5,1),ncol=4)
#' data=rmvnorm(1000,sigma=Sigma)
#' MINTknown(data[,1:3],data[,4],k=20,ky=30,w=TRUE,wy=FALSE,y0=rnorm(100000))
#'
#' @references \insertRef{BS2017}{IndepTest}
#'
#' @export
MINTknown=function(x,y,k,ky,w=FALSE,wy=FALSE,y0){
n=dim(cbind(x,y))[1]
B=dim(as.matrix(y0))[1]%/%n # The number of null statistics we can calculate
nullstat=rep(0,B)
for(b in 1:B){
yb=as.matrix(y0)[((b-1)*n+1):(b*n),] # Extracting the bth null sample
nullstat[b]=KLentropy(yb,k=ky,weights=wy)[[2]]-KLentropy(cbind(x,yb),k=k,weights=w)[[2]] # Calculating null stat
}
data=cbind(x,y) # Real data
stat=KLentropy(y,k=ky,weights=wy)[[2]]-KLentropy(data,k=k,weights=w)[[2]] # Calculating the real test stat
p=(1+sum(nullstat>=stat))/(B+1) # Comparing the real test stat to the null test stats
return(p)
}



47 changes: 47 additions & 0 deletions R/MINTperm.R
@@ -0,0 +1,47 @@


#' MINTknown
#'
#' Performs an independence test without knowledge of either marginal distribution using permutations.
#'
#' @param x The \eqn{n \times d_X} data matrix of \eqn{X} values.
#' @param y The \eqn{n \times d_Y} data matrix of \eqn{Y} values.
#' @param k The value of \eqn{k} to be used for estimation of the joint entropy \eqn{H(X,Y)}.
#' @param w The weight vector to used for estimation of the joint entropy \eqn{H(X,Y)}, with the same options as for the \code{\link{KLentropy}} function.
#' @param B The number of permutations to use, set at 1000 by default.
#'
#' @return The \eqn{p}-value corresponding the independence test carried out.
#'
#' @examples
#' # Independent univariate normal data
#' x=rnorm(1000); y=rnorm(1000)
#' MINTperm(x,y,k=20,B=100)
#' # Dependent univariate normal data
#' library(mvtnorm)
#' data=rmvnorm(1000,sigma=matrix(c(1,0.5,0.5,1),ncol=2))
#' MINTperm(data[,1],data[,2],k=20,B=100)
#' # Dependent multivariate normal data
#' Sigma=matrix(c(1,0,0,0,0,1,0,0,0,0,1,0.5,0,0,0.5,1),ncol=4)
#' data=rmvnorm(1000,sigma=Sigma)
#' MINTperm(data[,1:3],data[,4],k=20,w=TRUE,B=100)
#'
#' @references \insertRef{BS2017}{IndepTest}
#'
#' @export
MINTperm=function(x,y,k,w=FALSE,B=1000){
data=cbind(x,y); n=dim(data)[1]
Hp=rep(0,B)
for(b in 1:B){
yp=as.matrix(y)[sample(n),]
Hp[b]=KLentropy(cbind(x,yp),k,weights=w)[[2]]
}
H=KLentropy(data,k,weights=w)[[2]]
p=(1+sum(Hp<=H))/(B+1)
return(p)
}






60 changes: 60 additions & 0 deletions R/MINTregression.R
@@ -0,0 +1,60 @@


#' MINTregression
#'
#' Performs a goodness-of-fit test of a linear model by testing whether the errors are independent of the covariates.
#'
#' @param x The \eqn{n \times p} design matrix.
#' @param y The response vector of length \eqn{n}.
#' @param k The value of \eqn{k} to be used for estimation of the joint entropy \eqn{H(X,\epsilon)}.
#' @param keps The value of \eqn{k} to be used for estimation of the marginal entropy \eqn{H(\epsilon)}.
#' @param w The weight vector to be used for estimation of the joint entropy \eqn{H(X,\epsilon)}, with the same options as for the \code{\link{KLentropy}} function.
#'@param eps A vector of null errors which should have the same distribution as the errors are assumed to have in the linear model.
#'
#' @return The \eqn{p}-value corresponding the independence test carried out.
#'
#' @examples
#' # Correctly specified linear model
#' x=runif(100,min=-1.5,max=1.5); y=x+rnorm(100)
#' plot(lm(y~x),which=1)
#' MINTregression(x,y,5,10,w=FALSE,rnorm(10000))
#' # Misspecified mean linear model
#' x=runif(100,min=-1.5,max=1.5); y=x^3+rnorm(100)
#' plot(lm(y~x),which=1)
#' MINTregression(x,y,5,10,w=FALSE,rnorm(10000))
#' # Heteroscedastic linear model
#' x=runif(100,min=-1.5,max=1.5); y=x+x*rnorm(100);
#' plot(lm(y~x),which=1)
#' MINTregression(x,y,5,10,w=FALSE,rnorm(10000))
#' # Multivariate misspecified mean linear model
#' x=matrix(runif(1500,min=-1.5,max=1.5),ncol=3)
#' y=x[,1]^3+0.3*x[,2]-0.3*x[,3]+rnorm(500)
#' plot(lm(y~x),which=1)
#' MINTregression(x,y,30,50,w=TRUE,rnorm(50000))
#'
#' @references \insertRef{BS2017}{IndepTest}
#'
#' @export
MINTregression=function(x,y,k,keps,w=FALSE,eps){
dim=dim(cbind(x,y)); d=dim[2]; n=dim[1]
P=x%*%solve(t(x)%*%x)%*%t(x) # Projection matrix
B=length(eps)%/%n # The number of null statistics we can calculate
nullstat=rep(0,B)
for(b in 1:B){
data=eps[((b-1)*n+1):(b*n)]
res=data-P%*%data; sigma2=sum(res^2)/n
stdres=res/sqrt(sigma2)
nullstat[b]=KLentropy(stdres,keps,weights=FALSE)[[2]]-KLentropy(cbind(x,stdres),k,w)[[2]]
}
res=y-P%*%y; sigma2=sum(res^2)/n # Residuals and \hat{\sigma}^2
stdres=res/sqrt(sigma2)
teststat=KLentropy(stdres,keps,weights=FALSE)[[2]]-KLentropy(cbind(x,stdres),k,w)[[2]]
p=(1+sum(nullstat>=teststat))/(B+1)
return(p)
}






Binary file added build/partial.rdb
Binary file not shown.
15 changes: 15 additions & 0 deletions inst/REFERENCES.bib
@@ -0,0 +1,15 @@


@article{BSY2017,
author = {{Berrett, T. B.} and {Samworth, R. J.} and {Yuan, M.}},
title = {Efficient multivariate entropy estimation via k-nearest neighbour distances.},
journal = {Submitted},
year = {2017},
}

@article{BS2017,
author = {{Berrett, T. B.} and {Samworth, R. J.}},
title = {Nonparametric independence testing via mutual information.},
journal = {In preparation},
year = {2017},
}
32 changes: 32 additions & 0 deletions man/KLentropy.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 7fbf0bc

Please sign in to comment.