/
correlate.R
383 lines (319 loc) · 13.7 KB
/
correlate.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
# compute correlation matrix:
# - automatically removes non-numeric variables from the data
# - automatically does "pairwise.complete.obs" for handling missing data
# - can report the result of correlation tests if requested
# - can do pearson, spearman and kendall
# - defaults to no test, and to pearson correlation
#' Correlation matrices
#'
#' @description Computes a correlation matrix and runs hypothesis tests with corrections for multiple comparisons
#'
#' @param x Matrix or data frame containing variables to be correlated
#' @param y Optionally, a second set of variables to be correlated with those in \code{x}
#' @param test Should hypothesis tests be displayed? (Default=\code{FALSE})
#' @param corr.method What kind of correlations should be computed? Default is \code{"pearson"}, but \code{"spearman"} and \code{"kendall"} are also supported
#' @param p.adjust.method What method should be used to correct for multiple comparisons. Default value is \code{"holm"}, and the allowable values are the same as for \code{\link{p.adjust}}
#'
#' @details The \code{correlate} function calculates a correlation matrix
#' between all pairs of variables. Much like the \code{cor} function, if the
#' user inputs only one set of variables (\code{x}) then it computes all
#' pairwise correlations between the variables in \code{x}. If the user
#' specifies both \code{x} and \code{y} it correlates the variables in
#' \code{x} with the variables in \code{y}.
#'
#' Unlike the \code{cor} function, \code{correlate} does not generate an
#' error if some of the variables are categorical (i.e., factors). Variables
#' that are not numeric (or integer) class are simply ignored. They appear in
#' the output, but no correlations are reported for those variables. The
#' decision to have the \code{correlate} function allow the user a little
#' leniency when the input contains non-numeric variables should be explained.
#' The motivation is pedagogical rather than statistical. It is sometimes the
#' case in psychology that students need to work with correlation matrices
#' before they are comfortable subsetting a data frame, so it is convenient
#' to allow them to type commands like \code{correlate(data)} even when
#' \code{data} contains variables for which Pearson/Spearman correlations
#' are not appropriate. (It is also useful to use the output of
#' \code{correlate} to illustrate the fact that Pearson correlations should
#' not be used for categorical variables).
#'
#' A second difference between \code{cor} and \code{correlate} is that
#' \code{correlate} runs hypothesis tests for all correlations in the
#' correlation matrix (using the \code{cor.test} function to do the work).
#' The results of the tests are only displayed to the user if
#' \code{test=TRUE}. This is a pragmatic choice, given the (perhaps
#' unfortunate) fact that psychologists often want to see the results
#' of these tests: it is probably not coincidental that the \code{corr.test}
#' function in the \pkg{psych} package already provides this functionality
#' (though the output is difficult for novices to read).
#'
#' The concern with running hypothesis tests for all elements of a correlation
#' matrix is inflated Type I error rates. To minimise this risk, reported
#' p-values are adjusted using the Holm method. The user can change this
#' setting by specifying \code{p.adjust.method}. See \code{\link{p.adjust}}
#' for details.
#'
#' Missing data are handled using pairwise complete cases.
#'
#' @return The printed output shows the correlation matrix, and if tests are
#' requested it also reports a matrix of p-values and sample sizes associated
#' with each correlation (these can vary if there are missing data). The
#' underlying data structure is an object of class \code{correlate} (an S3
#' class). It is effectively a list containing four elements:
#' \code{correlation} is the correlation matrix, \code{p.value} is the matrix
#' of p-values, \code{sample.size} is the matrix of sample sizes, and
#' \code{args} is a vector that stores information about what the user
#' requested.
#'
#' @export
#'
#' @examples
#' # data frame with factors and missing values
#' data <- data.frame(
#' anxiety = c(1.31,2.72,3.18,4.21,5.55,NA),
#' stress = c(2.01,3.45,1.99,3.25,4.27,6.80),
#' depression = c(2.51,1.77,3.34,5.83,9.01,7.74),
#' happiness = c(4.02,3.66,5.23,6.37,7.83,1.18),
#' gender = factor( c("male","female","female","male","female","female") ),
#' ssri = factor( c("no","no","no",NA,"yes","yes") )
#' )
#'
#' # default output is just the (Pearson) correlation matrix
#' correlate( data )
#'
#' # other types of correlation:
#' correlate( data, corr.method="spearman" )
#'
#' # two meaningful subsets to be correlated:
#' nervous <- data[,c("anxiety","stress")]
#' happy <- data[,c("happiness","depression","ssri")]
#'
#' # default output for two matrix input
#' correlate( nervous, happy )
#'
#' # the same examples, with Holm-corrected p-values
#' correlate( data, test=TRUE )
#' correlate( nervous, happy, test=TRUE )
#'
correlate <- function( x, y=NULL, test=FALSE, corr.method="pearson", p.adjust.method="holm" ) {
# did the user specify two input matrices?
two.inputs <- !is.null(y)
# allow numeric vectors
if( is.vector(x) & is.numeric(x) ) {
x <- data.frame(x)
call <- match.call()
n <- call[[2]]
names(x) <- "x.var"
if(class(n) == "name") names(x) <- as.character(n)
}
if( two.inputs & is.vector(y) & is.numeric(y) ) {
y <- data.frame(y)
call <- match.call()
n <- call[[3]]
names(y) <- "y.var"
if(class(n) == "name") names(y) <- as.character(n)
}
# check input matrices are data frames
if( !(class(x) %in% c("matrix","data.frame") )) {
stop( 'x must be a matrix or data frame' )
}
if( two.inputs & !(class(y) %in% c("matrix","data.frame") )) {
stop( 'y must be a matrix or data frame' )
}
# warn if x and y appear to have the same variables!
# coerce to data frame
if( class(x) =="matrix" ) x <- as.data.frame(x)
if( two.inputs & class(y) == "matrix") y <- as.data.frame(y)
# store other args
args <- c( two.inputs=two.inputs, test=test,
corr.method=corr.method, p.adjust.method=p.adjust.method )
# define a function to run correlation test and trap warning message
getCT <- function( x,y, method ) {
# get the correlation test, trapping the ties problem warning as needed...
old.warn <- options(warn=2) # convert warnings to errors
ct <- try( stats::cor.test( x, y, method=method), silent=TRUE ) # try the correlation
tp <- FALSE # assume no ties problem unless...
# check for failures:
if( class(ct) == "try-error" ) {
# handle the case when the "error" was a ties warning
tp <- length( grep("exact p-value with ties",ct )) > 0
if( tp ) { # if it was a ties problem...
options(warn=-1) # suppress warnings completely for the next run...
} else { # if not...
options( old.warn ) # reset warning state and let R throw what it likes...
}
# now run the test again...
ct <- stats::cor.test( x, y, method=method )
}
options( old.warn ) # reset warnings to original state
return( list( ct=ct, tp=tp) )
}
if( !two.inputs ) {
#### one matrix case ###
# drop categorical variables
classes <- sapply(x,"class") # get the classes
inds <- which( classes %in% c("integer","numeric")) # retain only int and num
n.vars <- dim(x)[2] # number of variables
n.cont <- length(inds) # number of continuous variables
# initialise the output list with empty matrices
R <- list( correlation = matrix( NA, n.vars, n.vars) )
rownames( R$correlation ) <- colnames( R$correlation ) <- colnames(x)
R$sample.size <- R$p.value <- R$correlation
R$args <- args
R$tiesProblem <- FALSE
# run pairwise tests (inefficient looping!)
for( a in 1:(n.cont-1) ){
for( b in (a+1):n.cont) {
i <- inds[a] # the a-th continuous variable
j <- inds[b] # the b-th continuous variable
# ct <- cor.test( x[,i], x[,j], method=corr.method )
cttp <- getCT( x[,i], x[,j], corr.method )
# store the output
R$tiesProblem <- R$tiesProblem | cttp$tp
R$correlation[j,i] <- R$correlation[i,j] <- cttp$ct$estimate
R$p.value[j,i] <- R$p.value[i,j] <- cttp$ct$p.value
}
}
# adjust p (inefficient duplication here)
upper.inds <- upper.tri(R$p.value) & !is.na(R$p.value) # cells containing p-values, upper
R$p.value[upper.inds] <- stats::p.adjust( R$p.value[upper.inds], method=p.adjust.method )
lower.inds <- lower.tri(R$p.value) & !is.na(R$p.value) # cells containing p-values, lower
R$p.value[lower.inds] <- stats::p.adjust( R$p.value[lower.inds], method=p.adjust.method )
# fill in sample sizes for individual variables
for( i in 1:n.vars ) {
R$sample.size[i,i] <- sum(!is.na(x[,i]))
}
# and pairwise sample sizes for all variables
for( i in 1:(n.vars-1) ) {
for( j in (i+1):n.vars) {
R$sample.size[j,i] <- R$sample.size[i,j] <- sum(!(is.na(x[,i]) | is.na(x[,j])))
}
}
} else {
#### two matrix case ###
# track only the categorical variables
inds.x <- which(sapply(x,"class") %in% c("integer","numeric"))
inds.y <- which(sapply(y,"class") %in% c("integer","numeric"))
# variable numbers
n.vars.x <- dim(x)[2] # number of variables in x
n.vars.y <- dim(y)[2] # number of variables in y
n.cont.x <- length(inds.x) # number of continuous variables in x
n.cont.y <- length(inds.y) # number of continuous variables in y
# initialise the output list
R <- list( correlation = matrix( NA, n.vars.x, n.vars.y) )
rownames( R$correlation ) <- colnames(x)
colnames( R$correlation ) <- colnames(y)
R$sample.size <- R$p.value <- R$correlation
R$args <- args
R$tiesProblem <- FALSE
# run pairwise tests (inefficient looping!)
for( a in 1:n.cont.x ){
for( b in 1:n.cont.y) {
i <- inds.x[a] # the a-th continuous variable in x
j <- inds.y[b] # the b-th continuous variable in y
#ct <- cor.test( x[,i], y[,j], method=corr.method ) # run the test
cttp <- getCT( x[,i], y[,j], corr.method )
# store the output
R$tiesProblem <- R$tiesProblem | cttp$tp
R$correlation[i,j] <- cttp$ct$estimate
R$p.value[i,j] <- cttp$ct$p.value
# store sample size
R$sample.size[i,j] <- sum(!(is.na(x[,i]) | is.na(y[,j])))
}
}
# adjust p
R$p.value <- matrix( stats::p.adjust( R$p.value, method=p.adjust.method ),
n.vars.x, n.vars.y,
dimnames=dimnames(R$p.value) )
}
# define an S3 class
class(R) <- c("correlate","list")
return(R)
}
# print method
#' Print method for correlate objects
#'
#' @param x An object of class 'correlate'
#' @param ... For consistency with the generic (unused)
#'
#' @return Invisibly returns the original object
#' @export
print.correlate <- function( x, ... ){
# fixed properties that should probably be converted to
# input arguments?
nDigits <- 3
naPrint <- "."
# function to force equal digit printing
makeTxt <- function(x, nDigits, naPrint ) {
n <- dim(x)
format <- paste0("%.",nDigits,"f")
txt <- sprintf( format, x)
txt <- gsub("NA", naPrint, txt, fixed=TRUE)
txt <- matrix(txt,n[1],n[2], dimnames=dimnames(x))
return(txt)
}
# function to print the text matrix
printTxt <- function( txt ) {
print.default( txt, quote=FALSE, right=TRUE)
}
# print the correlations
cat("\n")
cat("CORRELATIONS\n")
cat("============\n")
cat("- correlation type: ", x$arg["corr.method"], "\n")
cat("- correlations shown only when both variables are numeric\n")
cat("\n")
if( options()$show.signif.stars & x$arg["test"]==TRUE ) { # if significance stars needed...
# function to return the significance stars string
getSigString <- function(p) {
if( is.na(p) | p > .1 ) return( " " )
if( p > .05 ) return( ". " )
if( p > .01 ) return( "* " )
if( p > .001 ) return( "**" )
return( "***" )
}
# function to generate interleaved indices
interleave <- function(n){
ord <- vector()
ord[ seq(1,2*n-1,2) ] <- 1:n
ord[ seq(2,2*n,2)] <- (n+1):(2*n)
return(ord)
}
# print correlation matrix with sig stars
n <- dim( x$correlation )[2] # number of columns
txt <- makeTxt( x$correlation, nDigits, naPrint ) # text form of the correlation matrix
for( i in 1:n ) txt[,i] <- paste0(txt[,i], sapply(x$p.value[,i], getSigString)) # append stars
colnames(txt) <- paste0( colnames(x$correlation), rep.int(" ",n)) # column names
rownames(txt) <- rownames( x$correlation ) # row names
printTxt(txt)
# print the key
cat("\n---\n")
cat("Signif. codes: . = p < .1, * = p<.05, ** = p<.01, *** = p<.001\n")
} else { # if no significance stars needed...
txt <- makeTxt( x$correlation, nDigits, naPrint )
printTxt( txt )
}
if( x$arg["test"]==TRUE ) {
# print the p.values
cat("\n\np-VALUES\n")
cat("========\n")
if( x$args["two.inputs"] ) { nTests <- sum( !is.na( x$correlation))
} else { nTests <- sum( !is.na( x$correlation)) / 2 }
cat("- total number of tests run: ", nTests, "\n")
cat("- correction for multiple testing: ", x$arg["p.adjust.method"],"\n")
if( x$tiesProblem ) {
cat("- WARNING: cannot compute exact p-values with ties\n" )
}
cat("\n")
# print p-values, forcing nDigits
txt <- makeTxt( x$p.value, nDigits, naPrint )
printTxt( txt )
# print the sample sizes
cat("\n\nSAMPLE SIZES\n")
cat("============\n")
cat("\n")
txt <- makeTxt( x$sample.size, nDigits=0, naPrint )
printTxt( txt )
}
# invisbly return the input
return( invisible(x) )
}