forked from neurodebian/afni_removeme_eventually
-
Notifications
You must be signed in to change notification settings - Fork 0
/
1dCorrelate.c
486 lines (399 loc) · 19.2 KB
/
1dCorrelate.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
#include "mrilib.h"
#include "zgaussian.c"
#define NCOR 6
static char *cor_name[NCOR] = { "Pearson" , "Spearman" , "Quadrant" , "K-Tau-b" , "TicTacToe" , "Quantile:9" } ;
/* list of functions to compute correlations [cf. thd_correlate.c] */
typedef float (*ccfun)(int,float *,float *) ;
static ccfun cor_func[NCOR] =
{ THD_pearson_corr, THD_spearman_corr, THD_quadrant_corr, THD_ktaub_corr, THD_tictactoe_corr , THD_quantile_corr } ;
/* prototype for function to bootstrap the correlations */
static float_quad Corrboot( int n , float *x , float *y , float xsig , float ysig ,
float (*cfun)(int,float *,float *) ) ;
/* prototype for function to get normal theory confidence intervals */
static float_pair PCorrCI( int npt , float cor , float alpha ) ;
/* default number of bootstrap repetitions */
#undef NBOOT
#define NBOOT 4000
#undef NBMIN
#define NBMIN 400
static int nboot = NBOOT ; /* changed by the -nboot option */
static float alpha = 0.05f ; /* changed by the -alpha option */
static int doblk = 0 ; /* changed by the -block option */
/*----------------------------------------------------------------------------*/
void usage_1dCorrelate( int detail ){
printf(
"Usage: 1dCorrelate [options] 1Dfile 1Dfile ...\n"
"------\n"
" * Each input 1D column is a collection of data points.\n"
" * The correlation coefficient between each column pair is computed, along\n"
" with its confidence interval (via a bias-corrected bootstrap procedure).\n"
" * The minimum sensible column length is 7.\n"
" * At least 2 columns are needed [in 1 or more .1D files].\n"
" * If there are N input columns, there will be N*(N-1)/2 output rows.\n"
" * Output appears on stdout; redirect ('>' or '>>') as needed.\n"
" * Only one correlation method can be used in one run of this program.\n"
" * This program is basically the basterd offspring of program 1ddot.\n"
" * Also see http://en.wikipedia.org/wiki/Confidence_interval\n"
"\n"
"-------\n"
"Methods [actually, only the first letter is needed to choose a method]\n"
"------- [and the case doesn't matter: '-P' and '-p' both = '-Pearson']\n"
"\n"
" -Pearson = Pearson correlation [the default method]\n"
" -Spearman = Spearman (rank) correlation [more robust vs. outliers]\n"
" -Quadrant = Quadrant (binarized) correlation [most robust, but weaker]\n"
" -Ktaub = Kendall's tau_b 'correlation' [popular somewhere, maybe]\n"
"\n"
"-------------\n"
"Other Options [these options cannot be abbreviated!]\n"
"-------------\n"
"\n"
" -nboot B = Set the number of bootstrap replicates to 'B'.\n"
" * The default value of B is %d.\n"
" * A larger number will give somewhat more accurate\n"
" confidence intervals, at the cost of more CPU time.\n"
"\n"
" -alpha A = Set the 2-sided confidence interval width to '100-A' percent.\n"
" * The default value of A is 5, giving the 2.5..97.5%% interval.\n"
" * The smallest allowed A is 1 (0.5%%..99.5%%) and the largest\n"
" allowed value of A is 20 (10%%..90%%).\n"
" * If you are interested assessing if the 'p-value' of a\n"
" correlation is smaller than 5%% (say), then you should use\n"
" '-alpha 10' and see if the confidence interval includes 0.\n"
"\n"
" -block = Attempt to allow for serial correlation in the data by doing\n"
" *OR* variable-length block resampling, rather than completely\n"
" -blk random resampling as in the usual bootstrap.\n"
" * You should NOT do this unless you believe that serial\n"
" correlation (along each column) is present and significant.\n"
" * Block resampling requires at least 20 data points in each\n"
" input column. Fewer than 20 will turn off this option.\n"
#if 0
"\n"
" -vsig ss = With this option, you specify that each individual measurement\n"
" variable has a nonzero standard deviation ('sig') -- that data\n"
" might come from some calibration studies, for example. In the\n"
" bootstrap analysis, the random replicates are perturbed by\n"
" independent zero-mean normal deviates with standard deviations\n"
" given in the file 'ss' -- this file should have 1 row, and that\n"
" row should have 1 non-negative number for every column entered\n"
" into the correlation analysis.\n"
" * In the case of the Pearson coefficient, the 'N:' normal theory\n"
" results do NOT include the effect of this extra randomness.\n"
#endif
"-----\n"
"Notes\n"
"-----\n"
"* For each pair of columns, the output include the correlation value\n"
" as directly calculated, plus the bias-corrected bootstrap value, and\n"
" the desired (100-A)%% confidence interval [also via bootstrap].\n"
"\n"
"* The primary purpose of this program is to provide an easy way to get\n"
" the bootstrap confidence intervals, since people almost always seem to use\n"
" the asymptotic normal theory to decide if a correlation is 'significant',\n"
" and this often seems misleading to me [especially for short columns].\n"
"\n"
"* Bootstrapping confidence intervals for the inverse correlations matrix\n"
" (i.e., partial correlations) would be interesting -- anyone out there\n"
" need this ability?\n"
"\n"
"-------------\n"
"Sample output [command was '1dCorrelate -alpha 10 A2.1D B2.1D']\n"
"-------------\n"
"# Pearson correlation [n=12 #col=2]\n"
"# Name Name Value BiasCorr 5.00%% 95.00%% N: 5.00%% N:95.00%%\n"
"# -------- -------- -------- -------- -------- -------- -------- --------\n"
" A2.1D[0] B2.1D[0] +0.57254 +0.57225 -0.03826 +0.86306 +0.10265 +0.83353\n"
"\n"
"* Bias correction of the correlation had little effect; this is very common.\n"
" ++ To be clear, the bootstrap bias correction is to allow for potential bias\n"
" in the statistical estimate of correlation when the sample size is small.\n"
" ++ It cannot correct for biases that result from faulty data (or faulty\n"
" assumptions about the data).\n"
"\n"
"* The correlation is NOT significant at this level, since the CI (confidence\n"
" interval) includes 0 in its range.\n"
"\n"
"* For the Pearson method ONLY, the last two columns ('N:', as above) also\n"
" show the widely used asymptotic normal theory confidence interval. As in\n"
" the example, the bootstrap interval is often (but not always) wider than\n"
" the theoretical interval.\n"
"\n"
"* In the example, the normal theory might indicate that the correlation is\n"
" significant (less than a 5%% chance that the CI includes 0), but the\n"
" bootstrap CI shows that is not a reasonable statistical conclusion.\n"
" ++ The principal reason that I wrote this program was to make it easy\n"
" to check if the normal (Gaussian) theory for correlation significance\n"
" testing is reasonable in any given case -- for small samples, it often\n"
" is NOT reasonable!\n"
"\n"
"* Using the same data with the '-S' option gives the table below, again\n"
" indicating that there is no significant correlation between the columns\n"
" (note also the lack of the 'N:' results for Spearman correlation):\n"
"\n"
"# Spearman correlation [n=12 #col=2]\n"
"# Name Name Value BiasCorr 5.00%% 95.00%%\n"
"# -------- -------- -------- -------- -------- --------\n"
" A2.1D[0] B2.1D[0] +0.46154 +0.42756 -0.23063 +0.86078\n"
"\n"
"*** Written by RWCox (AKA Zhark the Mad Correlator) -- 19 May 2011 ***\n"
, NBOOT ) ;
PRINT_COMPILE_DATE ;
}
/*============================================================================*/
int main( int argc , char *argv[] )
{
int iarg , ii,jj,kk,mm , nvec , nx=0,ny , ff , vlen=4 ;
MRI_IMAGE *tim , *vsim=NULL ;
MRI_IMARR *tar ;
char **vecnam , *tnam ;
float *far , **tvec , *vsig=NULL , xsig,ysig ;
float_quad qcor ; float_pair pci ; float corst, cor025, cor500, cor975 ;
char fmt[256] ;
int cormeth=0 ; /* 0=Pearson, 1=Spearman, 2=Quadrant, 3=Kendall tau_b */
float (*corfun)(int,float *,float *) ;
/*-- start the AFNI machinery --*/
mainENTRY("1dCorrelate main") ; machdep() ;
/* check for options */
iarg = 1 ; nvec = 0 ;
while( iarg < argc && argv[iarg][0] == '-' ){
/* I get by with a little help from my friends? */
if( strcmp(argv[iarg],"-help") == 0 || strcmp(argv[iarg],"-h") == 0){
usage_1dCorrelate(strlen(argv[iarg])>3 ? 2:1);
exit(0) ;
}
/*--- methods ---*/
if( toupper(argv[iarg][1]) == 'P' ){ cormeth = 0 ; iarg++ ; continue ; }
if( toupper(argv[iarg][1]) == 'S' ){ cormeth = 1 ; iarg++ ; continue ; }
if( toupper(argv[iarg][1]) == 'Q' ){ cormeth = 2 ; iarg++ ; continue ; }
if( toupper(argv[iarg][1]) == 'K' ){ cormeth = 3 ; iarg++ ; continue ; }
if( toupper(argv[iarg][1]) == 'T' ){ cormeth = 4 ; iarg++ ; continue ; }
if( toupper(argv[iarg][1]) == 'U' ){ cormeth = 5 ; iarg++ ; continue ; }
/*--- set nboot ---*/
if( strcasecmp(argv[iarg],"-nboot") == 0 || strcasecmp(argv[iarg],"-num") == 0 ){
iarg++ ; if( iarg >= argc ) ERROR_exit("Need argument after '-nboot'") ;
nboot = (int)strtod(argv[iarg],NULL) ;
if( nboot < NBMIN ){
WARNING_message("Replacing -nboot %d with %d",nboot,NBMIN) ;
nboot = NBMIN ;
}
iarg++ ; continue ;
}
/*--- set alpha ---*/
if( strcasecmp(argv[iarg],"-alpha") == 0 ){
iarg++ ; if( iarg >= argc ) ERROR_exit("Need argument after '-alpha'") ;
alpha = (float)strtod(argv[iarg],NULL) ;
if( alpha < 1.0f ){
WARNING_message("Replacing -alpha %.1f with 1",alpha) ;
alpha = 0.01f ;
} else if( alpha > 20.0f ){
WARNING_message("Replacing -alpha %.1f with 20",alpha) ;
alpha = 0.20f ;
} else {
alpha *= 0.01f ; /* convert from percent to fraction */
}
iarg++ ; continue ;
}
/*--- block resampling ---*/
if( strcasecmp(argv[iarg],"-blk") == 0 || strcasecmp(argv[iarg],"-block") == 0 ){
doblk = 1 ; iarg++ ; continue ;
}
if( strcasecmp(argv[iarg],"-vsig") == 0 ){
if( vsim != NULL ) ERROR_exit("Can't use -vsig twice!") ;
if( ++iarg >= argc ) ERROR_exit("Need argument after -vsig") ;
vsim = mri_read_1D(argv[iarg]) ;
if( vsim == NULL ) ERROR_exit("Can't read -vsig file '%s'",argv[iarg]) ;
iarg++ ; continue ;
}
/*--- user should be flogged ---*/
ERROR_message("Monstrously illegal option '%s'",argv[iarg]) ;
suggest_best_prog_option(argv[0], argv[iarg]);
exit(1);
}
/*--- user should be flogged twice ---*/
if( argc < 2 ){
usage_1dCorrelate(1) ; exit(0) ;
}
if( iarg == argc )
ERROR_exit("No 1D files on command line!?\n") ;
/* the function to compute the correlation */
corfun = cor_func[cormeth] ;
/* check and assemble list of input 1D files */
ff = iarg ;
INIT_IMARR(tar) ;
for( ; iarg < argc ; iarg++ ){
tim = mri_read_1D( argv[iarg] ) ;
if( tim == NULL ) ERROR_exit("Can't read 1D file '%s'",argv[iarg]) ;
if( nx == 0 ){
nx = tim->nx ;
if( nx < 3 )
ERROR_exit("1D file '%.77s' length=%d is less than 3",argv[iarg],nx) ;
else if( nx < 7 )
WARNING_message("1D file '%.77s' length=%d is less than 7",argv[iarg],nx) ;
} else if( tim->nx != nx ){
ERROR_exit("Length of 1D file '%.77s' [%d] doesn't match first file [%d]",
argv[iarg] , tim->nx , nx );
}
nvec += tim->ny ;
ADDTO_IMARR(tar,tim) ;
}
/* user is really an idiot -- flogging's too good for him */
if( nvec < 2 ) ERROR_exit("Must have at least 2 input columns!") ;
if( nx < 20 && doblk ){
doblk = 0 ;
WARNING_message("Column length %d < 20 ==> cannot use block resampling",nx) ;
}
if( vsim != NULL ){
if( vsim->nvox < nvec )
ERROR_exit("-vsig file only has %d entries, but needs at least %d",vsim->nvox,nvec) ;
vsig = MRI_FLOAT_PTR(vsim) ;
}
/* create vectors from 1D files */
tvec = (float **)malloc( sizeof(float *)*nvec ) ;
vecnam = (char **)malloc( sizeof(char *)*nvec ) ;
for( jj=0 ; jj < nvec ; jj++ ){
tvec[jj] = (float *)malloc( sizeof(float)*nx ) ;
vecnam[jj] = (char *)malloc(sizeof(char)*THD_MAX_NAME) ;
}
/* copy data into new space, create output labels, check for stoopiditees */
for( kk=mm=0 ; mm < IMARR_COUNT(tar) ; mm++ ){
tim = IMARR_SUBIM(tar,mm) ;
far = MRI_FLOAT_PTR(tim) ;
tnam = tim->name ; if( tnam == NULL ) tnam = "File" ;
for( jj=0 ; jj < tim->ny ; jj++,kk++ ){
for( ii=0 ; ii < nx ; ii++ ) tvec[kk][ii] = far[ii+jj*nx] ;
sprintf(vecnam[kk],"%s[%d]",THD_trailname(tnam,0),jj) ; /* vector name */
iarg = strlen(vecnam[kk]) ; vlen = MAX(vlen,iarg) ;
if( THD_is_constant(nx,tvec[kk]) )
ERROR_exit("Column %s is constant!",vecnam[kk]) ;
}
}
DESTROY_IMARR(tar) ;
/*--- Print a beeyootiful header ---*/
printf("# %s correlation [n=%d #col=%d]\n",cor_name[cormeth],nx,nvec) ;
sprintf(fmt,"# %%-%ds %%-%ds",vlen,vlen) ;
printf(fmt,"Name","Name") ;
printf(" Value BiasCorr %5.2f%% %5.2f%%",50.0f*alpha,100.0f-50.0f*alpha) ;
if( cormeth == 0 ) /* Pearson */
printf(" N:%5.2f%% N:%5.2f%%",50.0f*alpha,100.0f-50.0f*alpha) ;
printf("\n") ;
printf("# ") ;
for( ii=0 ; ii < vlen ; ii++ ) printf("-") ;
printf(" ") ;
for( ii=0 ; ii < vlen ; ii++ ) printf("-") ;
printf(" ") ;
printf(" --------") ;
printf(" --------") ;
printf(" --------") ;
printf(" --------") ;
if( cormeth == 0 ){ printf(" --------") ; printf(" --------") ; }
printf("\n") ;
if( cormeth != 0 ) /* non-Pearson */
sprintf(fmt," %%-%ds %%-%ds %%+8.5f %%+8.5f %%+8.5f %%+8.5f\n",vlen,vlen) ;
else /* Pearson */
sprintf(fmt," %%-%ds %%-%ds %%+8.5f %%+8.5f %%+8.5f %%+8.5f %%+8.5f %%+8.5f\n",vlen,vlen) ;
/*--- Do some actual work for a suprising change ---*/
for( jj=0 ; jj < nvec ; jj++ ){ /* loops over column pairs */
for( kk=jj+1 ; kk < nvec ; kk++ ){
if( vsig != NULL ){ xsig = vsig[jj]; ysig = vsig[kk]; } else { xsig = ysig = 0.0f; }
qcor = Corrboot( nx, tvec[jj], tvec[kk], xsig, ysig, corfun ) ; /* outsourced */
corst = qcor.a ; cor025 = qcor.b ; cor500 = qcor.c ; cor975 = qcor.d ;
if( cormeth == 0 ){ /* Pearson */
pci = PCorrCI( nx , corst , alpha ) ;
printf(fmt, vecnam[jj], vecnam[kk], corst, cor500, cor025, cor975, pci.a,pci.b ) ;
} else { /* all other methods */
printf(fmt, vecnam[jj], vecnam[kk], corst, cor500, cor025, cor975 ) ;
}
}
}
/* Finished -- go back to watching Star Trek reruns -- Tribbles ahoy, Cap'n! */
exit(0) ;
}
/*----------------------------------------------------------------------------*/
/* Correlate 2 vectors of length n, using the function 'cfun'.
Return values are in a float_quad with four elements:
.a = standard correlation estimate
.b = lower edge (alpha/2) of bias-corrected bootstrap confidence interval
.c = middle (50%) of bootstrap confidence interval
.d = upper edge (1-alpha/2) of bootstrap confidence interval
*//*--------------------------------------------------------------------------*/
static float_quad Corrboot( int n , float *x , float *y ,
float xsig , float ysig ,
float (*cfun)(int,float *,float *) )
{
float *xar , *yar , *cbb , corst ;
float_quad res = {0.0f,-1.0f,0.0f,1.0f} ;
float_triple bci ;
int ii,jj,kk , nn=n ;
ENTRY("Corrboot") ;
if( nn < 3 || x == NULL || y == NULL || cfun == NULL ) RETURN(res) ;
if( xsig < 0.0f ) xsig = 0.0f ;
if( ysig < 0.0f ) ysig = 0.0f ;
/* workspaces */
xar = (float *)malloc(sizeof(float)*nn) ; /* resampled x vector */
yar = (float *)malloc(sizeof(float)*nn) ; /* resampled y vector */
cbb = (float *)malloc(sizeof(float)*nboot) ; /* saved correlations */
/* compute the un-resampled result */
for( ii=0 ; ii < nn ; ii++ ){ xar[ii] = x[ii] ; yar[ii] = y[ii] ; }
corst = cfun(nn,xar,yar) ; /* standard result */
/* compute bootstrap results */
for( kk=0 ; kk < nboot ; kk++ ){
if( !doblk ){ /* simple resampling */
for( ii=0 ; ii < nn ; ii++ ){
jj = lrand48() % nn ; xar[ii] = x[jj] ; yar[ii] = y[jj] ;
}
} else { /* block resampling */
int jold = lrand48() % nn ;
xar[0] = x[jold] ; yar[0] = y[jold] ;
for( ii=1 ; ii < nn ; ii++ ){
if( lrand48()%8 == 0 ){ /* 12.5% chance of random jump */
jj = lrand48() % nn ;
} else {
jj = jold+1 ; if( jj == nn ) jj = 0 ;
}
xar[ii] = x[jj] ; yar[ii] = y[jj] ; jold = jj ;
}
}
if( xsig > 0.0f || ysig > 0.0f ){
for( ii=0; ii < nn; ii++ ){
xar[ii] += zgaussian()*xsig; yar[ii] += zgaussian()*ysig;
}
}
cbb[kk] = cfun(nn,xar,yar) ; /* bootstrap result */
}
/* get the actual bias-corrected results [cf. thd_correlate.c] */
bci = THD_bootstrap_confinv( corst , alpha , nboot , cbb ) ;
/* empty the dustbin */
free(cbb) ; free(yar) ; free(xar) ;
/* this is bad */
if( bci.a == 0.0f && bci.b == 0.0f && bci.c == 0.0f ) RETURN(res) ;
/* this is good */
res.a = corst ; res.b = bci.a ; res.c = bci.b ; res.d = bci.c ;
RETURN(res) ;
}
/*----------------------------------------------------------------------------*/
/* Utility functions for PCorrCI (infra). */
#undef MYatanh
#define MYatanh(x) ( ((x)<-0.999329f) ? -4.0f \
:((x)>+0.999329f) ? +4.0f : atanhf(x) )
static float MYtanh( float x )
{
register float ex , exi ;
if( x > 7.0f ) return 1.0f ; /* check for stupid inputs */
else if( x < -7.0f ) return -1.0f ;
ex = exp(x) ; exi = 1.0f/ex ;
return (ex-exi)/(ex+exi) ;
}
/*----------------------------------------------------------------------------*/
/* Get the bivariate normal theory confidence interval for Pearson */
static float_pair PCorrCI( int npt , float cor , float alph )
{
float_pair ci = {-1.0f,1.0f} ;
float zc , zb,zt , dz ;
if( npt < 4 || cor <= -1.0f || cor >= 1.0f ) return ci ; /* bad inputs */
zc = MYatanh(cor) ; /* Piscatorial transformation */
dz = qginv(0.5*alph) / sqrt(npt-3.0) ; /* cf. mri_stats.c for qginv() */
ci.a = MYtanh(zc-dz) ; /* lower bound */
ci.b = MYtanh(zc+dz) ; /* upper bound */
return ci ;
}