-
Notifications
You must be signed in to change notification settings - Fork 96
/
bivariate.rb
389 lines (365 loc) · 12.5 KB
/
bivariate.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
require 'statsample/bivariate/pearson'
module Statsample
# Diverse methods and classes to calculate bivariate relations
# Specific classes:
# * Statsample::Bivariate::Pearson : Pearson correlation coefficient (r)
# * Statsample::Bivariate::Tetrachoric : Tetrachoric correlation
# * Statsample::Bivariate::Polychoric : Polychoric correlation (using joint, two-step and polychoric series)
module Bivariate
autoload(:Polychoric, 'statsample/bivariate/polychoric')
autoload(:Tetrachoric, 'statsample/bivariate/tetrachoric')
class << self
# Covariance between two vectors
def covariance(v1,v2)
v1a,v2a=Statsample.only_valid_clone(v1,v2)
return nil if v1a.size==0
if Statsample.has_gsl?
GSL::Stats::covariance(v1a.gsl, v2a.gsl)
else
covariance_slow(v1a,v2a)
end
end
# Estimate the ML between two dichotomic vectors
def maximum_likehood_dichotomic(pred,real)
preda,reala=Statsample.only_valid_clone(pred,real)
sum=0
preda.each_index{|i|
sum+=(reala[i]*Math::log(preda[i])) + ((1-reala[i])*Math::log(1-preda[i]))
}
sum
end
def covariance_slow(v1,v2) # :nodoc:
v1a,v2a=Statsample.only_valid(v1,v2)
sum_of_squares(v1a,v2a) / (v1a.size-1)
end
def sum_of_squares(v1,v2)
v1a,v2a=Statsample.only_valid_clone(v1,v2)
m1=v1a.mean
m2=v2a.mean
(v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
end
# Calculate Pearson correlation coefficient (r) between 2 vectors
def pearson(v1,v2)
v1a,v2a=Statsample.only_valid_clone(v1,v2)
return nil if v1a.size ==0
if Statsample.has_gsl?
GSL::Stats::correlation(v1a.gsl, v2a.gsl)
else
pearson_slow(v1a,v2a)
end
end
def pearson_slow(v1,v2) # :nodoc:
v1a,v2a=Statsample.only_valid_clone(v1,v2)
# Calculate sum of squares
ss=sum_of_squares(v1a,v2a)
ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares))
end
alias :correlation :pearson
# Retrieves the value for t test for a pearson correlation
# between two vectors to test the null hipothesis of r=0
def t_pearson(v1,v2)
v1a,v2a=Statsample.only_valid_clone(v1,v2)
r=pearson(v1a,v2a)
if(r==1.0)
0
else
t_r(r,v1a.size)
end
end
# Retrieves the value for t test for a pearson correlation
# giving r and vector size
# Source : http://faculty.chass.ncsu.edu/garson/PA765/correl.htm
def t_r(r,size)
r * Math::sqrt(((size)-2).to_f / (1 - r**2))
end
# Retrieves the probability value (a la SPSS)
# for a given t, size and number of tails.
# Uses a second parameter
# * :both or 2 : for r!=0 (default)
# * :right, :positive or 1 : for r > 0
# * :left, :negative : for r < 0
def prop_pearson(t, size, tails=:both)
tails=:both if tails==2
tails=:right if tails==1 or tails==:positive
tails=:left if tails==:negative
n_tails=case tails
when :both then 2
else 1
end
t=-t if t>0 and (tails==:both)
cdf=Distribution::T.cdf(t, size-2)
if(tails==:right)
1.0-(cdf*n_tails)
else
cdf*n_tails
end
end
# Predicted time for pairwise correlation matrix, in miliseconds
# See benchmarks/correlation_matrix.rb to see mode of calculation
def prediction_pairwise(vars,cases)
((-0.518111-0.000746*cases+1.235608*vars+0.000740*cases*vars)**2) / 100
end
# Predicted time for optimized correlation matrix, in miliseconds
# See benchmarks/correlation_matrix.rb to see mode of calculation
def prediction_optimized(vars,cases)
((4+0.018128*cases+0.246871*vars+0.001169*vars*cases)**2) / 100
end
# Returns residual score after delete variance
# from another variable
#
def residuals(from,del)
r=Statsample::Bivariate.pearson(from,del)
froms, dels = from.vector_standarized, del.vector_standarized
nv=[]
froms.data_with_nils.each_index do |i|
if froms[i].nil? or dels[i].nil?
nv.push(nil)
else
nv.push(froms[i]-r*dels[i])
end
end
nv.to_vector(:scale)
end
# Correlation between v1 and v2, controling the effect of
# control on both.
def partial_correlation(v1,v2,control)
v1a,v2a,cona=Statsample.only_valid_clone(v1,v2,control)
rv1v2=pearson(v1a,v2a)
rv1con=pearson(v1a,cona)
rv2con=pearson(v2a,cona)
(rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
end
def covariance_matrix_optimized(ds)
x=ds.to_gsl
n=x.row_size
m=x.column_size
means=((1/n.to_f)*GSL::Matrix.ones(1,n)*x).row(0)
centered=x-(GSL::Matrix.ones(n,m)*GSL::Matrix.diag(means))
ss=centered.transpose*centered
s=((1/(n-1).to_f))*ss
s
end
# Covariance matrix.
# Order of rows and columns depends on Dataset#fields order
def covariance_matrix(ds)
vars,cases=ds.fields.size,ds.cases
if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
cm=covariance_matrix_optimized(ds)
else
cm=covariance_matrix_pairwise(ds)
end
cm.extend(Statsample::CovariateMatrix)
cm.fields=ds.fields
cm
end
def covariance_matrix_pairwise(ds)
cache={}
matrix=ds.collect_matrix do |row,col|
if (ds[row].type!=:scale or ds[col].type!=:scale)
nil
elsif row==col
ds[row].variance
else
if cache[[col,row]].nil?
cov=covariance(ds[row],ds[col])
cache[[row,col]]=cov
cov
else
cache[[col,row]]
end
end
end
matrix
end
# Correlation matrix.
# Order of rows and columns depends on Dataset#fields order
def correlation_matrix(ds)
vars,cases=ds.fields.size,ds.cases
if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
cm=correlation_matrix_optimized(ds)
else
cm=correlation_matrix_pairwise(ds)
end
cm.extend(Statsample::CovariateMatrix)
cm.fields=ds.fields
cm
end
def correlation_matrix_optimized(ds)
s=covariance_matrix_optimized(ds)
sds=GSL::Matrix.diagonal(s.diagonal.sqrt.pow(-1))
cm=sds*s*sds
# Fix diagonal
s.row_size.times {|i|
cm[i,i]=1.0
}
cm
end
def correlation_matrix_pairwise(ds)
cache={}
cm=ds.collect_matrix do |row,col|
if row==col
1.0
elsif (ds[row].type!=:scale or ds[col].type!=:scale)
nil
else
if cache[[col,row]].nil?
r=pearson(ds[row],ds[col])
cache[[row,col]]=r
r
else
cache[[col,row]]
end
end
end
end
# Retrieves the n valid pairwise.
def n_valid_matrix(ds)
ds.collect_matrix do |row,col|
if row==col
ds[row].valid_data.size
else
rowa,rowb=Statsample.only_valid_clone(ds[row],ds[col])
rowa.size
end
end
end
# Matrix of correlation probabilities.
# Order of rows and columns depends on Dataset#fields order
def correlation_probability_matrix(ds, tails=:both)
rows=ds.fields.collect do |row|
ds.fields.collect do |col|
v1a,v2a=Statsample.only_valid_clone(ds[row],ds[col])
(row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
end
end
Matrix.rows(rows)
end
# Spearman ranked correlation coefficient (rho) between 2 vectors
def spearman(v1,v2)
v1a,v2a=Statsample.only_valid_clone(v1,v2)
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
pearson(v1r,v2r)
end
# Calculate Point biserial correlation. Equal to Pearson correlation, with
# one dichotomous value replaced by "0" and the other by "1"
def point_biserial(dichotomous,continous)
ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid
raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2
raise(TypeError, "Second vector should be continous") if ds['c'].type!=:scale
f0=ds['d'].factors.sort[0]
m0=ds.filter_field('c') {|c| c['d']==f0}
m1=ds.filter_field('c') {|c| c['d']!=f0}
((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
end
# Kendall Rank Correlation Coefficient (Tau a)
# Based on Hervé Adbi article
def tau_a(v1,v2)
v1a,v2a=Statsample.only_valid_clone(v1,v2)
n=v1.size
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
o1=ordered_pairs(v1r)
o2=ordered_pairs(v2r)
delta= o1.size*2-(o2 & o1).size*2
1-(delta * 2 / (n*(n-1)).to_f)
end
# Calculates Goodman and Kruskal’s Tau b correlation.
# Tb is an asymmetric P-R-E measure of association for nominal scales
# (Mielke, X)
#
# Tau-b defines perfect association as strict monotonicity. Although it
# requires strict monotonicity to reach 1.0, it does not penalize ties as
# much as some other measures.
# == Reference
# Mielke, P. GOODMAN–KRUSKAL TAU AND GAMMA.
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
def tau_b(matrix)
v=pairs(matrix)
((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f)
end
# Calculates Goodman and Kruskal's gamma.
#
# Gamma is the surplus of concordant pairs over discordant pairs, as a
# percentage of all pairs ignoring ties.
#
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
def gamma(matrix)
v=pairs(matrix)
(v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f
end
# Calculate indexes for a matrix the rows and cols has to be ordered
def pairs(matrix)
# calculate concordant #p matrix
rs=matrix.row_size
cs=matrix.column_size
conc=disc=ties_x=ties_y=0
(0...(rs-1)).each do |x|
(0...(cs-1)).each do |y|
((x+1)...rs).each do |x2|
((y+1)...cs).each do |y2|
# #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
conc+=matrix[x,y]*matrix[x2,y2]
end
end
end
end
(0...(rs-1)).each {|x|
(1...(cs)).each{|y|
((x+1)...rs).each{|x2|
(0...y).each{|y2|
# #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
disc+=matrix[x,y]*matrix[x2,y2]
}
}
}
}
(0...(rs-1)).each {|x|
(0...(cs)).each{|y|
((x+1)...(rs)).each{|x2|
ties_x+=matrix[x,y]*matrix[x2,y]
}
}
}
(0...rs).each {|x|
(0...(cs-1)).each{|y|
((y+1)...(cs)).each{|y2|
ties_y+=matrix[x,y]*matrix[x,y2]
}
}
}
{'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
end
def ordered_pairs(vector)
d=vector.data
a=[]
(0...(d.size-1)).each{|i|
((i+1)...(d.size)).each {|j|
a.push([d[i],d[j]])
}
}
a
end
=begin
def sum_of_codeviated(v1,v2)
v1a,v2a=Statsample.only_valid(v1,v2)
sum=0
(0...v1a.size).each{|i|
sum+=v1a[i]*v2a[i]
}
sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
end
=end
# Report the minimum number of cases valid of a covariate matrix
# based on a dataset
def min_n_valid(ds)
min=ds.cases
m=n_valid_matrix(ds)
for x in 0...m.row_size
for y in 0...m.column_size
min=m[x,y] if m[x,y] < min
end
end
min
end
end
end
end