Permalink
Browse files

Correlation matrix optimized. Better specs

* Created benchmarks directory
* Optimized correlation matrix. Use gsl matrix algebra or pairwise correlations depending on empiric calculated equations. See benchmarks/correlation_matrix.rb to see implementation of calculation
* Moved tests fixtures from data to test/fixtures
  • Loading branch information...
1 parent 6c17e08 commit bccd7be14d181cb0ffed9be65b87f3a0c4138291 @clbustos committed Jan 18, 2011
View
@@ -42,7 +42,7 @@ h=Hoe.spec('statsample') do
self.developer('Claudio Bustos', 'clbustos@gmail.com')
self.extra_deps << ["spreadsheet","~>0.6.5"] << ["reportbuilder", "~>1.4"] << ["minimization", "~>0.2.0"] << ["fastercsv", ">0"] << ["dirty-memoize", "~>0.0"] << ["extendmatrix","~>0.3.1"] << ["statsample-bivariate-extension", ">0"] << ["rserve-client", "~>0.2.5"] << ["rubyvis", "~>0.4.0"]
- self.extra_dev_deps << ["hoe","~>0"] << ["shoulda","~>0"] << ["minitest", "~>2.0"] << ["rserve-client", "~>0"]
+ self.extra_dev_deps << ["hoe","~>0"] << ["shoulda","~>0"] << ["minitest", "~>2.0"] << ["rserve-client", "~>0"] << ["gettext", "~>0"]
self.clean_globs << "test/images/*" << "demo/item_analysis/*" << "demo/Regression"
self.post_install_message = <<-EOF
***************************************************
Binary file not shown.
@@ -0,0 +1,91 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html;charset=utf-8" >
+<title>Correlation matrix analysis</title>
+ <style>
+ body {
+ margin:0;
+ padding:1em;
+ }
+ table {
+ border-collapse: collapse;
+
+ }
+ table td {
+ border: 1px solid black;
+ }
+ .section {
+ margin:0.5em;
+ }
+ </style>
+
+</head><body>
+<h1>Correlation matrix analysis</h1><div id='toc'><div class='title'>List of contents</div>
+<ul>
+<li><a href='#toc_1'>Multiple Regression: cases,vars,time_optimized over time_optimized</a></li>
+<ul>
+<li><a href='#toc_2'>ANOVA</a></li>
+</ul>
+<li><a href='#toc_3'>Multiple Regression: cases,vars,time_pairwise over time_pairwise</a></li>
+<ul>
+<li><a href='#toc_4'>ANOVA</a></li>
+</ul>
+</ul>
+</div>
+<div class='tot'><div class='title'>List of tables</div><ul><li><a href='#table_1'>ANOVA Table</a></li><li><a href='#table_2'>Beta coefficients</a></li><li><a href='#table_3'>ANOVA Table</a></li><li><a href='#table_4'>Beta coefficients</a></li></ul></div>
+ <div class='section'><h2>Multiple Regression: cases,vars,time_optimized over time_optimized</h2><a name='toc_1'></a>
+ <p>Engine: Statsample::Regression::Multiple::GslEngine</p>
+ <p>Cases(listwise)=63(63)</p>
+ <p>R=0.917</p>
+ <p>R^2=0.840</p>
+ <p>R^2 Adj=0.835</p>
+ <p>Std.Error R=4.889</p>
+ <p>Equation=0.770 + 0.031cases + 0.555vars</p>
+ <div class='section'><h3>ANOVA</h3><a name='toc_2'></a>
+ <a name='table_1'></a><table><caption>ANOVA Table</caption><thead><th>source</th><th>ss</th><th>df</th><th>ms</th><th>f</th><th>p</th></thead>
+<tbody>
+<tr><td>Regression</td><td>7534.647</td><td>2</td><td>3767.324</td><td>157.636</td><td>0.000</td></tr>
+<tr><td>Error</td><td>1433.932</td><td>60</td><td>23.899</td><td></td><td></td></tr>
+<tr><td>Total</td><td>8968.579</td><td>62</td><td>3791.222</td><td></td><td></td></tr>
+</tbody>
+</table>
+
+ </div>
+ <a name='table_2'></a><table><caption>Beta coefficients</caption><thead><th>coeff</th><th>b</th><th>beta</th><th>se</th><th>t</th></thead>
+<tbody>
+<tr><td>Constant</td><td>0.770</td><td>-</td><td>1.013</td><td>0.760</td></tr>
+<tr><td>cases</td><td>0.031</td><td>0.795</td><td>0.002</td><td>15.410</td></tr>
+<tr><td>vars</td><td>0.555</td><td>0.455</td><td>0.063</td><td>8.820</td></tr>
+</tbody>
+</table>
+
+ </div>
+ <div class='section'><h2>Multiple Regression: cases,vars,time_pairwise over time_pairwise</h2><a name='toc_3'></a>
+ <p>Engine: Statsample::Regression::Multiple::GslEngine</p>
+ <p>Cases(listwise)=63(63)</p>
+ <p>R=0.987</p>
+ <p>R^2=0.974</p>
+ <p>R^2 Adj=0.973</p>
+ <p>Std.Error R=2.308</p>
+ <p>Equation=-2.221 + 0.007cases + 1.390vars</p>
+ <div class='section'><h3>ANOVA</h3><a name='toc_4'></a>
+ <a name='table_3'></a><table><caption>ANOVA Table</caption><thead><th>source</th><th>ss</th><th>df</th><th>ms</th><th>f</th><th>p</th></thead>
+<tbody>
+<tr><td>Regression</td><td>11990.075</td><td>2</td><td>5995.038</td><td>1125.103</td><td>0.000</td></tr>
+<tr><td>Error</td><td>319.706</td><td>60</td><td>5.328</td><td></td><td></td></tr>
+<tr><td>Total</td><td>12309.781</td><td>62</td><td>6000.366</td><td></td><td></td></tr>
+</tbody>
+</table>
+
+ </div>
+ <a name='table_4'></a><table><caption>Beta coefficients</caption><thead><th>coeff</th><th>b</th><th>beta</th><th>se</th><th>t</th></thead>
+<tbody>
+<tr><td>Constant</td><td>-2.221</td><td>-</td><td>0.478</td><td>-4.643</td></tr>
+<tr><td>cases</td><td>0.007</td><td>0.158</td><td>0.001</td><td>7.572</td></tr>
+<tr><td>vars</td><td>1.390</td><td>0.974</td><td>0.030</td><td>46.828</td></tr>
+</tbody>
+</table>
+
+ </div>
+</body></html>
@@ -0,0 +1,70 @@
+# This test create a database to adjust the best algorithm
+# to use on correlation matrix
+require(File.expand_path(File.dirname(__FILE__)+'/helpers_benchmark.rb'))
+require 'statsample'
+require 'benchmark'
+
+def create_dataset(vars,cases)
+ ran=Distribution::Normal.rng_ugaussian
+ ds=vars.times.inject({}) {|ac,v|
+ ac["x#{v}"]=Statsample::Vector.new_scale(cases) {ran.call}
+ ac
+ }.to_dataset
+end
+
+def prediction_pairwise(vars,cases)
+ ((-2.192+0.007*cases+1.392*vars)**2) / 1000.0
+end
+def prediction_optimized(vars,cases)
+ ((0.897+0.030*cases+0.515*vars)**2) / 1000.0
+end
+
+
+if File.mtime(__FILE__)>File.mtime("results.ds")
+
+ reps=100 #number of repetitions
+
+
+ ds_sizes=[5,10,30,50,100,150,200,500,1000]
+ ds_vars=[2,3,4,5,10,20,30]
+ rs=Statsample::Dataset.new(%w{cases vars time_optimized time_pairwise})
+ ds_sizes.each do |cases|
+ ds_vars.each do |vars|
+ ds=create_dataset(vars,cases)
+ time_optimized= Benchmark.realtime do
+ reps.times {
+ Statsample::Bivariate.correlation_matrix_optimized(ds)
+ ds.clear_gsl
+ }
+ end
+
+ time_pairwise= Benchmark.realtime do
+ reps.times {
+ Statsample::Bivariate.correlation_matrix_pairwise(ds)
+ }
+ end
+
+ puts "Cases:#{cases}, vars:#{vars} -> opt:%0.3f (%0.3f) | pair: %0.3f (%0.3f)" % [time_optimized, prediction_optimized(vars,cases), time_pairwise, prediction_pairwise(vars,cases)]
+
+ rs.add_case({'cases'=>cases,'vars'=>vars,'time_optimized'=>Math.sqrt(time_optimized*1000),'time_pairwise'=>Math.sqrt(time_pairwise*1000)})
+ end
+ end
+
+ rs.update_valid_data
+ rs.save("results.ds")
+ Statsample::Excel.write(rs,"correlation_matrix.xls")
+else
+ rs=Statsample.load("results.ds")
+end
+
+
+rs.fields.each {|f| rs[f].type=:scale}
+
+
+rb=ReportBuilder.new(:name=>"Correlation matrix analysis")
+
+rb.add(Statsample::Regression.multiple(rs[['cases','vars','time_optimized']],'time_optimized'))
+rb.add(Statsample::Regression.multiple(rs[['cases','vars','time_pairwise']],'time_pairwise'))
+
+
+rb.save_html("correlation_matrix.html")
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,31 @@
+require(File.expand_path(File.dirname(__FILE__)+'/helpers_benchmark.rb'))
+
+extend BenchPress
+cases=250
+vars=20
+
+
+name "gsl matrix based vs. manual ruby correlation matrix (#{vars} vars, #{cases} cases)"
+author 'Clbustos'
+date '2011-01-18'
+summary "
+A correlation matrix could be constructed using matrix algebra or
+mannualy, calculating covariances, means and sd for each pair of vectors.
+In this test, we test the calculation using #{vars} variables with
+#{cases} cases on each vector
+"
+
+reps 200 #number of repetitions
+
+ds=vars.times.inject({}) {|ac,v|
+ac["x#{v}"]=Statsample::Vector.new_scale(cases) {rand()}
+ac
+}.to_dataset
+
+measure "Statsample::Bivariate.correlation_matrix_optimized" do
+ Statsample::Bivariate.correlation_matrix_optimized(ds)
+end
+
+measure "Statsample::Bivariate.correlation_matrix_pairwise" do
+ Statsample::Bivariate.correlation_matrix_pairwise(ds)
+end
@@ -0,0 +1,32 @@
+require(File.expand_path(File.dirname(__FILE__)+'/helpers_benchmark.rb'))
+
+extend BenchPress
+cases=500
+vars=5
+
+
+name "gsl matrix based vs. manual ruby correlation matrix (#{vars} vars, #{cases} cases)"
+author 'Clbustos'
+date '2011-01-18'
+summary "
+A correlation matrix could be constructed using matrix algebra or
+mannualy, calculating covariances, means and sd for each pair of vectors.
+In this test, we test the calculation using #{vars} variables with
+#{cases} cases on each vector
+"
+
+reps 200 #number of repetitions
+
+
+ds=vars.times.inject({}) {|ac,v|
+ac["x#{v}"]=Statsample::Vector.new_scale(cases) {rand()}
+ac
+}.to_dataset
+
+measure "Statsample::Bivariate.correlation_matrix_optimized" do
+ Statsample::Bivariate.correlation_matrix_optimized(ds)
+end
+
+measure "Statsample::Bivariate.correlation_matrix_pairwise" do
+ Statsample::Bivariate.correlation_matrix_pairwise(ds)
+end
Binary file not shown.
@@ -0,0 +1,5 @@
+$:.unshift(File.expand_path(File.dirname(__FILE__)+'/../lib/'))
+$:.unshift(File.expand_path(File.dirname(__FILE__)+'/'))
+
+require 'statsample'
+require 'bench_press'
View
Binary file not shown.
@@ -101,6 +101,20 @@ def prop_pearson(t, size, tails=:both)
cdf*n_tails
end
end
+
+
+ # Predicted time for pairwise correlation matrix, in miliseconds
+ # See benchmarks/correlation_matrix.rb to see mode of calculation
+
+ def prediction_pairwise(vars,cases)
+ (-2.192+0.007*cases+1.392*vars)**2
+ end
+ # Predicted time for optimized correlation matrix, in miliseconds
+ # See benchmarks/correlation_matrix.rb to see mode of calculation
+
+ def prediction_optimized(vars,cases)
+ (0.897+0.030*cases+0.515*vars)**2
+ end
# Returns residual score after delete variance
# from another variable
#
@@ -128,10 +142,36 @@ def partial_correlation(v1,v2,control)
end
+ def covariance_matrix_optimized(ds)
+ x=ds.to_gsl
+ n=x.row_size
+ m=x.column_size
+ means=((1/n.to_f)*GSL::Matrix.ones(1,n)*x).row(0)
+ centered=x-(GSL::Matrix.ones(n,m)*GSL::Matrix.diag(means))
+ ss=centered.transpose*centered
+ s=((1/(n-1).to_f))*ss
+ s
+ end
+
# Covariance matrix.
# Order of rows and columns depends on Dataset#fields order
def covariance_matrix(ds)
+ vars,cases=ds.fields.size,ds.cases
+
+ if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
+ cm=covariance_matrix_optimized(ds)
+ else
+ cm=covariance_matrix_pairwise(ds)
+
+ end
+ cm.extend(Statsample::CovariateMatrix)
+ cm.fields=ds.fields
+ cm
+ end
+
+
+ def covariance_matrix_pairwise(ds)
cache={}
matrix=ds.collect_matrix do |row,col|
if (ds[row].type!=:scale or ds[col].type!=:scale)
@@ -148,15 +188,35 @@ def covariance_matrix(ds)
end
end
end
- matrix.extend CovariateMatrix
- matrix.fields=ds.fields
matrix
end
# Correlation matrix.
# Order of rows and columns depends on Dataset#fields order
-
def correlation_matrix(ds)
+ vars,cases=ds.fields.size,ds.cases
+ if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
+ cm=correlation_matrix_optimized(ds)
+ else
+ cm=correlation_matrix_pairwise(ds)
+
+ end
+ cm.extend(Statsample::CovariateMatrix)
+ cm.fields=ds.fields
+ cm
+ end
+
+ def correlation_matrix_optimized(ds)
+ s=covariance_matrix_optimized(ds)
+ sds=GSL::Matrix.diagonal(s.diagonal.sqrt.pow(-1))
+ cm=sds*s*sds
+ # Fix diagonal
+ s.row_size.times {|i|
+ cm[i,i]=1.0
+ }
+ cm
+ end
+ def correlation_matrix_pairwise(ds)
cache={}
cm=ds.collect_matrix do |row,col|
if row==col
@@ -173,9 +233,6 @@ def correlation_matrix(ds)
end
end
end
- cm.extend(Statsample::CovariateMatrix)
- cm.fields=ds.fields
- cm
end
# Retrieves the n valid pairwise.
Oops, something went wrong.

0 comments on commit bccd7be

Please sign in to comment.