Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Correlation matrix optimized. Better specs

* Created benchmarks directory
* Optimized correlation matrix. Use gsl matrix algebra or pairwise correlations depending on empiric calculated equations. See benchmarks/correlation_matrix.rb to see implementation of calculation
* Moved tests fixtures from data to test/fixtures
  • Loading branch information...
commit bccd7be14d181cb0ffed9be65b87f3a0c4138291 1 parent 6c17e08
@clbustos authored
Showing with 453 additions and 47 deletions.
  1. +1 −1  Rakefile
  2. BIN  benchmarks/correlation_matrix.ds
  3. +91 −0 benchmarks/correlation_matrix.html
  4. +70 −0 benchmarks/correlation_matrix.rb
  5. BIN  benchmarks/correlation_matrix.xls
  6. BIN  benchmarks/correlation_matrix_1.xls
  7. +31 −0 benchmarks/correlation_matrix_15_variables.rb
  8. +32 −0 benchmarks/correlation_matrix_5_variables.rb
  9. BIN  benchmarks/correlation_matrix_gsl_ruby.ods
  10. +5 −0 benchmarks/helpers_benchmark.rb
  11. BIN  benchmarks/results.ds
  12. +63 −6 lib/statsample/bivariate.rb
  13. +22 −18 lib/statsample/dataset.rb
  14. +2 −0  lib/statsample/factor.rb
  15. +3 −0  lib/statsample/factor/parallelanalysis.rb
  16. +5 −4 lib/statsample/factor/pca.rb
  17. +3 −3 lib/statsample/graph/boxplot.rb
  18. +2 −1  lib/statsample/graph/histogram.rb
  19. +33 −0 lib/statsample/matrix.rb
  20. +1 −1  lib/statsample/vector.rb
  21. +2 −2 references.txt
  22. 0  {data → test/fixtures}/crime.txt
  23. 0  {data → test/fixtures}/hartman_23.matrix
  24. 0  {data → test/fixtures}/repeated_fields.csv
  25. 0  {data → test/fixtures}/test_binomial.csv
  26. 0  test/{ → fixtures}/test_csv.csv
  27. 0  test/{ → fixtures}/test_xls.xls
  28. 0  {data → test/fixtures}/tetmat_matrix.txt
  29. 0  {data → test/fixtures}/tetmat_test.txt
  30. +55 −1 test/test_bivariate.rb
  31. +2 −2 test/test_csv.rb
  32. +23 −1 test/test_dataset.rb
  33. +1 −1  test/test_logit.rb
  34. +2 −2 test/test_mle.rb
  35. +1 −1  test/test_regression.rb
  36. +2 −2 test/test_reliability.rb
  37. +1 −1  test/test_xls.rb
View
2  Rakefile
@@ -42,7 +42,7 @@ h=Hoe.spec('statsample') do
self.developer('Claudio Bustos', 'clbustos@gmail.com')
self.extra_deps << ["spreadsheet","~>0.6.5"] << ["reportbuilder", "~>1.4"] << ["minimization", "~>0.2.0"] << ["fastercsv", ">0"] << ["dirty-memoize", "~>0.0"] << ["extendmatrix","~>0.3.1"] << ["statsample-bivariate-extension", ">0"] << ["rserve-client", "~>0.2.5"] << ["rubyvis", "~>0.4.0"]
- self.extra_dev_deps << ["hoe","~>0"] << ["shoulda","~>0"] << ["minitest", "~>2.0"] << ["rserve-client", "~>0"]
+ self.extra_dev_deps << ["hoe","~>0"] << ["shoulda","~>0"] << ["minitest", "~>2.0"] << ["rserve-client", "~>0"] << ["gettext", "~>0"]
self.clean_globs << "test/images/*" << "demo/item_analysis/*" << "demo/Regression"
self.post_install_message = <<-EOF
***************************************************
View
BIN  benchmarks/correlation_matrix.ds
Binary file not shown
View
91 benchmarks/correlation_matrix.html
@@ -0,0 +1,91 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html;charset=utf-8" >
+<title>Correlation matrix analysis</title>
+ <style>
+ body {
+ margin:0;
+ padding:1em;
+ }
+ table {
+ border-collapse: collapse;
+
+ }
+ table td {
+ border: 1px solid black;
+ }
+ .section {
+ margin:0.5em;
+ }
+ </style>
+
+</head><body>
+<h1>Correlation matrix analysis</h1><div id='toc'><div class='title'>List of contents</div>
+<ul>
+<li><a href='#toc_1'>Multiple Regression: cases,vars,time_optimized over time_optimized</a></li>
+<ul>
+<li><a href='#toc_2'>ANOVA</a></li>
+</ul>
+<li><a href='#toc_3'>Multiple Regression: cases,vars,time_pairwise over time_pairwise</a></li>
+<ul>
+<li><a href='#toc_4'>ANOVA</a></li>
+</ul>
+</ul>
+</div>
+<div class='tot'><div class='title'>List of tables</div><ul><li><a href='#table_1'>ANOVA Table</a></li><li><a href='#table_2'>Beta coefficients</a></li><li><a href='#table_3'>ANOVA Table</a></li><li><a href='#table_4'>Beta coefficients</a></li></ul></div>
+ <div class='section'><h2>Multiple Regression: cases,vars,time_optimized over time_optimized</h2><a name='toc_1'></a>
+ <p>Engine: Statsample::Regression::Multiple::GslEngine</p>
+ <p>Cases(listwise)=63(63)</p>
+ <p>R=0.917</p>
+ <p>R^2=0.840</p>
+ <p>R^2 Adj=0.835</p>
+ <p>Std.Error R=4.889</p>
+ <p>Equation=0.770 + 0.031cases + 0.555vars</p>
+ <div class='section'><h3>ANOVA</h3><a name='toc_2'></a>
+ <a name='table_1'></a><table><caption>ANOVA Table</caption><thead><th>source</th><th>ss</th><th>df</th><th>ms</th><th>f</th><th>p</th></thead>
+<tbody>
+<tr><td>Regression</td><td>7534.647</td><td>2</td><td>3767.324</td><td>157.636</td><td>0.000</td></tr>
+<tr><td>Error</td><td>1433.932</td><td>60</td><td>23.899</td><td></td><td></td></tr>
+<tr><td>Total</td><td>8968.579</td><td>62</td><td>3791.222</td><td></td><td></td></tr>
+</tbody>
+</table>
+
+ </div>
+ <a name='table_2'></a><table><caption>Beta coefficients</caption><thead><th>coeff</th><th>b</th><th>beta</th><th>se</th><th>t</th></thead>
+<tbody>
+<tr><td>Constant</td><td>0.770</td><td>-</td><td>1.013</td><td>0.760</td></tr>
+<tr><td>cases</td><td>0.031</td><td>0.795</td><td>0.002</td><td>15.410</td></tr>
+<tr><td>vars</td><td>0.555</td><td>0.455</td><td>0.063</td><td>8.820</td></tr>
+</tbody>
+</table>
+
+ </div>
+ <div class='section'><h2>Multiple Regression: cases,vars,time_pairwise over time_pairwise</h2><a name='toc_3'></a>
+ <p>Engine: Statsample::Regression::Multiple::GslEngine</p>
+ <p>Cases(listwise)=63(63)</p>
+ <p>R=0.987</p>
+ <p>R^2=0.974</p>
+ <p>R^2 Adj=0.973</p>
+ <p>Std.Error R=2.308</p>
+ <p>Equation=-2.221 + 0.007cases + 1.390vars</p>
+ <div class='section'><h3>ANOVA</h3><a name='toc_4'></a>
+ <a name='table_3'></a><table><caption>ANOVA Table</caption><thead><th>source</th><th>ss</th><th>df</th><th>ms</th><th>f</th><th>p</th></thead>
+<tbody>
+<tr><td>Regression</td><td>11990.075</td><td>2</td><td>5995.038</td><td>1125.103</td><td>0.000</td></tr>
+<tr><td>Error</td><td>319.706</td><td>60</td><td>5.328</td><td></td><td></td></tr>
+<tr><td>Total</td><td>12309.781</td><td>62</td><td>6000.366</td><td></td><td></td></tr>
+</tbody>
+</table>
+
+ </div>
+ <a name='table_4'></a><table><caption>Beta coefficients</caption><thead><th>coeff</th><th>b</th><th>beta</th><th>se</th><th>t</th></thead>
+<tbody>
+<tr><td>Constant</td><td>-2.221</td><td>-</td><td>0.478</td><td>-4.643</td></tr>
+<tr><td>cases</td><td>0.007</td><td>0.158</td><td>0.001</td><td>7.572</td></tr>
+<tr><td>vars</td><td>1.390</td><td>0.974</td><td>0.030</td><td>46.828</td></tr>
+</tbody>
+</table>
+
+ </div>
+</body></html>
View
70 benchmarks/correlation_matrix.rb
@@ -0,0 +1,70 @@
+# This test create a database to adjust the best algorithm
+# to use on correlation matrix
+require(File.expand_path(File.dirname(__FILE__)+'/helpers_benchmark.rb'))
+require 'statsample'
+require 'benchmark'
+
+def create_dataset(vars,cases)
+ ran=Distribution::Normal.rng_ugaussian
+ ds=vars.times.inject({}) {|ac,v|
+ ac["x#{v}"]=Statsample::Vector.new_scale(cases) {ran.call}
+ ac
+ }.to_dataset
+end
+
+def prediction_pairwise(vars,cases)
+ ((-2.192+0.007*cases+1.392*vars)**2) / 1000.0
+end
+def prediction_optimized(vars,cases)
+ ((0.897+0.030*cases+0.515*vars)**2) / 1000.0
+end
+
+
+if File.mtime(__FILE__)>File.mtime("results.ds")
+
+ reps=100 #number of repetitions
+
+
+ ds_sizes=[5,10,30,50,100,150,200,500,1000]
+ ds_vars=[2,3,4,5,10,20,30]
+ rs=Statsample::Dataset.new(%w{cases vars time_optimized time_pairwise})
+ ds_sizes.each do |cases|
+ ds_vars.each do |vars|
+ ds=create_dataset(vars,cases)
+ time_optimized= Benchmark.realtime do
+ reps.times {
+ Statsample::Bivariate.correlation_matrix_optimized(ds)
+ ds.clear_gsl
+ }
+ end
+
+ time_pairwise= Benchmark.realtime do
+ reps.times {
+ Statsample::Bivariate.correlation_matrix_pairwise(ds)
+ }
+ end
+
+ puts "Cases:#{cases}, vars:#{vars} -> opt:%0.3f (%0.3f) | pair: %0.3f (%0.3f)" % [time_optimized, prediction_optimized(vars,cases), time_pairwise, prediction_pairwise(vars,cases)]
+
+ rs.add_case({'cases'=>cases,'vars'=>vars,'time_optimized'=>Math.sqrt(time_optimized*1000),'time_pairwise'=>Math.sqrt(time_pairwise*1000)})
+ end
+ end
+
+ rs.update_valid_data
+ rs.save("results.ds")
+ Statsample::Excel.write(rs,"correlation_matrix.xls")
+else
+ rs=Statsample.load("results.ds")
+end
+
+
+rs.fields.each {|f| rs[f].type=:scale}
+
+
+rb=ReportBuilder.new(:name=>"Correlation matrix analysis")
+
+rb.add(Statsample::Regression.multiple(rs[['cases','vars','time_optimized']],'time_optimized'))
+rb.add(Statsample::Regression.multiple(rs[['cases','vars','time_pairwise']],'time_pairwise'))
+
+
+rb.save_html("correlation_matrix.html")
View
BIN  benchmarks/correlation_matrix.xls
Binary file not shown
View
BIN  benchmarks/correlation_matrix_1.xls
Binary file not shown
View
31 benchmarks/correlation_matrix_15_variables.rb
@@ -0,0 +1,31 @@
+require(File.expand_path(File.dirname(__FILE__)+'/helpers_benchmark.rb'))
+
+extend BenchPress
+cases=250
+vars=20
+
+
+name "gsl matrix based vs. manual ruby correlation matrix (#{vars} vars, #{cases} cases)"
+author 'Clbustos'
+date '2011-01-18'
+summary "
+A correlation matrix could be constructed using matrix algebra or
+mannualy, calculating covariances, means and sd for each pair of vectors.
+In this test, we test the calculation using #{vars} variables with
+#{cases} cases on each vector
+"
+
+reps 200 #number of repetitions
+
+ds=vars.times.inject({}) {|ac,v|
+ac["x#{v}"]=Statsample::Vector.new_scale(cases) {rand()}
+ac
+}.to_dataset
+
+measure "Statsample::Bivariate.correlation_matrix_optimized" do
+ Statsample::Bivariate.correlation_matrix_optimized(ds)
+end
+
+measure "Statsample::Bivariate.correlation_matrix_pairwise" do
+ Statsample::Bivariate.correlation_matrix_pairwise(ds)
+end
View
32 benchmarks/correlation_matrix_5_variables.rb
@@ -0,0 +1,32 @@
+require(File.expand_path(File.dirname(__FILE__)+'/helpers_benchmark.rb'))
+
+extend BenchPress
+cases=500
+vars=5
+
+
+name "gsl matrix based vs. manual ruby correlation matrix (#{vars} vars, #{cases} cases)"
+author 'Clbustos'
+date '2011-01-18'
+summary "
+A correlation matrix could be constructed using matrix algebra or
+mannualy, calculating covariances, means and sd for each pair of vectors.
+In this test, we test the calculation using #{vars} variables with
+#{cases} cases on each vector
+"
+
+reps 200 #number of repetitions
+
+
+ds=vars.times.inject({}) {|ac,v|
+ac["x#{v}"]=Statsample::Vector.new_scale(cases) {rand()}
+ac
+}.to_dataset
+
+measure "Statsample::Bivariate.correlation_matrix_optimized" do
+ Statsample::Bivariate.correlation_matrix_optimized(ds)
+end
+
+measure "Statsample::Bivariate.correlation_matrix_pairwise" do
+ Statsample::Bivariate.correlation_matrix_pairwise(ds)
+end
View
BIN  benchmarks/correlation_matrix_gsl_ruby.ods
Binary file not shown
View
5 benchmarks/helpers_benchmark.rb
@@ -0,0 +1,5 @@
+$:.unshift(File.expand_path(File.dirname(__FILE__)+'/../lib/'))
+$:.unshift(File.expand_path(File.dirname(__FILE__)+'/'))
+
+require 'statsample'
+require 'bench_press'
View
BIN  benchmarks/results.ds
Binary file not shown
View
69 lib/statsample/bivariate.rb
@@ -101,6 +101,20 @@ def prop_pearson(t, size, tails=:both)
cdf*n_tails
end
end
+
+
+ # Predicted time for pairwise correlation matrix, in miliseconds
+ # See benchmarks/correlation_matrix.rb to see mode of calculation
+
+ def prediction_pairwise(vars,cases)
+ (-2.192+0.007*cases+1.392*vars)**2
+ end
+ # Predicted time for optimized correlation matrix, in miliseconds
+ # See benchmarks/correlation_matrix.rb to see mode of calculation
+
+ def prediction_optimized(vars,cases)
+ (0.897+0.030*cases+0.515*vars)**2
+ end
# Returns residual score after delete variance
# from another variable
#
@@ -128,10 +142,36 @@ def partial_correlation(v1,v2,control)
end
+ def covariance_matrix_optimized(ds)
+ x=ds.to_gsl
+ n=x.row_size
+ m=x.column_size
+ means=((1/n.to_f)*GSL::Matrix.ones(1,n)*x).row(0)
+ centered=x-(GSL::Matrix.ones(n,m)*GSL::Matrix.diag(means))
+ ss=centered.transpose*centered
+ s=((1/(n-1).to_f))*ss
+ s
+ end
+
# Covariance matrix.
# Order of rows and columns depends on Dataset#fields order
def covariance_matrix(ds)
+ vars,cases=ds.fields.size,ds.cases
+
+ if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
+ cm=covariance_matrix_optimized(ds)
+ else
+ cm=covariance_matrix_pairwise(ds)
+
+ end
+ cm.extend(Statsample::CovariateMatrix)
+ cm.fields=ds.fields
+ cm
+ end
+
+
+ def covariance_matrix_pairwise(ds)
cache={}
matrix=ds.collect_matrix do |row,col|
if (ds[row].type!=:scale or ds[col].type!=:scale)
@@ -148,15 +188,35 @@ def covariance_matrix(ds)
end
end
end
- matrix.extend CovariateMatrix
- matrix.fields=ds.fields
matrix
end
# Correlation matrix.
# Order of rows and columns depends on Dataset#fields order
-
def correlation_matrix(ds)
+ vars,cases=ds.fields.size,ds.cases
+ if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
+ cm=correlation_matrix_optimized(ds)
+ else
+ cm=correlation_matrix_pairwise(ds)
+
+ end
+ cm.extend(Statsample::CovariateMatrix)
+ cm.fields=ds.fields
+ cm
+ end
+
+ def correlation_matrix_optimized(ds)
+ s=covariance_matrix_optimized(ds)
+ sds=GSL::Matrix.diagonal(s.diagonal.sqrt.pow(-1))
+ cm=sds*s*sds
+ # Fix diagonal
+ s.row_size.times {|i|
+ cm[i,i]=1.0
+ }
+ cm
+ end
+ def correlation_matrix_pairwise(ds)
cache={}
cm=ds.collect_matrix do |row,col|
if row==col
@@ -173,9 +233,6 @@ def correlation_matrix(ds)
end
end
end
- cm.extend(Statsample::CovariateMatrix)
- cm.fields=ds.fields
- cm
end
# Retrieves the n valid pairwise.
View
40 lib/statsample/dataset.rb
@@ -115,6 +115,10 @@ def self.crosstab_by_asignation(rows,columns,values)
ds.update_valid_data
ds
end
+ # Return true if any vector has missing data
+ def has_missing_data?
+ @vectors.any? {|k,v| v.has_missing_data?}
+ end
# Creates a new dataset. A dataset is a set of ordered named vectors
# of the same size.
#
@@ -138,18 +142,9 @@ def initialize(vectors={}, fields=[])
check_order
check_length
end
+ @gsl=nil
@i=nil
end
- #
- # Returns a GSL::matrix
- #
- def to_gsl_matrix
- matrix=GSL::Matrix.alloc(cases,@vectors.size)
- each_array do |row|
- row.each_index{|y| matrix.set(@i,y,row[y]) }
- end
- matrix
- end
#
# Creates a copy of the given dataset, deleting all the cases with
# missing data on one of the vectors.
@@ -375,6 +370,7 @@ def add_case(v,uvd=true)
# Check vectors and fields after inserting data. Use only
# after #add_case_array or #add_case with second parameter to false
def update_valid_data
+ @gsl=nil
@fields.each{|f| @vectors[f].set_valid_data}
check_length
end
@@ -491,7 +487,6 @@ def check_length # :nodoc:
size=v.size
else
if v.size!=size
- p v.to_a.size
raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
end
end
@@ -629,7 +624,6 @@ def collect_with_index(type=:scale)
end
# Recode a vector based on a block
def recode!(vector_name)
-
0.upto(@cases-1) {|i|
@vectors[vector_name].data[i]=yield case_as_hash(i)
}
@@ -658,13 +652,23 @@ def to_matrix
end
if Statsample.has_gsl?
- def to_matrix_gsl
- rows=[]
- self.each_array{|c|
- rows.push(c)
- }
- GSL::Matrix.alloc(*rows)
+ def clear_gsl
+ @gsl=nil
+ end
+
+ def to_gsl
+ if @gsl.nil?
+ if cases.nil?
+ update_valid_data
+ end
+ @gsl=GSL::Matrix.alloc(cases,fields.size)
+ self.each_array{|c|
+ @gsl.set_row(@i,c)
+ }
+ end
+ @gsl
end
+
end
# Return a correlation matrix for fields included as parameters.
View
2  lib/statsample/factor.rb
@@ -41,8 +41,10 @@ def self.anti_image_covariance_matrix(matrix)
aicm
end
def self.anti_image_correlation_matrix(matrix)
+ matrix=matrix.to_matrix
s=Matrix.diag(*(matrix.inverse.diagonal)).sqrt.inverse
aicm=s*matrix.inverse*s
+
aicm.extend(Statsample::CovariateMatrix)
aicm.fields=matrix.fields if matrix.respond_to? :fields
aicm
View
3  lib/statsample/factor/parallelanalysis.rb
@@ -132,6 +132,7 @@ def compute
puts "#{@name}: Iteration #{i}" if $DEBUG or debug
# Create a dataset of dummy values
ds_bootstrap=Statsample::Dataset.new(@ds.fields)
+
@fields.each do |f|
if bootstrap_method==:random
ds_bootstrap[f]=@n_cases.times.map {|c| rng.call}.to_scale
@@ -141,6 +142,8 @@ def compute
raise "bootstrap_method doesn't recogniced"
end
end
+ ds_bootstrap.update_valid_data
+
matrix=Statsample::Bivariate.send(matrix_method, ds_bootstrap)
if smc
smc_v=matrix.inverse.diagonal.map{|ii| 1-(1.quo(ii))}
View
9 lib/statsample/factor/pca.rb
@@ -111,10 +111,11 @@ def feature_matrix(m=nil)
end
# Returns Principal Components for +input+ matrix or dataset
# The number of PC to return is equal to parameter +m+.
- # If +m+ isn't set, m set to number of PCs selected at object creation.
+ # If +m+ isn't set, m set to number of PCs selected at object creation.
+ # Use covariance matrix
+
def principal_components(input, m=nil)
- data_matrix=input.to_matrix
- var_names=(data_matrix.respond_to? :fields_y) ? data_matrix.fields_y : data_matrix.column_size.times.map {|i| "VAR_%d" % (i+1)}
+ data_matrix=input.to_matrix
m||=@m
raise "data matrix variables<>pca variables" if data_matrix.column_size!=@n_variables
@@ -141,7 +142,7 @@ def component_matrix_covariance(m=nil)
cm[i,j]=ff[i,j] * Math.sqrt(eigenvalues[j] / @matrix[i,i])
}
}
- cm.extend CovariateMatrix
+ cm.extend NamedMatrix
cm.name=_("Component matrix (from covariance)")
cm.fields_x = @variables_names
cm.fields_y = m.times.map {|i| "PC_%d" % (i+1)}
View
6 lib/statsample/graph/boxplot.rb
@@ -223,11 +223,11 @@ def rubyvis_panel # :nodoc:
dot.bottom {|v| y_scale.scale(v)}
dot.title {|v| v}
end
-
-
- end
+ end
end
+ vis
end
+
# Returns SVG with scatterplot
def to_svg
rp=rubyvis_panel
View
3  lib/statsample/graph/histogram.rb
@@ -120,7 +120,7 @@ def rubyvis_panel # :nodoc:
y_scale=Rubyvis::Scale.linear(@minimum_y, @maximum_y).range(0, height - margin_vert)
y_scale.nice
- max_range=@hist.max
+
bins=@hist.bins.times.map {|i|
{
:low =>@hist.get_range(i)[0],
@@ -170,6 +170,7 @@ def rubyvis_panel # :nodoc:
end
rubyvis_normal_distribution(pan) if @line_normal_distribution
end
+ vis
end
# Returns SVG with scatterplot
def to_svg
View
33 lib/statsample/matrix.rb
@@ -61,12 +61,45 @@ class Matrix
def to_gsl
self
end
+ def row_size
+ size1
+ end
+ def column_size
+ size2
+ end
+ def determinant
+ det
+ end
+ def inverse
+ GSL::Linalg::LU.invert(self)
+ end
+ def eigenpairs
+ self.to_matrix.eigenpairs
+ end
+ def eigenvalues
+ self.to_matrix.eigenvalues
+ end
+ def eigenpairs_ruby
+ self.to_matrix.eigenpairs_ruby
+ end
+ def square?
+ size1==size2
+ end
def to_matrix
rows=self.size1
cols=self.size2
out=(0...rows).collect{|i| (0...cols).collect {|j| self[i,j]} }
::Matrix.rows(out)
end
+ def total_sum
+ sum=0
+ size1.times {|i|
+ size2.times {|j|
+ sum+=self[i,j]
+ }
+ }
+ sum
+ end
end
end
View
2  lib/statsample/vector.rb
@@ -848,7 +848,7 @@ def mean # :nodoc:
def variance_sample(m=nil) # :nodoc:
check_type :scale
m||=mean
- @gsl.variance_m
+ @gsl.nil? ? nil : @gsl.variance_m
end
def standard_deviation_sample(m=nil) # :nodoc:
check_type :scale
View
4 references.txt
@@ -7,6 +7,7 @@ References
* Dinneen, L., & Blakesley, B. (1973). Algorithm AS 62: A Generator for the Sampling Distribution of the Mann- Whitney U Statistic. <em>Journal of the Royal Statistical Society, 22</em>(2), 269-273
* Dziuban, C., & Shirkey E. (1974). When is a correlation matrix appropriate for factor analysis? Some decision rules. Psychological Bulletin, 81(6), 358-361.
* Hayton, J., Allen, D. & Scarpello, V.(2004). Factor Retention Decisions in Exploratory Factor Analysis: a Tutorial on Parallel Analysis. <i>Organizational Research Methods, 7</i> (2), 191-205.
+* Härdle, W. & Simar, L. (2003). Applied Multivariate Statistical Analysis. Springer
* Lin, J. (2007). VARIMAX_K58 [Source code]. [http://www.johnny-lin.com/idl_code/varimax_k58.pro]
* Liu, O., & Rijmen, F. (2008). A modified procedure for parallel analysis of ordered categorical data. Behavior Research Methods, 40(2), 556-562.
* McGraw, K. & Wong, S.P. (1996). Forming Inferences About Some Intraclass Correlation Coefficients. Psychological methods, 1(1), 30-46.
@@ -16,8 +17,7 @@ References
* Smith, L. (2002). A tutorial on Principal Component Analysis. Available on http://courses.eas.ualberta.ca/eas570/pca_tutorial.pdf
* http://en.wikipedia.org/wiki/Welch-Satterthwaite_equation
* http://europe.isixsigma.com/library/content/c080806a.asp
-* http://snippets.dzone.com/posts/show/4666
* http://stattrek.com/Lesson6/SRS.aspx
-* http://www.cut-the-knot.org/do_you_know/AllPerm.shtml
+* http://talkstats.com/showthread.php?t=5056
* http://www.gnu.org/software/gsl/manual/html_node/The-histogram-struct.html
* http://www.taygeta.com/random/gaussian.html
View
0  data/crime.txt → test/fixtures/crime.txt
File renamed without changes
View
0  data/hartman_23.matrix → test/fixtures/hartman_23.matrix
File renamed without changes
View
0  data/repeated_fields.csv → test/fixtures/repeated_fields.csv
File renamed without changes
View
0  data/test_binomial.csv → test/fixtures/test_binomial.csv
File renamed without changes
View
0  test/test_csv.csv → test/fixtures/test_csv.csv
File renamed without changes
View
0  test/test_xls.xls → test/fixtures/test_xls.xls
File renamed without changes
View
0  data/tetmat_matrix.txt → test/fixtures/tetmat_matrix.txt
File renamed without changes
View
0  data/tetmat_test.txt → test/fixtures/tetmat_test.txt
File renamed without changes
View
56 test/test_bivariate.rb
@@ -48,7 +48,7 @@ class StatsampleBivariateTestCase < MiniTest::Unit::TestCase
assert_in_delta(Statsample::Bivariate.prop_pearson(r.t,8,:both), r.probability, 0.001)
assert(r.summary.size>0)
end
- should "return correct correlation_matrix" do
+ should "return correct correlation_matrix with nils values" do
v1=[6,5,4,7,8,4,3,2].to_vector(:scale)
v2=[2,3,7,8,6,4,3,2].to_vector(:scale)
v3=[6,2, 1000,1000,5,4,7,8].to_vector(:scale)
@@ -68,6 +68,60 @@ class StatsampleBivariateTestCase < MiniTest::Unit::TestCase
end
#assert_equal(expected,obt)
end
+ should "return same values for optimized and pairwise covariance matrix" do
+ cases=100
+ v1=Statsample::Vector.new_scale(cases) {rand()}
+ v2=Statsample::Vector.new_scale(cases) {rand()}
+ v3=Statsample::Vector.new_scale(cases) {rand()}
+ v4=Statsample::Vector.new_scale(cases) {rand()}
+ v5=Statsample::Vector.new_scale(cases) {rand()}
+
+ ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4,'v5'=>v5}.to_dataset
+
+ cor_opt=Statsample::Bivariate.covariance_matrix_optimized(ds)
+
+ cor_pw =Statsample::Bivariate.covariance_matrix_pairwise(ds)
+ assert_equal_matrix(cor_opt,cor_pw,1e-15)
+
+ end
+ should "return same values for optimized and pairwise correlation matrix" do
+ cases=100
+ v1=Statsample::Vector.new_scale(cases) {rand()}
+ v2=Statsample::Vector.new_scale(cases) {rand()}
+ v3=Statsample::Vector.new_scale(cases) {rand()}
+ v4=Statsample::Vector.new_scale(cases) {rand()}
+ v5=Statsample::Vector.new_scale(cases) {rand()}
+
+ ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4,'v5'=>v5}.to_dataset
+
+ cor_opt=Statsample::Bivariate.correlation_matrix_optimized(ds)
+
+ cor_pw =Statsample::Bivariate.correlation_matrix_pairwise(ds)
+ assert_equal_matrix(cor_opt,cor_pw,1e-15)
+
+ end
+ should "return correct correlation_matrix without nils values" do
+ v1=[6,5,4,7,8,4,3,2].to_vector(:scale)
+ v2=[2,3,7,8,6,4,3,2].to_vector(:scale)
+ v3=[6,2, 1000,1000,5,4,7,8].to_vector(:scale)
+ v4=[2,4,6,7, 3,7,8,6].to_vector(:scale)
+ ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4}.to_dataset
+ c=Proc.new {|n1,n2|Statsample::Bivariate.pearson(n1,n2)}
+ expected=Matrix[ [c.call(v1,v1),c.call(v1,v2),c.call(v1,v3),c.call(v1,v4)], [c.call(v2,v1),c.call(v2,v2),c.call(v2,v3),c.call(v2,v4)], [c.call(v3,v1),c.call(v3,v2),c.call(v3,v3),c.call(v3,v4)],
+ [c.call(v4,v1),c.call(v4,v2),c.call(v4,v3),c.call(v4,v4)]
+ ]
+ obt=Statsample::Bivariate.correlation_matrix(ds)
+ for i in 0...expected.row_size
+ for j in 0...expected.column_size
+ #puts expected[i,j].inspect
+ #puts obt[i,j].inspect
+ assert_in_delta(expected[i,j], obt[i,j],0.0001, "#{expected[i,j].class}!=#{obt[i,j].class} ")
+ end
+ end
+ #assert_equal(expected,obt)
+ end
+
+
should "return correct value for prop pearson" do
assert_in_delta(0.42, Statsample::Bivariate.prop_pearson(Statsample::Bivariate.t_r(0.084,94), 94),0.01)
assert_in_delta(0.65, Statsample::Bivariate.prop_pearson(Statsample::Bivariate.t_r(0.046,95), 95),0.01)
View
4 test/test_csv.rb
@@ -1,7 +1,7 @@
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleCSVTestCase < MiniTest::Unit::TestCase
def setup
- @ds=Statsample::CSV.read(File.dirname(__FILE__)+"/test_csv.csv")
+ @ds=Statsample::CSV.read(File.dirname(__FILE__)+"/fixtures/test_csv.csv")
end
def test_read
assert_equal(6,@ds.cases)
@@ -21,7 +21,7 @@ def test_nil
assert_equal(nil,@ds['age'][5])
end
def test_repeated
- ds=Statsample::CSV.read(File.dirname(__FILE__)+"/../data/repeated_fields.csv")
+ ds=Statsample::CSV.read(File.dirname(__FILE__)+"/fixtures/repeated_fields.csv")
assert_equal(%w{id name_1 age_1 city a1 name_2 age_2},ds.fields)
age=[3,4,5,6,nil,8].to_vector(:scale)
assert_equal(age,ds['age_2'])
View
24 test/test_dataset.rb
@@ -18,7 +18,15 @@ def test_saveload
a=Statsample.load(outfile.path)
assert_equal(@ds,a)
end
-
+ def test_gsl
+ if Statsample.has_gsl?
+ matrix=GSL::Matrix[[1,2],[3,4],[5,6]]
+ ds=Statsample::Dataset.new('v1'=>[1,3,5].to_vector,'v2'=>[2,4,6].to_vector)
+ assert_equal(matrix,ds.to_gsl)
+ else
+ skip("Gsl needed")
+ end
+ end
def test_matrix
matrix=Matrix[[1,2],[3,4],[5,6]]
ds=Statsample::Dataset.new('v1'=>[1,3,5].to_vector,'v2'=>[2,4,6].to_vector)
@@ -123,6 +131,20 @@ def test_vector_missing_values
mva=[2,3,0,1,0,1].to_vector(:scale)
assert_equal(mva,ds.vector_missing_values)
end
+
+ def test_has_missing_values
+ a1=[1 ,nil ,3 ,4 , 5,nil].to_vector(:scale)
+ a2=[10 ,nil ,20,20 ,20,30].to_vector(:scale)
+ b1=[nil,nil ,1 ,1 ,1 ,2].to_vector(:scale)
+ b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale)
+ c= [nil,2 , 4,2 ,2 ,2].to_vector(:scale)
+ ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2,'c'=>c}.to_dataset
+ assert(ds.has_missing_data?)
+ clean=ds.dup_only_valid
+ assert(!clean.has_missing_data?)
+ end
+
+
def test_vector_count_characters
a1=[1 ,"abcde" ,3 ,4 , 5,nil].to_vector(:scale)
a2=[10 ,20.3 ,20 ,20 ,20,30].to_vector(:scale)
View
2  test/test_logit.rb
@@ -3,7 +3,7 @@
class StatsampleLogitTestCase < MiniTest::Unit::TestCase
context Statsample::Regression::Binomial::Logit do
should "return correct values for example" do
- crime=File.dirname(__FILE__)+'/../data/test_binomial.csv'
+ crime=File.dirname(__FILE__)+'/fixtures/test_binomial.csv'
ds=Statsample::CSV.read(crime)
lr=Statsample::Regression::Binomial::Logit.new(ds,'y')
assert_in_delta(-38.8669,lr.log_likehood,0.001)
View
4 test/test_mle.rb
@@ -1,8 +1,8 @@
require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
class StatsampleMLETestCase < MiniTest::Unit::TestCase
def setup
- @file_binomial=File.dirname(__FILE__)+'/../data/test_binomial.csv'
- @crime=File.dirname(__FILE__)+'/../data/crime.txt'
+ @file_binomial=File.dirname(__FILE__)+'/fixtures/test_binomial.csv'
+ @crime=File.dirname(__FILE__)+'/fixtures/crime.txt'
@cases=100
a=Array.new()
b=Array.new()
View
2  test/test_regression.rb
@@ -182,8 +182,8 @@ def test_regression_matrix
@c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
@y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
-
cor=Statsample::Bivariate.correlation_matrix(ds)
+
lr=Statsample::Regression::Multiple::MatrixEngine.new(cor,'y', :y_mean=>@y.mean, :x_mean=>{'a'=>ds['a'].mean, 'b'=>ds['b'].mean, 'c'=>ds['c'].mean}, :cases=>@a.size, :y_sd=>@y.sd , :x_sd=>{'a' => @a.sd, 'b' => @b.sd, 'c' => @c.sd})
assert_nil(lr.constant_se)
assert_nil(lr.constant_t)
View
4 test/test_reliability.rb
@@ -197,8 +197,8 @@ class StatsampleReliabilityTestCase < MiniTest::Unit::TestCase
assert_in_delta(var_mean, @ia.variances_mean)
assert_equal(@x1.mean, @ia.item_statistics['x1'][:mean])
assert_equal(@x4.mean, @ia.item_statistics['x4'][:mean])
- assert_equal(@x1.sds, @ia.item_statistics['x1'][:sds])
- assert_equal(@x4.sds, @ia.item_statistics['x4'][:sds])
+ assert_in_delta(@x1.sds, @ia.item_statistics['x1'][:sds],1e-14)
+ assert_in_delta(@x4.sds, @ia.item_statistics['x4'][:sds],1e-14)
ds2=@ds.clone
ds2.delete_vector('x1')
vector_sum=ds2.vector_sum
View
2  test/test_xls.rb
@@ -2,7 +2,7 @@
class StatsampleExcelTestCase < MiniTest::Unit::TestCase
context "Excel reader" do
setup do
- @ds=Statsample::Excel.read(File.dirname(__FILE__)+"/test_xls.xls")
+ @ds=Statsample::Excel.read(File.dirname(__FILE__)+"/fixtures/test_xls.xls")
end
should "set the number of cases" do
assert_equal(6,@ds.cases)
Please sign in to comment.
Something went wrong with that request. Please try again.