Permalink
Browse files

Update PCA to use SVD (pull #1527)

  • Loading branch information...
1 parent 6a7a951 commit 53bff36ea9fbd72089afdb9b5b41337dc891a44a @rcorbish rcorbish committed with saudet Dec 19, 2016
View
@@ -10,6 +10,7 @@ README.md~
*.jpg
*.png
*.bin
+*.swp
*.releaseBackup
*.jpg
*.png
@@ -36,57 +36,100 @@
private PCA() {
}
+
/**
- * Reduce the dimension of x
- * to the specified number of dimensions.
- * <p/>
- * Happily based on the great work done in the tsne paper here:
- * http://homepage.tudelft.nl/19j49/t-SNE.html
+ * Calculates pca vectors of a matrix, for a fixed number of reduced features
+ * returns the reduced feature set
+ * The return is a projection of A onto principal nDims components
+ *
+ * To use the PCA: assume A is the original feature set
+ * then project A onto a reduced set of features. It is possible to
+ * reconstruct the original data ( losing information, but having the same
+ * dimensionality )
+ *
+ * <pre>
+ * {@code
+ *
+ * INDArray Areduced = A.mmul( factor ) ;
+ * INDArray Aoriginal = Areduced.mmul( factor.transpose() ) ;
+ *
+ * }
+ * </pre>
*
- * @param X the x to reduce
- * @param nDims the number of dimensions to reduce to
- * @param normalize normalize
- * @return the reduced dimension
+ * @param A the array of features, rows are results, columns are features - will be changed
+ * @param nDims the number of components on which to project the features
+ * @param normalize whether to normalize (adjust each feature to have zero mean)
+ * @return the reduced parameters of A
*/
- public static INDArray pca(INDArray X, int nDims, boolean normalize) {
- if (normalize) {
- INDArray mean = X.mean(0);
- X = X.subiRowVector(mean);
- }
-
- INDArray C;
- if (X.size(1) < X.size(0))
- C = X.transpose().mmul(X);
-
- else
- C = X.mmul(X.transpose()).muli(1 / X.size(0));
+ public static INDArray pca(INDArray A, int nDims, boolean normalize) {
+ INDArray factor = pca_factor( A, nDims, normalize ) ;
+ return A.mmul( factor ) ;
+ }
- IComplexNDArray[] eigen = Eigen.eigenvectors(C);
- IComplexNDArray M = eigen[1];
- IComplexNDArray lambda = eigen[0];
- IComplexNDArray diagLambda = Nd4j.diag(lambda);
- INDArray[] sorted = Nd4j.sortWithIndices(diagLambda, 0, false);
- //change lambda to be the indexes
+ /**
+ * Calculates pca factors of a matrix, for a fixed number of reduced features
+ * returns the factors to scale observations
+ *
+ * The return is a factor matrix to reduce (normalized) feature sets
+ *
+ * @see pca(INDArray, int, boolean)
+ *
+ * @param A the array of features, rows are results, columns are features - will be changed
+ * @param nDims the number of components on which to project the features
+ * @param normalize whether to normalize (adjust each feature to have zero mean)
+ * @return the reduced feature set
+ */
+ public static INDArray pca_factor(INDArray A, int nDims, boolean normalize) {
- INDArray indices = sorted[0];
+ if( normalize ) {
+ // Normalize to mean 0 for each feature ( each column has 0 mean )
+ INDArray mean = A.mean(0) ;
+ A.subiRowVector( mean ) ;
+ }
- INDArrayIndex[] indices2 = NDArrayIndex.create(indices.get(
- NDArrayIndex.interval(0, nDims)));
+ int m = A.rows() ;
+ int n = A.columns() ;
- INDArrayIndex[] rowsAndColumnIndices = new INDArrayIndex[]{
- NDArrayIndex.interval(0, M.rows()), indices2[0]
- };
+ // The prepare SVD results, we'll decomp A to UxSxV'
+ INDArray s = Nd4j.create( m<n?m:n ) ;
+ INDArray VT = Nd4j.create( n, n, 'f' ) ;
- M = M.get(rowsAndColumnIndices);
+ // Note - we don't care about U
+ Nd4j.getBlasWrapper().lapack().sgesvd( A, s, null, VT );
+
+ // for comparison k & nDims are the equivalent values in both methods implementing PCA
- X = Nd4j.createComplex(X.subRowVector(X.mean(0))).mmul(M);
+ // So now let's rip out the appropriate number of left singular vectors from
+ // the V output (note we pulls rows since VT is a transpose of V)
+ INDArray V = VT.transpose() ;
+ INDArray factor = Nd4j.create( n, nDims, 'f' ) ;
+ for( int i=0 ; i<nDims ; i++ ) {
+ factor.putColumn( i, V.getColumn(i) ) ;
+ }
+ return factor ;
+ }
- return X;
+ /**
+ * Calculates pca reduced value of a matrix, for a given variance. A larger variance (99%)
+ * will result in a higher order feature set.
+ *
+ * The returned matrix is a projection of A onto principal components
+ *
+ * @see pca(INDArray, int, boolean)
+ *
+ * @param A the array of features, rows are results, columns are features - will be changed
+ * @param variance the amount of variance to preserve as a float 0 - 1
+ * @param normalize whether to normalize (set features to have zero mean)
+ * @return the matrix representing a reduced feature set
+ */
+ public static INDArray pca(INDArray A, double variance, boolean normalize) {
+ INDArray factor = pca_factor( A, variance, normalize ) ;
+ return A.mmul( factor ) ;
}
@@ -100,13 +143,14 @@ public static INDArray pca(INDArray X, int nDims, boolean normalize) {
*
* The array Areduced is a projection of A onto principal components
*
+ * @see pca(INDArray, double, boolean)
+ *
* @param A the array of features, rows are results, columns are features - will be changed
* @param variance the amount of variance to preserve as a float 0 - 1
- * @param whether to normalize (set features to have zero mean)
+ * @param normalize whether to normalize (set features to have zero mean)
* @return the matrix to mulitiply a feature by to get a reduced feature set
*/
- public static INDArray pca(INDArray A, double variance, boolean normalize) {
-
+ public static INDArray pca_factor(INDArray A, double variance, boolean normalize) {
if( normalize ) {
// Normalize to mean 0 for each feature ( each column has 0 mean )
INDArray mean = A.mean(0) ;
@@ -141,7 +185,7 @@ public static INDArray pca(INDArray A, double variance, boolean normalize) {
}
}
if( k == -1 ) { // if we need everything
- throw new RuntimeException( "No reduction possible for reqd. variance - use smaller variance" ) ;
+ throw new RuntimeException( "No reduction possible for reqd. variance - use smaller variance" ) ;
}
// So now let's rip out the appropriate number of left singular vectors from
// the V output (note we pulls rows since VT is a transpose of V)
@@ -0,0 +1,86 @@
+
+package org.nd4j.linalg.dimensionalityreduction ;
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.nd4j.linalg.BaseNd4jTest;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.checkutil.CheckUtil;
+import org.nd4j.linalg.checkutil.NDArrayCreationUtil;
+import org.nd4j.linalg.factory.Nd4j;
+import org.nd4j.linalg.factory.Nd4jBackend;
+
+
+import java.util.List;
+
+/**
+ * Created by rcorbish
+ */
+@RunWith(Parameterized.class)
+public class TestPCA extends BaseNd4jTest {
+
+
+ public TestPCA(Nd4jBackend backend) {
+ super(backend);
+ }
+
+ @Test
+ public void testFactorDims() {
+ int m = 13 ;
+ int n = 4 ;
+
+ double f[] = new double[] {
+ 7,1,11,11,7,11,3,1,2,21,1,11,10,26,29,56,31,52,55,71,31,54,47,40,66,68,
+ 6,15,8,8,6,9,17,22,18,4,23,9,8,60,52,20,47,33,22,6,44,22,26,34,12,12
+ } ;
+
+ INDArray A = Nd4j.create(f, new int[] { m, n }, 'f' ) ;
+
+ INDArray A1 = A.dup('f') ;
+ INDArray Factor = org.nd4j.linalg.dimensionalityreduction.PCA.pca_factor(A1, 3, true ) ;
+ A1 = A.subiRowVector( A.mean(0) ) ;
+
+ INDArray Reduced = A1.mmul(Factor) ;
+ INDArray Reconstructed = Reduced.mmul( Factor.transpose() ) ;
+ INDArray Diff = Reconstructed.sub( A1 ) ;
+ for( int i=0 ; i<m*n ; i++ ) {
+ assertEquals( "Reconstructed matrix is very different from the original.", 0.0, Diff.getDouble(i), 1.0 ) ;
+ }
+ }
+
+
+ @Test
+ public void testFactorVariance() {
+ int m = 13 ;
+ int n = 4 ;
+
+ double f[] = new double[] {
+ 7,1,11,11,7,11,3,1,2,21,1,11,10,26,29,56,31,52,55,71,31,54,47,40,66,68,
+ 6,15,8,8,6,9,17,22,18,4,23,9,8,60,52,20,47,33,22,6,44,22,26,34,12,12
+ } ;
+
+ INDArray A = Nd4j.create(f, new int[] { m, n }, 'f' ) ;
+
+ INDArray A1 = A.dup('f') ;
+ INDArray Factor1 = org.nd4j.linalg.dimensionalityreduction.PCA.pca_factor(A1, 0.95, true ) ;
+ A1 = A.subiRowVector( A.mean(0) ) ;
+ INDArray Reduced1 = A1.mmul(Factor1) ;
+ INDArray Reconstructed1 = Reduced1.mmul( Factor1.transpose() ) ;
+ INDArray Diff1 = Reconstructed1.sub( A1 ) ;
+ for( int i=0 ; i<m*n ; i++ ) {
+ assertEquals( "Reconstructed matrix is very different from the original.", 0.0, Diff1.getDouble(i), 0.1 ) ;
+ }
+ INDArray A2 = A.dup('f') ;
+ INDArray Factor2 = org.nd4j.linalg.dimensionalityreduction.PCA.pca_factor(A2, 0.50, true ) ;
+ assertTrue( "Variance differences should change factor sizes.", Factor1.columns()>Factor2.columns() ) ;
+ }
+
+
+ @Override
+ public char ordering() {
+ return 'f';
+ }
+}
+

0 comments on commit 53bff36

Please sign in to comment.