Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sgemm gives incorrect results on 2.4 when transA != transB and N is 7, 11, 15, ... #246

Open
hughperkins opened this issue Mar 27, 2016 · 4 comments

Comments

@hughperkins
Copy link
Contributor

Example test program:

#include <iostream>
#include <sys/types.h>
#include <stdio.h>
#include <string.h>
#include <clBLAS.h>
#include <stdlib.h>
using namespace std;

  cl_int err;
  cl_platform_id platform = 0;
  cl_device_id device = 0;
  cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
  cl_context ctx = 0;
  cl_command_queue queue = 0;
  cl_mem bufA, bufB, bufC;
  cl_event event = NULL;
  int ret = 0;

void clgemm(int colmaj, char transAchar, char transBchar, int M, int N, int K, float alpha, float *A, int lda,
     float *B, int ldb, float beta, float *C, int ldc, float *result) {
clblasTranspose transA = transAchar == 'n' ? clblasNoTrans : clblasTrans;
clblasTranspose transB = transBchar == 'n' ? clblasNoTrans : clblasTrans;

size_t off = 0;
size_t offA = 0;
size_t offB = 0;
size_t offC = 0;

clblasOrder order;
if(colmaj == 1 ) {
  order = clblasColumnMajor;
} else {
  order = clblasRowMajor;
}

  bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
                        NULL, &err);
  bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
                        NULL, &err);
  bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
                        NULL, &err);

  err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
      M * K * sizeof(*A), A, 0, NULL, NULL);
  err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,
      K * N * sizeof(*B), B, 0, NULL, NULL);
  err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,
      M * N * sizeof(*C), C, 0, NULL, NULL);

  err = clblasSgemm(order, transA, transB, M - off, N - off, K - off,
                       alpha, bufA, offA, lda,
                       bufB, offB, ldb, beta,
                       bufC, offC, ldc,
                       1, &queue, 0, NULL, &event);
  if (err != CL_SUCCESS) {
      printf("clblasSgemmEx() failed with %d\n", err);
      ret = 1;
      exit(1);
  }
  else {
      err = clWaitForEvents(1, &event);
      err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0,
                                M * N * sizeof(*result),
                                result, 0, NULL, NULL);
  }

  clReleaseMemObject(bufC);
  clReleaseMemObject(bufB);
  clReleaseMemObject(bufA);

}

void copy(float *target, float *source, int numels ) {
  for(int i = 0; i < numels; i++) {
    target[i] = source[i];
  }
}

// assumes row major
void transpose(float *A, int rows, int cols) {
  float *A_ = new float[rows * cols];
  for(int i=0; i < rows; i++ ) {
    for(int j = 0; j< cols; j++) {
      A_[j * rows + i] = A[i * cols + j];
    }
  }
  copy(A, A_, rows * cols);
  delete[] A_;
}

// assumes row major
void mult(float *C, float *A, float *B, int M, int K, int N) {
  for(int m = 0; m < M; m++ ) {
    for(int n = 0; n < N; n++ ) {
      float sum = 0;
      for(int k = 0; k < K; k++ ) {
        sum = sum + A[m * K + k] * B[k * N + n];
      }
      C[m * N + n] = sum;
    }
  }
}

bool test1(int colmaj, int M, int N, int K, int transAint, int transBint) {
  char transa = transAint == 1 ? 't' : 'n';
  char transb = transBint == 1 ? 't' : 'n';
//  cout << "colmaj=" << colmaj << " " << transa << " " << transb << " M=" << M << " K=" << K << " N=" << N << endl;

  float alpha = 1;
  int lda, ldb, ldc;

  if(colmaj == 1) {
    if(transAint == 1) {
       lda = K;
    } else {
       lda = M;
    }
    if(transBint == 1) {
       ldb = N;
    } else {
       ldb = K;
    }
  } else {
    if(transAint == 1) {
       lda = M;    
    } else {
       lda = K;
    }
    if(transBint == 1) {
       ldb = K;    
    } else {
       ldb = N;
    }
  }

  if(colmaj == 1) {
    ldc = M;
  } else {
    ldc = N;
  }

  float beta = 0;
  // assume these are row major, untransposed
  float *A = new float[M * K];
  float *B = new float[K * N];
  float *C = new float[M * N];
  for(int i = 0; i < M * K; i++) {
     A[i] = rand() / (float)RAND_MAX - 0.5f;
  }
  for(int i = 0; i < N * K; i++) {
    B[i] = rand() / (float)RAND_MAX - 0.5f;
  }
  for(int i = 0; i < M * N; i++) {
    C[i] = 0.0f;
  }

//  cout << "op(A):" << endl;
//    for(int m=0; m < M; m++) {
//      for(int k = 0; k < K; k++) {
//        cout << A[m * K + k] << " ";
//      }
//      cout << endl;
//    }

//  cout << "op(B):" << endl;
//    for(int k = 0; k < K; k++) {
//      for(int n=0; n < N; n++) {
//        cout << B[k * N + n] << " ";
//      }
//      cout << endl;
//    }

   float *Aforblas = new float[M*K];
   float *Bforblas = new float[K * N];
   float *Cforblas = new float[M*N];
   copy(Aforblas, A, M*K);
   copy(Bforblas, B, N*K);

  float *Cours = new float[M * N];

  float *Aour = new float[M * K];
  float *Bour = new float[K * N];
  copy(Aour, A, M * K);
  copy(Bour, B, N * K);
  bool flipAforblas = !(colmaj == 1) != !(transAint == 1);
  bool flipBforblas = !(colmaj == 1) != !(transBint == 1);
  if(flipAforblas) {
    transpose(Aforblas, M, K);
   }
  if(flipBforblas) {
    transpose(Bforblas, K, N);
   }
  mult(Cours, A, B, M, K, N);

//  cout<< "result from CPU: " << endl;
//  for(int m = 0; m < M; m++) {
//    for(int n = 0; n < N; n++) {
//      int i = m + n * M;
//      cout << Cours[i] << " ";
//    }
//    cout << endl;
//  }

  float *clout = new float[M * N];
  clgemm(colmaj, transa, transb, M, N, K, alpha, Aforblas, lda,
     Bforblas, ldb, beta, C, ldc, clout);
  if(colmaj == 1 ) {
    transpose(clout, N, M);
  }
  bool ok = true;
  for(int m = 0; m < M; m++) {
    for(int n = 0; n < N; n++) {
      int i = m + n * M;
//      cout << "  " << i << " " << Cours[i] << " " << clout[i] << endl;
      float diff = clout[i] - Cours[i];
      diff = diff < 0 ? - diff : diff;
      if(diff > 0.0001) {
//         cout << "ERROR " << M << " " << N << " " << K << " " << transa << " " << transb << endl;
         ok = false;
//         exit(1);
      }
    }
  }
  if(!ok) {
   cout << "ERROR colmaj=" << colmaj << " M=" << M << " N=" << N << " K=" << K << " transa=" << transa << " transb=" << transb << endl;
  }
  delete[] A;
  delete[] B;
  delete[] C;
  delete[] Cours;
  delete[] clout;
  return ok;
}

int main(int argc, char *argv[]) {
  clewInit();

  err = clGetPlatformIDs(1, &platform, NULL);
  if (err != CL_SUCCESS) {
      printf( "clGetPlatformIDs() failed with %d\n", err );
      return 1;
  }
  cout << "got platforms" << endl;

  err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
  if (err != CL_SUCCESS) {
      printf( "clGetDeviceIDs() failed with %d\n", err );
      return 1;
  }

  props[1] = (cl_context_properties)platform;
  ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
  if (err != CL_SUCCESS) {
      printf( "clCreateContext() failed with %d\n", err );
      return 1;
  }

  queue = clCreateCommandQueue(ctx, device, 0, &err);
  if (err != CL_SUCCESS) {
      printf( "clCreateCommandQueue() failed with %d\n", err );
      clReleaseContext(ctx);
      return 1;
  }

  err = clblasSetup();
  if (err != CL_SUCCESS) {
      printf("clblasSetup() failed with %d\n", err);
      clReleaseCommandQueue(queue);
      clReleaseContext(ctx);
      return 1;
  }

  test1(1, 1, 7, 1, 0, 0);
  test1(1, 1, 7, 1, 1, 1);
//  for(int colmaj = 0; colmaj <= 1; colmaj++ ) {
  int colmaj = 1; {
    for(int m=1; m <= 16; m++) {
      for(int n=1; n <= 16; n++) {
        for(int k=1; k <= 16; k++) {
  //        for(int transA =0; transA <= 1; transA++) {
          int transA = 1; {
//            for(int transB =0; transB <= 1; transB++) {
            int transB = 0; {
  //            test1();
              test1(colmaj, m, n, k, transA, transB);
            }
          }
        }
      }
    }
  }
  // check:
  // colmaj transa transb res
  // 0 0 0 ok
  // 0 1 0 ok
  // 0 0 1 FAIL 7 1 1 n t
  // 0 1 1 ok
  // 1 0 0 ok
  // 1 1 0 FAIL 1 7 1 t n
  // 1 0 1 ok
  // 1 1 1 ok

  /* Finalize work with clblas. */
  clblasTeardown();

  /* Release OpenCL working objects. */
  clReleaseCommandQueue(queue);
  clReleaseContext(ctx);

  return 0;
}

@hughperkins
Copy link
Contributor Author

(As far as "why dont I use 2.8, 2.10, eg travis build failures for 2.8 and 2.10 on mac:

@pavanky
Copy link
Contributor

pavanky commented Mar 27, 2016

Isn't the OSX issue fixed in develop?

@hughperkins
Copy link
Contributor Author

Apparently so https://travis-ci.org/hughperkins/cltorch/builds/118785864 . But it's not the only reason I prefer 2.4 actually. I'm not sure if the 'path of least resistance' is 2.10, or 2.4, but 2.4 in its favor:

  • I've used it for a year, already fixed all the bugs I've noticed (except the one in this thread.... :-( )
  • only needs OpenCL 1.1. Some mobile devices dont support 1.2
  • doesnt need python for build process
  • just seems generally simpler, more bug free (the bug in this thread not-withstanding)

@hughperkins
Copy link
Contributor Author

(well, I've migrated cltorch and clnn to use clblas 2.11/develop for now https://github.com/hughperkins/cltorch/#recent-changes Will see how that goes. If I get a zillion complaints about builds/bugs/opencl 1.2 support, I might try to fix 2.4; otherwise I note that 2.11/develop does seem to:

  • build and run ok on ubuntu 15.10
  • build ok on mac os x (including rpath works ok) https://travis-ci.org/hughperkins/cltorch/builds/118788901
  • gives correct results for all possible 2d matrices with sides from 1 to 16, and all possible combination of colmajor, transa, and transb (using above test case script)
    )

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants