Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Excessive Caching in Neural Network #257

Closed
unrealwill opened this issue Jul 28, 2021 · 6 comments · Fixed by #258
Closed

Excessive Caching in Neural Network #257

unrealwill opened this issue Jul 28, 2021 · 6 comments · Fixed by #258

Comments

@unrealwill
Copy link

Hello,

I'm trying to build a proto-neural-network with enzyme, aka two successive Matrix-vector product.
I tried to keep the code as simple and minimalist as possible.

The code runs fine but when I pass -Rpass=enzyme it indicates that it's caching and recomputing whereas it shouldn't need any memory allocation, as I'm preallocating the intermediate buffers, nor recomputation as I'm preserving the intermediate layers.

I have put restrict everywhere I can, but what am I doing wrong ?

Thanks

bugDense.cpp

#include <iostream>
using namespace std;

extern int enzyme_dup;
extern int enzyme_dupnoneed;
extern int enzyme_out;
extern int enzyme_const;

void __enzyme_autodiff(...);

inline void zero( double*__restrict__ v, int n)
{
  for( int i = 0 ; i < n ; i++) v[i] = 0.0;
}

void dense( double*__restrict__ A, double* __restrict__  x, double* __restrict__ out, int n, int m)
{
    zero( out, n );
    for( int i = 0 ; i < n ; i++ )
      for( int j = 0 ; j < m ; j++)
          out[i] += A[i*m+j] *x[j];
}


inline void rangep1( double*__restrict__ v, int n)
{
  for( int i = 0 ; i < n ; i++) v[i] = i+1;
}

template<typename T>
T sq( T x)
{
  return x*x;
}

inline void printVector( double*__restrict__ x, int n )
{
    for( int i = 0 ; i < n ; i++)
    {
      cout << x[i] << endl;
    }
    cout << endl;
}

inline void printMatrix( double*__restrict__ A, int n1, int n2 )
{
  for( int i = 0 ; i < n1 ; i++ )
  {
    for( int j = 0 ; j < n2 ; j++)
    {
      cout << A[i*n2+j] << " ";
    }
    cout << endl;
  }
}

class Fun2Params
{
public:
    Fun2Params(int featDim, int d)
    {
       A = new double[featDim*d];
       B = new double[featDim*featDim];
       rangep1(A,featDim*d);
       rangep1(B,featDim*featDim);
    }
    double* __restrict__ A;
    double* __restrict__ B;
};

class Fun2Memory
{
public:
    Fun2Memory(int featDim)
    {
      y0 = new double[featDim];
      y1 = new double[featDim];
      zero(y0,featDim);
      zero(y1,featDim);
    }
    double* __restrict__ y0;
    double* __restrict__ y1;
};

class Fun2
{
public:
  Fun2(int featDim, int d):featDim(featDim),d(d)
  {
      p = new double[d];
      rangep1(p,d);
  }
 double*  __restrict__ p;
 int featDim;
 int d;
};

void structuredFun2 (Fun2Params* __restrict__  x, Fun2Memory*  __restrict__ y,  double*  __restrict__  out ,Fun2* __restrict__ parameters )
{
    int d = parameters->d;
    int featDim = parameters->featDim;
    printf("featDim %d\n", featDim);

    dense( x->A, parameters->p, y->y0,featDim,d);
    dense( x->B, y->y0, y->y1,featDim,featDim);

    double temp = 0.0;
    for( int i= 0; i < featDim ; i++)
    {
      temp += sq(y->y0[i]) ;
      temp += sq(y->y1[i]);
    }
    *out = temp;
}

void testFun2()
{
  int d = 2;
  int featDim = 6;

  Fun2Params fp(d,featDim);
  Fun2Params dfp(d,featDim);

  Fun2Memory fm(featDim);
  Fun2Memory dfm(featDim);

  Fun2 fun2(featDim,d);

  double dout = 1.0;
  double out=0.0;
  __enzyme_autodiff(structuredFun2,
                                      enzyme_dup, &fp,&dfp,
                                      enzyme_dup, &fm ,&dfm,
                                      enzyme_dup,&out,&dout,
                                      enzyme_const, fun2);
  cout << "out " << endl;
  cout << out << endl;
  cout << "dfp.A " << endl;
  printMatrix( dfp.A,featDim,d);
  cout << "dfp.B" << endl;
  printMatrix( dfp.B,featDim,featDim);
  cout << endl;
}

int main(int argc, char** argv )
{
  cout<<"bugDense "<<endl;
  testFun2();
  return 0;
}

Compilation with :
clang bugDense.cpp -lstdc++ -lm -fno-exceptions -Rpass=enzyme -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-11.so -O2 -o bugDense

Output :

remark: Load may need caching   %arrayidx9.promoted.i = load double, double* %arrayidx9.i, align 8, !tbaa !44, !alias.scope !46, !noalias !38 due to   store double %add10.i, double* %arrayidx9.i, align 8, !dbg !47, !tbaa !44, !alias.scope !46, !noalias !38 [-Rpass=enzyme]
bugDense.cpp:21:21: remark: Load may need caching   %9 = load double, double* %arrayidx.i, align 8, !dbg !53, !tbaa !44, !alias.scope !54, !noalias !55 due to   store double %add10.i, double* %arrayidx9.i, align 8, !dbg !47, !tbaa !44, !alias.scope !46, !noalias !38 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                    ^
bugDense.cpp:21:31: remark: Load may need caching   %10 = load double, double* %arrayidx6.i, align 8, !dbg !56, !tbaa !44, !alias.scope !57, !noalias !58 due to   store double %add10.i, double* %arrayidx9.i, align 8, !dbg !47, !tbaa !44, !alias.scope !46, !noalias !38 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                              ^
remark: Load may need caching   %arrayidx9.promoted.i45 = load double, double* %arrayidx9.i44, align 8, !tbaa !44, !alias.scope !80, !noalias !75 due to   store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
bugDense.cpp:21:21: remark: Load may need caching   %15 = load double, double* %arrayidx.i53, align 8, !dbg !89, !tbaa !44, !alias.scope !90, !noalias !91 due to   store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                    ^
bugDense.cpp:21:31: remark: Load may need caching   %16 = load double, double* %arrayidx6.i54, align 8, !dbg !92, !tbaa !44, !alias.scope !93, !noalias !94 due to   store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                              ^
bugDense.cpp:21:31: remark: Load must be recomputed   %10 = load double, double* %arrayidx6.i, align 8, !dbg !56, !tbaa !44, !alias.scope !57, !noalias !58 in reverse_invertfor.body4.i due to   store double %add10.i, double* %arrayidx9.i, align 8, !dbg !47, !tbaa !44, !alias.scope !46, !noalias !38 [-Rpass=enzyme]
bugDense.cpp:21:31: remark: Caching instruction   %13 = load double, double* %arrayidx6.i, align 8, !dbg !55, !tbaa !44, !alias.scope !56, !noalias !57 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugDense.cpp:21:31: remark: Load must be recomputed   %16 = load double, double* %arrayidx6.i54, align 8, !dbg !92, !tbaa !44, !alias.scope !93, !noalias !94 in reverse_invertfor.body4.i59 due to   store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
bugDense.cpp:21:31: remark: Caching instruction   %31 = load double, double* %arrayidx6.i54, align 8, !dbg !93, !tbaa !45, !alias.scope !94, !noalias !95 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugDense.cpp:21:21: remark: Load must be recomputed   %15 = load double, double* %arrayidx.i53, align 8, !dbg !89, !tbaa !44, !alias.scope !90, !noalias !91 in reverse_invertfor.body4.i59 due to   store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                    ^
bugDense.cpp:21:21: remark: Caching instruction   %35 = load double, double* %arrayidx.i53, align 8, !dbg !91, !tbaa !45, !alias.scope !92, !noalias !93 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugDense 
featDim 6
out 
5.72725e+08
dfp.A 
7.83671e+06 1.56734e+07 
9.69974e+06 1.93995e+07 
1.76338e+07 3.52676e+07 
2.8622e+07 5.7244e+07 
4.65207e+07 9.30413e+07 
7.4193e+07 1.48386e+08 
dfp.B
1.19345e+06 2.62559e+06 4.05773e+06 5.48987e+06 6.92201e+06 8.35415e+06 
2.6271e+06 5.77963e+06 8.93216e+06 1.20847e+07 1.52372e+07 1.83897e+07 
4.0644e+06 8.94168e+06 1.38191e+07 1.86968e+07 2.35748e+07 2.84511e+07 
5.48659e+06 1.20907e+07 1.86488e+07 2.52308e+07 7.83671e+06 9.69973e+06 
1.76338e+07 2.8622e+07 4.65206e+07 7.4193e+07 4.01854e+07 4.84996e+07 
238690 525420 812880 1.09699e+06 1.3857e+06 1.71612e+06
@wsmoses
Copy link
Member

wsmoses commented Jul 29, 2021

I've reproduced this and deduced that we weren't enabling scoped alias analysis (and thus Enzyme, without that improved alias info, had to assume extra things cached). Happily turning that on eliminates the caches completely (which is undergoing a PR now).

@unrealwill
Copy link
Author

Thanks for the quick fix ! 👍

@unrealwill
Copy link
Author

FYI, I cloned a fresh enzyme revision 6117bbd which should contain the current fix, then rebuild, and reinstall Enzyme.
But I still observe the bug with clang version 11.1.0
I'll try a more recent clang (hopefully it will have a compatible Cuda version).

@unrealwill
Copy link
Author

The bug is also present with
clang version 12.0.1 freshly installed from https://github.com/llvm/llvm-project/tree/release/12.x

I'll try clang's mainline

@wsmoses
Copy link
Member

wsmoses commented Jul 29, 2021

Just to confirm, can you past the output of the analysis?

Specifically, the thing to look for is that there's no more lines like bugDense.cpp:21:31: remark: Caching instruction

There will still exist some lines like bugDense.cpp:21:21: remark: Load may need caching which effectively say that you're overwriting out and if you need the old value of out, it needs to be cached because of this store. A different analysis (differential use analysis) will determine that Enzyme won't need the old value of out in the reverse, and thus there's no need to cache it (and thus no Caching instruction...)

@unrealwill
Copy link
Author

OK, it still spits plenty of lines but looking more carefully there is no longer "remark: Caching instruction".
👍 so I guess it should be OK now.

There is still a "remark: Load must be recomputed" though (but I care less about this provided it's not quadratic)

Here is the new output :

clang bugDense.cpp  -lstdc++ -lm  -fno-exceptions -Rpass=enzyme -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-12.so  -O2  -o bin/bugDense
remark: Load may need caching   %arrayidx9.promoted.i = load double, double* %arrayidx9.i, align 8, !tbaa !47, !alias.scope !32, !noalias !44 due to   store double %add10.i, double* %arrayidx9.i, align 8, !dbg !49, !tbaa !47, !alias.scope !32, !noalias !44 [-Rpass=enzyme]
bugDense.cpp:21:21: remark: Load may need caching   %10 = load double, double* %arrayidx.i, align 8, !dbg !56, !tbaa !47, !alias.scope !26, !noalias !57 due to   tail call void @llvm.memset.p0i8.i64(i8* align 8 %v6.i.i34, i8 0, i64 %15, i1 false) #12, !dbg !79, !alias.scope !80, !noalias !83 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                    ^
bugDense.cpp:21:31: remark: Load may need caching   %11 = load double, double* %arrayidx6.i, align 8, !dbg !58, !tbaa !47, !alias.scope !30, !noalias !59 due to   tail call void @llvm.memset.p0i8.i64(i8* align 8 %v6.i.i34, i8 0, i64 %15, i1 false) #12, !dbg !79, !alias.scope !80, !noalias !83 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                              ^
remark: Load may need caching   %arrayidx9.promoted.i43 = load double, double* %arrayidx9.i42, align 8, !tbaa !47, !alias.scope !74, !noalias !83 due to   store double %add10.i54, double* %arrayidx9.i42, align 8, !dbg !86, !tbaa !47, !alias.scope !74, !noalias !83 [-Rpass=enzyme]
bugDense.cpp:21:31: remark: Load must be recomputed   %11 = load double, double* %arrayidx6.i, align 8, !dbg !58, !tbaa !47, !alias.scope !30, !noalias !59 in reverse_invertfor.body4.i due to   tail call void @llvm.memset.p0i8.i64(i8* align 8 %v6.i.i34, i8 0, i64 %15, i1 false) #12, !dbg !79, !alias.scope !80, !noalias !83 [-Rpass=enzyme]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging a pull request may close this issue.

2 participants