Excessive Caching in Neural Network #257

unrealwill · 2021-07-28T17:13:07Z

Hello,

I'm trying to build a proto-neural-network with enzyme, aka two successive Matrix-vector product.
I tried to keep the code as simple and minimalist as possible.

The code runs fine but when I pass -Rpass=enzyme it indicates that it's caching and recomputing whereas it shouldn't need any memory allocation, as I'm preallocating the intermediate buffers, nor recomputation as I'm preserving the intermediate layers.

I have put restrict everywhere I can, but what am I doing wrong ?

Thanks

bugDense.cpp

#include <iostream>
using namespace std;

extern int enzyme_dup;
extern int enzyme_dupnoneed;
extern int enzyme_out;
extern int enzyme_const;

void __enzyme_autodiff(...);

inline void zero( double*__restrict__ v, int n)
{
  for( int i = 0 ; i < n ; i++) v[i] = 0.0;
}

void dense( double*__restrict__ A, double* __restrict__  x, double* __restrict__ out, int n, int m)
{
    zero( out, n );
    for( int i = 0 ; i < n ; i++ )
      for( int j = 0 ; j < m ; j++)
          out[i] += A[i*m+j] *x[j];
}


inline void rangep1( double*__restrict__ v, int n)
{
  for( int i = 0 ; i < n ; i++) v[i] = i+1;
}

template<typename T>
T sq( T x)
{
  return x*x;
}

inline void printVector( double*__restrict__ x, int n )
{
    for( int i = 0 ; i < n ; i++)
    {
      cout << x[i] << endl;
    }
    cout << endl;
}

inline void printMatrix( double*__restrict__ A, int n1, int n2 )
{
  for( int i = 0 ; i < n1 ; i++ )
  {
    for( int j = 0 ; j < n2 ; j++)
    {
      cout << A[i*n2+j] << " ";
    }
    cout << endl;
  }
}

class Fun2Params
{
public:
    Fun2Params(int featDim, int d)
    {
       A = new double[featDim*d];
       B = new double[featDim*featDim];
       rangep1(A,featDim*d);
       rangep1(B,featDim*featDim);
    }
    double* __restrict__ A;
    double* __restrict__ B;
};

class Fun2Memory
{
public:
    Fun2Memory(int featDim)
    {
      y0 = new double[featDim];
      y1 = new double[featDim];
      zero(y0,featDim);
      zero(y1,featDim);
    }
    double* __restrict__ y0;
    double* __restrict__ y1;
};

class Fun2
{
public:
  Fun2(int featDim, int d):featDim(featDim),d(d)
  {
      p = new double[d];
      rangep1(p,d);
  }
 double*  __restrict__ p;
 int featDim;
 int d;
};

void structuredFun2 (Fun2Params* __restrict__  x, Fun2Memory*  __restrict__ y,  double*  __restrict__  out ,Fun2* __restrict__ parameters )
{
    int d = parameters->d;
    int featDim = parameters->featDim;
    printf("featDim %d\n", featDim);

    dense( x->A, parameters->p, y->y0,featDim,d);
    dense( x->B, y->y0, y->y1,featDim,featDim);

    double temp = 0.0;
    for( int i= 0; i < featDim ; i++)
    {
      temp += sq(y->y0[i]) ;
      temp += sq(y->y1[i]);
    }
    *out = temp;
}

void testFun2()
{
  int d = 2;
  int featDim = 6;

  Fun2Params fp(d,featDim);
  Fun2Params dfp(d,featDim);

  Fun2Memory fm(featDim);
  Fun2Memory dfm(featDim);

  Fun2 fun2(featDim,d);

  double dout = 1.0;
  double out=0.0;
  __enzyme_autodiff(structuredFun2,
                                      enzyme_dup, &fp,&dfp,
                                      enzyme_dup, &fm ,&dfm,
                                      enzyme_dup,&out,&dout,
                                      enzyme_const, fun2);
  cout << "out " << endl;
  cout << out << endl;
  cout << "dfp.A " << endl;
  printMatrix( dfp.A,featDim,d);
  cout << "dfp.B" << endl;
  printMatrix( dfp.B,featDim,featDim);
  cout << endl;
}

int main(int argc, char** argv )
{
  cout<<"bugDense "<<endl;
  testFun2();
  return 0;
}

Compilation with :
clang bugDense.cpp -lstdc++ -lm -fno-exceptions -Rpass=enzyme -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-11.so -O2 -o bugDense

Output :

remark: Load may need caching   %arrayidx9.promoted.i = load double, double* %arrayidx9.i, align 8, !tbaa !44, !alias.scope !46, !noalias !38 due to   store double %add10.i, double* %arrayidx9.i, align 8, !dbg !47, !tbaa !44, !alias.scope !46, !noalias !38 [-Rpass=enzyme]
bugDense.cpp:21:21: remark: Load may need caching   %9 = load double, double* %arrayidx.i, align 8, !dbg !53, !tbaa !44, !alias.scope !54, !noalias !55 due to   store double %add10.i, double* %arrayidx9.i, align 8, !dbg !47, !tbaa !44, !alias.scope !46, !noalias !38 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                    ^
bugDense.cpp:21:31: remark: Load may need caching   %10 = load double, double* %arrayidx6.i, align 8, !dbg !56, !tbaa !44, !alias.scope !57, !noalias !58 due to   store double %add10.i, double* %arrayidx9.i, align 8, !dbg !47, !tbaa !44, !alias.scope !46, !noalias !38 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                              ^
remark: Load may need caching   %arrayidx9.promoted.i45 = load double, double* %arrayidx9.i44, align 8, !tbaa !44, !alias.scope !80, !noalias !75 due to   store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
bugDense.cpp:21:21: remark: Load may need caching   %15 = load double, double* %arrayidx.i53, align 8, !dbg !89, !tbaa !44, !alias.scope !90, !noalias !91 due to   store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                    ^
bugDense.cpp:21:31: remark: Load may need caching   %16 = load double, double* %arrayidx6.i54, align 8, !dbg !92, !tbaa !44, !alias.scope !93, !noalias !94 due to   store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                              ^
bugDense.cpp:21:31: remark: Load must be recomputed   %10 = load double, double* %arrayidx6.i, align 8, !dbg !56, !tbaa !44, !alias.scope !57, !noalias !58 in reverse_invertfor.body4.i due to   store double %add10.i, double* %arrayidx9.i, align 8, !dbg !47, !tbaa !44, !alias.scope !46, !noalias !38 [-Rpass=enzyme]
bugDense.cpp:21:31: remark: Caching instruction   %13 = load double, double* %arrayidx6.i, align 8, !dbg !55, !tbaa !44, !alias.scope !56, !noalias !57 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugDense.cpp:21:31: remark: Load must be recomputed   %16 = load double, double* %arrayidx6.i54, align 8, !dbg !92, !tbaa !44, !alias.scope !93, !noalias !94 in reverse_invertfor.body4.i59 due to   store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
bugDense.cpp:21:31: remark: Caching instruction   %31 = load double, double* %arrayidx6.i54, align 8, !dbg !93, !tbaa !45, !alias.scope !94, !noalias !95 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugDense.cpp:21:21: remark: Load must be recomputed   %15 = load double, double* %arrayidx.i53, align 8, !dbg !89, !tbaa !44, !alias.scope !90, !noalias !91 in reverse_invertfor.body4.i59 due to   store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                    ^
bugDense.cpp:21:21: remark: Caching instruction   %35 = load double, double* %arrayidx.i53, align 8, !dbg !91, !tbaa !45, !alias.scope !92, !noalias !93 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugDense 
featDim 6
out 
5.72725e+08
dfp.A 
7.83671e+06 1.56734e+07 
9.69974e+06 1.93995e+07 
1.76338e+07 3.52676e+07 
2.8622e+07 5.7244e+07 
4.65207e+07 9.30413e+07 
7.4193e+07 1.48386e+08 
dfp.B
1.19345e+06 2.62559e+06 4.05773e+06 5.48987e+06 6.92201e+06 8.35415e+06 
2.6271e+06 5.77963e+06 8.93216e+06 1.20847e+07 1.52372e+07 1.83897e+07 
4.0644e+06 8.94168e+06 1.38191e+07 1.86968e+07 2.35748e+07 2.84511e+07 
5.48659e+06 1.20907e+07 1.86488e+07 2.52308e+07 7.83671e+06 9.69973e+06 
1.76338e+07 2.8622e+07 4.65206e+07 7.4193e+07 4.01854e+07 4.84996e+07 
238690 525420 812880 1.09699e+06 1.3857e+06 1.71612e+06

The text was updated successfully, but these errors were encountered:

wsmoses · 2021-07-29T00:38:26Z

I've reproduced this and deduced that we weren't enabling scoped alias analysis (and thus Enzyme, without that improved alias info, had to assume extra things cached). Happily turning that on eliminates the caches completely (which is undergoing a PR now).

unrealwill · 2021-07-29T06:23:31Z

Thanks for the quick fix ! 👍

unrealwill · 2021-07-29T12:27:56Z

FYI, I cloned a fresh enzyme revision 6117bbd which should contain the current fix, then rebuild, and reinstall Enzyme.
But I still observe the bug with clang version 11.1.0
I'll try a more recent clang (hopefully it will have a compatible Cuda version).

unrealwill · 2021-07-29T15:18:36Z

The bug is also present with
clang version 12.0.1 freshly installed from https://github.com/llvm/llvm-project/tree/release/12.x

I'll try clang's mainline

wsmoses · 2021-07-29T15:23:53Z

Just to confirm, can you past the output of the analysis?

Specifically, the thing to look for is that there's no more lines like bugDense.cpp:21:31: remark: Caching instruction

There will still exist some lines like bugDense.cpp:21:21: remark: Load may need caching which effectively say that you're overwriting out and if you need the old value of out, it needs to be cached because of this store. A different analysis (differential use analysis) will determine that Enzyme won't need the old value of out in the reverse, and thus there's no need to cache it (and thus no Caching instruction...)

unrealwill · 2021-07-29T15:50:47Z

OK, it still spits plenty of lines but looking more carefully there is no longer "remark: Caching instruction".
👍 so I guess it should be OK now.

There is still a "remark: Load must be recomputed" though (but I care less about this provided it's not quadratic)

Here is the new output :

clang bugDense.cpp  -lstdc++ -lm  -fno-exceptions -Rpass=enzyme -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-12.so  -O2  -o bin/bugDense
remark: Load may need caching   %arrayidx9.promoted.i = load double, double* %arrayidx9.i, align 8, !tbaa !47, !alias.scope !32, !noalias !44 due to   store double %add10.i, double* %arrayidx9.i, align 8, !dbg !49, !tbaa !47, !alias.scope !32, !noalias !44 [-Rpass=enzyme]
bugDense.cpp:21:21: remark: Load may need caching   %10 = load double, double* %arrayidx.i, align 8, !dbg !56, !tbaa !47, !alias.scope !26, !noalias !57 due to   tail call void @llvm.memset.p0i8.i64(i8* align 8 %v6.i.i34, i8 0, i64 %15, i1 false) #12, !dbg !79, !alias.scope !80, !noalias !83 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                    ^
bugDense.cpp:21:31: remark: Load may need caching   %11 = load double, double* %arrayidx6.i, align 8, !dbg !58, !tbaa !47, !alias.scope !30, !noalias !59 due to   tail call void @llvm.memset.p0i8.i64(i8* align 8 %v6.i.i34, i8 0, i64 %15, i1 false) #12, !dbg !79, !alias.scope !80, !noalias !83 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                              ^
remark: Load may need caching   %arrayidx9.promoted.i43 = load double, double* %arrayidx9.i42, align 8, !tbaa !47, !alias.scope !74, !noalias !83 due to   store double %add10.i54, double* %arrayidx9.i42, align 8, !dbg !86, !tbaa !47, !alias.scope !74, !noalias !83 [-Rpass=enzyme]
bugDense.cpp:21:31: remark: Load must be recomputed   %11 = load double, double* %arrayidx6.i, align 8, !dbg !58, !tbaa !47, !alias.scope !30, !noalias !59 in reverse_invertfor.body4.i due to   tail call void @llvm.memset.p0i8.i64(i8* align 8 %v6.i.i34, i8 0, i64 %15, i1 false) #12, !dbg !79, !alias.scope !80, !noalias !83 [-Rpass=enzyme]

wsmoses mentioned this issue Jul 29, 2021

Enable scope AA #258

Merged

wsmoses closed this as completed in #258 Jul 29, 2021

unrealwill mentioned this issue Aug 9, 2021

Excessive Caching in neural network mk2 #278

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Excessive Caching in Neural Network #257

Excessive Caching in Neural Network #257

unrealwill commented Jul 28, 2021

wsmoses commented Jul 29, 2021

unrealwill commented Jul 29, 2021

unrealwill commented Jul 29, 2021

unrealwill commented Jul 29, 2021

wsmoses commented Jul 29, 2021 •

edited

Loading

unrealwill commented Jul 29, 2021

Excessive Caching in Neural Network #257

Excessive Caching in Neural Network #257

Comments

unrealwill commented Jul 28, 2021

wsmoses commented Jul 29, 2021

unrealwill commented Jul 29, 2021

unrealwill commented Jul 29, 2021

unrealwill commented Jul 29, 2021

wsmoses commented Jul 29, 2021 • edited Loading

unrealwill commented Jul 29, 2021

wsmoses commented Jul 29, 2021 •

edited

Loading