Merge pull request #312 from kostrzewa/benchmark.refactor

benchmark: estimate appropriate number of iterations and give structured output
etmc · Mar 17, 2016 · fc9a8ee · fc9a8ee
2 parents 9e46ed9 + 86da1a2
commit fc9a8ee
Showing 1 changed file with 102 additions and 57 deletions.
diff --git a/benchmark.c b/benchmark.c
@@ -254,56 +254,82 @@ int main(int argc,char *argv[])
 #endif
 
   if(even_odd_flag) {
+    sdt=0.; sqdt=0.0;
     /*initialize the pseudo-fermion fields*/
-    j_max=2048;
-    sdt=0.;
     for (k = 0; k < k_max; k++) {
       random_spinor_field_eo(g_spinor_field[k], reproduce_randomnumber_flag, RN_GAUSS);
     }
 
-    while(sdt < 30.) {
+    j_max=512;
+    antioptaway=0.0;
+    /* compute approximately how many applications we need to do to get a reliable measurement */
 #ifdef MPI
-      MPI_Barrier(MPI_COMM_WORLD);
-#endif
-      t1 = gettime();
-      antioptaway=0.0;
-      for (j=0;j<j_max;j++) {
-        for (k=0;k<k_max;k++) {
-          Hopping_Matrix(0, g_spinor_field[k+k_max], g_spinor_field[k]);
-          Hopping_Matrix(1, g_spinor_field[2*k_max], g_spinor_field[k+k_max]);
-          antioptaway+=creal(g_spinor_field[2*k_max][0].s0.c0);
-        }
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+    t1 = gettime();
+    for (j=0;j<j_max;j++) {
+      for (k=0;k<k_max;k++) {
+        Hopping_Matrix(0, g_spinor_field[k+k_max], g_spinor_field[k]);
+        Hopping_Matrix(1, g_spinor_field[2*k_max], g_spinor_field[k+k_max]);
+        antioptaway+=creal(g_spinor_field[2*k_max][0].s0.c0);
       }
-      t2 = gettime();
-      dt = t2-t1;
+    }
+    dt = gettime()-t1;
+    // division by g_nproc because we will average over processes
+    j = (int)(ceil(j_max*31.0/dt/g_nproc));
 #ifdef MPI
-      MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&j,&j_max, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 #else
-      sdt = dt;
+    j_max = j;
 #endif
-      qdt=dt*dt;
+
+
+
+    /* perform the actual benchmark */
 #ifdef MPI
-      MPI_Allreduce (&qdt, &sqdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-#else
-      sqdt = qdt;
+    MPI_Barrier(MPI_COMM_WORLD);
 #endif
-      sdt=sdt/((double)g_nproc);
-      sqdt=sqrt(sqdt/g_nproc-sdt*sdt);
-      j_max*=2;
+    t1 = gettime();
+    antioptaway=0.0;
+    for (j=0;j<j_max;j++) {
+      for (k=0;k<k_max;k++) {
+        Hopping_Matrix(0, g_spinor_field[k+k_max], g_spinor_field[k]);
+        Hopping_Matrix(1, g_spinor_field[2*k_max], g_spinor_field[k+k_max]);
+        antioptaway+=creal(g_spinor_field[2*k_max][0].s0.c0);
+      }
     }
-    j_max=j_max/2;
+    dt = gettime()-t1;
+#ifdef MPI
+    MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#else
+    sdt = dt;
+#endif
+
+    qdt=dt*dt;
+#ifdef MPI
+    MPI_Allreduce (&qdt, &sqdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#else
+    sqdt = qdt;
+#endif
+
+    sdt=sdt/((double)g_nproc);
+    sqdt=sqrt(sqdt/g_nproc-sdt*sdt);
+
     dts=dt;
     sdt=1.0e6f*sdt/((double)(k_max*j_max*(VOLUME)));
     sqdt=1.0e6f*sqdt/((double)(k_max*j_max*(VOLUME)));
 
     if(g_proc_id==0) {
       printf("# The following result is just to make sure that the calculation is not optimized away: %e\n", antioptaway);
       printf("# Total compute time %e sec, variance of the time %e sec. (%d iterations).\n", sdt, sqdt, j_max);
-      printf("# Communication switched on:\n# (%d Mflops [%d bit arithmetic])\n", (int)(1608.0f/sdt),(int)sizeof(spinor)/3);
+#ifdef MPI
+      printf("# Communication switched on: \n");
+#endif
+      printf("\n%12d Mflops(total) %8d Mflops(process)", (int)(g_nproc*1608.0f/sdt),(int)(1608.0f/sdt));
 #ifdef OMP
-      printf("# Mflops per OpenMP thread ~ %d\n",(int)(1608.0f/(omp_num_threads*sdt)));
+      printf(" %8d Mflops(thread)",(int)(1608.0f/(omp_num_threads*sdt)));
 #endif
-      printf("\n");
+      printf(" [ %d bit arithmetic ]\n\n",(int)(sizeof(spinor)/3)); 
       fflush(stdout);
     }
 
@@ -329,11 +355,11 @@ int main(int argc,char *argv[])
     dt=1.0e6f*dt/((double)(k_max*j_max*(VOLUME)));
     if(g_proc_id==0) {
       printf("# The following result is printed just to make sure that the calculation is not optimized away: %e\n",antioptaway);
-      printf("# Communication switched off: \n# (%d Mflops [%d bit arithmetic])\n", (int)(1608.0f/dt),(int)sizeof(spinor)/3);
+      printf("# Communication switched off: \n\n%12d Mflops(total) %8d Mflops(process)", (int)(g_nproc*1608.0f/dt),(int)(1608.0f/dt));
 #ifdef OMP
-      printf("# Mflops per OpenMP thread ~ %d\n",(int)(1608.0f/(omp_num_threads*dt)));
+      printf(" %8d Mflops(thread)",(int)(1608.0f/(omp_num_threads*dt)));
 #endif
-      printf("\n"); 
+      printf(" [ %d bit arithmetic ]\n\n",(int)(sizeof(spinor)/3)); 
       fflush(stdout);
     }
     sdt=sdt/((double)k_max);
@@ -353,56 +379,75 @@ int main(int argc,char *argv[])
   else {
     /* the non even/odd case now */
     /*initialize the pseudo-fermion fields*/
-    j_max=1;
+    j_max=128;
     sdt=0.;
     for (k=0;k<k_max;k++) {
       random_spinor_field_lexic(g_spinor_field[k], reproduce_randomnumber_flag, RN_GAUSS);
     }
 
-    while(sdt < 3.) {
+    /* estimate a reasonable number of applications to get a reliable measurement */
 #ifdef MPI
-      MPI_Barrier(MPI_COMM_WORLD);
-#endif
-      t1 = gettime();
-      for (j=0;j<j_max;j++) {
-        for (k=0;k<k_max;k++) {
-          D_psi(g_spinor_field[k+k_max], g_spinor_field[k]);
-          antioptaway+=creal(g_spinor_field[k+k_max][0].s0.c0);
-        }
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+    t1 = gettime();
+    for (j=0;j<j_max;j++) {
+      for (k=0;k<k_max;k++) {
+        D_psi(g_spinor_field[k+k_max], g_spinor_field[k]);
+        antioptaway+=creal(g_spinor_field[k+k_max][0].s0.c0);
       }
-      t2 = gettime();
-      dt=t2-t1;
+    }
+    t2 = gettime();
+    dt=t2-t1;
+    // division by g_nproc because we will average over processes using  MPI_SUM
+    j = (int)(ceil(j_max*31.0/dt/g_nproc));
 #ifdef MPI
-      MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&j,&j_max, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 #else
-      sdt = dt;
+    j_max = j;
 #endif
-      qdt=dt*dt;
+
+    /* perform the actual measurement */
 #ifdef MPI
-      MPI_Allreduce (&qdt, &sqdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-#else
-      sqdt = qdt;
+    MPI_Barrier(MPI_COMM_WORLD);
 #endif
-      sdt=sdt/((double)g_nproc);
-      sqdt=sqrt(sqdt/g_nproc-sdt*sdt);
-      j_max*=2;
+    t1 = gettime();
+    for (j=0;j<j_max;j++) {
+      for (k=0;k<k_max;k++) {
+        D_psi(g_spinor_field[k+k_max], g_spinor_field[k]);
+        antioptaway+=creal(g_spinor_field[k+k_max][0].s0.c0);
+      }
     }
-    j_max=j_max/2;
+    t2 = gettime();
+    dt=t2-t1;
+#ifdef MPI
+    MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#else
+    sdt = dt;
+#endif
+    qdt=dt*dt;
+#ifdef MPI
+    MPI_Allreduce (&qdt, &sqdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#else
+    sqdt = qdt;
+#endif
+    sdt=sdt/((double)g_nproc);
+    sqdt=sqrt(sqdt/g_nproc-sdt*sdt);
     dts=dt;
     sdt=1.0e6f*sdt/((double)(k_max*j_max*(VOLUME)));
     sqdt=1.0e6f*sqdt/((double)(k_max*j_max*(VOLUME)));
 
     if(g_proc_id==0) {
       printf("# The following result is just to make sure that the calculation is not optimized away: %e\n", antioptaway);
-      printf("# Total compute time %e sec, variance of the time %e sec. (%d iterations).\n", sdt, sqdt, j_max);
-      printf("\n# (%d Mflops [%d bit arithmetic])\n", (int)(1680.0f/sdt),(int)sizeof(spinor)/3);
+      printf("# Total compute time %e sec, variance of the time %e sec. (%d iterations).\n\n", sdt, sqdt, j_max);
+      printf(" %12d Mflops(total) %8d Mflops(process)", (int)(1680.0f*g_nproc/sdt),(int)(1680.0f/sdt));
 #ifdef OMP
-      printf("# Mflops per OpenMP thread ~ %d\n",(int)(1680.0f/(omp_num_threads*sdt)));
+      printf(" %8d Mflops(thread)",(int)(1680.0f/(omp_num_threads*sdt)));
 #endif
-      printf("\n"); 
+      printf(" [ %d bit arithmetic ]\n\n",(int)(sizeof(spinor)/3)); 
       fflush(stdout);
     }
   }
+
 #ifdef HAVE_LIBLEMON
   if(g_proc_id==0) {
     printf("# Performing parallel IO test ...\n");