Merge pull request #2 from sifive/myeh/invscalv_scrub2

Restore changes from sifive-blis-private#28
flame · Oct 12, 2023 · 8663e95 · 8663e95
2 parents efd65d3 + a0ec386
commit 8663e95
Show file tree

Hide file tree

Showing 6 changed files with 75 additions and 137 deletions.
diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c
@@ -85,8 +85,7 @@ void bli_samaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
             __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
             __asm__("vmerge.vvm v16, v16, v24, v0");
         }
-        inc_t tmp = vl * incx;
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
         offset += vl;
         avl -= vl;
     }
@@ -147,8 +146,7 @@ void bli_damaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
             __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
             __asm__("vmerge.vvm v16, v16, v24, v0");
         }
-        inc_t tmp = vl * incx;
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
         offset += vl;
         avl -= vl;
     }
@@ -214,8 +212,7 @@ void bli_camaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
             __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
             __asm__("vmerge.vvm v16, v16, v24, v0");
         }
-        inc_t tmp = vl * incx;
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
         offset += vl;
         avl -= vl;
     }
@@ -278,8 +275,7 @@ void bli_zamaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
             __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
             __asm__("vmerge.vvm v16, v16, v24, v0");
         }
-        inc_t tmp = vl * incx;
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
         offset += vl;
         avl -= vl;
     }

diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c
@@ -50,6 +50,8 @@ void bli_scopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_,
     (void)cntx;
     const float* restrict x = x_;
     float* restrict y = y_;
+    if (n <= 0)
+        return;
 
     incx *= FLT_SIZE;
     incy *= FLT_SIZE;
@@ -69,10 +71,8 @@ void bli_scopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_,
         else
             __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
 
-        inc_t tmp1 = vl * incx;
-        inc_t tmp2 = vl * incy;
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
         avl -= vl;
     }
     return;
@@ -93,8 +93,11 @@ void bli_scopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_,
 void bli_dcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx,
                      void * restrict y_, inc_t incy, const cntx_t *cntx) {
     (void)conjx;
+    (void)cntx;
     const double* restrict x = x_;
     double* restrict y = y_;
+    if (n <= 0)
+        return;
 
     incx *= FLT_SIZE;
     incy *= FLT_SIZE;
@@ -114,10 +117,8 @@ void bli_dcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_,
         else
             __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
 
-        inc_t tmp1 = vl * incx;
-        inc_t tmp2 = vl * incy;
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
-        __asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
         avl -= vl;
     }
     return;
@@ -144,6 +145,8 @@ void bli_ccopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_,
     (void)cntx;
     const scomplex* restrict x = x_;
     scomplex* restrict y = y_;
+    if (n <= 0)
+        return;
 
     incx *= 2 * FLT_SIZE;
     incy *= 2 * FLT_SIZE;
@@ -164,10 +167,8 @@ void bli_ccopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_,
             else
                 __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
 
-            inc_t tmp1 = vl * incx;
-            inc_t tmp2 = vl * incy;
-            __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
-            __asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2));
+            __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
             avl -= vl;
         }
     } else {
@@ -189,50 +190,10 @@ void bli_ccopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_,
             else
                 __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy));
 
-            inc_t tmp1 = vl * incx;
-            inc_t tmp2 = vl * incy;
-            __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
-            __asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2));
+            __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
             avl -= vl;
         }
-        /*
-        // After some benchmarks, it looks like using vl(s)e and vs(s)e with
-        masked
-        // instructions for conjugation is faster than using segment loads and
-        stores.
-        // We'll use the segment load/store version for now, but I'd like to
-        leave this
-        // code here (but commented out) for possible future use.
-        size_t avl = n;
-        // 0xA = 0b1010
-        // this masks off the real parts, so only the imaginary parts are
-        negated
-        // this mask is large enough only for vl <= 64
-        uint64_t mask[1] = {0xAAAAAAAAAAAAAAAA};
-        __asm__("vsetivli zero, 1, e64, m1, ta, ma");
-        __asm__("vle64.v v0, (%0)" : : "r"(mask));
-        while (avl) {
-          size_t vl;
-          __asm__ volatile("vsetvli %0, %1, e64, m4, ta, ma" : "=r"(vl) :
-        "r"(avl)); if (incx == 8)
-            __asm__("vle64.v v4, (%0)" : : "r"(x));
-          else
-            __asm__("vlse64.v v4, (%0), %1" : : "r"(x), "r"(incx));
-          // set vl = VLMAX
-          __asm__ volatile("vsetvli t0, zero, e32, m4, ta, ma");
-          __asm__("vfneg.v v4, v4, v0.t");
-          __asm__ volatile ("vsetvli zero, %0, e64, m4, ta, ma" : : "r"(avl));
-          if (incy == 8)
-            __asm__("vse64.v v4, (%0)" : : "r"(y));
-          else
-            __asm__("vsse64.v v4, (%0), %1" : : "r"(y), "r"(incy));
-          inc_t tmp1 = vl * incx;
-          inc_t tmp2 = vl * incy;
-          __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
-          __asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2));
-          avl -= vl;
-        }
-        */
     }
     return;
 }
@@ -263,6 +224,8 @@ void bli_zcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_,
     (void)cntx;
     const dcomplex* restrict x = x_;
     dcomplex* restrict y = y_;
+    if (n <= 0)
+        return;
 
     incx *= 2 * FLT_SIZE;
     incy *= 2 * FLT_SIZE;
@@ -300,10 +263,8 @@ void bli_zcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_,
             else
                 __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy));
 
-            inc_t tmp1 = vl * incx;
-            inc_t tmp2 = vl * incy;
-            __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
-            __asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2));
+            __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
             avl -= vl;
         }
     }

diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c
@@ -49,6 +49,8 @@ void bli_sinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
                            const cntx_t *cntx) {
     (void)cntx;
     float* restrict x = x_;
+    if (n <= 0)
+        return;
 
     float one = 1.f;
     __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
@@ -68,8 +70,7 @@ void bli_sinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
             __asm__("vfrdiv.vf v0, v0, f0");
             __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
         }
-        inc_t tmp1 = vl * incx;
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
         avl -= vl;
     }
     return;
@@ -93,6 +94,8 @@ void bli_dinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
                            const cntx_t *cntx) {
     (void)cntx;
     double* restrict x = x_;
+    if (n <= 0)
+        return;
 
     double one = 1.;
     __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
@@ -112,8 +115,7 @@ void bli_dinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
             __asm__("vfrdiv.vf v0, v0, f0");
             __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
         }
-        inc_t tmp1 = vl * incx;
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
         avl -= vl;
     }
     return;
@@ -136,6 +138,8 @@ void bli_cinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
                            const cntx_t *cntx) {
     (void)cntx;
     scomplex* restrict x = x_;
+    if (n <= 0)
+        return;
 
     incx *= 2 * FLT_SIZE;
     size_t avl = n;
@@ -161,8 +165,7 @@ void bli_cinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
             __asm__("vfdiv.vv v4, v4, v8");
             __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
         }
-        inc_t tmp1 = vl * incx;
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
         avl -= vl;
     }
     return;
@@ -184,6 +187,8 @@ void bli_zinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
                            const cntx_t *cntx) {
     (void)cntx;
     dcomplex* restrict x = x_;
+    if (n <= 0)
+        return;
 
     incx *= 2 * FLT_SIZE;
     size_t avl = n;
@@ -209,8 +214,7 @@ void bli_zinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
             __asm__("vfdiv.vv v4, v4, v8");
             __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
         }
-        inc_t tmp1 = vl * incx;
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
         avl -= vl;
     }
     return;

diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c
@@ -53,6 +53,8 @@ void bli_sinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr
     (void)cntx;
     const float* restrict alpha = alpha_;
     float* restrict x = x_;
+    if (n <= 0 || *alpha == 0.f || *alpha == 1.f)
+        return;
 
     float one = 1.f;
     __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
@@ -74,8 +76,7 @@ void bli_sinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr
             __asm__("vfmul.vf v0, v0, f0");
             __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
         }
-        inc_t tmp1 = vl * incx;
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
         avl -= vl;
     }
     return;
@@ -104,6 +105,8 @@ void bli_dinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr
     (void)cntx;
     const double* restrict alpha = alpha_;
     double* restrict x = x_;
+    if (n <= 0 || *alpha == 0. || *alpha == 1.)
+        return;
 
     double one = 1.;
     __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
@@ -125,8 +128,7 @@ void bli_dinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr
             __asm__("vfmul.vf v0, v0, f0");
             __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
         }
-        inc_t tmp1 = vl * incx;
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
         avl -= vl;
     }
     return;
@@ -157,6 +159,8 @@ void bli_cinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr
     (void)cntx;
     const scomplex* restrict alpha = alpha_;
     scomplex* restrict x = x_;
+    if (n <= 0 || (alpha->real == 0.f && alpha->imag == 0.f) || (alpha->real == 1.f && alpha->imag == 0.f))
+        return;
 
     __asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha));
     __asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
@@ -188,8 +192,7 @@ void bli_cinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr
             __asm__("vfmacc.vf v12, f1, v0");
             __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx));
         }
-        inc_t tmp1 = vl * incx;
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
         avl -= vl;
     }
     return;
@@ -223,6 +226,8 @@ void bli_zinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr
     (void)cntx;
     const dcomplex* restrict alpha = alpha_;
     dcomplex* restrict x = x_;
+    if (n <= 0 || (alpha->real == 0. && alpha->imag == 0.) || (alpha->real == 1. && alpha->imag == 0.))
+        return;
 
     __asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha));
     __asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
@@ -254,8 +259,7 @@ void bli_zinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr
             __asm__("vfmacc.vf v12, f1, v0");
             __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx));
         }
-        inc_t tmp1 = vl * incx;
-        __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
         avl -= vl;
     }
     return;