Hook up some SSE2

ZDoom · Dec 1, 2019 · 2c751d2 · alexey-lysiuk · Dec 2, 2019 · coelckers
1 parent 4a25c9f
commit 2c751d2
Showing 1 changed file with 63 additions and 0 deletions.
diff --git a/src/rendering/polyrenderer/drawers/screen_triangle.cpp b/src/rendering/polyrenderer/drawers/screen_triangle.cpp
@@ -39,6 +39,7 @@
 #include "x86.h"
 #include <cmath>
 
+#ifdef NO_SSE
 static void WriteW(int y, int x0, int x1, const TriDrawTriangleArgs* args, PolyTriangleThreadData* thread)
 {
 	float startX = x0 + (0.5f - args->v1->x);
@@ -53,6 +54,36 @@ static void WriteW(int y, int x0, int x1, const TriDrawTriangleArgs* args, PolyT
 		posW += stepW;
 	}
 }
+#else
+static void WriteW(int y, int x0, int x1, const TriDrawTriangleArgs* args, PolyTriangleThreadData* thread)
+{
+	float startX = x0 + (0.5f - args->v1->x);
+	float startY = y + (0.5f - args->v1->y);
+
+	float posW = args->v1->w + args->gradientX.W * startX + args->gradientY.W * startY;
+	float stepW = args->gradientX.W;
+	float* w = thread->scanline.W;
+
+	int ssecount = ((x1 - x0) & 3);
+	int sseend = x0 + ssecount;
+
+	__m128 mstepW = _mm_set1_ps(stepW * 4.0f);
+	__m128 mposW = _mm_setr_ps(posW, posW + stepW, posW + stepW + stepW, posW + stepW + stepW + stepW);
+
+	for (int x = x0; x < sseend; x += 4)
+	{
+		_mm_storeu_ps(w + x, _mm_rcp_ps(mposW));
+		mposW = _mm_add_ps(mposW, mstepW);
+	}
+
+	posW += ssecount * stepW;
+	for (int x = sseend; x < x1; x++)
+	{
+		w[x] = 1.0f / posW;
+		posW += stepW;
+	}
+}
+#endif
 
 static void WriteLightArray(int y, int x0, int x1, const TriDrawTriangleArgs* args, PolyTriangleThreadData* thread)
 {
@@ -117,6 +148,7 @@ static void WriteLightArray(int y, int x0, int x1, const TriDrawTriangleArgs* ar
 	}
 }
 
+#ifdef NO_SSE
 static void WriteVarying(float pos, float step, int x0, int x1, const float* w, float* varying)
 {
 	for (int x = x0; x < x1; x++)
@@ -125,6 +157,29 @@ static void WriteVarying(float pos, float step, int x0, int x1, const float* w,
 		pos += step;
 	}
 }
+#else
+static void WriteVarying(float pos, float step, int x0, int x1, const float* w, float* varying)
+{
+	int ssecount = ((x1 - x0) & 3);
+	int sseend = x0 + ssecount;
+
+	__m128 mstep = _mm_set1_ps(step * 4.0f);
+	__m128 mpos = _mm_setr_ps(pos, pos + step, pos + step + step, pos + step + step + step);
+
+	for (int x = x0; x < sseend; x += 4)
+	{
+		_mm_storeu_ps(varying + x, _mm_mul_ps(mpos, _mm_loadu_ps(w + x)));
+		mpos = _mm_add_ps(mpos, mstep);
+	}
+
+	pos += ssecount * step;
+	for (int x = sseend; x < x1; x++)
+	{
+		varying[x] = pos * w[x];
+		pos += step;
+	}
+}
+#endif
 
 static void WriteVaryings(int y, int x0, int x1, const TriDrawTriangleArgs* args, PolyTriangleThreadData* thread)
 {
@@ -283,10 +338,18 @@ static void WriteStencil(int y, int x0, int x1, PolyTriangleThreadData* thread)
 	}
 }
 
+#ifdef NO_SSE
 static float wrap(float value)
 {
 	return value - std::floor(value);
 }
+#else
+static float wrap(float value)
+{
+	__m128 mvalue = _mm_set_ss(value);
+	return _mm_cvtss_f32(_mm_sub_ss(mvalue, _mm_floor_ss(_mm_setzero_ps(), mvalue)));
+}
+#endif
 
 static uint32_t sampleTexture(float u, float v, const uint32_t* texPixels, int texWidth, int texHeight)
 {