Skip to content

Commit

Permalink
sf.net #572: gas+x86+SSE faulty optimisations
Browse files Browse the repository at this point in the history
- with '-vec 2' for intra-expression vectorisation of horizontal add optimisation was generating bad AST resulting in bad assembly
  • Loading branch information
jayrm committed Jan 11, 2021
1 parent 8af0326 commit be3817c
Show file tree
Hide file tree
Showing 4 changed files with 292 additions and 3 deletions.
1 change: 1 addition & 0 deletions changelog.txt
Expand Up @@ -112,6 +112,7 @@ Version 1.08.0
- gfxlib2: character data was incorrectly stored for values >= 128 causing incorrect values returned from SCREEN() function
- LLVM backend: escape procedure names on win32 targets to avoid having llvm mangle the names also
- sf.net #572: don't call vectorize operations for anything but gas+x86+sse backend / options
- sf.net #572: gas+x86+SSE faulty optimisations with '-vec 2' for intra-expression vectorisation of horizontal add optimisation was generating bad AST resulting in bad assembly


Version 1.07.0
Expand Down
32 changes: 29 additions & 3 deletions src/compiler/ast-vectorize.bas
Expand Up @@ -574,30 +574,56 @@ private function astIntraTreeVectorize _
if( n->class = AST_NODECLASS_BOP ) then
if( n->op.op = AST_OP_ADD ) then

'' test if nodes can be merged / vectorized
'' careful, maxVectorWidth & vectorWidth are shared
maxVectorWidth = 4
vectorWidth = 0

if( hMergeNode( n->l, n->r, FALSE ) ) then

'' go ahead and do the merge / vectorize
''
maxVectorWidth = 4
vectorWidth = 0
hMergeNode( n->l, n->r, TRUE )

'' check for multiple HADDs
'' n = AST_OP_ADD, can be removed
'' n->l = vectorized node or an existing AST_OP_HADD
'' n->r = can be discarded

assert( n->l )

l = n->l

'' check for multiple AST_OP_HADD's
if( l->class = AST_NODECLASS_UOP ) then
if( l->op.op = AST_OP_HADD ) then

'' replace the callers node
*n = *l

assert( n->l )
assert( n->l->vector <> 0 )

'' copy the new value of the vector to AST_OP_HADD node
n->vector = n->l->vector

'' remove this node
astDelNode( l )
n->vector = 0
return TRUE
end if
end if

'' n = AST_OP_ADD, can be replaced
'' n->l = vectorized node
'' n->r = can be discarded
assert( n->l )

astDelTree( n->r )
n->r = NULL
n->class = AST_NODECLASS_UOP
n->op.op = AST_OP_HADD
n->vector = 0
n->vector = n->l->vector

return TRUE
end if
Expand Down
6 changes: 6 additions & 0 deletions src/compiler/emit_SSE.bas
Expand Up @@ -1094,6 +1094,7 @@ private sub _emitADDF_SSE _
else
'' This should never happen due to IR_OPT_FPUCONV
outp " implement 'add integer to float'"
assert( 0 )
end if
end sub

Expand Down Expand Up @@ -1174,6 +1175,7 @@ private sub _emitSUBF_SSE _
else
'' This should never happen due to IR_OPT_FPUCONV
outp " implement 'subtract integer from float'"
assert( 0 )
end if
end sub

Expand Down Expand Up @@ -1253,6 +1255,7 @@ private sub _emitMULF_SSE _
else
'' This should never happen due to IR_OPT_FPUCONV
outp " implement 'multiply float by integer'"
assert( 0 )
end if

end sub
Expand Down Expand Up @@ -1333,6 +1336,7 @@ private sub _emitDIVF_SSE _
else
'' This should never happen due to IR_OPT_FPUCONV
outp " implement 'divide float by integer'"
assert( 0 )
end if

end sub
Expand Down Expand Up @@ -1764,6 +1768,8 @@ private sub _emitHADDF_SSE _
outp "addps " + dst + COMMA + "xmm7"
outp "pshufd xmm7" + COMMA + dst + COMMA + "0x01"
outp "addss " + dst + COMMA + "xmm7"
else
assert( 0 )
end if
end if

Expand Down
256 changes: 256 additions & 0 deletions tests/optimizations/vector.bas
Expand Up @@ -157,6 +157,262 @@ SUITE( fbc_tests.optimizations.vector )

END_TEST

TEST( fvec3_intra )

dim as FVec3 a3, b3, c3

#macro check( result, expr )
c3.x = expr

CU_ASSERT_EQUAL( c3.x, result )
CU_ASSERT_EQUAL( c3.y, 0f )
CU_ASSERT_EQUAL( c3.z, 0f )
#endmacro

a3 = type(1f, 2f, 4f)
b3 = type(8f, 16f, 32f)

c3.y = 0.0f
c3.z = 0.0f

check( 1f, a3.x )
check( 2f, a3.y )
check( 3f, a3.x + a3.y )
check( 3f, a3.y + a3.x )
check( 4f, a3.z )
check( 5f, a3.x + a3.z )
check( 5f, a3.z + a3.x )
check( 6f, a3.y + a3.z )
check( 6f, a3.z + a3.y )
check( 7f, a3.x + a3.y + a3.z )
check( 7f, a3.y + a3.x + a3.z )
check( 7f, a3.x + a3.z + a3.y )
check( 7f, a3.z + a3.x + a3.y )
check( 7f, a3.y + a3.z + a3.x )
check( 7f, a3.z + a3.y + a3.x )

'' mix in another source

check( 17f, b3.y + a3.x )
check( 18f, a3.y + b3.y )
check( 19f, b3.y + a3.x + a3.y )
check( 19f, a3.x + b3.y + a3.y )
check( 19f, a3.x + a3.y + b3.y )
check( 19f, b3.y + a3.y + a3.x )
check( 19f, a3.y + b3.y + a3.x )
check( 19f, a3.y + a3.x + b3.y )
check( 20f, b3.y + a3.z )
check( 20f, a3.z + b3.y )
check( 21f, b3.y + a3.x + a3.z )
check( 21f, a3.x + b3.y + a3.z )
check( 21f, a3.x + a3.z + b3.y )
check( 21f, b3.y + a3.z + a3.x )
check( 21f, a3.z + b3.y + a3.x )
check( 21f, a3.z + a3.x + b3.y )
check( 22f, b3.y + a3.y + a3.z )
check( 22f, a3.y + b3.y + a3.z )
check( 22f, a3.y + a3.z + b3.y )
check( 22f, b3.y + a3.z + a3.y )
check( 22f, a3.z + b3.y + a3.y )
check( 22f, a3.z + a3.y + b3.y )
check( 23f, b3.y + a3.x + a3.y + a3.z )
check( 23f, a3.x + b3.y + a3.y + a3.z )
check( 23f, a3.x + a3.y + b3.y + a3.z )
check( 23f, a3.x + a3.y + a3.z + b3.y )
check( 23f, b3.y + a3.y + a3.x + a3.z )
check( 23f, a3.y + b3.y + a3.x + a3.z )
check( 23f, a3.y + a3.x + b3.y + a3.z )
check( 23f, a3.y + a3.x + a3.z + b3.y )
check( 23f, b3.y + a3.x + a3.z + a3.y )
check( 23f, a3.x + b3.y + a3.z + a3.y )
check( 23f, a3.x + a3.z + b3.y + a3.y )
check( 23f, a3.x + a3.z + a3.y + b3.y )
check( 23f, b3.y + a3.z + a3.x + a3.y )
check( 23f, a3.z + b3.y + a3.x + a3.y )
check( 23f, a3.z + a3.x + b3.y + a3.y )
check( 23f, a3.z + a3.x + a3.y + b3.y )
check( 23f, b3.y + a3.y + a3.z + a3.x )
check( 23f, a3.y + b3.y + a3.z + a3.x )
check( 23f, a3.y + a3.z + b3.y + a3.x )
check( 23f, a3.y + a3.z + a3.x + b3.y )
check( 23f, b3.y + a3.z + a3.y + a3.x )
check( 23f, a3.z + b3.y + a3.y + a3.x )
check( 23f, a3.z + a3.y + b3.y + a3.x )
check( 23f, a3.z + a3.y + a3.x + b3.y )

END_TEST

TEST( fvec4_intra )

dim as FVec4 a4, b4, c4

#macro check( result, expr )
c4.x = expr

CU_ASSERT_EQUAL( c4.x, result )
CU_ASSERT_EQUAL( c4.y, 0f )
CU_ASSERT_EQUAL( c4.z, 0f )
CU_ASSERT_EQUAL( c4.w, 0f )
#endmacro

#macro check3( r, a, b, c )
check( r, a + b + c )
check( r, b + a + c )
check( r, a + c + b )
check( r, c + a + b )
check( r, b + c + a )
check( r, c + b + a )
#endmacro

#macro check4( r, p, a, b, c )
check( r, p + a + b + c )
check( r, p + b + a + c )
check( r, p + a + c + b )
check( r, p + c + a + b )
check( r, p + b + c + a )
check( r, p + c + b + a )
#endmacro

a4 = type( 1f, 2f, 4f, 8f)
b4 = type(16f, 32f, 64f, 128f)

c4.y = 0.0f
c4.z = 0.0f
c4.w = 0.0f

check ( 1f, a4.x )
check ( 2f, a4.y )
check ( 3f, a4.x + a4.y )
check ( 3f, a4.y + a4.x )
check ( 4f, a4.z )
check ( 5f, a4.x + a4.z )
check ( 5f, a4.z + a4.x )
check ( 6f, a4.y + a4.z )
check ( 6f, a4.z + a4.y )
check3( 7f, a4.x, a4.y, a4.z )
check ( 8f, a4.w )
check ( 9f, a4.x + a4.w )
check ( 9f, a4.w + a4.x )
check ( 10f, a4.w + a4.y )
check ( 10f, a4.y + a4.w )
check3( 11f, a4.x, a4.y, a4.w )
check ( 12f, a4.z + a4.w )
check ( 12f, a4.w + a4.z )
check3( 13f, a4.x, a4.z, a4.w )
check3( 14f, a4.y, a4.z, a4.w )
check4( 15f, a4.x, a4.y, a4.z, a4.w )
check4( 15f, a4.y, a4.x, a4.z, a4.w )
check4( 15f, a4.z, a4.x, a4.y, a4.w )
check4( 15f, a4.w, a4.x, a4.y, a4.z )

'' mix in another source

check ( 33f, b4.y + a4.x )
check ( 33f, a4.x + b4.y )

check ( 34f, b4.y + a4.y )
check ( 34f, a4.y + b4.y )

check ( 35f, b4.y + a4.x + a4.y )
check ( 35f, a4.x + b4.y + a4.y )
check ( 35f, a4.x + a4.y + b4.y )

check ( 35f, b4.y + a4.y + a4.x )
check ( 35f, a4.y + b4.y + a4.x )
check ( 35f, a4.y + a4.x + b4.y )

check ( 36f, b4.y + a4.z )
check ( 36f, a4.z + b4.y )

check ( 37f, b4.y + a4.x + a4.z )
check ( 37f, a4.x + b4.y + a4.z )
check ( 37f, a4.x + a4.z + b4.y )

check ( 37f, b4.y + a4.z + a4.x )
check ( 37f, a4.z + b4.y + a4.x )
check ( 37f, a4.z + a4.x + b4.y )

check ( 38f, b4.y + a4.y + a4.z )
check ( 38f, a4.y + b4.y + a4.z )
check ( 38f, a4.y + a4.z + b4.y )

check ( 38f, b4.y + a4.z + a4.y )
check ( 38f, a4.z + b4.y + a4.y )
check ( 38f, a4.z + a4.y + b4.y )

check3( 39f, b4.y + a4.x, a4.y, a4.z )
check3( 39f, a4.x, b4.y + a4.y, a4.z )
check3( 39f, a4.x, a4.y, b4.y + a4.z )
check3( 39f, a4.x, a4.y, a4.z + b4.y )

check ( 40f, b4.y + a4.w )
check ( 40f, a4.w + b4.y )

check ( 41f, b4.y + a4.x + a4.w )
check ( 41f, a4.x + b4.y + a4.w )
check ( 41f, a4.x + a4.w + b4.y )

check ( 41f, b4.y + a4.w + a4.x )
check ( 41f, a4.w + b4.y + a4.x )
check ( 41f, a4.w + a4.x + b4.y )

check ( 42f, b4.y + a4.w + a4.y )
check ( 42f, a4.w + b4.y + a4.y )
check ( 42f, a4.w + a4.y + b4.y )

check ( 42f, b4.y + a4.y + a4.w )
check ( 42f, a4.y + b4.y + a4.w )
check ( 42f, a4.y + a4.w + b4.y )

check3( 43f, b4.y + a4.x, a4.y, a4.w )
check3( 43f, a4.x, b4.y + a4.y, a4.w )
check3( 43f, a4.x, a4.y, b4.y + a4.w )
check3( 43f, a4.x, a4.y, a4.w + b4.y )

check ( 44f, b4.y + a4.z + a4.w )
check ( 44f, a4.z + b4.y + a4.w )
check ( 44f, a4.z + a4.w + b4.y )

check ( 44f, b4.y + a4.w + a4.z )
check ( 44f, a4.w + b4.y + a4.z )
check ( 44f, a4.w + a4.z + b4.y )

check3( 45f, b4.y + a4.x, a4.z, a4.w )
check3( 45f, a4.x, b4.y + a4.z, a4.w )
check3( 45f, a4.x, a4.z, b4.y + a4.w )
check3( 45f, a4.x, a4.z, a4.w + b4.y )

check3( 46f, b4.y + a4.y, a4.z, a4.w )
check3( 46f, a4.y, b4.y + a4.z, a4.w )
check3( 46f, a4.y, a4.z, b4.y + a4.w )
check3( 46f, a4.y, a4.z, a4.w + b4.y )

check4( 47f, b4.y + a4.x, a4.y, a4.z, a4.w )
check4( 47f, a4.x, b4.y + a4.y, a4.z, a4.w )
check4( 47f, a4.x, a4.y, b4.y + a4.z, a4.w )
check4( 47f, a4.x, a4.y, a4.z, b4.y + a4.w )
check4( 47f, a4.x, a4.y, a4.z, a4.w + b4.y )

check4( 47f, b4.y + a4.y, a4.x, a4.z, a4.w )
check4( 47f, a4.y, b4.y + a4.x, a4.z, a4.w )
check4( 47f, a4.y, a4.x, b4.y + a4.z, a4.w )
check4( 47f, a4.y, a4.x, a4.z, b4.y + a4.w )
check4( 47f, a4.y, a4.x, a4.z, a4.w + b4.y )

check4( 47f, b4.y + a4.z, a4.x, a4.y, a4.w )
check4( 47f, a4.z, b4.y + a4.x, a4.y, a4.w )
check4( 47f, a4.z, a4.x, b4.y + a4.y, a4.w )
check4( 47f, a4.z, a4.x, a4.y, b4.y + a4.w )
check4( 47f, a4.z, a4.x, a4.y, a4.w + b4.y )

check4( 47f, b4.y + a4.w, a4.x, a4.y, a4.z )
check4( 47f, a4.w, b4.y + a4.x, a4.y, a4.z )
check4( 47f, a4.w, a4.x, b4.y + a4.y, a4.z )
check4( 47f, a4.w, a4.x, a4.y, b4.y + a4.z )
check4( 47f, a4.w, a4.x, a4.y, a4.z + b4.y )

END_TEST

''
'' simple arithmetic (double precision)
''
Expand Down

0 comments on commit be3817c

Please sign in to comment.