From 213ba49ad6e483e0ee00990af5d5072f16951f83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Fri, 13 Oct 2023 14:48:26 +0200 Subject: [PATCH 01/12] Simplify instructions with known not-small operands --- erts/emulator/beam/jit/arm/instr_arith.cpp | 77 +++++++++------ erts/emulator/beam/jit/arm/instr_common.cpp | 38 ++++++-- erts/emulator/beam/jit/x86/instr_arith.cpp | 102 ++++++++++++-------- erts/emulator/beam/jit/x86/instr_common.cpp | 60 ++++++++---- 4 files changed, 183 insertions(+), 94 deletions(-) diff --git a/erts/emulator/beam/jit/arm/instr_arith.cpp b/erts/emulator/beam/jit/arm/instr_arith.cpp index 0c74430f61c4..7e0ccf9d240c 100644 --- a/erts/emulator/beam/jit/arm/instr_arith.cpp +++ b/erts/emulator/beam/jit/arm/instr_arith.cpp @@ -35,6 +35,8 @@ void BeamModuleAssembler::emit_add_sub_types(bool is_small_result, if (is_small_result) { comment("skipped overflow test because the result is always small"); emit_are_both_small(LHS, lhs_reg, RHS, rhs_reg, next); + } else if (RHS.isLiteral()) { + /* Skipping test for small */ } else { if (always_small(RHS)) { a.and_(TMP1, lhs_reg, imm(_TAG_IMMED1_MASK)); @@ -60,8 +62,11 @@ void BeamModuleAssembler::emit_are_both_small(const ArgSource &LHS, const ArgSource &RHS, const a64::Gp rhs_reg, const Label next) { - if (always_small(RHS) && - always_one_of(LHS)) { + if (RHS.isLiteral()) { + comment("skipped test for small because one operand is never small"); + } else if (always_small(RHS) && + always_one_of( + LHS)) { comment("simplified test for small operand since other types are " "boxed"); emit_is_boxed(next, lhs_reg); @@ -165,7 +170,9 @@ void BeamModuleAssembler::emit_i_plus(const ArgLabel &Fail, auto [lhs, rhs] = load_sources(LHS, ARG2, RHS, ARG3); - if (rhs_is_arm_literal) { + if (RHS.isLiteral()) { + comment("skipped test for small because one operand is never small"); + } else if (rhs_is_arm_literal) { Uint cleared_tag = RHS.as().get() & ~_TAG_IMMED1_MASK; a.adds(ARG1, lhs.reg, imm(cleared_tag)); } else { @@ -349,10 +356,11 @@ void BeamModuleAssembler::emit_i_minus(const ArgLabel &Fail, } Label next = a.newLabel(); - auto [lhs, rhs] = load_sources(LHS, ARG2, RHS, ARG3); - if (rhs_is_arm_literal) { + if (RHS.isLiteral()) { + comment("skipped test for small because one operand is never small"); + } else if (rhs_is_arm_literal) { Uint cleared_tag = RHS.as().get() & ~_TAG_IMMED1_MASK; a.subs(ARG1, lhs.reg, imm(cleared_tag)); } else { @@ -535,6 +543,7 @@ void BeamModuleAssembler::emit_i_mul_add(const ArgLabel &Fail, const ArgRegister &Dst) { bool is_product_small = is_product_small_if_args_are_small(Src1, Src2); bool is_sum_small = is_sum_small_if_args_are_small(Src3, Src4); + bool sometimes_small = !(Src2.isLiteral() || Src4.isLiteral()); bool is_increment_zero = Src4.isSmall() && Src4.as().getSigned() == 0; Sint factor = 0; @@ -585,7 +594,7 @@ void BeamModuleAssembler::emit_i_mul_add(const ArgLabel &Fail, emit_are_both_small(Src1, src1.reg, Src2, src2.reg, small); } else if (always_small(Src2)) { emit_are_both_small(Src1, src1.reg, Src4, src4.reg, small); - } else { + } else if (sometimes_small) { ASSERT(!is_increment_zero); ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK); a.and_(TMP1, src1.reg, src2.reg); @@ -602,6 +611,9 @@ void BeamModuleAssembler::emit_i_mul_add(const ArgLabel &Fail, a.cmp(TMP1, imm(_TAG_IMMED1_SMALL)); a.b_eq(small); } + } else { + comment("skipped test for small because one operand is never " + "small"); } mov_var(ARG2, src1); @@ -625,14 +637,18 @@ void BeamModuleAssembler::emit_i_mul_add(const ArgLabel &Fail, } } - a.b(store_result); + if (sometimes_small) { + a.b(store_result); + } } a.bind(small); - if (is_increment_zero) { - comment("multiply smalls"); - } else { - comment("multiply and add smalls"); + if (sometimes_small) { + if (is_increment_zero) { + comment("multiply smalls"); + } else { + comment("multiply and add smalls"); + } } if (is_product_small && is_sum_small) { @@ -657,7 +673,7 @@ void BeamModuleAssembler::emit_i_mul_add(const ArgLabel &Fail, } comment("skipped test for small result"); - } else { + } else if (sometimes_small) { auto min_increment = std::get<0>(getClampedRange(Src4)); a.and_(TMP3, src1.reg, imm(~_TAG_IMMED1_MASK)); @@ -1181,22 +1197,29 @@ void BeamModuleAssembler::emit_i_band(const ArgLabel &Fail, } else { Label next = a.newLabel(); - /* TAG & TAG = TAG, so we don't need to tag it again. */ - a.and_(ARG1, lhs.reg, rhs.reg); - - ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK); - if (always_one_of(LHS) && - always_one_of(RHS)) { - comment("simplified test for small operands since other types are " - "boxed"); - emit_is_boxed(next, ARG1); + if (RHS.isLiteral()) { + comment("skipped test for small because one operand is never " + "small"); } else { - /* All other term types has at least one zero in the low 4 - * bits. Therefore, the result will be a small iff both - * operands are small. */ - a.and_(TMP1, ARG1, imm(_TAG_IMMED1_MASK)); - a.cmp(TMP1, imm(_TAG_IMMED1_SMALL)); - a.b_eq(next); + /* TAG & TAG = TAG, so we don't need to tag it again. */ + a.and_(ARG1, lhs.reg, rhs.reg); + + ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK); + if (always_one_of( + LHS) && + always_one_of( + RHS)) { + comment("simplified test for small operands since other types " + "are boxed"); + emit_is_boxed(next, ARG1); + } else { + /* All other term types has at least one zero in the low 4 + * bits. Therefore, the result will be a small iff both + * operands are small. */ + a.and_(TMP1, ARG1, imm(_TAG_IMMED1_MASK)); + a.cmp(TMP1, imm(_TAG_IMMED1_SMALL)); + a.b_eq(next); + } } mov_var(ARG2, lhs); diff --git a/erts/emulator/beam/jit/arm/instr_common.cpp b/erts/emulator/beam/jit/arm/instr_common.cpp index 2cb0992efbfb..4861928a0d52 100644 --- a/erts/emulator/beam/jit/arm/instr_common.cpp +++ b/erts/emulator/beam/jit/arm/instr_common.cpp @@ -1580,9 +1580,13 @@ void BeamModuleAssembler::emit_is_eq(const ArgLabel &Fail, a.cmp(x.reg, y.reg); a.b_eq(next); - /* We can skip deep comparisons when both args are immediates. */ - emit_are_both_immediate(x.reg, y.reg); - a.b_eq(resolve_beam_label(Fail, disp1MB)); + if (X.isLiteral() || Y.isLiteral()) { + comment("skipped test for small because one operand is never small"); + } else { + /* We can skip deep comparisons when both args are immediates. */ + emit_are_both_immediate(x.reg, y.reg); + a.b_eq(resolve_beam_label(Fail, disp1MB)); + } mov_var(ARG1, x); mov_var(ARG2, y); @@ -1603,9 +1607,13 @@ void BeamModuleAssembler::emit_is_ne(const ArgLabel &Fail, a.cmp(x.reg, y.reg); a.b_eq(resolve_beam_label(Fail, disp1MB)); - /* We can skip deep comparisons when both args are immediates. */ - emit_are_both_immediate(x.reg, y.reg); - a.b_eq(next); + if (X.isLiteral() || Y.isLiteral()) { + comment("skipped test for small because one operand is never small"); + } else { + /* We can skip deep comparisons when both args are immediates. */ + emit_are_both_immediate(x.reg, y.reg); + a.b_eq(next); + } mov_var(ARG1, x); mov_var(ARG2, y); @@ -1726,6 +1734,18 @@ void BeamModuleAssembler::emit_is_lt(const ArgLabel &Fail, a.cmp(lhs.reg, rhs.reg); a.b_ge(resolve_beam_label(Fail, disp1MB)); a.bind(next); + } else if (LHS.isLiteral() || RHS.isLiteral()) { + Label next = a.newLabel(); + comment("skipped test for small because one operand is never small"); + a.cmp(lhs.reg, rhs.reg); + a.b_eq(next); + + mov_var(ARG1, lhs); + mov_var(ARG2, rhs); + fragment_call(ga->get_arith_compare_shared()); + + a.bind(next); + a.b_ge(resolve_beam_label(Fail, disp1MB)); } else if (always_one_of( LHS) && always_one_of( @@ -1880,6 +1900,12 @@ void BeamModuleAssembler::emit_is_ge(const ArgLabel &Fail, a.tbz(TMP1, imm(bitNumber), resolve_beam_label(Fail, disp32K)); } a.bind(next); + } else if (LHS.isLiteral() || RHS.isLiteral()) { + comment("skipped test for small because one operand is never small"); + mov_var(ARG1, lhs); + mov_var(ARG2, rhs); + fragment_call(ga->get_arith_compare_shared()); + a.b_lt(resolve_beam_label(Fail, disp1MB)); } else if (always_one_of( LHS) && always_one_of( diff --git a/erts/emulator/beam/jit/x86/instr_arith.cpp b/erts/emulator/beam/jit/x86/instr_arith.cpp index 35976cc048d7..9112f5dc220f 100644 --- a/erts/emulator/beam/jit/x86/instr_arith.cpp +++ b/erts/emulator/beam/jit/x86/instr_arith.cpp @@ -210,16 +210,21 @@ void BeamModuleAssembler::emit_i_plus(const ArgSource &LHS, mov_arg(ARG2, LHS); /* Used by erts_mixed_plus in this slot */ mov_arg(ARG3, RHS); /* Used by erts_mixed_plus in this slot */ - emit_are_both_small(mixed, LHS, ARG2, RHS, ARG3); - a.mov(RET, ARG2); - a.and_(RET, imm(~_TAG_IMMED1_MASK)); - a.add(RET, ARG3); - if (small_result) { - comment("skipped overflow test because the result is always small"); - a.short_().jmp(next); + if (LHS.isLiteral() || RHS.isLiteral()) { + comment("skipped test for small because one operand is never small"); } else { - a.short_().jno(next); + emit_are_both_small(mixed, LHS, ARG2, RHS, ARG3); + + a.mov(RET, ARG2); + a.and_(RET, imm(~_TAG_IMMED1_MASK)); + a.add(RET, ARG3); + if (small_result) { + comment("skipped overflow test because the result is always small"); + a.short_().jmp(next); + } else { + a.short_().jno(next); + } } /* Call mixed addition. */ @@ -324,20 +329,24 @@ void BeamModuleAssembler::emit_i_minus(const ArgSource &LHS, mov_arg(ARG2, LHS); /* Used by erts_mixed_plus in this slot */ mov_arg(ARG3, RHS); /* Used by erts_mixed_plus in this slot */ - emit_are_both_small(mixed, LHS, ARG2, RHS, ARG3); - - if (small_result) { - comment("skipped overflow test because the result is always small"); - a.mov(RET, ARG2); - a.and_(ARG3, imm(~_TAG_IMMED1_MASK)); - a.sub(RET, ARG3); - a.short_().jmp(next); + if (LHS.isLiteral() || RHS.isLiteral()) { + comment("skipped test for small because one operand is never small"); } else { - a.mov(RET, ARG2); - a.mov(ARG4, ARG3); - a.and_(ARG4, imm(~_TAG_IMMED1_MASK)); - a.sub(RET, ARG4); - a.short_().jno(next); + emit_are_both_small(mixed, LHS, ARG2, RHS, ARG3); + + if (small_result) { + comment("skipped overflow test because the result is always small"); + a.mov(RET, ARG2); + a.and_(ARG3, imm(~_TAG_IMMED1_MASK)); + a.sub(RET, ARG3); + a.short_().jmp(next); + } else { + a.mov(RET, ARG2); + a.mov(ARG4, ARG3); + a.and_(ARG4, imm(~_TAG_IMMED1_MASK)); + a.sub(RET, ARG4); + a.short_().jno(next); + } } a.bind(mixed); @@ -995,6 +1004,7 @@ void BeamModuleAssembler::emit_i_mul_add(const ArgLabel &Fail, const ArgRegister &Dst) { bool is_product_small = is_product_small_if_args_are_small(Src1, Src2); bool is_sum_small = is_sum_small_if_args_are_small(Src3, Src4); + bool sometimes_small = !(Src2.isLiteral() || Src4.isLiteral()); bool is_increment_zero = Src4.isSmall() && Src4.as().getSigned() == 0; Sint factor = 0; @@ -1088,7 +1098,9 @@ void BeamModuleAssembler::emit_i_mul_add(const ArgLabel &Fail, mov_arg(ARG4, Src4); } - if (Src2.isSmall()) { + if (!sometimes_small) { + comment("skipped test for small because one operand is never small"); + } else if (Src2.isSmall()) { Sint val = Src2.as().getSigned(); emit_are_both_small(mixed, Src1, ARG2, Src4, ARG4); a.mov(RET, ARG2); @@ -1120,26 +1132,28 @@ void BeamModuleAssembler::emit_i_mul_add(const ArgLabel &Fail, a.sar(ARG5, imm(_TAG_IMMED1_SIZE)); } - a.and_(RET, imm(~_TAG_IMMED1_MASK)); - a.imul(RET, ARG5); - if (is_product_small) { - comment("skipped overflow check because product is always small"); - } else { - a.short_().jo(mixed); - } - - if (is_increment_zero) { - a.or_(RET, imm(_TAG_IMMED1_SMALL)); - } else { - a.add(RET, ARG4); - if (is_sum_small) { - comment("skipped overflow check because sum is always small"); + if (sometimes_small) { + a.and_(RET, imm(~_TAG_IMMED1_MASK)); + a.imul(RET, ARG5); + if (is_product_small) { + comment("skipped overflow check because product is always small"); } else { a.short_().jo(mixed); } - } - a.short_().jmp(next); + if (is_increment_zero) { + a.or_(RET, imm(_TAG_IMMED1_SMALL)); + } else { + a.add(RET, ARG4); + if (is_sum_small) { + comment("skipped overflow check because sum is always small"); + } else { + a.short_().jo(mixed); + } + } + + a.short_().jmp(next); + } /* Call mixed multiplication. */ a.bind(mixed); @@ -1258,11 +1272,15 @@ void BeamModuleAssembler::emit_i_band(const ArgSource &LHS, Label generic = a.newLabel(), next = a.newLabel(); - emit_are_both_small(generic, LHS, ARG2, RHS, RET); + if (RHS.isLiteral()) { + comment("skipped test for small because one operand is never small"); + } else { + emit_are_both_small(generic, LHS, ARG2, RHS, RET); - /* TAG & TAG = TAG, so we don't need to tag it again. */ - a.and_(RET, ARG2); - a.short_().jmp(next); + /* TAG & TAG = TAG, so we don't need to tag it again. */ + a.and_(RET, ARG2); + a.short_().jmp(next); + } a.bind(generic); { diff --git a/erts/emulator/beam/jit/x86/instr_common.cpp b/erts/emulator/beam/jit/x86/instr_common.cpp index bc19949492c2..223e42ab7b98 100644 --- a/erts/emulator/beam/jit/x86/instr_common.cpp +++ b/erts/emulator/beam/jit/x86/instr_common.cpp @@ -1592,12 +1592,16 @@ void BeamModuleAssembler::emit_is_eq(const ArgLabel &Fail, a.cmp(ARG1, ARG2); a.short_().je(next); - /* We can skip deep comparisons when both args are immediates. */ - a.mov(RETd, ARG1d); - a.and_(RETd, ARG2d); - a.and_(RETb, imm(_TAG_PRIMARY_MASK)); - a.cmp(RETb, imm(TAG_PRIMARY_IMMED1)); - a.je(fail); + if (A.isLiteral() || B.isLiteral()) { + comment("skipped test for small because one operand is never small"); + } else { + /* We can skip deep comparisons when both args are immediates. */ + a.mov(RETd, ARG1d); + a.and_(RETd, ARG2d); + a.and_(RETb, imm(_TAG_PRIMARY_MASK)); + a.cmp(RETb, imm(TAG_PRIMARY_IMMED1)); + a.je(fail); + } safe_fragment_call(ga->get_arith_eq_shared()); a.jne(fail); @@ -1615,12 +1619,16 @@ void BeamModuleAssembler::emit_is_ne(const ArgLabel &Fail, a.cmp(ARG1, ARG2); a.je(fail); - /* We can skip deep comparisons when both args are immediates. */ - a.mov(RETd, ARG1d); - a.and_(RETd, ARG2d); - a.and_(RETb, imm(_TAG_PRIMARY_MASK)); - a.cmp(RETb, imm(TAG_PRIMARY_IMMED1)); - a.short_().je(next); + if (A.isLiteral() || B.isLiteral()) { + comment("skipped test for small because one operand is never small"); + } else { + /* We can skip deep comparisons when both args are immediates. */ + a.mov(RETd, ARG1d); + a.and_(RETd, ARG2d); + a.and_(RETb, imm(_TAG_PRIMARY_MASK)); + a.cmp(RETb, imm(TAG_PRIMARY_IMMED1)); + a.short_().je(next); + } safe_fragment_call(ga->get_arith_eq_shared()); a.je(fail); @@ -1723,6 +1731,7 @@ void BeamModuleAssembler::emit_is_lt(const ArgLabel &Fail, Label generic = a.newLabel(), do_jge = a.newLabel(), next = a.newLabel(); bool both_small = always_small(LHS) && always_small(RHS); bool need_generic = !both_small; + bool never_small = LHS.isLiteral() || RHS.isLiteral(); mov_arg(ARG2, RHS); /* May clobber ARG1 */ mov_arg(ARG1, LHS); @@ -1750,6 +1759,10 @@ void BeamModuleAssembler::emit_is_lt(const ArgLabel &Fail, comment("simplified test because it always succeeds when LHS is a " "bignum"); emit_is_not_boxed(next, ARG1, dShort); + } else if (never_small) { + comment("skipped test for small because one operand is never small"); + a.cmp(ARG1, ARG2); + a.short_().je(do_jge); } else if (always_one_of( LHS) && always_one_of( @@ -1789,9 +1802,11 @@ void BeamModuleAssembler::emit_is_lt(const ArgLabel &Fail, } /* Both arguments are smalls. */ - a.cmp(ARG1, ARG2); - if (need_generic) { - a.short_().jmp(do_jge); + if (!never_small) { + a.cmp(ARG1, ARG2); + if (need_generic) { + a.short_().jmp(do_jge); + } } a.bind(generic); @@ -1829,6 +1844,7 @@ void BeamModuleAssembler::emit_is_ge(const ArgLabel &Fail, Label generic = a.newLabel(), small = a.newLabel(), do_jl = a.newLabel(), next = a.newLabel(); bool need_generic = !both_small; + bool never_small = LHS.isLiteral() || RHS.isLiteral(); mov_arg(ARG2, RHS); /* May clobber ARG1 */ mov_arg(ARG1, LHS); @@ -1886,6 +1902,10 @@ void BeamModuleAssembler::emit_is_ge(const ArgLabel &Fail, a.cmp(RETb, imm(_TAG_HEADER_NEG_BIG)); /* Fail if bignum is positive. */ a.short_().jmp(do_jl); + } else if (never_small) { + comment("skipped test for small because one operand is never small"); + a.cmp(ARG1, ARG2); + a.short_().je(next); } else if (always_one_of( LHS) && always_one_of( @@ -1913,7 +1933,7 @@ void BeamModuleAssembler::emit_is_ge(const ArgLabel &Fail, } else { /* Avoid the expensive generic comparison for equal terms. */ a.cmp(ARG1, ARG2); - a.short_().je(do_jl); + a.short_().je(next); a.mov(RETd, ARG1d); a.and_(RETd, ARG2d); @@ -1926,9 +1946,11 @@ void BeamModuleAssembler::emit_is_ge(const ArgLabel &Fail, /* Both arguments are smalls. */ a.bind(small); - a.cmp(ARG1, ARG2); - if (need_generic) { - a.short_().jmp(do_jl); + if (!never_small) { + a.cmp(ARG1, ARG2); + if (need_generic) { + a.short_().jmp(do_jl); + } } a.bind(generic); From ae8fe47e6887ef95e301db16391429fbbba83c63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Sat, 14 Oct 2023 09:32:51 +0200 Subject: [PATCH 02/12] AArch64: Optimize binary matching Leverage the bitfield manipulation instructions to optimize matching. Consider this example: bm(<>) -> {X, Y, Z}. In OTP 26, the code for extracting the `X`, `Y`, and `Z` variables looks like this: # extract integer 16 ror x7, x7, 48 mov x27, 15 bfi x27, x7, 4, 16 # extract integer 16 ror x7, x7, 48 mov x28, 15 bfi x28, x7, 4, 16 # extract integer 16 ror x7, x7, 48 mov x15, 15 bfi x15, x7, 4, 16 With this commit, the code is simplified to: # extract integer 16 mov x4, 15 orr x27, x4, x7, 44 # extract integer 16 ubfx x9, x7, 32, 16 orr x28, x4, x9, 4 # extract integer 16 ubfx x9, x7, 16, 16 orr x15, x4, x9, 4 --- erts/emulator/beam/jit/arm/beam_asm.hpp | 5 +- erts/emulator/beam/jit/arm/instr_bs.cpp | 172 +++++++++++++++--------- 2 files changed, 111 insertions(+), 66 deletions(-) diff --git a/erts/emulator/beam/jit/arm/beam_asm.hpp b/erts/emulator/beam/jit/arm/beam_asm.hpp index d2a219cd0359..e80ad8aadc0f 100644 --- a/erts/emulator/beam/jit/arm/beam_asm.hpp +++ b/erts/emulator/beam/jit/arm/beam_asm.hpp @@ -1227,12 +1227,15 @@ class BeamModuleAssembler : public BeamAssembler, const arm::Gp bin_base, const arm::Gp bitdata); - void emit_extract_integer(const arm::Gp bitdata, + void emit_extract_integer(const arm::Gp &bitdata, + const arm::Gp &small_tag, Uint flags, + Uint position, Uint bits, const ArgRegister &Dst); void emit_extract_bitstring(const arm::Gp bitdata, + Uint position, Uint bits, const ArgRegister &Dst); diff --git a/erts/emulator/beam/jit/arm/instr_bs.cpp b/erts/emulator/beam/jit/arm/instr_bs.cpp index 3fa07e9ef7f2..642b5e926a21 100644 --- a/erts/emulator/beam/jit/arm/instr_bs.cpp +++ b/erts/emulator/beam/jit/arm/instr_bs.cpp @@ -3512,49 +3512,81 @@ void BeamModuleAssembler::emit_read_bits(Uint bits, a.bind(read_done); } -void BeamModuleAssembler::emit_extract_integer(const arm::Gp bitdata, +void BeamModuleAssembler::emit_extract_integer(const arm::Gp &bitdata, + const arm::Gp &small_tag, Uint flags, + Uint position, Uint bits, const ArgRegister &Dst) { - Label big = a.newLabel(); - Label done = a.newLabel(); - arm::Gp data_reg; + arm::Gp data_reg = bitdata; auto dst = init_destination(Dst, TMP1); - Uint num_partial = bits % 8; - Uint num_complete = 8 * (bits / 8); if (bits <= 8) { /* Endian does not matter for values that fit in a byte. */ flags &= ~BSF_LITTLE; } + /* Optimize extraction of the first segment after a read. Saves + * one instruction. */ + if (bits > 0 && bits < SMALL_BITS && position + bits == 64 && + (flags & (BSF_LITTLE | BSF_SIGNED)) == 0) { + ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK); + a.orr(dst.reg, + small_tag, + data_reg, + arm::lsr(position - _TAG_IMMED1_SIZE)); + flush_var(dst); + return; + } + + Label big = a.newLabel(); + Label done = a.newLabel(); + Uint num_partial = bits % 8; + Uint num_complete = 8 * (bits / 8); + + switch (bits) { + case 0: + data_reg = ZERO; + break; + case 64: + data_reg = bitdata; + break; + default: + data_reg = TMP2; + switch (flags & (BSF_SIGNED | BSF_LITTLE)) { + case BSF_SIGNED: /* Signed and big-endian */ + a.sbfx(TMP2, bitdata, position, bits); + break; + default: + a.ubfx(TMP2, bitdata, position, bits); + break; + } + } + /* If this segment is little-endian, reverse endianness. */ if ((flags & BSF_LITTLE) != 0) { comment("reverse endian for a little-endian segment"); - } - data_reg = TMP2; - if ((flags & BSF_LITTLE) == 0) { - data_reg = bitdata; - } else if (bits == 16) { - a.rev16(TMP2, bitdata); - } else if (bits == 32) { - a.rev32(TMP2, bitdata); - } else if (num_partial == 0) { - a.rev64(TMP2, bitdata); - a.lsr(TMP2, TMP2, arm::lsr(64 - bits)); - } else { - a.ubfiz(TMP3, bitdata, imm(num_complete), imm(num_partial)); - a.ubfx(TMP2, bitdata, imm(num_partial), imm(num_complete)); - a.rev64(TMP2, TMP2); - a.orr(TMP2, TMP3, TMP2, arm::lsr(64 - num_complete)); + if (bits == 16) { + a.rev16(TMP2, data_reg); + } else if (bits == 32) { + a.rev32(TMP2, data_reg); + } else if (num_partial == 0) { + a.rev64(TMP2, data_reg); + a.lsr(TMP2, TMP2, arm::lsr(64 - bits)); + } else { + a.ubfiz(TMP3, data_reg, imm(num_complete), imm(num_partial)); + a.ubfx(TMP2, data_reg, imm(num_partial), imm(num_complete)); + a.rev64(TMP2, TMP2); + a.orr(TMP2, TMP3, TMP2, arm::lsr(64 - num_complete)); + } + data_reg = TMP2; } - /* Sign-extend the number if the segment is signed. */ - if ((flags & BSF_SIGNED) != 0) { + /* Sign-extend the number if the segment is signed and little-endian. */ + if ((flags & (BSF_SIGNED | BSF_LITTLE)) == (BSF_SIGNED | BSF_LITTLE)) { if (0 < bits && bits < 64) { comment("sign extend extracted value"); - a.lsl(TMP2, data_reg, imm(64 - bits)); - a.asr(TMP2, TMP2, imm(64 - bits)); + a.sbfx(TMP2, data_reg, 0, bits); data_reg = TMP2; } } @@ -3562,10 +3594,6 @@ void BeamModuleAssembler::emit_extract_integer(const arm::Gp bitdata, /* Handle segments whose values might not fit in a small integer. */ if (bits >= SMALL_BITS) { comment("test whether it fits in a small"); - if (bits < 64 && (flags & BSF_SIGNED) == 0) { - a.and_(TMP2, data_reg, imm((1ull << bits) - 1)); - data_reg = TMP2; - } if ((flags & BSF_SIGNED) != 0) { /* Signed segment. */ a.adds(TMP3, ZERO, data_reg, arm::lsr(SMALL_BITS - 1)); @@ -3582,20 +3610,7 @@ void BeamModuleAssembler::emit_extract_integer(const arm::Gp bitdata, } /* Tag and store the extracted small integer. */ - comment("store extracted integer as a small"); - mov_imm(dst.reg, _TAG_IMMED1_SMALL); - if ((flags & BSF_SIGNED) != 0) { - a.orr(dst.reg, dst.reg, data_reg, arm::lsl(_TAG_IMMED1_SIZE)); - } else { - if (bits >= SMALL_BITS) { - a.bfi(dst.reg, - data_reg, - arm::lsl(_TAG_IMMED1_SIZE), - imm(SMALL_BITS)); - } else if (bits != 0) { - a.bfi(dst.reg, data_reg, arm::lsl(_TAG_IMMED1_SIZE), imm(bits)); - } - } + a.orr(dst.reg, small_tag, data_reg, arm::lsl(_TAG_IMMED1_SIZE)); if (bits >= SMALL_BITS) { a.b(done); @@ -3629,16 +3644,28 @@ void BeamModuleAssembler::emit_extract_integer(const arm::Gp bitdata, } void BeamModuleAssembler::emit_extract_bitstring(const arm::Gp bitdata, + Uint position, Uint bits, const ArgRegister &Dst) { auto dst = init_destination(Dst, TMP1); + switch (position) { + case 0: + mov_imm(TMP4, 0); + break; + case 64: + a.mov(TMP4, bitdata); + break; + default: + a.ror(TMP4, bitdata, imm(position)); + break; + } a.add(dst.reg, HTOP, imm(TAG_PRIMARY_BOXED)); mov_imm(TMP2, header_heap_bits(bits)); mov_imm(TMP3, bits); a.stp(TMP2, TMP3, arm::Mem(HTOP).post(sizeof(Eterm[2]))); if (bits > 0) { - a.rev64(TMP4, bitdata); + a.rev64(TMP4, TMP4); a.str(TMP4, arm::Mem(HTOP).post(sizeof(Eterm[1]))); } @@ -3912,8 +3939,11 @@ void BeamModuleAssembler::emit_i_bs_match_test_heap(ArgLabel const &Fail, const arm::Gp bin_base = ARG2; const arm::Gp bin_position = ARG3; const arm::Gp bin_size = ARG4; + const arm::Gp small_tag = ARG5; const arm::Gp bitdata = ARG8; bool position_is_valid = false; + bool small_tag_valid = false; + Uint offset_in_bitdata = 0; for (auto seg : segments) { switch (seg.action) { @@ -3970,19 +4000,23 @@ void BeamModuleAssembler::emit_i_bs_match_test_heap(ArgLabel const &Fail, break; } case BsmSegment::action::EQ: { + arm::Gp cmp_reg = TMP1; comment("=:= %ld %ld", seg.size, seg.unit); - if (seg.size != 0 && seg.size != 64) { - a.ror(bitdata, bitdata, imm(64 - seg.size)); + + offset_in_bitdata -= seg.size; + + if (seg.size == 0) { + cmp_reg = ZERO; + } else if (seg.size == 64) { + cmp_reg = bitdata; + } else { + a.ubfx(cmp_reg, bitdata, offset_in_bitdata, seg.size); } - if (seg.size == 64) { - cmp(bitdata, seg.unit); - } else if (seg.size == 32) { - cmp(bitdata.w(), seg.unit); - } else if (seg.unit == 0) { - a.tst(bitdata, imm((1ull << seg.size) - 1)); + + if (seg.size == 32) { + cmp(cmp_reg.w(), seg.unit); } else { - a.and_(TMP1, bitdata, imm((1ull << seg.size) - 1)); - cmp(TMP1, seg.unit); + cmp(cmp_reg, seg.unit); } a.b_ne(resolve_beam_label(Fail, disp1MB)); break; @@ -3991,10 +4025,12 @@ void BeamModuleAssembler::emit_i_bs_match_test_heap(ArgLabel const &Fail, comment("test_heap %ld", seg.size); emit_gc_test(ArgWord(0), ArgWord(seg.size), seg.live); position_is_valid = false; + small_tag_valid = false; break; } case BsmSegment::action::READ: { comment("read %ld", seg.size); + offset_in_bitdata = 64; if (seg.size == 0) { comment("(nothing to do)"); } else { @@ -4020,10 +4056,8 @@ void BeamModuleAssembler::emit_i_bs_match_test_heap(ArgLabel const &Fail, auto Dst = seg.dst; comment("extract binary %ld", bits); - emit_extract_bitstring(bitdata, bits, Dst); - if (bits != 0 && bits != 64) { - a.ror(bitdata, bitdata, imm(64 - bits)); - } + emit_extract_bitstring(bitdata, offset_in_bitdata, bits, Dst); + offset_in_bitdata -= bits; break; } case BsmSegment::action::EXTRACT_INTEGER: { @@ -4032,10 +4066,17 @@ void BeamModuleAssembler::emit_i_bs_match_test_heap(ArgLabel const &Fail, auto Dst = seg.dst; comment("extract integer %ld", bits); - if (bits != 0 && bits != 64) { - a.ror(bitdata, bitdata, imm(64 - bits)); + if (!small_tag_valid) { + small_tag_valid = true; + mov_imm(small_tag, _TAG_IMMED1_SMALL); } - emit_extract_integer(bitdata, flags, bits, Dst); + offset_in_bitdata -= bits; + emit_extract_integer(bitdata, + small_tag, + flags, + offset_in_bitdata, + bits, + Dst); break; } case BsmSegment::action::GET_INTEGER: { @@ -4069,6 +4110,7 @@ void BeamModuleAssembler::emit_i_bs_match_test_heap(ArgLabel const &Fail, mov_arg(Dst, ARG1); position_is_valid = false; + small_tag_valid = false; break; } case BsmSegment::action::GET_BITSTRING: { @@ -4097,6 +4139,7 @@ void BeamModuleAssembler::emit_i_bs_match_test_heap(ArgLabel const &Fail, mov_arg(seg.dst, ARG1); position_is_valid = false; + small_tag_valid = false; break; } case BsmSegment::action::GET_TAIL: { @@ -4106,6 +4149,7 @@ void BeamModuleAssembler::emit_i_bs_match_test_heap(ArgLabel const &Fail, fragment_call(ga->get_bs_get_tail_shared()); mov_arg(seg.dst, ARG1); position_is_valid = false; + small_tag_valid = false; break; } case BsmSegment::action::SKIP: { @@ -4122,9 +4166,7 @@ void BeamModuleAssembler::emit_i_bs_match_test_heap(ArgLabel const &Fail, case BsmSegment::action::DROP: auto bits = seg.size; comment("drop %ld", bits); - if (bits != 0 && bits != 64) { - a.ror(bitdata, bitdata, imm(64 - bits)); - } + offset_in_bitdata -= bits; break; } } From 0a3a1f3e8fdb24252f23839299eaf573664510d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Sun, 15 Oct 2023 05:13:15 +0200 Subject: [PATCH 03/12] AArch64: Optimize binary construction Consider this example: bb(<>) -> <>. In OTP 26, the code for constructing the binary from the values of `X`, `Y`, and `Z` looks like this: # accumulate value for integer segment bfxil x7, x27, 4, 16 # accumulate value for integer segment lsl x7, x7, 16 bfxil x7, x28, 4, 16 # accumulate value for integer segment lsl x7, x7, 16 bfxil x7, x15, 4, 16 # construct integer segment from accumulator rev64 x7, x7 lsr x7, x7, 16 With this commit, the code is simplifed to: # accumulate value for integer segment at offset 48 bfi x7, x27, 44, 20 # accumulate value for integer segment at offset 32 bfi x7, x28, 28, 20 # accumulate value for integer segment at offset 16 bfi x7, x15, 12, 20 # construct integer segment from accumulator rev64 x7, x7 --- erts/emulator/beam/jit/arm/instr_bs.cpp | 105 +++++++++++++----------- 1 file changed, 56 insertions(+), 49 deletions(-) diff --git a/erts/emulator/beam/jit/arm/instr_bs.cpp b/erts/emulator/beam/jit/arm/instr_bs.cpp index 642b5e926a21..f49188280811 100644 --- a/erts/emulator/beam/jit/arm/instr_bs.cpp +++ b/erts/emulator/beam/jit/arm/instr_bs.cpp @@ -1739,7 +1739,8 @@ void BeamGlobalAssembler::emit_get_sint64_shared() { struct BscSegment { BscSegment() : type(am_false), unit(1), flags(0), src(ArgNil()), size(ArgNil()), - error_info(0), effectiveSize(-1), action(action::DIRECT) { + error_info(0), offsetInAccumulator(0), effectiveSize(-1), + action(action::DIRECT) { } Eterm type; @@ -1749,13 +1750,14 @@ struct BscSegment { ArgVal size; Uint error_info; + Uint offsetInAccumulator; Sint effectiveSize; /* Here are sub actions for storing integer segments. * - * We use the ACCUMULATE_FIRST and ACCUMULATE actions to shift the - * values of segments with known, small sizes (no more than 64 bits) - * into an accumulator register. + * We use the ACCUMULATE action to accumulator values of segments + * with known, small sizes (no more than 64 bits) into an + * accumulator register. * * When no more segments can be accumulated, the STORE action is * used to store the value of the accumulator into the binary. @@ -1763,7 +1765,7 @@ struct BscSegment { * The DIRECT action is used when it is not possible to use the * accumulator (for unknown or too large sizes). */ - enum class action { DIRECT, ACCUMULATE_FIRST, ACCUMULATE, STORE } action; + enum class action { DIRECT, ACCUMULATE, STORE } action; }; static std::vector bs_combine_segments( @@ -1784,7 +1786,7 @@ static std::vector bs_combine_segments( segs.back().action == BscSegment::action::DIRECT) { /* There are no previous compatible ACCUMULATE / STORE * actions. Create the first ones. */ - seg.action = BscSegment::action::ACCUMULATE_FIRST; + seg.action = BscSegment::action::ACCUMULATE; segs.push_back(seg); seg.action = BscSegment::action::STORE; segs.push_back(seg); @@ -1794,8 +1796,8 @@ static std::vector bs_combine_segments( auto prev = segs.back(); if (prev.flags & BSF_LITTLE) { /* Little-endian segments cannot be combined with other - * segments. Create new ACCUMULATE_FIRST / STORE actions. */ - seg.action = BscSegment::action::ACCUMULATE_FIRST; + * segments. Create new ACCUMULATE / STORE actions. */ + seg.action = BscSegment::action::ACCUMULATE; segs.push_back(seg); seg.action = BscSegment::action::STORE; segs.push_back(seg); @@ -1816,7 +1818,7 @@ static std::vector bs_combine_segments( segs.push_back(prev); } else { /* The size exceeds 64 bits. Can't combine. */ - seg.action = BscSegment::action::ACCUMULATE_FIRST; + seg.action = BscSegment::action::ACCUMULATE; segs.push_back(seg); seg.action = BscSegment::action::STORE; segs.push_back(seg); @@ -1828,6 +1830,24 @@ static std::vector bs_combine_segments( break; } } + + /* Calculate bit offsets for each ACCUMULATE segment. */ + + Uint offset = 0; + for (int i = segs.size() - 1; i >= 0; i--) { + switch (segs[i].action) { + case BscSegment::action::STORE: + offset = 64 - segs[i].effectiveSize; + break; + case BscSegment::action::ACCUMULATE: + segs[i].offsetInAccumulator = offset; + offset += segs[i].effectiveSize; + break; + default: + break; + } + } + return segs; } @@ -2864,19 +2884,17 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, break; case am_integer: switch (seg.action) { - case BscSegment::action::ACCUMULATE_FIRST: case BscSegment::action::ACCUMULATE: { /* Shift an integer of known size (no more than 64 bits) * into a word-size accumulator. */ Label value_is_small = a.newLabel(); Label done = a.newLabel(); + auto offset = seg.offsetInAccumulator; + + comment("accumulate value for integer segment at offset %ld", + offset); - comment("accumulate value for integer segment"); auto src = load_source(seg.src, ARG1); - if (seg.effectiveSize < 64 && - seg.action == BscSegment::action::ACCUMULATE) { - a.lsl(ARG8, ARG8, imm(seg.effectiveSize)); - } if (!always_small(seg.src)) { if (always_one_of(seg.src)) { @@ -2924,14 +2942,19 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, a.bind(value_is_small); if (seg.effectiveSize == 64) { a.asr(ARG8, src.reg, imm(_TAG_IMMED1_SIZE)); - } else if (seg.effectiveSize + _TAG_IMMED1_SIZE > 64) { - a.asr(TMP1, src.reg, imm(_TAG_IMMED1_SIZE)); - a.bfxil(ARG8, TMP1, arm::lsr(0), imm(seg.effectiveSize)); - } else { + } else if (offset >= _TAG_IMMED1_SIZE) { + a.bfi(ARG8, + src.reg, + arm::lsr(offset - _TAG_IMMED1_SIZE), + imm(seg.effectiveSize + _TAG_IMMED1_SIZE)); + } else if (offset == 0 && seg.effectiveSize <= SMALL_BITS) { a.bfxil(ARG8, src.reg, - arm::lsr(_TAG_IMMED1_SIZE), + imm(_TAG_IMMED1_SIZE), imm(seg.effectiveSize)); + } else { + a.asr(TMP1, src.reg, imm(_TAG_IMMED1_SIZE)); + a.bfi(ARG8, TMP1, imm(offset), imm(seg.effectiveSize)); } a.bind(done); @@ -2950,41 +2973,25 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, /* First we'll need to ensure that the value in the * accumulator is in little endian format. */ ASSERT(seg.effectiveSize >= 0); - if (seg.effectiveSize % 8) { + if ((seg.flags & BSF_LITTLE) == 0) { + a.rev64(ARG8, ARG8); + } else { Uint complete_bytes = 8 * (seg.effectiveSize / 8); Uint num_partial = seg.effectiveSize % 8; - if (seg.flags & BSF_LITTLE) { + + if (seg.effectiveSize < 64) { + a.lsr(ARG8, ARG8, imm(64 - seg.effectiveSize)); + } + + if ((seg.effectiveSize % 8) != 0) { a.ubfx(TMP1, ARG8, imm(complete_bytes), imm(num_partial)); - a.bfc(ARG8, - arm::lsr(complete_bytes), - imm(64 - complete_bytes)); a.bfi(ARG8, TMP1, imm(complete_bytes + 8 - num_partial), imm(num_partial)); - } else { - a.lsl(ARG8, ARG8, imm(64 - seg.effectiveSize)); - a.rev64(ARG8, ARG8); - } - } else if ((seg.flags & BSF_LITTLE) == 0) { - switch (seg.effectiveSize) { - case 8: - break; - case 16: - a.rev16(ARG8, ARG8); - break; - case 32: - a.rev32(ARG8, ARG8); - break; - case 64: - a.rev64(ARG8, ARG8); - break; - default: - a.rev64(ARG8, ARG8); - a.lsr(ARG8, ARG8, imm(64 - seg.effectiveSize)); } } From c0dcdd4bd94e95221281195eb5fcbf80ab0708ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Thu, 19 Oct 2023 16:51:02 +0200 Subject: [PATCH 04/12] AArch64: Combine move + move + trim --- erts/emulator/beam/jit/arm/instr_common.cpp | 63 +++++++++++++++++---- erts/emulator/beam/jit/arm/ops.tab | 8 +++ erts/emulator/beam/jit/beam_jit_args.hpp | 8 +++ 3 files changed, 69 insertions(+), 10 deletions(-) diff --git a/erts/emulator/beam/jit/arm/instr_common.cpp b/erts/emulator/beam/jit/arm/instr_common.cpp index 4861928a0d52..a42fb552d99b 100644 --- a/erts/emulator/beam/jit/arm/instr_common.cpp +++ b/erts/emulator/beam/jit/arm/instr_common.cpp @@ -515,6 +515,56 @@ void BeamModuleAssembler::emit_i_move(const ArgSource &Src, mov_arg(Dst, Src); } +void BeamModuleAssembler::emit_move_two_trim(const ArgYRegister &Src1, + const ArgRegister &Dst1, + const ArgYRegister &Src2, + const ArgRegister &Dst2, + const ArgWord &Words) { + auto dst1 = init_destination(Dst1, TMP1); + auto dst2 = init_destination(Dst2, TMP2); + arm::Mem mem = getArgRef(Src1); + Sint trim = Words.get() * sizeof(Eterm); + auto src_index = Src1.as().get(); + + ASSERT(ArgVal::memory_relation(Src1, Src2) == + ArgVal::Relation::consecutive); + + if (src_index == 0 && Support::isInt9(trim)) { + /* Combine fetching of y0 and y1 with trimming. */ + mem = arm::Mem(E).post(trim); + a.ldp(dst1.reg, dst2.reg, mem); + dst1 = init_destination(Dst1.trimmed(Words.get()), TMP1); + dst2 = init_destination(Dst2.trimmed(Words.get()), TMP2); + flush_vars(dst1, dst2); + } else { + safe_ldp(dst1.reg, dst2.reg, Src1, Src2); + + /* Try to combine trimming with storing to one of destination + * registers. */ + + if (Dst1.isYRegister() && + Dst1.as().get() == Words.get() && + Support::isInt9(trim)) { + const arm::Mem dst_ref = arm::Mem(E, trim).pre(); + flush_var(dst2); + a.str(dst1.reg, dst_ref); + } else if (Dst2.isYRegister() && + Dst2.as().get() == Words.get() && + Support::isInt9(trim)) { + const arm::Mem dst_ref = arm::Mem(E, trim).pre(); + flush_var(dst1); + a.str(dst2.reg, dst_ref); + } else { + flush_vars(dst1, dst2); + + ASSERT(Words.get() <= 1023); + if (Words.get() > 0) { + add(E, E, Words.get() * sizeof(Eterm)); + } + } + } +} + void BeamModuleAssembler::emit_move_trim(const ArgSource &Src, const ArgRegister &Dst, const ArgWord &Words) { @@ -525,16 +575,9 @@ void BeamModuleAssembler::emit_move_trim(const ArgSource &Src, auto src_index = Src.as().get(); if (src_index == 0 && Support::isInt9(trim)) { const arm::Mem src_ref = arm::Mem(E).post(trim); - if (Dst.isXRegister()) { - auto dst = init_destination(Dst, TMP1); - a.ldr(dst.reg, src_ref); - flush_var(dst); - } else { - auto dst_index = Dst.as().get() - Words.get(); - auto dst = init_destination(ArgYRegister(dst_index), TMP1); - a.ldr(dst.reg, src_ref); - flush_var(dst); - } + auto dst = init_destination(Dst.trimmed(Words.get()), TMP1); + a.ldr(dst.reg, src_ref); + flush_var(dst); return; } diff --git a/erts/emulator/beam/jit/arm/ops.tab b/erts/emulator/beam/jit/arm/ops.tab index ef0c1c840be3..b4b1c54cef35 100644 --- a/erts/emulator/beam/jit/arm/ops.tab +++ b/erts/emulator/beam/jit/arm/ops.tab @@ -353,8 +353,16 @@ move S1 D1=y | move S2 D2=y | consecutive_words(D1, D2) => move S1 D1=y | move S2 D2=y | consecutive_words(D2, D1) => store_two_values S2 D2 S1 D1 +move S1=y D1 | move S2=y D2 | consecutive_words(S1, S2) | trim N u => + move_two_trim S1 D1 S2 D2 N + +move S2=y D2 | move S1=y D1 | consecutive_words(S1, S2) | trim N u => + move_two_trim S1 D1 S2 D2 N + move Src Dst | trim N u => move_trim Src Dst N +move_two_trim y d y d t + move_trim s d t move Src Dst => i_move Src Dst diff --git a/erts/emulator/beam/jit/beam_jit_args.hpp b/erts/emulator/beam/jit/beam_jit_args.hpp index 4dba1b3f4fd7..9705019ee2b9 100644 --- a/erts/emulator/beam/jit/beam_jit_args.hpp +++ b/erts/emulator/beam/jit/beam_jit_args.hpp @@ -247,6 +247,14 @@ struct ArgRegister : public ArgSource { return (int)(val >> 10); } + constexpr ArgVal trimmed(int n) const { + if (isYRegister()) { + return ArgVal(TYPE::YReg, UWord((val & REG_MASK) - n)); + } else { + return *this; + } + } + template constexpr T copy(int n) const { return T(n | (val & ~REG_MASK)); From c6539613ed498b2774181227d29752d9962acd93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Fri, 20 Oct 2023 08:00:29 +0200 Subject: [PATCH 05/12] AArch64: Enhance move_call_last --- erts/emulator/beam/jit/arm/instr_call.cpp | 16 ++++++++++++++++ erts/emulator/beam/jit/arm/ops.tab | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/erts/emulator/beam/jit/arm/instr_call.cpp b/erts/emulator/beam/jit/arm/instr_call.cpp index c7a150b6bff0..e1fc81787439 100644 --- a/erts/emulator/beam/jit/arm/instr_call.cpp +++ b/erts/emulator/beam/jit/arm/instr_call.cpp @@ -94,6 +94,13 @@ void BeamModuleAssembler::emit_move_call_last(const ArgYRegister &Src, a.ldr(dst.reg, src_ref); flush_var(dst); emit_i_call_only(CallTarget); + } else if (src_index + 1 == Deallocate.get()) { + auto dst = init_destination(Dst, TMP1); + safe_ldp(dst.reg, a64::x30, getArgRef(Src)); + flush_var(dst); + add(E, E, (Deallocate.get() + 1) * sizeof(Eterm)); + a.b(resolve_beam_label(CallTarget, disp128MB)); + mark_unreachable(); } else { mov_arg(Dst, Src); emit_deallocate(Deallocate); @@ -178,6 +185,15 @@ void BeamModuleAssembler::emit_move_call_ext_last(const ArgYRegister &Src, a.ldr(dst.reg, src_ref); flush_var(dst); emit_i_call_ext_only(Exp); + } else if (src_index + 1 == Deallocate.get()) { + auto dst = init_destination(Dst, TMP1); + mov_arg(ARG1, Exp); + arm::Mem target = emit_setup_dispatchable_call(ARG1); + safe_ldp(dst.reg, a64::x30, getArgRef(Src)); + flush_var(dst); + add(E, E, (Deallocate.get() + 1) * sizeof(Eterm)); + branch(target); + mark_unreachable(); } else { mov_arg(Dst, Src); emit_deallocate(Deallocate); diff --git a/erts/emulator/beam/jit/arm/ops.tab b/erts/emulator/beam/jit/arm/ops.tab index b4b1c54cef35..a0e54b841b00 100644 --- a/erts/emulator/beam/jit/arm/ops.tab +++ b/erts/emulator/beam/jit/arm/ops.tab @@ -809,8 +809,8 @@ bif_is_lt s s d # Internal calls. # -i_move S=y==0 Dst | call_last Ar P D => move_call_last S Dst P D -i_move S=y==0 Dst | call_ext_last Ar P=u$is_not_bif D => move_call_ext_last S Dst P D +i_move S=y Dst | call_last Ar P D => move_call_last S Dst P D +i_move S=y Dst | call_ext_last Ar P=u$is_not_bif D => move_call_ext_last S Dst P D move_call_last y d f t move_call_ext_last y d e t From 7ca660b82c0d11bc36e432c8cc7ebf2df1094169 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Thu, 16 Nov 2023 20:12:32 +0100 Subject: [PATCH 06/12] x86_64: Optimize arithmetic instructions Simplify the code when right-hand side operand is constant. Also eliminate some register shuffling by using the LEA instruction. --- erts/emulator/beam/jit/x86/instr_arith.cpp | 83 ++++++++++++---------- 1 file changed, 46 insertions(+), 37 deletions(-) diff --git a/erts/emulator/beam/jit/x86/instr_arith.cpp b/erts/emulator/beam/jit/x86/instr_arith.cpp index 9112f5dc220f..17fccdf53ab5 100644 --- a/erts/emulator/beam/jit/x86/instr_arith.cpp +++ b/erts/emulator/beam/jit/x86/instr_arith.cpp @@ -174,31 +174,30 @@ void BeamModuleAssembler::emit_i_plus(const ArgSource &LHS, const ArgSource &RHS, const ArgLabel &Fail, const ArgRegister &Dst) { + bool is_rhs_literal = false; + Sint rhs_untagged; + bool small_result = is_sum_small_if_args_are_small(LHS, RHS); - if (always_small(LHS) && always_small(RHS) && small_result) { - /* Since we don't need the order on this path (no exceptions), we'll - * simplify the code below by shuffling constants to the right-hand - * side. */ - const ArgSource A = LHS.isSmall() ? RHS : LHS, - B = LHS.isSmall() ? LHS : RHS; + if (RHS.isSmall()) { + /* Must be signed for the template magic in isInt32 to work for + * negative numbers. */ + rhs_untagged = RHS.as().getSigned() << _TAG_IMMED1_SIZE; + is_rhs_literal = Support::isInt32(rhs_untagged); + } + if (always_small(LHS) && always_small(RHS) && small_result) { comment("add without overflow check"); - mov_arg(RET, A); - if (B.isSmall()) { - /* Must be signed for the template magic in isInt32 to work for - * negative numbers. */ - Sint untagged = B.as().getSigned() << _TAG_IMMED1_SIZE; + mov_arg(RET, LHS); - if (Support::isInt32(untagged)) { - a.add(RET, imm(untagged)); - } else { - mov_imm(ARG2, B.as().get() & ~_TAG_IMMED1_MASK); - a.add(RET, ARG2); - } + if (is_rhs_literal) { + a.add(RET, imm(rhs_untagged)); + } else if (RHS.isSmall()) { + mov_imm(ARG2, RHS.as().get() & ~_TAG_IMMED1_MASK); + a.add(RET, ARG2); } else { - mov_arg(ARG2, B); + mov_arg(ARG2, RHS); a.lea(RET, x86::qword_ptr(RET, ARG2, 0, -_TAG_IMMED1_SMALL)); } @@ -216,13 +215,13 @@ void BeamModuleAssembler::emit_i_plus(const ArgSource &LHS, } else { emit_are_both_small(mixed, LHS, ARG2, RHS, ARG3); - a.mov(RET, ARG2); - a.and_(RET, imm(~_TAG_IMMED1_MASK)); - a.add(RET, ARG3); if (small_result) { comment("skipped overflow test because the result is always small"); + a.lea(RET, x86::qword_ptr(ARG2, ARG3, 0, -_TAG_IMMED1_SMALL)); a.short_().jmp(next); } else { + a.lea(RET, x86::qword_ptr(ARG2, -_TAG_IMMED1_SMALL)); + a.add(RET, ARG3); a.short_().jno(next); } } @@ -297,23 +296,26 @@ void BeamModuleAssembler::emit_i_minus(const ArgSource &LHS, const ArgSource &RHS, const ArgLabel &Fail, const ArgRegister &Dst) { + bool is_rhs_literal = false; + Sint rhs_untagged = 0; bool small_result = is_diff_small_if_args_are_small(LHS, RHS); + if (RHS.isSmall()) { + /* Must be signed for the template magic in isInt32 to work for + * negative numbers. */ + rhs_untagged = RHS.as().getSigned() << _TAG_IMMED1_SIZE; + is_rhs_literal = Support::isInt32(rhs_untagged); + } + if (always_small(LHS) && always_small(RHS) && small_result) { comment("subtract without overflow check"); mov_arg(RET, LHS); - if (RHS.isSmall()) { - /* Must be signed for the template magic in isInt32 to work for - * negative numbers. */ - Sint untagged = RHS.as().getSigned() << _TAG_IMMED1_SIZE; - - if (Support::isInt32(untagged)) { - a.sub(RET, imm(untagged)); - } else { - mov_imm(ARG2, RHS.as().get() & ~_TAG_IMMED1_MASK); - a.sub(RET, ARG2); - } + if (is_rhs_literal) { + a.sub(RET, imm(rhs_untagged)); + } else if (RHS.isSmall()) { + mov_imm(ARG2, RHS.as().get() & ~_TAG_IMMED1_MASK); + a.sub(RET, ARG2); } else { mov_arg(ARG2, RHS); a.and_(ARG2, imm(~_TAG_IMMED1_MASK)); @@ -337,14 +339,21 @@ void BeamModuleAssembler::emit_i_minus(const ArgSource &LHS, if (small_result) { comment("skipped overflow test because the result is always small"); a.mov(RET, ARG2); - a.and_(ARG3, imm(~_TAG_IMMED1_MASK)); - a.sub(RET, ARG3); + if (is_rhs_literal) { + a.sub(RET, imm(rhs_untagged)); + } else { + a.and_(ARG3, imm(~_TAG_IMMED1_MASK)); + a.sub(RET, ARG3); + } a.short_().jmp(next); } else { a.mov(RET, ARG2); - a.mov(ARG4, ARG3); - a.and_(ARG4, imm(~_TAG_IMMED1_MASK)); - a.sub(RET, ARG4); + if (is_rhs_literal) { + a.sub(RET, imm(rhs_untagged)); + } else { + a.lea(ARG4, x86::qword_ptr(ARG3, -_TAG_IMMED1_SMALL)); + a.sub(RET, ARG4); + } a.short_().jno(next); } } From 5cf90cfd45c63fc91ba561a3d0e5c71725455ec9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Sat, 21 Oct 2023 09:11:35 +0200 Subject: [PATCH 07/12] Add helpers is_cons() and is_not_cons() --- erts/emulator/beam/jit/arm/beam_asm.hpp | 14 +++++++++ erts/emulator/beam/jit/arm/instr_common.cpp | 11 ++----- erts/emulator/beam/jit/x86/beam_asm.hpp | 32 +++++++++++++++++++-- erts/emulator/beam/jit/x86/instr_common.cpp | 17 +++++------ 4 files changed, 52 insertions(+), 22 deletions(-) diff --git a/erts/emulator/beam/jit/arm/beam_asm.hpp b/erts/emulator/beam/jit/arm/beam_asm.hpp index e80ad8aadc0f..9987eb9aaf73 100644 --- a/erts/emulator/beam/jit/arm/beam_asm.hpp +++ b/erts/emulator/beam/jit/arm/beam_asm.hpp @@ -596,6 +596,20 @@ struct BeamAssembler : public BeamAssemblerCommon { } } + void emit_is_cons(Label Fail, arm::Gp Src) { + const int bitNumber = 1; + ERTS_CT_ASSERT(_TAG_PRIMARY_MASK - TAG_PRIMARY_LIST == + (1 << bitNumber)); + a.tbnz(Src, imm(bitNumber), Fail); + } + + void emit_is_not_cons(Label Fail, arm::Gp Src) { + const int bitNumber = 1; + ERTS_CT_ASSERT(_TAG_PRIMARY_MASK - TAG_PRIMARY_LIST == + (1 << bitNumber)); + a.tbz(Src, imm(bitNumber), Fail); + } + void emit_is_boxed(Label Fail, arm::Gp Src) { const int bitNumber = 0; ERTS_CT_ASSERT(_TAG_PRIMARY_MASK - TAG_PRIMARY_BOXED == diff --git a/erts/emulator/beam/jit/arm/instr_common.cpp b/erts/emulator/beam/jit/arm/instr_common.cpp index a42fb552d99b..bb19016cbb3d 100644 --- a/erts/emulator/beam/jit/arm/instr_common.cpp +++ b/erts/emulator/beam/jit/arm/instr_common.cpp @@ -950,10 +950,7 @@ void BeamModuleAssembler::emit_set_tuple_element(const ArgSource &Element, void BeamModuleAssembler::emit_is_nonempty_list(const ArgLabel &Fail, const ArgRegister &Src) { auto list_ptr = load_source(Src, TMP1); - const int bitNumber = 1; - - ERTS_CT_ASSERT(_TAG_PRIMARY_MASK - TAG_PRIMARY_LIST == (1 << bitNumber)); - a.tbnz(list_ptr.reg, imm(bitNumber), resolve_beam_label(Fail, disp32K)); + emit_is_cons(resolve_beam_label(Fail, dispUnknown), list_ptr.reg); } void BeamModuleAssembler::emit_jump(const ArgLabel &Fail) { @@ -1215,11 +1212,7 @@ void BeamModuleAssembler::emit_is_nil(const ArgLabel &Fail, auto src = load_source(Src, TMP1); if (always_one_of(Src)) { - const int bitNumber = 1; - ERTS_CT_ASSERT(_TAG_PRIMARY_MASK - TAG_PRIMARY_LIST == - (1 << bitNumber)); - comment("simplified is_nil test because its argument is always a list"); - a.tbz(src.reg, imm(bitNumber), resolve_beam_label(Fail, disp32K)); + emit_is_not_cons(resolve_beam_label(Fail, dispUnknown), src.reg); } else { a.cmp(src.reg, imm(NIL)); a.b_ne(resolve_beam_label(Fail, disp1MB)); diff --git a/erts/emulator/beam/jit/x86/beam_asm.hpp b/erts/emulator/beam/jit/x86/beam_asm.hpp index fa36e5ee9032..c25aa9304d6f 100644 --- a/erts/emulator/beam/jit/x86/beam_asm.hpp +++ b/erts/emulator/beam/jit/x86/beam_asm.hpp @@ -766,17 +766,43 @@ struct BeamAssembler : public BeamAssemblerCommon { #endif } - void emit_test_boxed(x86::Gp Src) { + void emit_test(x86::Gp Src, byte mask) { /* Use the shortest possible instruction depending on the source * register. */ if (Src == x86::rax || Src == x86::rdi || Src == x86::rsi || Src == x86::rcx || Src == x86::rdx) { - a.test(Src.r8(), imm(_TAG_PRIMARY_MASK - TAG_PRIMARY_BOXED)); + a.test(Src.r8(), imm(mask)); + } else { + a.test(Src.r32(), imm(mask)); + } + } + + void emit_test_cons(x86::Gp Src) { + emit_test(Src, _TAG_PRIMARY_MASK - TAG_PRIMARY_LIST); + } + + void emit_is_cons(Label Fail, x86::Gp Src, Distance dist = dLong) { + emit_test_cons(Src); + if (dist == dShort) { + a.short_().jne(Fail); + } else { + a.jne(Fail); + } + } + + void emit_is_not_cons(Label Fail, x86::Gp Src, Distance dist = dLong) { + emit_test_cons(Src); + if (dist == dShort) { + a.short_().je(Fail); } else { - a.test(Src.r32(), imm(_TAG_PRIMARY_MASK - TAG_PRIMARY_BOXED)); + a.je(Fail); } } + void emit_test_boxed(x86::Gp Src) { + emit_test(Src, _TAG_PRIMARY_MASK - TAG_PRIMARY_BOXED); + } + void emit_is_boxed(Label Fail, x86::Gp Src, Distance dist = dLong) { emit_test_boxed(Src); if (dist == dShort) { diff --git a/erts/emulator/beam/jit/x86/instr_common.cpp b/erts/emulator/beam/jit/x86/instr_common.cpp index 223e42ab7b98..97a7de2f094e 100644 --- a/erts/emulator/beam/jit/x86/instr_common.cpp +++ b/erts/emulator/beam/jit/x86/instr_common.cpp @@ -368,8 +368,7 @@ void BeamModuleAssembler::emit_is_nonempty_list_get_list( const ArgRegister &Hd, const ArgRegister &Tl) { mov_arg(RET, Src); - a.test(RETb, imm(_TAG_PRIMARY_MASK - TAG_PRIMARY_LIST)); - a.jne(resolve_beam_label(Fail)); + emit_is_cons(resolve_beam_label(Fail), RET); emit_get_list(RET, Hd, Tl); } @@ -377,12 +376,11 @@ void BeamModuleAssembler::emit_is_nonempty_list_get_hd(const ArgLabel &Fail, const ArgRegister &Src, const ArgRegister &Hd) { mov_arg(RET, Src); - a.test(RETb, imm(_TAG_PRIMARY_MASK - TAG_PRIMARY_LIST)); - a.jne(resolve_beam_label(Fail)); + emit_is_cons(resolve_beam_label(Fail), RET); - x86::Gp boxed_ptr = emit_ptr_val(RET, RET); + x86::Gp ptr = emit_ptr_val(RET, RET); - a.mov(ARG2, getCARRef(boxed_ptr)); + a.mov(ARG2, getCARRef(ptr)); mov_arg(Hd, ARG2); } @@ -391,12 +389,11 @@ void BeamModuleAssembler::emit_is_nonempty_list_get_tl(const ArgLabel &Fail, const ArgRegister &Src, const ArgRegister &Tl) { mov_arg(RET, Src); - a.test(RETb, imm(_TAG_PRIMARY_MASK - TAG_PRIMARY_LIST)); - a.jne(resolve_beam_label(Fail)); + emit_is_cons(resolve_beam_label(Fail), RET); - x86::Gp boxed_ptr = emit_ptr_val(RET, RET); + x86::Gp ptr = emit_ptr_val(RET, RET); - a.mov(ARG2, getCDRRef(boxed_ptr)); + a.mov(ARG2, getCDRRef(ptr)); mov_arg(Tl, ARG2); } From be78def1a6012ec593c77572ba534ee19e497135 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Sat, 21 Oct 2023 07:34:51 +0200 Subject: [PATCH 08/12] Optimize tag tests in is_eq_exact and is_ne_exact --- erts/emulator/beam/jit/arm/beam_asm.hpp | 43 ++++++++++++ erts/emulator/beam/jit/arm/instr_common.cpp | 26 ++------ erts/emulator/beam/jit/x86/beam_asm.hpp | 74 ++++++++++++++------- erts/emulator/beam/jit/x86/instr_common.cpp | 33 ++------- 4 files changed, 106 insertions(+), 70 deletions(-) diff --git a/erts/emulator/beam/jit/arm/beam_asm.hpp b/erts/emulator/beam/jit/arm/beam_asm.hpp index 9987eb9aaf73..f40bd6d3221a 100644 --- a/erts/emulator/beam/jit/arm/beam_asm.hpp +++ b/erts/emulator/beam/jit/arm/beam_asm.hpp @@ -1775,6 +1775,49 @@ class BeamModuleAssembler : public BeamAssembler, a.ldp(gp1, gp2, arm::Mem(SUPER_TMP)); } } + + /* Set the Z flag if Reg1 and Reg2 are definitely not equal based + * on their tags alone. (They may still be equal if both are + * immediates and all other bits are equal too.) */ + void emit_is_unequal_based_on_tags(Label Unequal, + const ArgVal &Src1, + arm::Gp Reg1, + const ArgVal &Src2, + arm::Gp Reg2) { + ERTS_CT_ASSERT(TAG_PRIMARY_IMMED1 == _TAG_PRIMARY_MASK); + ERTS_CT_ASSERT((TAG_PRIMARY_LIST | TAG_PRIMARY_BOXED) == + TAG_PRIMARY_IMMED1); + + if (always_one_of(Src1)) { + emit_is_boxed(Unequal, Reg2); + } else if (always_one_of(Src2)) { + emit_is_boxed(Unequal, Reg1); + } else if (exact_type(Src1)) { + emit_is_cons(Unequal, Reg2); + } else if (exact_type(Src2)) { + emit_is_cons(Unequal, Reg1); + } else { + a.orr(SUPER_TMP, Reg1, Reg2); + + if (never_one_of(Src1) || + never_one_of(Src2)) { + emit_is_boxed(Unequal, SUPER_TMP); + } else if (never_one_of(Src1) || + never_one_of(Src2)) { + emit_is_cons(Unequal, SUPER_TMP); + } else { + a.and_(SUPER_TMP, SUPER_TMP, imm(_TAG_PRIMARY_MASK)); + a.cmp(SUPER_TMP, imm(TAG_PRIMARY_IMMED1)); + + /* + * SUPER_TMP will now be TAG_PRIMARY_IMMED1 if either + * one or both registers are immediates, or if one + * register is a list and the other a boxed. + */ + a.b_eq(Unequal); + } + } + } }; void beamasm_metadata_update(std::string module_name, diff --git a/erts/emulator/beam/jit/arm/instr_common.cpp b/erts/emulator/beam/jit/arm/instr_common.cpp index bb19016cbb3d..2ae8918b8c87 100644 --- a/erts/emulator/beam/jit/arm/instr_common.cpp +++ b/erts/emulator/beam/jit/arm/instr_common.cpp @@ -1514,18 +1514,13 @@ void BeamModuleAssembler::emit_is_eq_exact(const ArgLabel &Fail, emit_is_boxed(resolve_beam_label(Fail, dispUnknown), TMP1); } else if (always_same_types(X, Y)) { comment("skipped tag test since they are always equal"); - } else if (Y.isLiteral()) { - /* Fail immediately unless X is the same type of pointer as - * the literal Y. - */ - Eterm literal = beamfile_get_literal(beam, Y.as().get()); - Uint tag_test = _TAG_PRIMARY_MASK - (literal & _TAG_PRIMARY_MASK); - int bitNumber = Support::ctz(tag_test); - a.tbnz(x.reg, imm(bitNumber), resolve_beam_label(Fail, disp32K)); } else { /* Fail immediately if the pointer tags are not equal. */ - emit_is_unequal_based_on_tags(x.reg, y.reg); - a.b_eq(resolve_beam_label(Fail, disp1MB)); + emit_is_unequal_based_on_tags(resolve_beam_label(Fail, dispUnknown), + X, + x.reg, + Y, + y.reg); } /* Both operands are pointers having the same tag. Must do a @@ -1574,19 +1569,10 @@ void BeamModuleAssembler::emit_is_ne_exact(const ArgLabel &Fail, emit_is_boxed(next, TMP1); } else if (always_same_types(X, Y)) { comment("skipped tag test since they are always equal"); - } else if (Y.isLiteral()) { - /* Succeed immediately if X is not the same type of pointer as - * the literal Y. - */ - Eterm literal = beamfile_get_literal(beam, Y.as().get()); - Uint tag_test = _TAG_PRIMARY_MASK - (literal & _TAG_PRIMARY_MASK); - int bitNumber = Support::ctz(tag_test); - a.tbnz(x.reg, imm(bitNumber), next); } else { /* Test whether the terms are definitely unequal based on the tags * alone. */ - emit_is_unequal_based_on_tags(x.reg, y.reg); - a.b_eq(next); + emit_is_unequal_based_on_tags(next, X, x.reg, Y, y.reg); } /* Both operands are pointers having the same tag. Must do a diff --git a/erts/emulator/beam/jit/x86/beam_asm.hpp b/erts/emulator/beam/jit/x86/beam_asm.hpp index c25aa9304d6f..42a86ed7fb60 100644 --- a/erts/emulator/beam/jit/x86/beam_asm.hpp +++ b/erts/emulator/beam/jit/x86/beam_asm.hpp @@ -855,30 +855,6 @@ struct BeamAssembler : public BeamAssemblerCommon { } } - /* Set the Z flag if Reg1 and Reg2 are definitely not equal based on their - * tags alone. (They may still be equal if both are immediates and all other - * bits are equal too.) */ - void emit_is_unequal_based_on_tags(x86::Gp Reg1, x86::Gp Reg2) { - ASSERT(Reg1 != RET && Reg2 != RET); - emit_is_unequal_based_on_tags(Reg1, Reg2, RET); - } - - void emit_is_unequal_based_on_tags(x86::Gp Reg1, - x86::Gp Reg2, - const x86::Gp &spill) { - ERTS_CT_ASSERT(TAG_PRIMARY_IMMED1 == _TAG_PRIMARY_MASK); - ERTS_CT_ASSERT((TAG_PRIMARY_LIST | TAG_PRIMARY_BOXED) == - TAG_PRIMARY_IMMED1); - a.mov(RETd, Reg1.r32()); - a.or_(RETd, Reg2.r32()); - a.and_(RETb, imm(_TAG_PRIMARY_MASK)); - - /* RET will be now be TAG_PRIMARY_IMMED1 if either one or both - * registers are immediates, or if one register is a list and the other - * a boxed. */ - a.cmp(RETb, imm(TAG_PRIMARY_IMMED1)); - } - /* * Generate the shortest instruction for setting a register to an immediate * value. May clear flags. @@ -1617,6 +1593,56 @@ class BeamModuleAssembler : public BeamAssembler, mov_arg(getArgRef(to), from); } } + + /* Set the Z flag if Reg1 and Reg2 are definitely not equal based + * on their tags alone. (They may still be equal if both are + * immediates and all other bits are equal too.) + * + * Clobbers RET. + */ + void emit_is_unequal_based_on_tags(Label Unequal, + const ArgVal &Src1, + x86::Gp Reg1, + const ArgVal &Src2, + x86::Gp Reg2, + Distance dist = dLong) { + ERTS_CT_ASSERT(TAG_PRIMARY_IMMED1 == _TAG_PRIMARY_MASK); + ERTS_CT_ASSERT((TAG_PRIMARY_LIST | TAG_PRIMARY_BOXED) == + TAG_PRIMARY_IMMED1); + + if (always_one_of(Src1)) { + emit_is_boxed(Unequal, Reg2, dist); + } else if (always_one_of(Src2)) { + emit_is_boxed(Unequal, Reg1, dist); + } else if (exact_type(Src1)) { + emit_is_cons(Unequal, Reg2, dist); + } else if (exact_type(Src2)) { + emit_is_cons(Unequal, Reg1, dist); + } else { + a.mov(RETd, Reg1.r32()); + a.or_(RETd, Reg2.r32()); + + if (never_one_of(Src1) || + never_one_of(Src2)) { + emit_is_boxed(Unequal, RET, dist); + } else if (never_one_of(Src1) || + never_one_of(Src2)) { + emit_is_cons(Unequal, RET, dist); + } else { + a.and_(RETb, imm(_TAG_PRIMARY_MASK)); + + /* RET will now be TAG_PRIMARY_IMMED1 if either one or + * both registers are immediates, or if one register + * is a list and the other a boxed. */ + a.cmp(RETb, imm(TAG_PRIMARY_IMMED1)); + if (dist == dShort) { + a.short_().je(Unequal); + } else { + a.je(Unequal); + } + } + } + } }; void beamasm_metadata_update(std::string module_name, diff --git a/erts/emulator/beam/jit/x86/instr_common.cpp b/erts/emulator/beam/jit/x86/instr_common.cpp index 97a7de2f094e..c773e98f58bd 100644 --- a/erts/emulator/beam/jit/x86/instr_common.cpp +++ b/erts/emulator/beam/jit/x86/instr_common.cpp @@ -1433,18 +1433,13 @@ void BeamModuleAssembler::emit_is_eq_exact(const ArgLabel &Fail, a.jne(resolve_beam_label(Fail)); } else if (always_same_types(X, Y)) { comment("skipped tag test since they are always equal"); - } else if (Y.isLiteral()) { - /* Fail immediately unless X is the same type of pointer as - * the literal Y. - */ - Eterm literal = beamfile_get_literal(beam, Y.as().get()); - Uint tag_test = _TAG_PRIMARY_MASK - (literal & _TAG_PRIMARY_MASK); - a.test(ARG1.r8(), imm(tag_test)); - a.jne(resolve_beam_label(Fail)); } else { /* Fail immediately if the pointer tags are not equal. */ - emit_is_unequal_based_on_tags(ARG1, ARG2); - a.je(resolve_beam_label(Fail)); + emit_is_unequal_based_on_tags(resolve_beam_label(Fail), + X, + ARG1, + Y, + ARG2); } /* Both operands are pointers having the same tag. Must do a @@ -1493,27 +1488,13 @@ void BeamModuleAssembler::emit_is_ne_exact(const ArgLabel &Fail, a.short_().jne(next); } else if (always_same_types(X, Y)) { comment("skipped tag test since they are always equal"); - } else if (Y.isLiteral()) { - /* Succeed immediately if X is not the same type of pointer as - * the literal Y. - */ - Eterm literal = beamfile_get_literal(beam, Y.as().get()); - Uint tag_test = _TAG_PRIMARY_MASK - (literal & _TAG_PRIMARY_MASK); - a.test(ARG1.r8(), imm(tag_test)); -#ifdef JIT_HARD_DEBUG - a.jne(next); -#else - a.short_().jne(next); -#endif } else { /* Test whether the terms are definitely unequal based on the tags * alone. */ - emit_is_unequal_based_on_tags(ARG1, ARG2); - #ifdef JIT_HARD_DEBUG - a.je(next); + emit_is_unequal_based_on_tags(next, X, ARG1, Y, ARG2); #else - a.short_().je(next); + emit_is_unequal_based_on_tags(next, X, ARG1, Y, ARG2, dShort); #endif } From c76f8c217933568d5a44aa6f9d1d2f9fd04a944a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Sun, 22 Oct 2023 17:02:59 +0200 Subject: [PATCH 09/12] Optimize test for arithmetic equality Take better advantage of operand types. * If one operand is known to never be an immediate term, don't emit any test for immediate operands. (Just go ahead and call the helper fragment.) * Don't do any immediate test for known immediates. --- erts/emulator/beam/jit/arm/beam_asm.hpp | 25 ++++++++++++++------- erts/emulator/beam/jit/arm/instr_common.cpp | 22 +++++++++--------- erts/emulator/beam/jit/x86/beam_asm.hpp | 18 +++++++++++++++ erts/emulator/beam/jit/x86/instr_common.cpp | 20 +++++++---------- 4 files changed, 53 insertions(+), 32 deletions(-) diff --git a/erts/emulator/beam/jit/arm/beam_asm.hpp b/erts/emulator/beam/jit/arm/beam_asm.hpp index f40bd6d3221a..415f114e9e8b 100644 --- a/erts/emulator/beam/jit/arm/beam_asm.hpp +++ b/erts/emulator/beam/jit/arm/beam_asm.hpp @@ -674,14 +674,6 @@ struct BeamAssembler : public BeamAssemblerCommon { } } - /* Set the Z flag if Reg1 and Reg2 are both immediates. */ - void emit_are_both_immediate(arm::Gp Reg1, arm::Gp Reg2) { - ERTS_CT_ASSERT(TAG_PRIMARY_IMMED1 == _TAG_PRIMARY_MASK); - a.and_(SUPER_TMP, Reg1, Reg2); - a.and_(SUPER_TMP, SUPER_TMP, imm(_TAG_PRIMARY_MASK)); - a.cmp(SUPER_TMP, imm(TAG_PRIMARY_IMMED1)); - } - /* Set the Z flag if Reg1 and Reg2 are definitely not equal based * on their tags alone. (They may still be equal if both are * immediates and all other bits are equal too.) */ @@ -1818,6 +1810,23 @@ class BeamModuleAssembler : public BeamAssembler, } } } + + /* Set the Z flag if Reg1 and Reg2 are both immediates. */ + void emit_are_both_immediate(const ArgVal &Src1, + arm::Gp Reg1, + const ArgVal &Src2, + arm::Gp Reg2) { + ERTS_CT_ASSERT(TAG_PRIMARY_IMMED1 == _TAG_PRIMARY_MASK); + if (always_immediate(Src1)) { + a.and_(SUPER_TMP, Reg2, imm(_TAG_PRIMARY_MASK)); + } else if (always_immediate(Src2)) { + a.and_(SUPER_TMP, Reg1, imm(_TAG_PRIMARY_MASK)); + } else { + a.and_(SUPER_TMP, Reg1, Reg2); + a.and_(SUPER_TMP, SUPER_TMP, imm(_TAG_PRIMARY_MASK)); + } + a.cmp(SUPER_TMP, imm(TAG_PRIMARY_IMMED1)); + } }; void beamasm_metadata_update(std::string module_name, diff --git a/erts/emulator/beam/jit/arm/instr_common.cpp b/erts/emulator/beam/jit/arm/instr_common.cpp index 2ae8918b8c87..e3b29a9ec6ee 100644 --- a/erts/emulator/beam/jit/arm/instr_common.cpp +++ b/erts/emulator/beam/jit/arm/instr_common.cpp @@ -1595,18 +1595,17 @@ void BeamModuleAssembler::emit_is_eq(const ArgLabel &Fail, const ArgSource &X, const ArgSource &Y) { Label next = a.newLabel(); - - auto x = load_source(X, ARG1); - auto y = load_source(Y, ARG2); + auto [x, y] = load_sources(X, ARG1, Y, ARG2); a.cmp(x.reg, y.reg); a.b_eq(next); - if (X.isLiteral() || Y.isLiteral()) { - comment("skipped test for small because one operand is never small"); + if (always_one_of(X) || + always_one_of(X)) { + comment("skipped test for immediate because one operand never is"); } else { /* We can skip deep comparisons when both args are immediates. */ - emit_are_both_immediate(x.reg, y.reg); + emit_are_both_immediate(X, x.reg, Y, y.reg); a.b_eq(resolve_beam_label(Fail, disp1MB)); } @@ -1622,18 +1621,17 @@ void BeamModuleAssembler::emit_is_ne(const ArgLabel &Fail, const ArgSource &X, const ArgSource &Y) { Label next = a.newLabel(); - - auto x = load_source(X, ARG1); - auto y = load_source(Y, ARG2); + auto [x, y] = load_sources(X, ARG1, Y, ARG2); a.cmp(x.reg, y.reg); a.b_eq(resolve_beam_label(Fail, disp1MB)); - if (X.isLiteral() || Y.isLiteral()) { - comment("skipped test for small because one operand is never small"); + if (always_one_of(X) || + always_one_of(X)) { + comment("skipped test for immediate because one operand never is"); } else { /* We can skip deep comparisons when both args are immediates. */ - emit_are_both_immediate(x.reg, y.reg); + emit_are_both_immediate(X, x.reg, Y, y.reg); a.b_eq(next); } diff --git a/erts/emulator/beam/jit/x86/beam_asm.hpp b/erts/emulator/beam/jit/x86/beam_asm.hpp index 42a86ed7fb60..eed39447a115 100644 --- a/erts/emulator/beam/jit/x86/beam_asm.hpp +++ b/erts/emulator/beam/jit/x86/beam_asm.hpp @@ -1643,6 +1643,24 @@ class BeamModuleAssembler : public BeamAssembler, } } } + + /* Set the Z flag if Reg1 and Reg2 are both immediates. */ + void emit_are_both_immediate(const ArgVal &Src1, + x86::Gp Reg1, + const ArgVal &Src2, + x86::Gp Reg2) { + ERTS_CT_ASSERT(TAG_PRIMARY_IMMED1 == _TAG_PRIMARY_MASK); + if (always_immediate(Src1)) { + a.mov(RETd, Reg2.r32()); + } else if (always_immediate(Src2)) { + a.mov(RETd, Reg1.r32()); + } else { + a.mov(RETd, Reg1.r32()); + a.and_(RETd, Reg2.r32()); + } + a.and_(RETb, imm(_TAG_PRIMARY_MASK)); + a.cmp(RETb, imm(TAG_PRIMARY_IMMED1)); + } }; void beamasm_metadata_update(std::string module_name, diff --git a/erts/emulator/beam/jit/x86/instr_common.cpp b/erts/emulator/beam/jit/x86/instr_common.cpp index c773e98f58bd..78435f3513e7 100644 --- a/erts/emulator/beam/jit/x86/instr_common.cpp +++ b/erts/emulator/beam/jit/x86/instr_common.cpp @@ -1570,14 +1570,12 @@ void BeamModuleAssembler::emit_is_eq(const ArgLabel &Fail, a.cmp(ARG1, ARG2); a.short_().je(next); - if (A.isLiteral() || B.isLiteral()) { - comment("skipped test for small because one operand is never small"); + if (always_one_of(A) || + always_one_of(B)) { + comment("skipped test for immediate because one operand never is"); } else { /* We can skip deep comparisons when both args are immediates. */ - a.mov(RETd, ARG1d); - a.and_(RETd, ARG2d); - a.and_(RETb, imm(_TAG_PRIMARY_MASK)); - a.cmp(RETb, imm(TAG_PRIMARY_IMMED1)); + emit_are_both_immediate(A, ARG1, B, ARG2); a.je(fail); } @@ -1597,14 +1595,12 @@ void BeamModuleAssembler::emit_is_ne(const ArgLabel &Fail, a.cmp(ARG1, ARG2); a.je(fail); - if (A.isLiteral() || B.isLiteral()) { - comment("skipped test for small because one operand is never small"); + if (always_one_of(A) || + always_one_of(B)) { + comment("skipped test for immediate because one operand never is"); } else { /* We can skip deep comparisons when both args are immediates. */ - a.mov(RETd, ARG1d); - a.and_(RETd, ARG2d); - a.and_(RETb, imm(_TAG_PRIMARY_MASK)); - a.cmp(RETb, imm(TAG_PRIMARY_IMMED1)); + emit_are_both_immediate(A, ARG1, B, ARG2); a.short_().je(next); } From 8cd93d05c32a7fb5d4551d9b0202571a585afac3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Sat, 25 Nov 2023 07:48:54 +0100 Subject: [PATCH 10/12] Optimize equality tests with known literals * Inline equality test with lists of a single immediate element (such as `[42]` or `[a]`). Call a specialized helper fragment for equality test with lists of two or more immediates. * Call a a specialized helper fragment for matching tuples containing only immediates. Use the same helper fragment for matching bignums and floats. * Inline comparisons with empty binaries and empty maps. --- .../beam/jit/arm/beam_asm_global.hpp.pl | 2 + erts/emulator/beam/jit/arm/instr_common.cpp | 297 +++++++++++++++- erts/emulator/beam/jit/beam_jit_common.cpp | 29 ++ erts/emulator/beam/jit/beam_jit_common.hpp | 3 + .../beam/jit/x86/beam_asm_global.hpp.pl | 2 + erts/emulator/beam/jit/x86/instr_common.cpp | 327 +++++++++++++++++- erts/emulator/test/op_SUITE.erl | 7 +- 7 files changed, 637 insertions(+), 30 deletions(-) diff --git a/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl b/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl index 0f5f8446888e..1f1739973248 100644 --- a/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl +++ b/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl @@ -92,6 +92,8 @@ int128_to_big_shared int_div_rem_body_shared int_div_rem_guard_shared + is_eq_exact_list_shared + is_eq_exact_shallow_boxed_shared is_in_range_shared is_ge_lt_shared minus_body_shared diff --git a/erts/emulator/beam/jit/arm/instr_common.cpp b/erts/emulator/beam/jit/arm/instr_common.cpp index e3b29a9ec6ee..0d398d7af77f 100644 --- a/erts/emulator/beam/jit/arm/instr_common.cpp +++ b/erts/emulator/beam/jit/arm/instr_common.cpp @@ -63,6 +63,7 @@ extern "C" #include "beam_common.h" #include "code_ix.h" #include "erl_binary.h" +#include "erl_map.h" } using namespace asmjit; @@ -1482,11 +1483,183 @@ void BeamModuleAssembler::emit_i_test_arity(const ArgLabel &Fail, a.b_ne(resolve_beam_label(Fail, disp1MB)); } +/* + * ARG1 = First operand + * ARG2 = Literal list + * + * The result is returned in the Z flag. + */ +void BeamGlobalAssembler::emit_is_eq_exact_list_shared() { + Label loop = a.newLabel(), mid = a.newLabel(), done = a.newLabel(); + + a.b(mid); + + a.bind(loop); + emit_untag_ptr(ARG1, ARG1); + emit_untag_ptr(ARG2, ARG2); + a.ldp(TMP1, ARG1, arm::Mem(ARG1)); + a.ldp(TMP2, ARG2, arm::Mem(ARG2)); + a.cmp(TMP1, TMP2); + a.b_ne(done); + + a.bind(mid); + a.cmp(ARG1, ARG2); + a.b_eq(done); + + /* If not equal, both terms must be CONSes. */ +#if !defined(DEBUG) + ERTS_CT_ASSERT(!is_list(make_small(0) | make_list(0))); + ERTS_CT_ASSERT(!is_list(make_boxed(0) | make_list(0))); +#endif + a.orr(TMP1, ARG1, ARG2); + emit_is_not_cons(loop, TMP1); + + /* Not equal. Clear Z flag. */ + a.cmp(TMP1, imm(0)); + + a.bind(done); + a.ret(a64::x30); +} + +/* + * ARG1 = LHS + * ARG2 = RHS + * + * The result is returned in the Z flag. + */ +void BeamGlobalAssembler::emit_is_eq_exact_shallow_boxed_shared() { + Label loop = a.newLabel(); + Label done = a.newLabel(); + Label not_equal = a.newLabel(); + + a.orr(TMP1, ARG1, ARG2); + emit_is_boxed(not_equal, TMP1); + + emit_untag_ptr(TMP1, ARG1); + a.ldr(TMP3, arm::Mem(TMP1)); + emit_untag_ptr(TMP2, ARG2); + a.lsr(ARG3, TMP3, imm(_HEADER_ARITY_OFFS)); + a.sub(ARG3, ARG3, imm(1)); + + a.bind(loop); + { + a.ldp(TMP3, TMP4, arm::Mem(TMP1).post(16)); + a.ldp(TMP5, TMP6, arm::Mem(TMP2).post(16)); + a.cmp(TMP3, TMP5); + a.ccmp(TMP4, TMP6, imm(NZCV::kNone), imm(arm::CondCode::kEQ)); + a.b_ne(done); + + a.subs(ARG3, ARG3, imm(2)); + a.b_pl(loop); + } + + a.cmn(ARG3, imm(2)); + a.b_eq(done); + + a.ldr(TMP3, arm::Mem(TMP1)); + a.ldr(TMP5, arm::Mem(TMP2)); + a.cmp(TMP3, TMP5); + + a.bind(done); + a.ret(a64::x30); + + a.bind(not_equal); + a.cmp(TMP1, 0); + a.ret(a64::x30); +} + void BeamModuleAssembler::emit_is_eq_exact(const ArgLabel &Fail, const ArgSource &X, const ArgSource &Y) { auto x = load_source(X, ARG1); + if (Y.isLiteral()) { + Eterm literal = beamfile_get_literal(beam, Y.as().get()); + + if (is_list(literal) && is_immed(CAR(list_val(literal))) && + is_nil(CDR(list_val(literal)))) { + /* Inline the equality test if the RHS argument is a list + * of one immediate value such as `[42]` or `[a]`. */ + arm::Gp cons_ptr; + + comment("inlined equality test with %T", literal); + if (!exact_type(X)) { + emit_is_cons(resolve_beam_label(Fail, dispUnknown), x.reg); + } + cons_ptr = emit_ptr_val(TMP1, x.reg); + a.sub(TMP1, cons_ptr, imm(TAG_PRIMARY_LIST)); + a.ldp(TMP2, TMP3, arm::Mem(TMP1)); + cmp(TMP2, CAR(list_val(literal))); + mov_imm(TMP4, NIL); + a.ccmp(TMP3, TMP4, imm(NZCV::kNone), imm(arm::CondCode::kEQ)); + a.b_ne(resolve_beam_label(Fail, disp1MB)); + + return; + } else if (beam_jit_is_list_of_immediates(literal)) { + comment("optimized equality test with %T", literal); + mov_var(ARG1, x); + mov_arg(ARG2, Y); + fragment_call(ga->get_is_eq_exact_list_shared()); + a.b_ne(resolve_beam_label(Fail, disp1MB)); + + return; + } else if (beam_jit_is_shallow_boxed(literal)) { + comment("optimized equality test with %T", literal); + mov_var(ARG1, x); + mov_arg(ARG2, Y); + fragment_call(ga->get_is_eq_exact_shallow_boxed_shared()); + a.b_ne(resolve_beam_label(Fail, disp1MB)); + + return; + } else if (is_bitstring(literal) && bitstring_size(literal) == 0) { + Label not_sub_bits = a.newLabel(); + + comment("simplified non-equality test with empty binary"); + emit_is_boxed(resolve_beam_label(Fail, dispUnknown), X, x.reg); + emit_untag_ptr(ARG1, x.reg); + + ERTS_CT_ASSERT_FIELD_PAIR(ErlHeapBits, thing_word, size); + a.ldp(TMP1, TMP2, arm::Mem(ARG1)); + + a.cmp(TMP1, imm(HEADER_SUB_BITS)); + a.b_ne(not_sub_bits); + { + ERTS_CT_ASSERT_FIELD_PAIR(ErlSubBits, start, end); + a.ldp(TMP2, TMP3, arm::Mem(ARG1, offsetof(ErlSubBits, start))); + a.sub(TMP2, TMP3, TMP2); + } + a.bind(not_sub_bits); + + if (masked_types(X) == + BeamTypeId::Bitstring) { + comment("skipped header test since we know it's a bitstring " + "when boxed"); + a.cbnz(TMP2, resolve_beam_label(Fail, disp1MB)); + } else { + const auto mask = _BITSTRING_TAG_MASK & ~_TAG_PRIMARY_MASK; + ERTS_CT_ASSERT(TAG_PRIMARY_HEADER == 0); + ERTS_CT_ASSERT(_TAG_HEADER_HEAP_BITS == + (_TAG_HEADER_HEAP_BITS & mask)); + a.and_(TMP1, TMP1, imm(mask)); + a.cmp(TMP1, imm(_TAG_HEADER_HEAP_BITS)); + a.ccmp(TMP2, imm(0), imm(NZCV::kNone), imm(arm::CondCode::kEQ)); + a.b_ne(resolve_beam_label(Fail, disp1MB)); + } + + return; + } else if (is_map(literal) && erts_map_size(literal) == 0) { + comment("optimized equality test with empty map", literal); + emit_is_boxed(resolve_beam_label(Fail, dispUnknown), X, x.reg); + emit_untag_ptr(ARG1, x.reg); + a.ldp(TMP1, TMP2, arm::Mem(ARG1)); + cmp(TMP1, MAP_HEADER_FLATMAP); + a.ccmp(TMP2, imm(0), imm(NZCV::kNone), imm(arm::CondCode::kEQ)); + a.b_ne(resolve_beam_label(Fail, disp1MB)); + + return; + } + } + /* If either argument is known to be an immediate, we can fail immediately * if they're not equal. */ if (always_immediate(X) || always_immediate(Y)) { @@ -1528,11 +1701,16 @@ void BeamModuleAssembler::emit_is_eq_exact(const ArgLabel &Fail, mov_var(ARG1, x); mov_var(ARG2, y); - emit_enter_runtime(); - runtime_call<2>(eq); - emit_leave_runtime(); - - a.cbz(ARG1.w(), resolve_beam_label(Fail, disp1MB)); + if (always_one_of(X) || + always_one_of(Y)) { + fragment_call(ga->get_is_eq_exact_shallow_boxed_shared()); + a.b_ne(resolve_beam_label(Fail, disp1MB)); + } else { + emit_enter_runtime(); + runtime_call<2>(eq); + emit_leave_runtime(); + a.cbz(ARG1.w(), resolve_beam_label(Fail, disp1MB)); + } a.bind(next); } @@ -1542,6 +1720,98 @@ void BeamModuleAssembler::emit_is_ne_exact(const ArgLabel &Fail, const ArgSource &Y) { auto x = load_source(X, ARG1); + if (Y.isLiteral()) { + Eterm literal = beamfile_get_literal(beam, Y.as().get()); + bool imm_list = beam_jit_is_list_of_immediates(literal); + + if (imm_list && erts_list_length(literal) == 1) { + arm::Gp cons_ptr; + Label next = a.newLabel(); + + /* Inline the equality test if the RHS argument is a list + * of one immediate value such as `[42]` or `[a]`. */ + comment("inlined non-equality test with %T", literal); + if (!exact_type(X)) { + emit_is_cons(next, x.reg); + } + cons_ptr = emit_ptr_val(TMP1, x.reg); + a.sub(TMP1, cons_ptr, imm(TAG_PRIMARY_LIST)); + a.ldp(TMP2, TMP3, arm::Mem(TMP1)); + cmp(TMP2, CAR(list_val(literal))); + mov_imm(TMP4, NIL); + a.ccmp(TMP3, TMP4, imm(NZCV::kNone), imm(arm::CondCode::kEQ)); + a.b_eq(resolve_beam_label(Fail, disp1MB)); + + a.bind(next); + + return; + } else if (imm_list) { + comment("optimized non-equality test with %T", literal); + mov_var(ARG1, x); + mov_arg(ARG2, Y); + fragment_call(ga->get_is_eq_exact_list_shared()); + a.b_eq(resolve_beam_label(Fail, disp1MB)); + + return; + } else if (beam_jit_is_shallow_boxed(literal)) { + comment("optimized non-equality test with %T", literal); + mov_var(ARG1, x); + mov_arg(ARG2, Y); + fragment_call(ga->get_is_eq_exact_shallow_boxed_shared()); + a.b_eq(resolve_beam_label(Fail, disp1MB)); + + return; + } else if (is_bitstring(literal) && bitstring_size(literal) == 0) { + Label next = a.newLabel(), not_sub_bits = a.newLabel(); + + comment("simplified non-equality test with empty binary"); + emit_is_boxed(next, X, x.reg); + emit_untag_ptr(ARG1, x.reg); + + ERTS_CT_ASSERT_FIELD_PAIR(ErlHeapBits, thing_word, size); + a.ldp(TMP1, TMP2, arm::Mem(ARG1)); + + a.cmp(TMP1, imm(HEADER_SUB_BITS)); + a.b_ne(not_sub_bits); + { + ERTS_CT_ASSERT_FIELD_PAIR(ErlSubBits, start, end); + a.ldp(TMP2, TMP3, arm::Mem(ARG1, offsetof(ErlSubBits, start))); + a.sub(TMP2, TMP3, TMP2); + } + a.bind(not_sub_bits); + + if (masked_types(X) == + BeamTypeId::Bitstring) { + comment("skipped header test since we know it's a bitstring " + "when boxed"); + a.cbz(TMP2, resolve_beam_label(Fail, disp1MB)); + } else { + const auto mask = _BITSTRING_TAG_MASK & ~_TAG_PRIMARY_MASK; + ERTS_CT_ASSERT(TAG_PRIMARY_HEADER == 0); + ERTS_CT_ASSERT(_TAG_HEADER_HEAP_BITS == + (_TAG_HEADER_HEAP_BITS & mask)); + a.and_(TMP1, TMP1, imm(mask)); + a.cmp(TMP1, imm(_TAG_HEADER_HEAP_BITS)); + a.ccmp(TMP2, imm(0), imm(NZCV::kNone), imm(arm::CondCode::kEQ)); + a.b_eq(resolve_beam_label(Fail, disp1MB)); + } + + a.bind(next); + + return; + } else if (is_map(literal) && erts_map_size(literal) == 0) { + comment("optimized non-equality test with empty map", literal); + emit_is_boxed(resolve_beam_label(Fail, dispUnknown), X, x.reg); + emit_untag_ptr(ARG1, x.reg); + a.ldp(TMP1, TMP2, arm::Mem(ARG1)); + cmp(TMP1, MAP_HEADER_FLATMAP); + a.ccmp(TMP2, imm(0), imm(NZCV::kNone), imm(arm::CondCode::kEQ)); + a.b_eq(resolve_beam_label(Fail, disp1MB)); + + return; + } + } + /* If either argument is known to be an immediate, we can fail immediately * if they're equal. */ if (always_immediate(X) || always_immediate(Y)) { @@ -1580,13 +1850,16 @@ void BeamModuleAssembler::emit_is_ne_exact(const ArgLabel &Fail, mov_var(ARG1, x); mov_var(ARG2, y); - emit_enter_runtime(); - - runtime_call<2>(eq); - - emit_leave_runtime(); - - a.cbnz(ARG1.w(), resolve_beam_label(Fail, disp1MB)); + if (always_one_of(X) || + always_one_of(Y)) { + fragment_call(ga->get_is_eq_exact_shallow_boxed_shared()); + a.b_eq(resolve_beam_label(Fail, disp1MB)); + } else { + emit_enter_runtime(); + runtime_call<2>(eq); + emit_leave_runtime(); + a.cbnz(ARG1.w(), resolve_beam_label(Fail, disp1MB)); + } a.bind(next); } diff --git a/erts/emulator/beam/jit/beam_jit_common.cpp b/erts/emulator/beam/jit/beam_jit_common.cpp index a65fa547de02..7013f25c8d87 100644 --- a/erts/emulator/beam/jit/beam_jit_common.cpp +++ b/erts/emulator/beam/jit/beam_jit_common.cpp @@ -1357,3 +1357,32 @@ Export *beam_jit_handle_unloaded_fun(Process *c_p, return ep; } + +bool beam_jit_is_list_of_immediates(Eterm term) { + while (is_list(term)) { + Eterm *cons = list_val(term); + if (!is_immed(CAR(cons))) { + return false; + } + term = CDR(cons); + } + return is_nil(term); +} + +bool beam_jit_is_shallow_boxed(Eterm term) { + if (is_tuple(term)) { + Eterm *tuple_ptr = tuple_val(term); + for (unsigned i = 1; i <= arityval(*tuple_ptr); i++) { + if (!is_immed(tuple_ptr[i])) { + return false; + } + } + return true; + } else if (is_big(term)) { + return true; + } else if (is_float(term)) { + return true; + } else { + return false; + } +} diff --git a/erts/emulator/beam/jit/beam_jit_common.hpp b/erts/emulator/beam/jit/beam_jit_common.hpp index 95396a3b2c8e..94bb8aacd70f 100644 --- a/erts/emulator/beam/jit/beam_jit_common.hpp +++ b/erts/emulator/beam/jit/beam_jit_common.hpp @@ -653,4 +653,7 @@ Export *beam_jit_handle_unloaded_fun(Process *c_p, int arity, Eterm fun_thing); +bool beam_jit_is_list_of_immediates(Eterm term); +bool beam_jit_is_shallow_boxed(Eterm term); + #endif diff --git a/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl b/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl index a77c05ea46fd..da86397e296c 100755 --- a/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl +++ b/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl @@ -86,6 +86,8 @@ i_test_yield_shared int_div_rem_body_shared int_div_rem_guard_shared + is_eq_exact_list_shared + is_eq_exact_shallow_boxed_shared is_in_range_shared is_ge_lt_shared minus_body_shared diff --git a/erts/emulator/beam/jit/x86/instr_common.cpp b/erts/emulator/beam/jit/x86/instr_common.cpp index 78435f3513e7..2fe6e17d3215 100644 --- a/erts/emulator/beam/jit/x86/instr_common.cpp +++ b/erts/emulator/beam/jit/x86/instr_common.cpp @@ -92,6 +92,7 @@ extern "C" #include "beam_common.h" #include "code_ix.h" #include "erl_binary.h" +#include "erl_map.h" } using namespace asmjit; @@ -1398,9 +1399,197 @@ void BeamModuleAssembler::emit_i_test_arity(const ArgLabel &Fail, a.jne(resolve_beam_label(Fail)); } +/* + * ARG1 = Fist operand + * ARG2 = Literal list + * + * The result is returned in the Z flag. + */ +void BeamGlobalAssembler::emit_is_eq_exact_list_shared() { + Label loop = a.newLabel(), mid = a.newLabel(), done = a.newLabel(); + + a.short_().jmp(mid); + + a.bind(loop); + (void)emit_ptr_val(ARG1, ARG1); + (void)emit_ptr_val(ARG2, ARG2); + a.mov(RET, getCARRef(ARG1)); + a.mov(ARG1, getCDRRef(ARG1)); + a.cmp(getCARRef(ARG2), RET); + a.short_().jne(done); + + a.mov(ARG2, getCDRRef(ARG2)); + + a.bind(mid); + a.cmp(ARG1, ARG2); + a.short_().je(done); + + /* If not equal, both terms must be CONSes. */ +#if !defined(DEBUG) + ERTS_CT_ASSERT(!is_list(TAG_PRIMARY_LIST | TAG_PRIMARY_BOXED)); + ERTS_CT_ASSERT(!is_list(TAG_PRIMARY_LIST | TAG_PRIMARY_IMMED1)); +#endif + a.mov(RETd, ARG1d); + a.or_(RETd, ARG2d); + emit_is_not_cons(loop, RET); + + /* Not equal. Clear Z flag. */ + ERTS_CT_ASSERT(TAG_PRIMARY_HEADER == 0); + a.cmp(RETb, imm(0)); + + a.bind(done); + a.ret(); +} + +/* + * ARG1 = LHS + * ARG2 = RHS + * + * The result is returned in the Z flag. + */ +void BeamGlobalAssembler::emit_is_eq_exact_shallow_boxed_shared() { + Label loop = a.newLabel(); + Label done = a.newLabel(); + Label not_equal = a.newLabel(); + + a.mov(RETd, ARG1d); + a.or_(RETd, ARG2d); + emit_is_boxed(not_equal, RET); + + a.and_(ARG1, imm(~TAG_PTR_MASK__)); + a.and_(ARG2, imm(~TAG_PTR_MASK__)); + a.mov(ARG3, x86::qword_ptr(ARG1, 0)); + a.shr(ARG3, imm(_HEADER_ARITY_OFFS)); + a.dec(ARG3); + mov_imm(ARG4, 0); + + a.bind(loop); + { + if (hasCpuFeature(CpuFeatures::X86::kAVX)) { + a.vmovdqu(x86::xmm0, x86::xmmword_ptr(ARG1, ARG4)); + a.vpxor(x86::xmm0, x86::xmm0, x86::xmmword_ptr(ARG2, ARG4)); + a.vptest(x86::xmm0, x86::xmm0); + } else { + a.mov(RET, x86::qword_ptr(ARG1, ARG4)); + a.cmp(RET, x86::qword_ptr(ARG2, ARG4)); + a.short_().jne(done); + a.mov(RET, x86::qword_ptr(ARG1, ARG4, 0, sizeof(Eterm))); + a.cmp(RET, x86::qword_ptr(ARG2, ARG4, 0, sizeof(Eterm))); + } + a.short_().jne(done); + + a.add(ARG4, imm(2 * sizeof(Eterm))); + a.sub(ARG3, imm(2)); + a.jge(loop); + } + + a.cmp(ARG3.r8(), imm(-2)); + a.short_().je(done); + + a.mov(RET, x86::qword_ptr(ARG1, ARG4, 0)); + a.cmp(RET, x86::qword_ptr(ARG2, ARG4, 0)); + + a.bind(done); + a.ret(); + + a.bind(not_equal); + a.cmp(RETb, 0); + a.ret(); +} + void BeamModuleAssembler::emit_is_eq_exact(const ArgLabel &Fail, const ArgSource &X, const ArgSource &Y) { + if (Y.isLiteral()) { + Eterm literal = beamfile_get_literal(beam, Y.as().get()); + bool imm_list = beam_jit_is_list_of_immediates(literal); + + if (imm_list && erts_list_length(literal) == 1) { + Sint head = (Sint)CAR(list_val(literal)); + comment("optimized equality test with %T", literal); + + mov_arg(RET, X); + if (!exact_type(X)) { + emit_is_cons(resolve_beam_label(Fail), RET); + } + (void)emit_ptr_val(RET, RET); + if (Support::isInt32(head)) { + a.cmp(getCARRef(RET), imm(head)); + } else { + mov_imm(ARG1, head); + a.cmp(getCARRef(RET), ARG1); + } + a.jne(resolve_beam_label(Fail)); + a.cmp(getCDRRef(RET), imm(NIL)); + a.jne(resolve_beam_label(Fail)); + + return; + } else if (imm_list) { + comment("optimized equality test with %T", literal); + mov_arg(ARG2, Y); + mov_arg(ARG1, X); + safe_fragment_call(ga->get_is_eq_exact_list_shared()); + a.jne(resolve_beam_label(Fail)); + + return; + } else if (beam_jit_is_shallow_boxed(literal)) { + comment("optimized equality test with %T", literal); + mov_arg(ARG2, Y); + mov_arg(ARG1, X); + safe_fragment_call(ga->get_is_eq_exact_shallow_boxed_shared()); + a.jne(resolve_beam_label(Fail)); + + return; + } else if (is_bitstring(literal) && bitstring_size(literal) == 0) { + comment("simplified equality test with empty bitstring"); + mov_arg(ARG2, X); + emit_is_boxed(resolve_beam_label(Fail), X, ARG2); + x86::Gp boxed_ptr = emit_ptr_val(ARG2, ARG2); + + ERTS_CT_ASSERT(offsetof(ErlHeapBits, size) == sizeof(Eterm)); + a.mov(ARG1, emit_boxed_val(boxed_ptr, sizeof(Eterm))); + + Label not_sub_bits = a.newLabel(); + if (masked_types(X) == + BeamTypeId::Bitstring) { + a.cmp(emit_boxed_val(boxed_ptr), imm(HEADER_SUB_BITS)); + } else { + a.mov(RETd, emit_boxed_val(boxed_ptr, 0, sizeof(Uint32))); + a.cmp(RETb, imm(HEADER_SUB_BITS)); + } + a.short_().jne(not_sub_bits); + + a.mov(ARG1, emit_boxed_val(boxed_ptr, offsetof(ErlSubBits, end))); + a.sub(ARG1, emit_boxed_val(boxed_ptr, offsetof(ErlSubBits, start))); + + a.bind(not_sub_bits); + if (masked_types(X) == + BeamTypeId::Bitstring) { + comment("skipped header test since we know it's a bitstring " + "when boxed"); + a.test(ARG1, ARG1); + } else { + a.and_(RETd, imm(_BITSTRING_TAG_MASK)); + a.sub(RETd, imm(_TAG_HEADER_HEAP_BITS)); + a.or_(RETd, ARG1d); + } + a.jne(resolve_beam_label(Fail)); + + return; + } else if (is_map(literal) && erts_map_size(literal) == 0) { + comment("optimized equality test with empty map", literal); + mov_arg(ARG1, X); + emit_is_boxed(resolve_beam_label(Fail), X, ARG1); + (void)emit_ptr_val(ARG1, ARG1); + a.cmp(emit_boxed_val(ARG1, 0, sizeof(Uint32)), MAP_HEADER_FLATMAP); + a.jne(resolve_beam_label(Fail)); + a.cmp(emit_boxed_val(ARG1, sizeof(Eterm), sizeof(Uint32)), imm(0)); + a.jne(resolve_beam_label(Fail)); + + return; + } + } + /* If one argument is known to be an immediate, we can fail * immediately if they're not equal. */ if (X.isRegister() && always_immediate(Y)) { @@ -1445,14 +1634,17 @@ void BeamModuleAssembler::emit_is_eq_exact(const ArgLabel &Fail, /* Both operands are pointers having the same tag. Must do a * deeper comparison. */ - emit_enter_runtime(); - - runtime_call<2>(eq); - - emit_leave_runtime(); - - a.test(RETd, RETd); - a.je(resolve_beam_label(Fail)); + if (always_one_of(X) || + always_one_of(Y)) { + safe_fragment_call(ga->get_is_eq_exact_shallow_boxed_shared()); + a.jne(resolve_beam_label(Fail)); + } else { + emit_enter_runtime(); + runtime_call<2>(eq); + emit_leave_runtime(); + a.test(RETd, RETd); + a.je(resolve_beam_label(Fail)); + } a.bind(next); } @@ -1460,6 +1652,106 @@ void BeamModuleAssembler::emit_is_eq_exact(const ArgLabel &Fail, void BeamModuleAssembler::emit_is_ne_exact(const ArgLabel &Fail, const ArgSource &X, const ArgSource &Y) { + if (Y.isLiteral()) { + Eterm literal = beamfile_get_literal(beam, Y.as().get()); + bool imm_list = beam_jit_is_list_of_immediates(literal); + + if (imm_list && erts_list_length(literal) == 1) { + Sint head = (Sint)CAR(list_val(literal)); + Label next = a.newLabel(); + + comment("optimized non-equality test with %T", literal); + mov_arg(RET, X); + if (!exact_type(X)) { + emit_is_cons(next, RET, dShort); + } + (void)emit_ptr_val(RET, RET); + if (Support::isInt32(head)) { + a.cmp(getCARRef(RET), imm(head)); + } else { + mov_imm(ARG1, head); + a.cmp(getCARRef(RET), ARG1); + } + a.short_().jne(next); + a.cmp(getCDRRef(RET), imm(NIL)); + a.je(resolve_beam_label(Fail)); + + a.bind(next); + + return; + } else if (imm_list) { + comment("optimized non-equality test with %T", literal); + mov_arg(ARG2, Y); + mov_arg(ARG1, X); + safe_fragment_call(ga->get_is_eq_exact_list_shared()); + a.je(resolve_beam_label(Fail)); + + return; + } else if (beam_jit_is_shallow_boxed(literal)) { + comment("optimized non-equality test with %T", literal); + mov_arg(ARG2, Y); + mov_arg(ARG1, X); + safe_fragment_call(ga->get_is_eq_exact_shallow_boxed_shared()); + a.je(resolve_beam_label(Fail)); + + return; + } else if (is_bitstring(literal) && bitstring_size(literal) == 0) { + Label next = a.newLabel(); + + comment("simplified non-equality test with empty bitstring"); + mov_arg(ARG2, X); + emit_is_boxed(next, X, ARG2, dShort); + x86::Gp boxed_ptr = emit_ptr_val(ARG2, ARG2); + + ERTS_CT_ASSERT(offsetof(ErlHeapBits, size) == sizeof(Eterm)); + a.mov(ARG1, emit_boxed_val(boxed_ptr, sizeof(Eterm))); + + Label not_sub_bits = a.newLabel(); + if (masked_types(X) == + BeamTypeId::Bitstring) { + a.cmp(emit_boxed_val(boxed_ptr), imm(HEADER_SUB_BITS)); + } else { + a.mov(RETd, emit_boxed_val(boxed_ptr, 0, sizeof(Uint32))); + a.cmp(RETb, imm(HEADER_SUB_BITS)); + } + a.short_().jne(not_sub_bits); + + a.mov(ARG1, emit_boxed_val(boxed_ptr, offsetof(ErlSubBits, end))); + a.sub(ARG1, emit_boxed_val(boxed_ptr, offsetof(ErlSubBits, start))); + + a.bind(not_sub_bits); + if (masked_types(X) == + BeamTypeId::Bitstring) { + comment("skipped header test since we know it's a bitstring " + "when boxed"); + a.test(ARG1, ARG1); + } else { + a.and_(RETd, imm(_BITSTRING_TAG_MASK)); + a.sub(RETd, imm(_TAG_HEADER_HEAP_BITS)); + a.or_(RETd, ARG1d); + } + a.je(resolve_beam_label(Fail)); + + a.bind(next); + + return; + } else if (is_map(literal) && erts_map_size(literal) == 0) { + Label next = a.newLabel(); + + comment("optimized non-equality test with empty map", literal); + mov_arg(ARG1, X); + emit_is_boxed(next, X, ARG1, dShort); + (void)emit_ptr_val(ARG1, ARG1); + a.cmp(emit_boxed_val(ARG1, 0, sizeof(Uint32)), MAP_HEADER_FLATMAP); + a.short_().jne(next); + a.cmp(emit_boxed_val(ARG1, sizeof(Eterm), sizeof(Uint32)), imm(0)); + a.je(resolve_beam_label(Fail)); + + a.bind(next); + return; + } + } + /* If one argument is known to be an immediate, we can fail * immediately if they're equal. */ if (X.isRegister() && always_immediate(Y)) { @@ -1498,14 +1790,17 @@ void BeamModuleAssembler::emit_is_ne_exact(const ArgLabel &Fail, #endif } - emit_enter_runtime(); - - runtime_call<2>(eq); - - emit_leave_runtime(); - - a.test(RETd, RETd); - a.jnz(resolve_beam_label(Fail)); + if (always_one_of(X) || + always_one_of(Y)) { + safe_fragment_call(ga->get_is_eq_exact_shallow_boxed_shared()); + a.jz(resolve_beam_label(Fail)); + } else { + emit_enter_runtime(); + runtime_call<2>(eq); + emit_leave_runtime(); + a.test(RETd, RETd); + a.jnz(resolve_beam_label(Fail)); + } a.bind(next); } diff --git a/erts/emulator/test/op_SUITE.erl b/erts/emulator/test/op_SUITE.erl index ef5bbccc27d0..5e10bbe26369 100644 --- a/erts/emulator/test/op_SUITE.erl +++ b/erts/emulator/test/op_SUITE.erl @@ -304,7 +304,7 @@ relop(Config) when is_list(Config) -> BitString = <<0:7>>, Map = #{a => b}, Vs0 = [a,b,-33,-33.0,0,0.0,-0,0,42,42.0,Big1,Big2,F1,F2, - Bin,BitString,Map], + Bin,BitString,Map,[16#1234_5678_abcd]], Vs = [unvalue(V) || V <- Vs0], Ops = ['==', '/=', '=:=', '=/=', '<', '=<', '>', '>='], binop(Ops, Vs). @@ -315,8 +315,11 @@ complex_relop(Config) when is_list(Config) -> Float = float(Big), Bin = <<"abc">>, BitString = <<0:7>>, + EmptyBitString = <<>>, Map = #{a => b}, - Vs0 = [an_atom,42.0,42,0.0,-0.0,Big,Float,Bin,BitString,Map], + EmptyMap = #{}, + Vs0 = [an_atom,42.0,42,0.0,-0.0,Big,Float,Bin,BitString, + EmptyBitString,Map,EmptyMap], Vs = flatmap(fun(X) -> [unvalue({X}),unvalue([X])] end, Vs0), Ops = ['==', '/=', '=:=', '=/=', '<', '=<', '>', '>='], binop(Ops, Vs). From 9699a0d3eb6d2c4fa07434c6a96aa2c06a562251 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Fri, 15 Dec 2023 08:00:50 +0100 Subject: [PATCH 11/12] Optimize matching out a bitstring --- erts/emulator/beam/jit/arm/instr_bs.cpp | 20 ++++++--- erts/emulator/beam/jit/x86/instr_bs.cpp | 22 ++++++---- erts/emulator/test/bs_match_bin_SUITE.erl | 52 +++++++++++++++++++++-- 3 files changed, 76 insertions(+), 18 deletions(-) diff --git a/erts/emulator/beam/jit/arm/instr_bs.cpp b/erts/emulator/beam/jit/arm/instr_bs.cpp index f49188280811..4c714ef4aa2f 100644 --- a/erts/emulator/beam/jit/arm/instr_bs.cpp +++ b/erts/emulator/beam/jit/arm/instr_bs.cpp @@ -3730,9 +3730,6 @@ static std::vector opt_bsm_segments( case BsmSegment::action::GET_BITSTRING: if (seg.size > 64) { read_action_pos = -1; - } else if (seg.action == BsmSegment::action::GET_BITSTRING && - seg.size % 8 != 0) { - read_action_pos = -1; } else { if ((seg.flags & BSF_LITTLE) != 0 || read_action_pos < 0 || seg.size + segs.at(read_action_pos).size > 64) { @@ -4122,16 +4119,25 @@ void BeamModuleAssembler::emit_i_bs_match_test_heap(ArgLabel const &Fail, } case BsmSegment::action::GET_BITSTRING: { auto Live = seg.live; + ERTS_ASSERT(seg.size > 64); comment("get binary %ld", seg.size); auto ctx = load_source(Ctx, TMP1); + if (position_is_valid) { + a.mov(ARG5, bin_position); + } else { + a.ldur(ARG5, emit_boxed_val(ctx.reg, start_offset)); + } lea(ARG1, arm::Mem(c_p, offsetof(Process, htop))); - a.ldur(ARG2, emit_boxed_val(ctx.reg, orig_offset)); - a.and_(ARG3, ARG2, imm(~TAG_PTR_MASK__)); - a.and_(ARG2, ARG2, imm(TAG_PTR_MASK__)); + if (seg.size <= ERL_ONHEAP_BITS_LIMIT) { + comment("skipped setting registers not used for heap binary"); + } else { + a.ldur(ARG2, emit_boxed_val(ctx.reg, orig_offset)); + a.and_(ARG3, ARG2, imm(~TAG_PTR_MASK__)); + a.and_(ARG2, ARG2, imm(TAG_PTR_MASK__)); + } a.ldur(ARG4, emit_boxed_val(ctx.reg, base_offset)); a.and_(ARG4, ARG4, imm(~ERL_SUB_BITS_FLAG_MASK)); - a.ldur(ARG5, emit_boxed_val(ctx.reg, start_offset)); mov_imm(ARG6, seg.size); a.add(TMP2, ARG5, ARG6); a.stur(TMP2, emit_boxed_val(ctx.reg, start_offset)); diff --git a/erts/emulator/beam/jit/x86/instr_bs.cpp b/erts/emulator/beam/jit/x86/instr_bs.cpp index fd3f32ba7e42..1fc2af02b5f0 100644 --- a/erts/emulator/beam/jit/x86/instr_bs.cpp +++ b/erts/emulator/beam/jit/x86/instr_bs.cpp @@ -3894,9 +3894,6 @@ static std::vector opt_bsm_segments( if (seg.size > 64) { read_action_pos = -1; - } else if (seg.action == BsmSegment::action::GET_BITSTRING && - seg.size % 8 != 0) { - read_action_pos = -1; } else if ((seg.flags & BSF_LITTLE) != 0 && is_common_size) { seg.action = BsmSegment::action::READ_INTEGER; read_action_pos = -1; @@ -4367,6 +4364,7 @@ void BeamModuleAssembler::emit_i_bs_match_test_heap(ArgLabel const &Fail, break; } case BsmSegment::action::GET_BITSTRING: { + ERTS_ASSERT(seg.size > 64); comment("get binary %ld", seg.size); if (is_ctx_valid) { a.mov(RET, ctx); @@ -4374,14 +4372,22 @@ void BeamModuleAssembler::emit_i_bs_match_test_heap(ArgLabel const &Fail, mov_arg(RET, Ctx); } emit_enter_runtime(); + if (is_position_valid) { + a.mov(ARG5, bin_position); + } else { + a.mov(ARG5, emit_boxed_val(RET, start_offset)); + } a.lea(ARG1, x86::qword_ptr(c_p, offsetof(Process, htop))); - a.mov(ARG2, emit_boxed_val(RET, orig_offset)); - a.mov(ARG3, ARG2); - a.and_(ARG2, imm(TAG_PTR_MASK__)); - a.and_(ARG3, imm(~TAG_PTR_MASK__)); + if (seg.size <= ERL_ONHEAP_BITS_LIMIT) { + comment("skipped setting registers not used for heap binary"); + } else { + a.mov(ARG2, emit_boxed_val(RET, orig_offset)); + a.mov(ARG3, ARG2); + a.and_(ARG2, imm(TAG_PTR_MASK__)); + a.and_(ARG3, imm(~TAG_PTR_MASK__)); + } a.mov(ARG4, emit_boxed_val(RET, base_offset)); a.and_(ARG4, imm(~ERL_SUB_BITS_FLAG_MASK)); - a.mov(ARG5, emit_boxed_val(RET, start_offset)); mov_imm(ARG6, seg.size); a.add(emit_boxed_val(RET, start_offset), ARG6); diff --git a/erts/emulator/test/bs_match_bin_SUITE.erl b/erts/emulator/test/bs_match_bin_SUITE.erl index 0ea354b01ee1..a80592f1df24 100644 --- a/erts/emulator/test/bs_match_bin_SUITE.erl +++ b/erts/emulator/test/bs_match_bin_SUITE.erl @@ -24,7 +24,8 @@ init_per_group/2,end_per_group/2, byte_split_binary/1,bit_split_binary/1,match_huge_bin/1, bs_match_string_edge_case/1,contexts/1, - empty_binary/1,small_bitstring/1]). + empty_binary/1,small_bitstring/1, + known_position/1]). -include_lib("common_test/include/ct.hrl"). @@ -33,7 +34,7 @@ suite() -> [{ct_hooks,[ts_install_cth]}]. all() -> [byte_split_binary, bit_split_binary, match_huge_bin, bs_match_string_edge_case, contexts, empty_binary, - small_bitstring]. + small_bitstring,known_position]. groups() -> []. @@ -281,7 +282,9 @@ small_bitstring(_Config) -> %% heap space for small bitstrings. rand_seed(), Bin = rand:bytes(10_000), - ok = small_bitstring_1(id(Bin), id(Bin)). + ok = small_bitstring_1(id(Bin), id(Bin)), + ok = small_bitstring_2(id(Bin), id(7)), + ok = small_bitstring_3(id(Bin), id(64)). small_bitstring_1(<>, @@ -291,6 +294,49 @@ small_bitstring_1(<>, <<>>) -> ok. +small_bitstring_2(<<>>, _) -> + ok; +small_bitstring_2(Bin, N7) -> + %% Ensure that matching fixed sizes gives the same result as + %% matching dynamic sizes. + + <> = Bin, + <> = Bin, + + <> = Bin, + <> = Bin, + + small_bitstring_2(As, N7). + +small_bitstring_3(<<>>, _) -> + ok; +small_bitstring_3(Bin, N64) -> + %% Ensure that matching fixed sizes gives the same result as + %% matching dynamic sizes for larger sizes. + + <> = Bin, + <> = Bin, + + <> = Bin, + <> = Bin, + + <> = Bin, + <> = Bin, + + small_bitstring_3(As, N64). + +known_position(_Config) -> + %% Cover the case of an extracted bitstring having a known position. + <> = id(<<42:8,"abcdefghij">>), + 42 = Int, + <<"abcdefghi">> = BitString, + + ok. + %%% %%% Common utilities. %%% From 1e3099e0b936354d0a5d755e17b26d7952bedcce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Mon, 18 Dec 2023 05:46:20 +0100 Subject: [PATCH 12/12] Aarch64: Combine put_list + deallocate --- erts/emulator/beam/jit/arm/instr_common.cpp | 50 +++++++++++++++++++++ erts/emulator/beam/jit/arm/ops.tab | 4 ++ 2 files changed, 54 insertions(+) diff --git a/erts/emulator/beam/jit/arm/instr_common.cpp b/erts/emulator/beam/jit/arm/instr_common.cpp index 0d398d7af77f..1ad65f19d7fa 100644 --- a/erts/emulator/beam/jit/arm/instr_common.cpp +++ b/erts/emulator/beam/jit/arm/instr_common.cpp @@ -713,6 +713,56 @@ void BeamModuleAssembler::emit_put_list(const ArgSource &Hd, flush_var(dst); } +void BeamModuleAssembler::emit_put_list_deallocate(const ArgSource &Hd, + const ArgSource &Tl, + const ArgRegister &Dst, + const ArgWord &Deallocate) { + Sint dealloc = Deallocate.get() * sizeof(Eterm); + arm::Gp hd_reg, tl_reg; + auto dst = init_destination(Dst, TMP3); + + ASSERT(dealloc <= 1023); + + if (Hd.isYRegister() && !Tl.isYRegister() && dealloc > 0) { + auto hd_index = Hd.as().get(); + + if (hd_index == 0) { + arm::Mem mem = getArgRef(Hd); + mem = arm::Mem(E).post(dealloc); + hd_reg = TMP1; + a.ldr(hd_reg, mem); + tl_reg = load_source(Tl, TMP2).reg; + dealloc = 0; + } + } else if (!Hd.isYRegister() && Tl.isYRegister() && dealloc > 0) { + auto tl_index = Tl.as().get(); + + if (tl_index == 0) { + arm::Mem mem = getArgRef(Tl); + mem = arm::Mem(E).post(dealloc); + tl_reg = TMP2; + a.ldr(tl_reg, mem); + hd_reg = load_source(Hd, TMP1).reg; + dealloc = 0; + } + } + + if (!hd_reg.isValid()) { + auto [hd, tl] = load_sources(Hd, TMP1, Tl, TMP2); + hd_reg = hd.reg; + tl_reg = tl.reg; + } + + a.stp(hd_reg, tl_reg, arm::Mem(HTOP).post(sizeof(Eterm[2]))); + a.sub(dst.reg, HTOP, imm(sizeof(Eterm[2]) - TAG_PRIMARY_LIST)); + + flush_var(dst); + + if (dealloc > 0) { + add(E, E, Deallocate.get() * sizeof(Eterm)); + } +} + void BeamModuleAssembler::emit_put_list2(const ArgSource &Hd1, const ArgSource &Hd2, const ArgSource &Tl, diff --git a/erts/emulator/beam/jit/arm/ops.tab b/erts/emulator/beam/jit/arm/ops.tab index a0e54b841b00..8411bc665e51 100644 --- a/erts/emulator/beam/jit/arm/ops.tab +++ b/erts/emulator/beam/jit/arm/ops.tab @@ -522,6 +522,10 @@ put_list Hd1=y Tl Dst | put_list Hd2=y Dst2 Dst3 | consecutive_words(Hd1, Hd2) => put_list2 Hd1 Hd2 Tl Dst +put_list Hd Tl Dst | deallocate N => + put_list_deallocate Hd Tl Dst N + +put_list_deallocate s s d t put_list s s d put_list2 s s s d