Skip to content

Commit

Permalink
AArch64: Enhance the effectiveness of the register cache
Browse files Browse the repository at this point in the history
365224b introduced a cache to keep track of BEAM registers
that had already been loaded into CPU registers.

This commit further improves the effectiveness of the cache. Now
21,193 loads of BEAM registers are avoided, compared to 15,098 loads
before this commit. (When loading all modules in OTP and the standard
library of Elixir 16.0.0.)

Of those load instructions, 8,975 instructions were no longer needed
because the contents of the BEAM register was already present in the
desired CPU register. Before this commit, only 1,861 load instructions
were eliminated.

The remaining load instructions are replaced with a `mov`
instruction (from register to register), which is more efficient than
a load instruction but does not reduce the code size.

As an example, the following BEAM code:

    {test,is_nonempty_list,{f,3},[{y,0}]}.
    {get_hd,{y,0},{x,0}}.

would be translated to native code like so:

    # is_nonempty_list_fS
        ldr x8, [x20]
        tbnz x8, 1, @label_3-1
    # get_hd_Sd
        ldr x8, [x20]
        ldur x25, [x8, -1]

That is, `{y,0}` would be loaded into a CPU register twice.

The improved caching avoids reloading `{y,0}` in the `get_hd`
instruction:

    # is_nonempty_list_fS
        ldr x8, [x20]
        tbnz x8, 1, @label_3-1
    # get_hd_Sd
    # skipped fetching of BEAM register
        ldur x25, [x8, -1]
  • Loading branch information
bjorng committed Jan 26, 2024
1 parent f3d80ab commit 06d402a
Show file tree
Hide file tree
Showing 10 changed files with 304 additions and 207 deletions.
273 changes: 169 additions & 104 deletions erts/emulator/beam/jit/arm/beam_asm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -958,98 +958,95 @@ class BeamModuleAssembler : public BeamAssembler,
* fragments as if they were local. */
std::unordered_map<void (*)(), Label> _dispatchTable;

/* Skip unnecessary moves in load_source(), load_sources(), and
* mov_arg(). Don't use these variables directly. */
size_t last_destination_offset = 0;
a64::Gp last_destination_from1, last_destination_from2;
arm::Mem last_destination_to1, last_destination_to2;
RegisterCache<16, arm::Mem, a64::Gp> reg_cache =
RegisterCache<16, arm::Mem, a64::Gp>(scheduler_registers, E, {});

/* Private helper. */
void preserve__cache(a64::Gp dst) {
last_destination_offset = a.offset();
invalidate_cache(dst);
void reg_cache_put(arm::Mem mem, a64::Gp src) {
if (src != SUPER_TMP) {
reg_cache.put(mem, src);
}
}

bool is_cache_valid() {
return a.offset() == last_destination_offset;
a64::Gp find_cache(arm::Mem mem) {
return reg_cache.find(a.offset(), mem);
}

/* Works as the STR instruction, but also updates the cache. */
void str_cache(a64::Gp src, arm::Mem dst) {
if (a.offset() == last_destination_offset &&
dst != last_destination_to1) {
/* Something is already cached in the first slot. Use the
* second slot. */
a.str(src, dst);
last_destination_offset = a.offset();
last_destination_to2 = dst;
last_destination_from2 = src;
} else {
/* Nothing cached yet, or the first slot has the same
* memory address as we will store into. Use the first
* slot and invalidate the second slot. */
a.str(src, dst);
last_destination_offset = a.offset();
last_destination_to1 = dst;
last_destination_from1 = src;
last_destination_to2 = arm::Mem();
}
void str_cache(a64::Gp src, arm::Mem mem_dst) {
reg_cache.consolidate(a.offset());
reg_cache.invalidate(src);

a.str(src, mem_dst);

reg_cache_put(mem_dst, src);
reg_cache.update(a.offset());
}

/* Works as the STP instruction, but also updates the cache. */
void stp_cache(a64::Gp src1, a64::Gp src2, arm::Mem dst) {
safe_stp(src1, src2, dst);
last_destination_offset = a.offset();
last_destination_to1 = dst;
last_destination_from1 = src1;
last_destination_to2 =
arm::Mem(a64::GpX(dst.baseId()), dst.offset() + 8);
last_destination_from2 = src2;
}

void invalidate_cache(a64::Gp dst) {
if (dst == last_destination_from1) {
last_destination_to1 = arm::Mem();
last_destination_from1 = a64::Gp();
}
if (dst == last_destination_from2) {
last_destination_to2 = arm::Mem();
last_destination_from2 = a64::Gp();
}
void stp_cache(a64::Gp src1, a64::Gp src2, arm::Mem mem_dst) {
arm::Mem next_dst =
arm::Mem(a64::GpX(mem_dst.baseId()), mem_dst.offset() + 8);

reg_cache.consolidate(a.offset());

reg_cache.invalidate(src1);
reg_cache.invalidate(src2);

safe_stp(src1, src2, mem_dst);

reg_cache_put(mem_dst, src1);
reg_cache_put(next_dst, src2);

reg_cache.update(a.offset());
}

/* Works like LDR, but looks in the cache first. */
void ldr_cached(a64::Gp dst, arm::Mem mem) {
if (a.offset() == last_destination_offset) {
a64::Gp cached_reg;
if (mem == last_destination_to1) {
cached_reg = last_destination_from1;
} else if (mem == last_destination_to2) {
cached_reg = last_destination_from2;
}
a64::Gp cached_reg = find_cache(mem);

if (cached_reg.isValid()) {
/* This memory location is cached. */
if (cached_reg != dst) {
comment("simplified fetching of BEAM register");
a.mov(dst, cached_reg);
preserve__cache(dst);
} else {
comment("skipped fetching of BEAM register");
invalidate_cache(dst);
}
if (cached_reg.isValid()) {
/* This memory location is cached. */
if (cached_reg == dst) {
comment("skipped fetching of BEAM register");
} else {
/* Not cached. Load and preserve the cache. */
a.ldr(dst, mem);
preserve__cache(dst);
comment("simplified fetching of BEAM register");
a.mov(dst, cached_reg);
reg_cache.invalidate(dst);
reg_cache.update(a.offset());
}
} else {
/* The cache is invalid. */
/* Not cached. Load and update cache. */
a.ldr(dst, mem);
last_destination_offset = a.offset();
last_destination_to1 = mem;
last_destination_from1 = dst;
last_destination_to2 = arm::Mem();
reg_cache.invalidate(dst);
reg_cache_put(mem, dst);
reg_cache.update(a.offset());
}
}

template<typename L, typename... Any>
void preserve_cache(L generate, Any... clobber) {
bool valid = reg_cache.validAt(a.offset());

generate();

if (valid) {
if (sizeof...(clobber) > 0) {
reg_cache.invalidate(clobber...);
}

reg_cache.update(a.offset());
}
}

void trim_preserve_cache(const ArgWord &Words) {
if (Words.get() > 0) {
ASSERT(Words.get() <= 1023);

preserve_cache([&]() {
auto offset = Words.get() * sizeof(Eterm);
add(E, E, offset);
reg_cache.trim_yregs(-offset);
});
}
}

Expand All @@ -1058,21 +1055,19 @@ class BeamModuleAssembler : public BeamAssembler,
}

void mov_preserve_cache(a64::Gp dst, a64::Gp src) {
if (a.offset() == last_destination_offset) {
a.mov(dst, src);
preserve__cache(dst);
} else {
a.mov(dst, src);
}
preserve_cache(
[&]() {
a.mov(dst, src);
},
dst);
}

void mov_imm_preserve_cache(a64::Gp dst, UWord value) {
if (a.offset() == last_destination_offset) {
mov_imm(dst, value);
preserve__cache(dst);
} else {
mov_imm(dst, value);
}
void untag_ptr_preserve_cache(a64::Gp dst, a64::Gp src) {
preserve_cache(
[&]() {
emit_untag_ptr(dst, src);
},
dst);
}

arm::Mem embed_label(const Label &label, enum Displacement disp);
Expand Down Expand Up @@ -1150,8 +1145,31 @@ class BeamModuleAssembler : public BeamAssembler,
a64::Gp emit_call_fun(bool skip_box_test = false,
bool skip_header_test = false);

void emit_is_cons(Label Fail, a64::Gp Src) {
preserve_cache([&]() {
BeamAssembler::emit_is_cons(Fail, Src);
});
}

void emit_is_not_cons(Label Fail, a64::Gp Src) {
preserve_cache([&]() {
BeamAssembler::emit_is_not_cons(Fail, Src);
});
}

void emit_is_list(Label Fail, a64::Gp Src) {
preserve_cache([&]() {
a.tst(Src, imm(_TAG_PRIMARY_MASK - TAG_PRIMARY_LIST));
a.mov(SUPER_TMP, NIL);
a.ccmp(Src, SUPER_TMP, imm(NZCV::kEqual), imm(arm::CondCode::kNE));
a.b_ne(Fail);
});
}

void emit_is_boxed(Label Fail, a64::Gp Src) {
BeamAssembler::emit_is_boxed(Fail, Src);
preserve_cache([&]() {
BeamAssembler::emit_is_boxed(Fail, Src);
});
}

void emit_is_boxed(Label Fail, const ArgVal &Arg, a64::Gp Src) {
Expand All @@ -1160,7 +1178,9 @@ class BeamModuleAssembler : public BeamAssembler,
return;
}

BeamAssembler::emit_is_boxed(Fail, Src);
preserve_cache([&]() {
BeamAssembler::emit_is_boxed(Fail, Src);
});
}

/* Copies `count` words from the address at `from`, to the address at `to`.
Expand Down Expand Up @@ -1446,13 +1466,17 @@ class BeamModuleAssembler : public BeamAssembler,
ASSERT(tmp.isGpX());

if (arg.isLiteral()) {
a.ldr(tmp, embed_constant(arg, disp32K));
preserve_cache(
[&]() {
a.ldr(tmp, embed_constant(arg, disp32K));
},
tmp);
return Variable(tmp);
} else if (arg.isRegister()) {
if (isRegisterBacked(arg)) {
auto index = arg.as<ArgXRegister>().get();
a64::Gp reg = register_backed_xregs[index];
invalidate_cache(reg);
reg_cache.invalidate(reg);
return Variable(reg);
}

Expand All @@ -1465,16 +1489,55 @@ class BeamModuleAssembler : public BeamAssembler,
: arg.as<ArgWord>().get();

if (Support::isIntOrUInt32(val)) {
mov_imm_preserve_cache(tmp, val);
preserve_cache(
[&]() {
mov_imm(tmp, val);
},
tmp);
return Variable(tmp);
}
}

a.ldr(tmp, embed_constant(arg, disp32K));
preserve_cache(
[&]() {
a.ldr(tmp, embed_constant(arg, disp32K));
},
tmp);
return Variable(tmp);
}
}

/*
* Load the argument into ANY register, using the
* cache to avoid reloading the value.
*
* Because it is not possible to predict into which register
* the value will end up, the following code is UNSAFE:
*
* auto src = load_source(Src);
* a.tst(src.reg, ...);
* a.mov(TMP2, NIL);
* a.ccmp(src.reg, TMP2, ..., ...);
*
* If the value of Src happens to end up in TMP2, it will be
* overwritten before its second use.
*
* Basically, the only safe way to use this function is when the
* register is used immediately and only once. For example:
*
* a.and_(TMP1, load_source(Src), imm(...));
* a.cmp(TMP1, imm(...));
*/
Variable<a64::Gp> load_source(const ArgVal &arg) {
a64::Gp cached_reg = find_cache(getArgRef(arg));

if (cached_reg.isValid()) {
return load_source(arg, cached_reg);
} else {
return load_source(arg, TMP1);
}
}

auto load_sources(const ArgVal &Src1,
a64::Gp tmp1,
const ArgVal &Src2,
Expand Down Expand Up @@ -1559,6 +1622,8 @@ class BeamModuleAssembler : public BeamAssembler,
void flush_var(const Variable<a64::Gp> &to) {
if (to.mem.hasBase()) {
str_cache(to.reg, to.mem);
} else {
reg_cache.invalidate(to.reg);
}
}

Expand Down Expand Up @@ -1707,11 +1772,11 @@ class BeamModuleAssembler : public BeamAssembler,
ASSERT(gp.isGpX());

if (abs_offset <= sizeof(Eterm) * MAX_LDR_STR_DISPLACEMENT) {
bool valid_cache = is_cache_valid();
a.ldr(gp, mem);
if (valid_cache) {
preserve__cache(gp);
}
preserve_cache(
[&]() {
a.ldr(gp, mem);
},
gp);
} else {
add(SUPER_TMP, a64::GpX(mem.baseId()), offset);
a.ldr(gp, arm::Mem(SUPER_TMP));
Expand Down Expand Up @@ -1751,12 +1816,12 @@ class BeamModuleAssembler : public BeamAssembler,
ASSERT(gp1 != gp2);

if (abs_offset <= sizeof(Eterm) * MAX_LDP_STP_DISPLACEMENT) {
bool valid_cache = is_cache_valid();
a.ldp(gp1, gp2, mem);
if (valid_cache) {
preserve__cache(gp1);
preserve__cache(gp2);
}
preserve_cache(
[&]() {
a.ldp(gp1, gp2, mem);
},
gp1,
gp2);
} else if (abs_offset < sizeof(Eterm) * MAX_LDR_STR_DISPLACEMENT) {
/* Note that we used `<` instead of `<=`, as we're loading two
* elements rather than one. */
Expand Down
2 changes: 1 addition & 1 deletion erts/emulator/beam/jit/arm/beam_asm_module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ void BeamModuleAssembler::emit_label(const ArgLabel &Label) {
current_label = rawLabels[Label.get()];
bind_veneer_target(current_label);

last_destination_offset = ~0;
reg_cache.invalidate();
}

void BeamModuleAssembler::emit_aligned_label(const ArgLabel &Label,
Expand Down

0 comments on commit 06d402a

Please sign in to comment.