Skip to content

Commit

Permalink
Add an HIR function inliner
Browse files Browse the repository at this point in the history
Summary:
With this diff, we add a function inliner as an HIR pass. The structure of the
inliner pass is as follows: for each `VectorCall` instruction, if it can be
inlined, inline it. There is no interesting policy yet.

Inlining is similar to HIR building, except that inlined functions do not get
their own `Function`s and are constructed inside their caller's CFG. In
addition, `LoadArg` gets translated to `Assign` and `Return` to
`Assign`+`Branch`. For example, the following HIR:

```
# Final HIR (without inlining)
fun __main__:callee {
  bb 0 {
    v3:Object = LoadArg<0; "x">
    v5:MortalLongExact[1] = LoadConst<MortalLongExact[1]>
    v6:Object = BinaryOp<Add> v3 v5
    Return v6
  }
}

fun __main__:caller {
  bb 0 {
    v3:OptObject = LoadGlobalCached<0; "callee">
    v4:MortalFunc[function:0x7f379b5584b0] = GuardIs<0x7f379b5584b0> v3
    v5:MortalLongExact[3] = LoadConst<MortalLongExact[3]>
    v6:Object = VectorCall<1> v4 v5
    Return v6
  }
}
```

gets inlined to:

```
# Final HIR
fun __main__:caller {
  bb 0 {
    v3:OptObject = LoadGlobalCached<0; "callee">
    v4:MortalFunc[function:0x7f62660574b0] = GuardIs<0x7f62660574b0> v3
    v5:MortalLongExact[3] = LoadConst<MortalLongExact[3]>
    BeginInlinedFunction<__main__:callee>
    v13:MortalLongExact[1] = LoadConst<MortalLongExact[1]>
    v14:Object = BinaryOp<Add> v5 v13
    EndInlinedFunction
    Return v14
  }
}
```

The inliner does not inline functions that:

* Are not preloaded, if running in in a multithreaded context
* Cannot be compiled by the JIT (`exec`, `locals`, etc)
* Have mismatched argument/parameter counts
* Have varargs
* Have varkeywords
* Have cellvars
* Have freevars
* Are generators or coroutines
* Have default arguments
* Have `*args`
* Have `**kwargs`

The inliner is currently disabled by default. This diff adds `-X
jit-enable-hir-inliner` and `PYTHONJITENABLEHIRINLINER` to enable it.

Reviewed By: mpage

Differential Revision: D30819582

fbshipit-source-id: 238980b
  • Loading branch information
tekknolagi authored and facebook-github-bot committed Mar 10, 2022
1 parent 6586370 commit f3c50b3
Show file tree
Hide file tree
Showing 23 changed files with 1,018 additions and 31 deletions.
1 change: 1 addition & 0 deletions Include/internal/pycore_shadow_frame.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ _PyShadowFrame_GetPyFrame(_PyShadowFrame *shadow_frame) {

int _PyShadowFrame_HasGen(_PyShadowFrame *shadow_frame);
PyGenObject *_PyShadowFrame_GetGen(_PyShadowFrame *shadow_frame);
void _PyShadowFrame_DumpStack(PyThreadState* state);

static inline uintptr_t _PyShadowFrame_MakeData(void *ptr,
_PyShadowFrame_PtrKind ptr_kind,
Expand Down
3 changes: 3 additions & 0 deletions Jit/codegen/environ.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ namespace codegen {
void Environ::addIPToBCMapping(
asmjit::Label label,
const jit::lir::Instruction* instr) {
// TODO(emacs): Support fetching code object and line number for inlined
// frames in the JIT.
return;
const jit::hir::Instr* origin = instr->origin();
if (origin == nullptr) {
// Origin might be null if we've parsed the LIR.
Expand Down
2 changes: 2 additions & 0 deletions Jit/codegen/environ.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ struct Environ {
int initial_yield_spill_size_{-1};

int max_arg_buffer_size{0};

bool has_inlined_functions{false};
};

} // namespace codegen
Expand Down
8 changes: 6 additions & 2 deletions Jit/codegen/gen_asm.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ class NativeGenerator {
deopt_trampoline_generators_(generateDeoptTrampoline(true)),
jit_trampoline_(generateJitTrampoline()),
frame_header_size_(calcFrameHeaderSize(func)),
max_inline_depth_(calcMaxInlineDepth(func)) {}
max_inline_depth_(calcMaxInlineDepth(func)) {
env_.has_inlined_functions = max_inline_depth_ > 0;
}

NativeGenerator(
const hir::Function* func,
Expand All @@ -51,7 +53,9 @@ class NativeGenerator {
deopt_trampoline_generators_(deopt_trampoline_generators),
jit_trampoline_(jit_trampoline),
frame_header_size_(calcFrameHeaderSize(func)),
max_inline_depth_(calcMaxInlineDepth(func)) {}
max_inline_depth_(calcMaxInlineDepth(func)) {
env_.has_inlined_functions = max_inline_depth_ > 0;
}

~NativeGenerator() {
if (as_ != nullptr) {
Expand Down
4 changes: 4 additions & 0 deletions Jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ void Compiler::runPasses(jit::hir::Function& irfunc) {
runPass<jit::hir::GuardTypeRemoval>(irfunc);
runPass<jit::hir::CallOptimization>(irfunc);
runPass<jit::hir::PhiElimination>(irfunc);
if (_PyJIT_IsHIRInlinerEnabled()) {
runPass<jit::hir::InlineFunctionCalls>(irfunc);
runPass<jit::hir::Simplify>(irfunc);
}
runPass<jit::hir::DeadCodeElimination>(irfunc);
runPass<jit::hir::RefcountInsertion>(irfunc);

Expand Down
2 changes: 1 addition & 1 deletion Jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ class CompiledFunction {
DISALLOW_COPY_AND_ASSIGN(CompiledFunction);

vectorcallfunc entry_point_;
CodeRuntime* code_runtime_;
CodeRuntime* code_runtime_{nullptr};
int code_size_;
int stack_size_;
int spill_stack_size_;
Expand Down
11 changes: 10 additions & 1 deletion Jit/frame.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,9 @@ void updatePyFrame(
// Interpreter is executing this frame; don't touch the PyFrameObject.
return;
}
// TODO(emacs): Support fetching code object and line number for inlined
// frames in the JIT.
return;
CodeRuntime* code_rt = getCodeRuntime(shadow_frame);
uintptr_t ip = getIP(tstate, shadow_frame, code_rt->frame_size());
std::optional<int> bc_off = code_rt->getBCOffForIP(ip);
Expand Down Expand Up @@ -233,9 +236,15 @@ Ref<PyFrameObject> createPyFrame(
gen->gi_frame = py_frame.get();
Py_INCREF(py_frame);
}
bool is_inlined_function =
_PyShadowFrame_GetPtrKind(shadow_frame) == PYSF_RTFS;
shadow_frame->data =
_PyShadowFrame_MakeData(py_frame, PYSF_PYFRAME, PYSF_JIT);
updatePyFrame(tstate, py_frame, shadow_frame);
if (!is_inlined_function) {
// TODO(emacs): Support fetching code object and line number for inlined
// frames in the JIT.
updatePyFrame(tstate, py_frame, shadow_frame);
}
return py_frame;
}

Expand Down
101 changes: 90 additions & 11 deletions Jit/hir/builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "Jit/hir/hir.h"
#include "Jit/hir/optimization.h"
#include "Jit/hir/preload.h"
#include "Jit/hir/ssa.h"
#include "Jit/hir/type.h"
#include "Jit/pyjit.h"
#include "Jit/ref.h"
Expand Down Expand Up @@ -251,7 +252,6 @@ void HIRBuilder::AllocateRegistersForCells(

// Holds the current state of translation for a given basic block
struct HIRBuilder::TranslationContext {
TranslationContext() = default;
TranslationContext(BasicBlock* b, const FrameState& fs)
: block(b), frame(fs) {}

Expand Down Expand Up @@ -536,7 +536,17 @@ std::unique_ptr<Function> HIRBuilder::buildHIR() {
}

std::unique_ptr<Function> irfunc = preloader_.makeFunction();
buildHIRImpl(irfunc.get(), /*frame_state=*/nullptr);
// Use RemoveTrampolineBlocks and RemoveUnreachableBlocks directly instead of
// Run because the rest of CleanCFG requires SSA.
CleanCFG::RemoveTrampolineBlocks(&irfunc->cfg);
CleanCFG::RemoveUnreachableBlocks(&irfunc->cfg);
return irfunc;
}

BasicBlock* HIRBuilder::buildHIRImpl(
Function* irfunc,
FrameState* frame_state) {
temps_ = TempAllocator(&irfunc->env);

BytecodeInstructionBlock bc_instrs{code_};
Expand All @@ -550,19 +560,30 @@ std::unique_ptr<Function> HIRBuilder::buildHIR() {
break;
}
}
irfunc->cfg.entry_block = entry_block;
if (frame_state == nullptr) {
// Function is not being inlined (irfunc matches code) so set the whole
// CFG's entry block.
irfunc->cfg.entry_block = entry_block;
}

// Insert LoadArg, LoadClosureCell, and MakeCell/MakeNullCell instructions
// for the entry block
TranslationContext entry_tc{
entry_block,
FrameState{code_, preloader_.globals(), preloader_.builtins()}};
FrameState{
code_,
preloader_.globals(),
preloader_.builtins(),
/*parent=*/frame_state}};
AllocateRegistersForLocals(&irfunc->env, entry_tc.frame);
AllocateRegistersForCells(&irfunc->env, entry_tc.frame);

addLoadArgs(entry_tc, irfunc->numArgs());
addLoadArgs(entry_tc, preloader_.numArgs());
Register* cur_func = nullptr;
if (irfunc->uses_runtime_func) {
// TODO(emacs): Check if the code object or preloader uses runtime func and
// drop the frame_state == nullptr check. Inlined functions should load a
// const instead of using LoadCurrentFunc.
if (frame_state == nullptr && irfunc->uses_runtime_func) {
cur_func = temps_.AllocateNonStack();
entry_tc.emit<LoadCurrentFunc>(cur_func);
}
Expand All @@ -584,12 +605,7 @@ std::unique_ptr<Function> HIRBuilder::buildHIR() {
entry_tc.block = first_block;
translate(*irfunc, bc_instrs, entry_tc);

// Use RemoveTrampolineBlocks and RemoveUnreachableBlocks directly instead of
// Run because the rest of CleanCFG requires SSA.
CleanCFG::RemoveTrampolineBlocks(&irfunc->cfg);
CleanCFG::RemoveUnreachableBlocks(&irfunc->cfg);

return irfunc;
return entry_block;
}

void HIRBuilder::emitProfiledTypes(
Expand Down Expand Up @@ -671,6 +687,69 @@ void HIRBuilder::emitProfiledTypes(
}
}

InlineResult HIRBuilder::inlineHIR(
Function* caller,
FrameState* caller_frame_state) {
if (!can_translate(code_)) {
JIT_DLOG("Can't translate all opcodes in %s", preloader_.fullname());
return {nullptr, nullptr};
}
BasicBlock* entry_block = buildHIRImpl(caller, caller_frame_state);
// Make one block with a Return that merges the return branches from the
// callee. After SSA, it will turn into a massive Phi. The caller can find
// the Return and use it as the output of the call instruction.
Register* return_val = caller->env.AllocateRegister();
BasicBlock* exit_block = caller->cfg.AllocateBlock();
exit_block->append<Return>(return_val);
for (auto block : caller->cfg.GetRPOTraversal(entry_block)) {
auto instr = block->GetTerminator();
if (instr->IsReturn()) {
auto assign = Assign::create(return_val, instr->GetOperand(0));
auto branch = Branch::create(exit_block);
instr->ExpandInto({assign, branch});
delete instr;
}
}

// Map of FrameState to parent pointers. We must completely disconnect the
// inlined function's CFG from its caller for SSAify to run properly: it will
// find uses (in FrameState) before defs and insert LoadConst<Nullptr>.
std::unordered_map<FrameState*, FrameState*> framestate_parent;
for (BasicBlock* block : caller->cfg.GetRPOTraversal(entry_block)) {
for (Instr& instr : *block) {
JIT_CHECK(
!instr.IsBeginInlinedFunction(),
"there should be no BeginInlinedFunction in inlined functions");
JIT_CHECK(
!instr.IsEndInlinedFunction(),
"there should be no EndInlinedFunction in inlined functions");
FrameState* fs = nullptr;
if (auto db = dynamic_cast<DeoptBase*>(&instr)) {
fs = db->frameState();
} else if (auto snap = dynamic_cast<Snapshot*>(&instr)) {
fs = snap->frameState();
}
if (fs == nullptr || fs->parent == nullptr) {
continue;
}
bool inserted = framestate_parent.emplace(fs, fs->parent).second;
JIT_CHECK(inserted, "there should not be duplicate FrameState pointers");
fs->parent = nullptr;
}
}

// The caller function has already been converted to SSA form and all HIR
// passes require input to be in SSA form. SSAify the inlined function.
SSAify{}.Run(entry_block, &caller->env);

// Re-link the CFG.
for (auto& [fs, parent] : framestate_parent) {
fs->parent = parent;
}

return {entry_block, exit_block};
}

void HIRBuilder::translate(
Function& irfunc,
const jit::BytecodeInstructionBlock& bc_instrs,
Expand Down
30 changes: 30 additions & 0 deletions Jit/hir/builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,21 @@ std::unique_ptr<Function> buildHIR(BorrowedRef<PyFunctionObject> func);
// analysis.
std::unique_ptr<Function> buildHIR(const Preloader& preloader);

// Inlining merges all of the different callee Returns (which terminate blocks,
// leading to a bunch of distinct exit blocks) into Branches to one Return
// block (one exit block), which the caller can transform into an Assign to the
// output register of the original call instruction.
//
// Call InlineResult::succeeded to determine if the inline was successful.
struct InlineResult {
BasicBlock* entry;
BasicBlock* exit;

bool succeeded() const {
return entry != nullptr && exit != nullptr;
}
};

class HIRBuilder {
public:
HIRBuilder(const Preloader& preloader)
Expand All @@ -112,9 +127,24 @@ class HIRBuilder {
// for failure.
std::unique_ptr<Function> buildHIR();

// Given the preloader for the callee (passed into the constructor),
// construct the CFG for the callee in the caller's CFG. Does not link the
// two CFGs, except for FrameState parent pointers. Use caller_frame_state
// as the starting FrameState for the callee.
//
// Use InlineResult::succeeded to check if inlining succeeded.
InlineResult inlineHIR(Function* caller, FrameState* caller_frame_state);

private:
DISALLOW_COPY_AND_ASSIGN(HIRBuilder);

// Used by buildHIR and inlineHIR.
// irfunc is the function being compiled or the caller function.
// frame_state should be nullptr if irfunc matches the preloader (not
// inlining) and non-nullptr otherwise (inlining).
// Returns the entry block.
BasicBlock* buildHIRImpl(Function* irfunc, FrameState* frame_state);

struct TranslationContext;
// Completes compilation of a finally block
using FinallyCompleter =
Expand Down
28 changes: 17 additions & 11 deletions Jit/hir/hir.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,11 +183,6 @@ struct FrameState {
builtins = other.builtins;
return *this;
}
FrameState(
BorrowedRef<PyCodeObject> code,
BorrowedRef<PyDictObject> globals,
BorrowedRef<PyDictObject> builtins)
: code(code), globals(globals), builtins(builtins) {}
FrameState(
BorrowedRef<PyCodeObject> code,
BorrowedRef<PyDictObject> globals,
Expand All @@ -202,7 +197,7 @@ struct FrameState {
// If the function is inlined into another function, the depth at which it
// is inlined (nested function calls may be inlined). Starts at 1. If the
// function is not inlined, 0.
int inlineDepth() {
int inlineDepth() const {
int inline_depth = -1;
const FrameState* frame = this;
while (frame != nullptr) {
Expand Down Expand Up @@ -667,13 +662,15 @@ class Instr {

void ReplaceWith(Instr& instr) {
instr.InsertBefore(*this);
instr.setBytecodeOffset(bytecodeOffset());
unlink();
}

void ExpandInto(const std::vector<Instr*>& expansion) {
Instr* last = this;
for (Instr* instr : expansion) {
instr->InsertAfter(*last);
instr->setBytecodeOffset(bytecodeOffset());
last = instr;
}
unlink();
Expand Down Expand Up @@ -1707,6 +1704,7 @@ class INSTR_CLASS(

class CheckBase : public DeoptBase {
protected:
// Used only for tests.
CheckBase(Opcode op) : DeoptBase(op) {
auto new_frame = std::make_unique<FrameState>();
setFrameState(std::move(new_frame));
Expand Down Expand Up @@ -1735,6 +1733,7 @@ DEFINE_SIMPLE_INSTR(CheckNeg, (TCInt), HasOutput, Operands<1>, CheckBase);

class CheckBaseWithName : public CheckBase {
protected:
// Used only for tests.
CheckBaseWithName(Opcode op, BorrowedRef<> name)
: CheckBase(op), name_(name) {}

Expand Down Expand Up @@ -1984,19 +1983,24 @@ class INSTR_CLASS(BeginInlinedFunction, (), Operands<0>), public InlineBase {
BeginInlinedFunction(
BorrowedRef<PyCodeObject> code,
BorrowedRef<PyObject> globals,
const FrameState& caller_state)
: InstrT(), code_(code), globals_(globals) {
caller_state_ = std::make_unique<FrameState>(caller_state);
std::unique_ptr<FrameState> caller_state,
const std::string& fullname)
: InstrT(), code_(code), globals_(globals), fullname_(fullname) {
caller_state_ = std::move(caller_state);
}

FrameState* callerFrameState() const {
const FrameState* callerFrameState() const {
return caller_state_.get();
}

BorrowedRef<PyCodeObject> code() const {
return code_.get();
}

std::string fullname() const {
return fullname_;
}

BorrowedRef<PyObject> globals() const {
return globals_.get();
}
Expand All @@ -2009,9 +2013,11 @@ class INSTR_CLASS(BeginInlinedFunction, (), Operands<0>), public InlineBase {
// BeginInlinedFunction must own the FrameState that is used for building the
// linked list of FrameStates as well as its parent FrameState. The parent is
// originally owned by the Call instruction, but that gets destroyed.
std::unique_ptr<FrameState> caller_state_{nullptr};
// Used for printing.
BorrowedRef<PyCodeObject> code_;
BorrowedRef<PyObject> globals_;
std::unique_ptr<FrameState> caller_state_{nullptr};
std::string fullname_;
};

class INSTR_CLASS(EndInlinedFunction, (), Operands<0>), public InlineBase {
Expand Down
Loading

0 comments on commit f3c50b3

Please sign in to comment.