209 changes: 209 additions & 0 deletions Jit/hir/optimization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@
#include "code.h"
#include "pycore_pystate.h"

#include "Jit/compiler.h"
#include "Jit/hir/analysis.h"
#include "Jit/hir/builder.h"
#include "Jit/hir/hir.h"
#include "Jit/hir/memory_effects.h"
#include "Jit/hir/printer.h"
#include "Jit/hir/ssa.h"
#include "Jit/jit_rt.h"
#include "Jit/pyjit.h"
#include "Jit/util.h"

#include <fmt/format.h>
Expand All @@ -34,6 +37,7 @@ PassRegistry::PassRegistry() {
addPass(CleanCFG::Factory);
addPass(DynamicComparisonElimination::Factory);
addPass(PhiElimination::Factory);
addPass(InlineFunctionCalls::Factory);
addPass(Simplify::Factory);
addPass(DeadCodeElimination::Factory);
addPass(GuardTypeRemoval::Factory);
Expand Down Expand Up @@ -714,5 +718,210 @@ void CleanCFG::Run(Function& irfunc) {
}
}

// Most of these checks are only temporary and do not in perpetuity prohibit
// inlining. They are here to simplify bringup of the inliner and can be
// treated as TODOs.
static bool canInline(
VectorCall* call_instr,
PyFunctionObject* func,
const std::string& fullname) {
if (func->func_defaults != nullptr) {
JIT_DLOG("Can't inline %s because it has defaults", fullname);
return false;
}
if (func->func_kwdefaults != nullptr) {
JIT_DLOG("Can't inline %s because it has kwdefaults", fullname);
return false;
}
PyCodeObject* code = reinterpret_cast<PyCodeObject*>(func->func_code);
if (code->co_kwonlyargcount > 0) {
JIT_DLOG("Can't inline %s because it has keyword-only args", fullname);
return false;
}
if (code->co_flags & CO_VARARGS) {
JIT_DLOG("Can't inline %s because it has varargs", fullname);
return false;
}
if (code->co_flags & CO_VARKEYWORDS) {
JIT_DLOG("Can't inline %s because it has varkwargs", fullname);
return false;
}
JIT_DCHECK(code->co_argcount >= 0, "argcount must be positive");
if (call_instr->numArgs() != static_cast<size_t>(code->co_argcount)) {
JIT_DLOG(
"Can't inline %s because it is called with mismatched arguments",
fullname);
return false;
}
if (code->co_flags & kCoFlagsAnyGenerator) {
JIT_DLOG("Can't inline %s because it is a generator", fullname);
return false;
}
Py_ssize_t ncellvars = PyTuple_GET_SIZE(code->co_cellvars);
if (ncellvars > 0) {
JIT_DLOG("Can't inline %s because it is has cellvars", fullname);
return false;
}
Py_ssize_t nfreevars = PyTuple_GET_SIZE(code->co_freevars);
if (nfreevars > 0) {
JIT_DLOG("Can't inline %s because it is has freevars", fullname);
return false;
}
if (usesRuntimeFunc(code)) {
JIT_DLOG(
"Can't inline %s because it needs runtime access to its "
"PyFunctionObject",
fullname);
return false;
}
if (g_threaded_compile_context.compileRunning() && !isPreloaded(func)) {
JIT_DLOG(
"Can't inline %s because multithreaded compile is enabled and the "
"function is not preloaded",
fullname);
return false;
}
return true;
}

void inlineFunctionCall(Function& caller, VectorCall* call_instr) {
Register* target = call_instr->func();
if (!target->type().hasValueSpec(TFunc)) {
JIT_DLOG(
"Cannot inline non-function type %s (%s) into %s",
target->type(),
*target,
caller.fullname);
return;
}
PyObject* func_obj = target->type().objectSpec();
JIT_CHECK(PyFunction_Check(func_obj), "Expected PyFunctionObject");
PyFunctionObject* func = reinterpret_cast<PyFunctionObject*>(func_obj);
PyCodeObject* code = reinterpret_cast<PyCodeObject*>(func->func_code);
JIT_CHECK(PyCode_Check(code), "Expected PyCodeObject");
PyObject* globals = func->func_globals;
std::string fullname = funcFullname(func);
if (!PyDict_Check(globals)) {
JIT_DLOG(
"Refusing to inline %s: globals is a %.200s, not a dict",
fullname,
Py_TYPE(globals)->tp_name);
return;
}
PyObject* builtins = PyEval_GetBuiltins();
if (!PyDict_CheckExact(builtins)) {
JIT_DLOG(
"Refusing to inline %s: builtins is a %.200s, not a dict",
fullname,
Py_TYPE(builtins)->tp_name);
return;
}
if (!canInline(call_instr, func, fullname)) {
JIT_DLOG("Cannot inline %s into %s", fullname, caller.fullname);
return;
}

auto caller_frame_state =
std::make_unique<FrameState>(*call_instr->frameState());
// Multi-threaded compilation must use an existing Preloader, whereas
// single-threaded compilation can make Preloaders on the fly.
InlineResult result;
if (g_threaded_compile_context.compileRunning()) {
HIRBuilder hir_builder(getPreloader(func));
result = hir_builder.inlineHIR(&caller, caller_frame_state.get());
} else {
// This explicit temporary is necessary because HIRBuilder takes a const
// reference and stores it and we need to make sure the target doesn't go
// away.
Preloader preloader(func);
HIRBuilder hir_builder(preloader);
result = hir_builder.inlineHIR(&caller, caller_frame_state.get());
}
if (result.entry == nullptr) {
JIT_DLOG("Cannot inline %s into %s", fullname, caller.fullname);
return;
}
auto begin_inlined_function = BeginInlinedFunction::create(
code, globals, std::move(caller_frame_state), fullname);

BasicBlock* head = call_instr->block();
BasicBlock* tail = head->splitAfter(*call_instr);
// TODO(emacs): Emit a DeoptPatchpoint here to catch the case where someone
// swaps out function.__code__.
// VectorCall -> {BeginInlinedFunction, Branch to callee CFG}
auto callee_branch = Branch::create(result.entry);
call_instr->ExpandInto({begin_inlined_function, callee_branch});
tail->push_front(
EndInlinedFunction::create(begin_inlined_function->inlineDepth()));

// Transform LoadArg into Assign
for (auto it = result.entry->begin(); it != result.entry->end();) {
auto& instr = *it;
++it;

if (instr.IsLoadArg()) {
auto load_arg = static_cast<LoadArg*>(&instr);
auto assign = Assign::create(
instr.GetOutput(), call_instr->arg(load_arg->arg_idx()));
instr.ReplaceWith(*assign);
delete &instr;
}
}

// Transform Return into Assign+Branch
auto return_instr = result.exit->GetTerminator();
JIT_CHECK(
return_instr->IsReturn(),
"terminator from inlined function should be Return");
auto assign =
Assign::create(call_instr->GetOutput(), return_instr->GetOperand(0));
auto return_branch = Branch::create(tail);
return_instr->ExpandInto({assign, return_branch});
delete return_instr;

delete call_instr;
}

void InlineFunctionCalls::Run(Function& irfunc) {
if (irfunc.code == nullptr) {
// In tests, irfunc may not have bytecode.
return;
}
if (irfunc.code->co_flags & kCoFlagsAnyGenerator) {
// TODO(T109706798): Support inlining into generators
JIT_DLOG(
"Refusing to inline functions into %s: function is a generator",
irfunc.fullname);
return;
}
std::vector<Instr*> to_inline;
for (auto& block : irfunc.cfg.blocks) {
for (auto& instr : block) {
// TODO(emacs): Support InvokeMethod, InvokeStaticFunction,
// VectorCallStatic
if (!instr.IsVectorCall()) {
continue;
}
to_inline.emplace_back(&instr);
}
}
if (to_inline.empty()) {
return;
}
for (auto instr : to_inline) {
inlineFunctionCall(irfunc, static_cast<VectorCall*>(instr));
// We need to reflow types after every inline to propagate new type
// information from the callee.
reflowTypes(irfunc);
}
// The inliner will make some blocks unreachable and we need to remove them
// to make the CFG valid again. While inlining might make some blocks
// unreachable and therefore make less work (less to inline), we cannot
// remove unreachable blocks in the above loop. It might delete instructions
// pointed to by `to_inline`.
CopyPropagation{}.Run(irfunc);
CleanCFG{}.Run(irfunc);
}

} // namespace hir
} // namespace jit
11 changes: 11 additions & 0 deletions Jit/hir/optimization.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,17 @@ class CleanCFG : public Pass {
}
};

class InlineFunctionCalls : public Pass {
public:
InlineFunctionCalls() : Pass("InlineFunctionCalls") {}

void Run(Function& irfunc) override;

static std::unique_ptr<InlineFunctionCalls> Factory() {
return std::make_unique<InlineFunctionCalls>();
}
};

class PassRegistry {
public:
PassRegistry();
Expand Down
1 change: 1 addition & 0 deletions Jit/hir/preload.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,7 @@ void Preloader::preload() {
// invalidated and freed; we just do this here for the side effect to
// make sure the cached value has been loaded and any side effects of
// loading it have been exercised.
JIT_CHECK(name != nullptr, "name cannot be null");
getGlobalCache(name);
global_names_.emplace(name_idx, name);
}
Expand Down
10 changes: 10 additions & 0 deletions Jit/hir/preload.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,16 @@ class Preloader {
return return_type_;
}

int numArgs() const {
if (code_ == nullptr) {
// code_ might be null if we parsed from textual ir
return 0;
}
return code_->co_argcount + code_->co_kwonlyargcount +
bool(code_->co_flags & CO_VARARGS) +
bool(code_->co_flags & CO_VARKEYWORDS);
}

private:
BorrowedRef<> constArg(BytecodeInstruction& bc_instr) const;
GlobalCache getGlobalCache(BorrowedRef<> name) const;
Expand Down
3 changes: 2 additions & 1 deletion Jit/hir/printer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,6 @@ static std::string format_immediates(const Instr& instr) {
switch (instr.opcode()) {
case Opcode::kAssign:
case Opcode::kBatchDecref:
case Opcode::kBeginInlinedFunction:
case Opcode::kBuildString:
case Opcode::kCheckExc:
case Opcode::kCheckNeg:
Expand Down Expand Up @@ -286,6 +285,8 @@ static std::string format_immediates(const Instr& instr) {
case Opcode::kXIncref: {
return "";
}
case Opcode::kBeginInlinedFunction:
return static_cast<const BeginInlinedFunction&>(instr).fullname();
case Opcode::kLoadArrayItem: {
const auto& load = static_cast<const LoadArrayItem&>(instr);
return load.offset() == 0 ? "" : fmt::format("Offset[{}]", load.offset());
Expand Down
134 changes: 131 additions & 3 deletions Jit/lir/generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
#include "Python.h"
#include "internal/pycore_pyerrors.h"
#include "internal/pycore_pystate.h"
#include "internal/pycore_shadow_frame.h"
#include "listobject.h"

#include "Jit/codegen/x86_64.h"
#include "Jit/deopt.h"
#include "Jit/frame.h"
#include "Jit/hir/analysis.h"
#include "Jit/jit_rt.h"
#include "Jit/lir/block_builder.h"
Expand Down Expand Up @@ -635,6 +637,14 @@ static void emitSubclassCheck(

#undef FOREACH_FAST_BUILTIN

static ssize_t shadowFrameOffsetBefore(const InlineBase* instr) {
return -instr->inlineDepth() * ssize_t{kShadowFrameSize};
}

static ssize_t shadowFrameOffsetOf(const InlineBase* instr) {
return shadowFrameOffsetBefore(instr) - ssize_t{kShadowFrameSize};
}

LIRGenerator::TranslatedBlock LIRGenerator::TranslateOneBasicBlock(
const hir::BasicBlock* hir_bb) {
BasicBlockBuilder bbb(env_, lir_func_);
Expand Down Expand Up @@ -2471,9 +2481,127 @@ LIRGenerator::TranslatedBlock LIRGenerator::TranslateOneBasicBlock(
// HintTypes are purely informative
break;
}
case Opcode::kBeginInlinedFunction:
case Opcode::kEndInlinedFunction:
JIT_CHECK(false, "not emitted yet");
case Opcode::kBeginInlinedFunction: {
// TODO(T109706798): Support calling from generators and inlining
// generators.
// TODO(emacs): Link all shadow frame prev pointers in function
// prologue, since they need not happen with every call -- just the
// data pointers need to be reset with every call.
// TODO(emacs): If we manage to optimize leaf calls to a series of
// non-deopting instructions, remove BeginInlinedFunction and
// EndInlinedFunction completely.
if (py_debug) {
bbb.AppendCode(
"Call {}, {:#x}, __asm_tstate",
GetSafeTempName(),
reinterpret_cast<uint64_t>(assertShadowCallStackConsistent));
}
auto instr = static_cast<const BeginInlinedFunction*>(&i);
auto caller_shadow_frame = GetSafeTempName();
bbb.AppendCode(
"Lea {}, __native_frame_base, {}",
caller_shadow_frame,
shadowFrameOffsetBefore(instr));
// There is already a shadow frame for the caller function.
auto callee_shadow_frame = GetSafeTempName();
bbb.AppendCode(
"Lea {}, __native_frame_base, {}",
callee_shadow_frame,
shadowFrameOffsetOf(instr));
bbb.AppendCode(
"Store {}, {}, {}",
caller_shadow_frame,
callee_shadow_frame,
SHADOW_FRAME_FIELD_OFF(prev));
// Set code object data
PyCodeObject* code = instr->code();
env_->code_rt->addReference(reinterpret_cast<PyObject*>(code));
PyObject* globals = instr->globals();
env_->code_rt->addReference(reinterpret_cast<PyObject*>(globals));
RuntimeFrameState* rtfs =
env_->code_rt->allocateRuntimeFrameState(code, globals);
uintptr_t data = _PyShadowFrame_MakeData(rtfs, PYSF_RTFS, PYSF_JIT);
auto data_reg = GetSafeTempName();
bbb.AppendCode("Move {}, {:#x}", data_reg, data);
bbb.AppendCode(
"Store {}, {}, {}",
data_reg,
callee_shadow_frame,
SHADOW_FRAME_FIELD_OFF(data));
// Set our shadow frame as top of shadow stack
bbb.AppendCode(
"Store {}, __asm_tstate, {}",
callee_shadow_frame,
offsetof(PyThreadState, shadow_frame));
if (py_debug) {
bbb.AppendCode(
"Call {}, {:#x}, __asm_tstate",
GetSafeTempName(),
reinterpret_cast<uint64_t>(assertShadowCallStackConsistent));
}
break;
}
case Opcode::kEndInlinedFunction: {
// TODO(T109706798): Support calling from generators and inlining
// generators.
if (py_debug) {
bbb.AppendCode(
"Call {}, {:#x}, __asm_tstate",
GetSafeTempName(),
reinterpret_cast<uint64_t>(assertShadowCallStackConsistent));
}
// callee_shadow_frame <- tstate.shadow_frame
auto callee_shadow_frame = GetSafeTempName();
bbb.AppendCode(
"Load {}, __asm_tstate, {}",
callee_shadow_frame,
offsetof(PyThreadState, shadow_frame));

// Check if the callee has been materialized into a PyFrame. Use the
// flags below.
static_assert(
PYSF_PYFRAME == 1 && _PyShadowFrame_NumPtrKindBits == 2,
"Unexpected constants");
auto shadow_frame_data = GetSafeTempName();
bbb.AppendCode(
"Load {}, {}, {}",
shadow_frame_data,
callee_shadow_frame,
SHADOW_FRAME_FIELD_OFF(data));
bbb.AppendCode("BitTest {}, 0", shadow_frame_data);

// caller_shadow_frame <- callee_shadow_frame.prev
auto caller_shadow_frame = GetSafeTempName();
bbb.AppendCode(
"Load {}, {}, {}",
caller_shadow_frame,
callee_shadow_frame,
SHADOW_FRAME_FIELD_OFF(prev));
// caller_shadow_frame -> tstate.shadow_frame
bbb.AppendCode(
"Store {}, __asm_tstate, {}",
caller_shadow_frame,
offsetof(PyThreadState, shadow_frame));

// Unlink PyFrame if needed. Someone might have materialized all of the
// PyFrames via PyEval_GetFrame or similar.
auto done = GetSafeLabelName();
bbb.AppendCode("BranchNC {}", done);
// TODO(T109445584): Remove this unused label.
bbb.AppendCode("{}:", GetSafeLabelName());
bbb.AppendCode(
"Call {}, {}, __asm_tstate",
GetSafeTempName(),
reinterpret_cast<uint64_t>(JITRT_UnlinkFrame));
bbb.AppendCode("{}:", done);
if (py_debug) {
bbb.AppendCode(
"Call {}, {:#x}, __asm_tstate",
GetSafeTempName(),
reinterpret_cast<uint64_t>(assertShadowCallStackConsistent));
}
break;
}
case Opcode::kIsTruthy: {
auto func = reinterpret_cast<uint64_t>(&PyObject_IsTrue);
bbb.AppendCode(
Expand Down
34 changes: 34 additions & 0 deletions Jit/pyjit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ struct JitConfig {
size_t batch_compile_workers{0};
int multithreaded_compile_test{0};
bool use_huge_pages{true};
int hir_inliner_enabled{0};
};
static JitConfig jit_config;

Expand Down Expand Up @@ -103,6 +104,20 @@ static std::unordered_map<BorrowedRef<PyCodeObject>, CodeData> jit_code_data;
// Every unit has an entry in preloaders if we are doing multithreaded compile.
static std::unordered_map<BorrowedRef<>, hir::Preloader> jit_preloaders;

namespace jit {
bool isPreloaded(BorrowedRef<PyFunctionObject> func) {
return jit_preloaders.find(func) != jit_preloaders.end();
}

const jit::hir::Preloader& getPreloader(BorrowedRef<PyFunctionObject> func) {
auto it = jit_preloaders.find(func);
if (it != jit_preloaders.end()) {
return it->second;
}
return map_get_strict(jit_preloaders, func->func_code);
}
} // namespace jit

// Strong references to every function and code object that were ever
// registered, to keep them alive for batch testing.
static std::vector<Ref<>> test_multithreaded_units;
Expand Down Expand Up @@ -461,6 +476,15 @@ void initFlagProcessor() {
},
"JIT list match line numbers");

xarg_flag_processor.addOption(
"jit-enable-hir-inliner",
"PYTHONJITENABLEHIRINLINER",
[](string) {
JIT_DLOG("Enabling the HIR inliner");
_PyJIT_EnableHIRInliner();
},
"Enable the JIT's HIR inliner");

xarg_flag_processor.addOption(
"jit-help", "", jit_help, "print all available JIT flags and exits");
}
Expand Down Expand Up @@ -1186,6 +1210,8 @@ int _PyJIT_Initialize() {
jit_config.init_state = JIT_INITIALIZED;
jit_config.is_enabled = 1;
g_jit_list = jit_list.release();
// Unconditionally set this, since we might have shadow frames from
// CO_SHADOW_FRAME or inlined functions.
_PyThreadState_GetFrame =
reinterpret_cast<PyThreadFrameGetter>(materializeShadowCallStack);

Expand All @@ -1211,6 +1237,14 @@ int _PyJIT_AreTypeSlotsEnabled() {
jit_config.are_type_slots_enabled;
}

void _PyJIT_EnableHIRInliner() {
jit_config.hir_inliner_enabled = 1;
}

int _PyJIT_IsHIRInlinerEnabled() {
return jit_config.hir_inliner_enabled;
}

int _PyJIT_Enable() {
if (jit_config.init_state != JIT_INITIALIZED) {
return 0;
Expand Down
22 changes: 22 additions & 0 deletions Jit/pyjit.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
#include "Jit/pyjit_result.h"
#include "Jit/pyjit_typeslots.h"

#ifdef __cplusplus
#include "Jit/hir/preload.h"
#endif

// Offset of the state field in jit::GenFooterData for fast access from C code.
// This value is verified by static_assert in runtime.h.
#define _PY_GEN_JIT_DATA_STATE_OFFSET 24
Expand Down Expand Up @@ -83,6 +87,16 @@ PyAPI_FUNC(int) _PyJIT_EnableTypeSlots(void);
*/
PyAPI_FUNC(int) _PyJIT_AreTypeSlotsEnabled(void);

/*
Enable the HIR inliner.
*/
PyAPI_FUNC(void) _PyJIT_EnableHIRInliner(void);

/*
* Returns 1 if the HIR inliner is enabled and 0 otherwise.
*/
PyAPI_FUNC(int) _PyJIT_IsHIRInlinerEnabled(void);

/*
* JITs slot functions for the type object, and handles setting up
* deoptimization support for the type. The caller provides the type object and
Expand Down Expand Up @@ -321,4 +335,12 @@ PyAPI_FUNC(void) _PyJIT_TypeModified(PyTypeObject* type);
bool _PyJIT_UseHugePages();
} /* extern "C" */
#endif

#ifdef __cplusplus
namespace jit {
bool isPreloaded(BorrowedRef<PyFunctionObject> func);
const hir::Preloader& getPreloader(BorrowedRef<PyFunctionObject> func);
} // namespace jit
#endif

#endif /* Py_LIMITED_API */
2 changes: 1 addition & 1 deletion Jit/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ typedef struct _GenDataFooter {
PyGenObject* gen;

// JIT metadata for associated code object
CodeRuntime* code_rt;
CodeRuntime* code_rt{nullptr};
} GenDataFooter;

// The state field needs to be at a fixed offset so it can be quickly accessed
Expand Down
1 change: 1 addition & 0 deletions Lib/test/test_cinderjit.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def jit_suppress(func):
return func


@unittest.skip("Temporarily disabled until line numbers work with the inliner")
class GetFrameLineNumberTests(unittest.TestCase):
def assert_code_and_lineno(self, frame, func, lineno):
self.assertEqual(frame.f_code, func.__code__)
Expand Down
17 changes: 17 additions & 0 deletions RuntimeTests/deopt_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -724,6 +724,23 @@ def test(n):
runTest(src, args, 1, result);
}

TEST_F(DeoptStressTest, Inliner) {
const char* src = R"(
def bar(n):
return n + 1
def test(n):
res = 0
res += bar(n)
return res
)";
auto arg1 = Ref<>::steal(PyLong_FromLong(10));
PyObject* args[] = {arg1};
auto result = Ref<>::steal(PyLong_FromLong(11));
_PyJIT_EnableHIRInliner();
runTest(src, args, 1, result);
}

using DeoptTest = RuntimeTest;

TEST_F(DeoptTest, ValueKind) {
Expand Down
414 changes: 414 additions & 0 deletions RuntimeTests/hir_tests/inliner_test.txt

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions RuntimeTests/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ int main(int argc, char* argv[]) {
"RuntimeTests/hir_tests/hir_builder_static_test.txt",
HIRTest::kCompileStatic);
register_test("RuntimeTests/hir_tests/guard_type_removal_test.txt");
register_test("RuntimeTests/hir_tests/inliner_test.txt");
register_test("RuntimeTests/hir_tests/phi_elimination_test.txt");
register_test("RuntimeTests/hir_tests/refcount_insertion_test.txt");
register_test(
Expand Down