Skip to content

Commit

Permalink
Python snapshots: Only load dynamic libraries that are needed
Browse files Browse the repository at this point in the history
Rather than actually preloading all libraries, we just preallocate space for
them. There is a function called `getMemory` that determines the location of the
dynamic libraries and nothing else so we can patch this to ensure that the
libraries get allocated in their dedicated location if they are loaded at all.
This allows us to ensure that their metadata always lands in the right spot.

We still have to make sure to load all the libraries in the correct order. There
is also the possibility that someone could use ctypes and mess everything up.
Though ctypes doesn't work with our snapshots before this PR. I guess it could
be fixed by patching libffi to record the trampoline address and function
table slot into the DSO_METADATA and then recreate them the same way when
restoring the snapshot. We'd also need to record the function table base for
each loaded library. Once we do all this, we should be safe again...
  • Loading branch information
hoodmane committed Mar 27, 2024
1 parent ffcadab commit 9b639a0
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 24 deletions.
6 changes: 5 additions & 1 deletion src/pyodide/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ copy_file(
# TODO: all of these should be fixed by linking our own Pyodide or by upstreaming.

PRELUDE = """
import { newWasmModule, monotonicDateNow, wasmInstantiate, getRandomValues } from "pyodide-internal:builtin_wrappers";
import { newWasmModule, monotonicDateNow, wasmInstantiate, getRandomValues, patchedGetMemory } from "pyodide-internal:builtin_wrappers";
// Pyodide uses `new URL(some_url, location)` to resolve the path in `loadPackage`. Setting
// `location = undefined` makes this throw an error if some_url is not an absolute url. Which is what
Expand Down Expand Up @@ -124,6 +124,10 @@ REPLACEMENTS = [
[
"reportUndefinedSymbols()",
"reportUndefinedSymbolsNoOp()"
],
[
"getMemory(",
"patchedGetMemory(libName,"
]
]

Expand Down
8 changes: 8 additions & 0 deletions src/pyodide/internal/builtin_wrappers.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { default as UnsafeEval } from "internal:unsafe-eval";
import { DSO_METADATA } from "pyodide-internal:metadata";

let lastTime;
let lastDelta = 0;
Expand Down Expand Up @@ -129,3 +130,10 @@ export async function wasmInstantiate(module, imports) {
const instance = new WebAssembly.Instance(module, imports);
return { module, instance };
}

export function patchedGetMemory(path) {
if (DSO_METADATA?.settings?.loadedLibs) {
DSO_METADATA.settings.loadedLibs.push(path);
}
return DSO_METADATA[path].metadataPtr;
}
9 changes: 9 additions & 0 deletions src/pyodide/internal/metadata.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,12 @@ export const MEMORY_SNAPSHOT_READER = MetadataReader.hasMemorySnapshot()
: ArtifactBundler.hasMemorySnapshot()
? ArtifactBundler
: undefined;
/**
* Record the dlopen handles that are needed by the MEMORY, where the dso metadata is preallocated,
* whether we are loading a baseline snapshot.
*/
export const DSO_METADATA = {
settings: {
loadedLibs: [],
},
};
103 changes: 80 additions & 23 deletions src/pyodide/internal/python.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import processScriptImports from "pyodide-internal:process_script_imports.py";
import {
MEMORY_SNAPSHOT_READER,
IS_CREATING_BASELINE_SNAPSHOT,
DSO_METADATA,
} from "pyodide-internal:metadata";
import { reportError, simpleRunPython } from "pyodide-internal:util";

Expand Down Expand Up @@ -54,11 +55,6 @@ const SHOULD_UPLOAD_SNAPSHOT =
let READ_MEMORY = undefined;
let SNAPSHOT_SIZE = undefined;

/**
* Record the dlopen handles that are needed by the MEMORY.
*/
let DSO_METADATA = {};

/**
* Used to defer artifact upload. This is set during initialisation, but is executed during a
* request because an IO context is needed for the upload.
Expand Down Expand Up @@ -174,12 +170,41 @@ function loadDynlib(Module, path, wasmModuleData) {
// "handles" are dlopen handles. There will be one entry in the `handles` list
// for each dlopen handle that has not been dlclosed. We need to keep track of
// these across
const { handles } = DSO_METADATA[path] || { handles: [] };
const handles = DSO_METADATA[path]?.handles || [];
for (const handle of handles) {
Module.LDSO.loadedLibsByHandle[handle] = dso;
}
}

function getLEB(binary) {
let offset = 0;
let ret = 0;
let mul = 1;
while (1) {
const byte = binary[offset++];
ret += (byte & 0x7f) * mul;
mul *= 0x80;
if (!(byte & 0x80)) break;
}
return [ret, offset];
}

function getDylinkMetadata(Module, contentsOffset) {
const view = new Uint8Array(12);
TarReader.read(contentsOffset, view);
if (new Uint32Array(view.buffer)[0] !== 0x6d736100) {
throw new Error("Invalid wasm magic number");
}
if (view[8] !== 0) {
throw new Error("First section should be the dylink section");
}
const [length, offset] = getLEB(view.subarray(9));
const totalSize = 9 + offset + length;
const metadataBuffer = new Uint8Array(totalSize);
TarReader.read(contentsOffset, metadataBuffer);
return Module.getDylinkMetadata(metadataBuffer);
}

/**
* This loads all dynamic libraries visible in the site-packages directory. They
* are loaded before the runtime is initialized outside of the heap, using the
Expand All @@ -192,21 +217,47 @@ function loadDynlib(Module, path, wasmModuleData) {
* there.
*/
function preloadDynamicLibs(Module) {
let SO_FILES_TO_LOAD = SITE_PACKAGES_SO_FILES;
if (DSO_METADATA?.settings?.baselineSnapshot) {
SO_FILES_TO_LOAD = [["_lzma.so"]];
}
try {
let SO_FILES_TO_LOAD = SITE_PACKAGES_SO_FILES;
if (DSO_METADATA?.settings?.baselineSnapshot) {
SO_FILES_TO_LOAD = [["_lzma.so"]];
}
const sitePackages = getSitePackagesPath(Module);
const moduleNodes = {};
for (const soFile of SO_FILES_TO_LOAD) {
let node = SITE_PACKAGES_INFO;
for (const part of soFile) {
node = node.children.get(part);
}
const { contentsOffset, size } = node;
const path = sitePackages + "/" + soFile.join("/");
moduleNodes[path] = node;
const { contentsOffset } = node;
const metadata = getDylinkMetadata(Module, contentsOffset);
const memAlign = Math.pow(2, metadata.memoryAlign);
const ptr = Module.alignMemory(
Module.getMemory(metadata.memorySize + memAlign),
memAlign,
);
if (!(path in DSO_METADATA)) {
DSO_METADATA[path] = {};
}
DSO_METADATA[path].metadataPtr = ptr;
}
let toLoad = [];
if (DSO_METADATA?.settings?.loadedLibs) {
// Make sure to copy list first b/c loadDynlib sticks another copy of the
// lib into the list causing an infinite loop if we're not careful.
toLoad = Array.from(DSO_METADATA.settings.loadedLibs);
} else {
// Snapshot is from prior to the load-as-needed change. Just load everything.
toLoad = SO_FILES_TO_LOAD.map(
(soFile) => sitePackages + "/" + soFile.join("/"),
);
}
for (const path of toLoad) {
const { contentsOffset, size } = moduleNodes[path];
const wasmModuleData = new Uint8Array(size);
TarReader.read(contentsOffset, wasmModuleData);
const path = sitePackages + "/" + soFile.join("/");
loadDynlib(Module, path, wasmModuleData);
}
} catch (e) {
Expand Down Expand Up @@ -309,23 +360,23 @@ async function prepareWasmLinearMemory(Module) {
* crash if we dlsym the handle after restoring from the snapshot
*/
function recordDsoHandles(Module) {
const dylinkInfo = {};
for (const [handle, { name }] of Object.entries(
Module.LDSO.loadedLibsByHandle,
)) {
if (handle === 0) {
continue;
}
if (!(name in dylinkInfo)) {
dylinkInfo[name] = { handles: [] };
if (!(name in DSO_METADATA)) {
DSO_METADATA[name] = {};
}
if (!("handles" in DSO_METADATA[name])) {
DSO_METADATA[name].handles = [];
}
dylinkInfo[name].handles.push(handle);
DSO_METADATA[name].handles.push(handle);
}
dylinkInfo.settings = {};
if (IS_CREATING_BASELINE_SNAPSHOT) {
dylinkInfo.settings.baselineSnapshot = true;
DSO_METADATA.settings.baselineSnapshot = true;
}
return dylinkInfo;
}

// This is the list of all packages imported by the Python bootstrap. We don't
Expand Down Expand Up @@ -400,8 +451,8 @@ function memorySnapshotDoImports(Module) {
*/
function makeLinearMemorySnapshot(Module) {
memorySnapshotDoImports(Module);
const dsoJSON = recordDsoHandles(Module);
return encodeSnapshot(Module.HEAP8, dsoJSON);
recordDsoHandles(Module);
return encodeSnapshot(Module.HEAP8, DSO_METADATA);
}

function setUploadFunction(toUpload) {
Expand Down Expand Up @@ -475,7 +526,10 @@ function decodeSnapshot() {
const jsonBuf = new Uint8Array(jsonLength);
MEMORY_SNAPSHOT_READER.readMemorySnapshot(offset, jsonBuf);
const jsonTxt = new TextDecoder().decode(jsonBuf);
DSO_METADATA = JSON.parse(jsonTxt);
for (const key in DSO_METADATA) {
delete DSO_METADATA[key];
}
Object.assign(DSO_METADATA, JSON.parse(jsonTxt));
READ_MEMORY = function (Module) {
// restore memory from snapshot
MEMORY_SNAPSHOT_READER.readMemorySnapshot(snapshotOffset, Module.HEAP8);
Expand Down Expand Up @@ -524,7 +578,10 @@ export async function loadPyodide(lockfile, indexURL) {
if (DSO_METADATA?.settings?.baselineSnapshot) {
// Invalidate caches if we have a baseline snapshot because the contents of site-packages may
// have changed.
simpleRunPython(Module, "from importlib import invalidate_caches as f; f(); del f");
simpleRunPython(
Module,
"from importlib import invalidate_caches as f; f(); del f",
);
}

// This is just here for our test suite. Ugly but just about the only way to test this.
Expand Down

0 comments on commit 9b639a0

Please sign in to comment.