diff --git a/.github/workflows/scripts-ios.yml b/.github/workflows/scripts-ios.yml index 6a3697f9a7..b0aaeef8cb 100644 --- a/.github/workflows/scripts-ios.yml +++ b/.github/workflows/scripts-ios.yml @@ -207,18 +207,7 @@ jobs: build-ios-metal: # Mirrors build-ios but enables the Metal rendering backend via the # codename1.arg.ios.metal=true build hint. Keeps the GL path untouched - # so a regression on either path is isolated in its own job. Part of - # the iOS Metal port migration -- see Ports/iOSPort/METAL_PORT_STATUS.md. - # - # continue-on-error while the Metal port is still in progress: - # screenshot comparisons will differ from golden images until DrawString - # is ported and ClipRect is re-enabled (Phase 2 follow-ups tracked in - # METAL_PORT_STATUS.md). The Metal job still runs on every PR that - # touches the paths below so we can watch for new regressions and - # download artifacts for comparison, but a failure here won't block - # the PR. Flip this to false (or remove the line) once the Metal - # variant matches the GL variant's screenshot set. - continue-on-error: true + # so a regression on either path is isolated in its own job. needs: build-port permissions: contents: read diff --git a/Ports/iOSPort/METAL_PORT_STATUS.md b/Ports/iOSPort/METAL_PORT_STATUS.md deleted file mode 100644 index 2ce26ad977..0000000000 --- a/Ports/iOSPort/METAL_PORT_STATUS.md +++ /dev/null @@ -1,55 +0,0 @@ -# iOS Metal Rendering Port — Status - -Branch: `metal-ios-backend`. Build flag: `-Dcodename1.arg.ios.metal=true` (uncomments `#define CN1_USE_METAL` in `CN1ES2compat.h`). OpenGL ES 2 remains the default. - -## Architectural choices - -- **Two backends, one Java surface.** The `ExecutableOp` queue, `CADisplayLink → drawFrame` drain loop, peer-component layering, and JNI surface in `IOSImplementation.java` are unchanged from the GL build. Metal pipeline state lives in `CN1Metalcompat.{h,m}`, shaders in `CN1MetalShaders.metal`, glyph atlas in `CN1MetalGlyphAtlas.{h,m}`, pipeline cache in `CN1MetalPipelineCache.{h,m}`. All gated by `#ifdef CN1_USE_METAL`. - -- **Mutable-image rendering goes through the alpha-mask path on both GL and Metal.** `MutableGraphics.nativeFillShape` / `nativeDrawShape` / `nativeFillRoundRect` / `nativeFillArc` / etc. all build a `GeneralPath` and call `renderShapeViaAlphaMask` which routes through `Renderer.c` → R8 alpha mask → `DrawTextureAlphaMask` op tagged with the current mutable image. The op's `execute` picks the Metal-MSL or GL-shader implementation by build flag. The Java side has zero `if (metalRendering)` runtime checks; the C side uses `#ifdef CN1_USE_METAL` exclusively. - -- **Deferred commit on mutable images.** `startDrawingOnImage` allocates an `MTLTexture` for the mutable; subsequent ops queue tagged with that target. `drawFrame`'s drain switches encoder per target and commits per-target command buffers without `waitUntilCompleted`. Pixel-reading paths (`getRGB`, encode-as-PNG/JPEG, `gausianBlurImage`) call `flushBuffer` to force a drain, then `CN1MetalFlushMutableImageSync` to wait, then read. - -- **Text rendering: per-(font, pointSize) R8 atlas via CoreText.** `CN1MetalGlyphAtlas` lazily rasterises glyphs into a 1024² (grows to 2048²) R8 texture using `CTFontDrawGlyphs`. LRU eviction with 64-entry cache cap. `CN1MetalDrawString` shapes via `CTLineCreateWithAttributedString` and emits one alpha-mask quad per glyph through the same `cn1_fs_alpha_mask` Metal shader the shape path uses. - -- **Gradient rendering: pure-GPU MSL fragment shaders.** `cn1_fs_linear_gradient` and `cn1_fs_radial_gradient` interpolate `mix(startColor, endColor, t)` per-fragment. No CG-bitmap upload; no offscreen rasterisation. Linear gradients use vertex texcoords (0..1) along the chosen axis; radial uses `length((uv - center) / radii)`. - -- **Premultiplied alpha throughout.** Pipeline blend mode: `src=One, dst=OneMinusSourceAlpha`. Mutable-texture clear colour is stored premultiplied so subsequent sampling (with `cn1_fs_textured`) composites correctly when the mutable was created via `Image.createImage(w, h, argb)` with non-opaque argb. - -- **Metal Y-down ortho with z-range remap.** `mutableProjection(w, h)` maps `(0,0) → (-1, +1)` (top-left in NDC) and `(w, h) → (+1, -1)` (bottom-right). Z is `0.5 * input_z + 0.5 * w` so GL-style clip-z `[-w, w]` maps to Metal's `[0, w]`. - -- **Render targets persistent across frames.** A persistent offscreen `screenTexture` with `MTLLoadActionLoad` accumulates per-frame ops the way the GL renderbuffer does; the drawable is acquired only at present time to minimise `nextDrawable` stalls. `setFramebuffer` is idempotent; `updateFrameBufferSize:h:` tears down any live encoder before rebuilding the texture so dimension changes mid-frame don't leak state. - -- **Phase 5 hardening landed.** sRGB colorspace, `maximumDrawableCount = 3` with skip-frame on nil drawable, memory-warning eviction of glyph atlases, lifecycle pause on backgrounding, drawable recreation on rotation. - -## Missing features / open issues - -- **Switch component triangular tear.** A small (~3% of pill area) triangular sub-pixel artefact remains where the white thumb meets the green pill on `SwitchTheme_dark` / `SwitchTheme_light`. The pill renders as a single solid shape (was four pacman wedges before the path-construction fixes) and the thumb circle is clean. Likely cause: `gausianBlurImage`'s blurred shadow halo isn't propagating into the new mutable's `MTLTexture` after `Image.getGraphics()` re-attaches. Several seed-the-texture-from-the-existing-UIImage attempts (commits `9f03c11a8` → `b8db2d74e` reverted) did not fix it. Needs device-level shader/Metal-debugger inspection to narrow further. Goldens for `kotlin` (which contains a Switch) reverted to the pre-bug capture so the test surfaces the regression. - -- **Perspective / camera transforms render empty on mutable targets.** `graphics-transform-perspective` and `graphics-transform-camera` mutable panels render blank on Metal; GL renders the perspective-transformed rectangles. Vertex shader chain (`projection × modelView × transform × pos4`) and z-range remap look correct on paper but the rendered output is empty. Goldens captured the empty state, so the test passes self-referentially. Real bug, hidden. - -- **Stencil clipping for non-rectangular clips.** Currently falls back to a bounding-box scissor — Form layout handles that OK but paths-as-masks and textures-as-masks clip incorrectly. - -- **`graphics-draw-line` rasterisation diffs.** Test draws thousands of 1-pixel lines; Metal's `MTLPrimitiveTypeLine` rasterisation rule differs from GL's at integer pixel boundaries, producing 1-pixel-wide diff stripes. Not a bug; rasterisation rule mismatch. - -- **Image scaling quality.** `CGContextDrawImage` round-trips at 1× scale produce blurry edges when stretched to 3× retina (affects `graphics-draw-image-rect`, `graphics-fill-round-rect` via the round-rect-as-image path). Both backends share this; not Metal-specific. - -- **`graphics-fill-polygon` dropped from compare pipeline.** When the JPEG preview exceeds 20 KB (test produces 72 KB), the runner drops the test from the comparison even though the full PNG was captured. Pre-existing tooling issue, not rendering. - -## Verification - -```bash -# GL baseline -cd scripts/hellocodenameone -./build.sh ios_source -scripts/run-ios-ui-tests.sh - -# Metal variant -./build.sh ios_source -Dios.metal=true -scripts/run-ios-ui-tests.sh - -# Compare -diff /screenshot-compare.json /screenshot-compare.json -``` - -Metal goldens live in `scripts/ios/screenshots-metal/`; the `build-ios-metal` CI job overrides `SCREENSHOT_REF_DIR` to point at it. GL goldens (`scripts/ios/screenshots/`) remain untouched by Metal port work. diff --git a/vm/ByteCodeTranslator/src/cn1_globals.m b/vm/ByteCodeTranslator/src/cn1_globals.m index 4b3e883111..f3ebb0ffe6 100644 --- a/vm/ByteCodeTranslator/src/cn1_globals.m +++ b/vm/ByteCodeTranslator/src/cn1_globals.m @@ -576,6 +576,8 @@ void collectThreadResources(struct ThreadLocalData *current) } } } +static void gcMarkDrain(CODENAME_ONE_THREAD_STATE); + /** * A simple concurrent mark algorithm that traverses the currently running threads */ @@ -686,6 +688,17 @@ void codenameOneGCMark() { } } markStatics(d); + // Drain the worklist before unblocking the thread so that every object + // transitively reachable from this thread's roots is fully marked while the + // thread is still paused -- matching the snapshot-at-the-beginning property + // the recursive implementation had. Without this drain, an unblocked mutator + // can read a still-grey field reference into a new local and null the field; + // the captured object would never be visited by the final drain, sweep would + // reclaim it, and a later monitorEnter on its freed pthread_mutex_t would + // silently deadlock. Earlier attempts at this drain hung at app startup + // because the overflow rescan path had a cursor-reset bug; with that fixed + // below, the drain runs to completion in O(reachable) time. + gcMarkDrain(d); if(!agressiveAllocator) { t->threadBlockedByGC = JAVA_FALSE; } else { @@ -701,6 +714,10 @@ void codenameOneGCMark() { for(int iter = 0 ; iter < CN1_CONSTANT_POOL_SIZE ; iter++) { gcMarkObject(d, (JAVA_OBJECT)constantPoolObjects[iter], JAVA_TRUE); } + + // Drain the worklist that the calls above populated. gcMarkObject no longer recurses + // through reference fields, so we need an explicit drain pass before sweep runs. + gcMarkDrain(d); } #ifdef DEBUG_GC_OBJECTS_IN_HEAP @@ -1161,6 +1178,56 @@ void codenameOneGcFree(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT obj) { //JAVA_OBJECT* recursionBlocker = 0; //int recursionBlockerPosition = 0; int recursionKey = 1; + +// Iterative mark using an explicit worklist. The previous implementation recursed +// through reference fields, building one C stack frame per Java reference traversed. +// iOS gives secondary pthreads a ~512KB stack, so a chain of a few thousand references +// (linked list, parse tree, deeply nested container) would SIGBUS the GC thread. +// Issue #3136. +// +// gcMarkObject now sets the mark bit and pushes onto a fixed worklist. gcMarkDrain +// pops entries and invokes their per-class mark function, which calls gcMarkObject on +// each reference field -- push, not recurse. If the worklist fills, the offending +// object is still marked (so sweep preserves it) but its field scan is deferred to +// the heap-rescan pass: walk the live-object table, re-invoke mark functions on +// already-marked objects to pick up children that were skipped on overflow. Idempotent +// because already-marked children are no-ops in gcMarkObject. +// +// CN1_GC_MARK_WORKLIST_SIZE is overridable at compile time (e.g. via -D in the Xcode +// build settings or the maven plugin). 65536 entries is ~1MB on 64-bit. Sized so the +// constant pool alone fits comfortably (HelloCodenameOne has ~15K entries, real apps +// can have more). Smaller sizes still work via the heap-rescan slow path, but the +// rescan adds non-trivial cost and the path is harder to test, so the default errs +// on the side of avoiding overflow for any normal app. +#ifndef CN1_GC_MARK_WORKLIST_SIZE +#define CN1_GC_MARK_WORKLIST_SIZE 65536 +#endif + +struct gcMarkWorklistEntry { + JAVA_OBJECT obj; + JAVA_BOOLEAN force; +}; + +static struct gcMarkWorklistEntry gcMarkWorklist[CN1_GC_MARK_WORKLIST_SIZE]; +static int gcMarkWorklistTop = 0; +static JAVA_BOOLEAN gcMarkWorklistOverflow = JAVA_FALSE; +// Set whenever gcMarkObject transitions an object from unmarked to marked. Used by +// the overflow-rescan loop to detect a fixed point: if a rescan+drain pass marks +// nothing new, the reachable set is fully closed under "marked" and we're done -- +// otherwise we'd spin forever re-pushing the same marked-and-already-scanned +// objects when the marked set is larger than the worklist. +static JAVA_BOOLEAN gcMarkFoundUnmarkedChildInPass = JAVA_FALSE; + +static inline void gcMarkWorklistPush(JAVA_OBJECT obj, JAVA_BOOLEAN force) { + if(gcMarkWorklistTop >= CN1_GC_MARK_WORKLIST_SIZE) { + gcMarkWorklistOverflow = JAVA_TRUE; + return; + } + gcMarkWorklist[gcMarkWorklistTop].obj = obj; + gcMarkWorklist[gcMarkWorklistTop].force = force; + gcMarkWorklistTop++; +} + void gcMarkObject(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT obj, JAVA_BOOLEAN force) { if(obj == JAVA_NULL || obj->__codenameOneParentClsReference == 0 || obj->__codenameOneParentClsReference == (&class__java_lang_Class)) { return; @@ -1173,19 +1240,16 @@ void gcMarkObject(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT obj, JAVA_BOOLEAN force return; } obj->__codenameOneReferenceCount = recursionKey; - obj->__codenameOneGcMark = currentGcMarkValue; - gcMarkFunctionPointer fp = obj->__codenameOneParentClsReference->markFunction; - if(fp != 0) { - fp(threadStateData, obj, force); + if(obj->__codenameOneParentClsReference->markFunction != 0) { + gcMarkWorklistPush(obj, force); } } return; - } obj->__codenameOneGcMark = currentGcMarkValue; - gcMarkFunctionPointer fp = obj->__codenameOneParentClsReference->markFunction; - if(fp != 0) { - fp(threadStateData, obj, force); + gcMarkFoundUnmarkedChildInPass = JAVA_TRUE; + if(obj->__codenameOneParentClsReference->markFunction != 0) { + gcMarkWorklistPush(obj, force); } } @@ -1205,6 +1269,57 @@ void gcMarkArrayObject(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT obj, JAVA_BOOLEAN } } +// Pops worklist entries and runs their mark functions. On overflow, rescans the live +// heap to push every marked-but-unscanned object so its children get visited (the +// children's pushes are what overflowed in the first place). The rescan uses a cursor +// that resumes across batches -- restarting from iter=0 on every batch would just +// re-push the same first WORKLIST_SIZE marked objects forever while later indices got +// starved, leaving their children unmarked and freeing reachable memory at sweep. +static void gcMarkDrain(CODENAME_ONE_THREAD_STATE) { + int rescanCursor = 0; + while(JAVA_TRUE) { + while(gcMarkWorklistTop > 0) { + gcMarkWorklistTop--; + JAVA_OBJECT obj = gcMarkWorklist[gcMarkWorklistTop].obj; + JAVA_BOOLEAN force = gcMarkWorklist[gcMarkWorklistTop].force; + gcMarkFunctionPointer fp = obj->__codenameOneParentClsReference->markFunction; + if(fp != 0) { + fp(threadStateData, obj, force); + } + } + int total = currentSizeOfAllObjectsInHeap; + // Done when the worklist drained without re-overflow AND we've finished a full + // sweep of the heap (cursor at end) AND nothing new got marked during the most + // recent sweep. Without the cursor==total check, we'd return while there are + // still marked objects past `cursor` whose mark functions haven't been called. + if(!gcMarkWorklistOverflow && rescanCursor >= total) { + return; + } + gcMarkWorklistOverflow = JAVA_FALSE; + if(rescanCursor >= total) { + if(!gcMarkFoundUnmarkedChildInPass) { + // We finished a full heap sweep, drained the resulting pushes, and the + // drain marked nothing new. Fixed point. + return; + } + // Pushes from the previous sweep's drain may have marked new objects past + // indices we already visited this round; restart the sweep so they get + // their mark functions called too. + rescanCursor = 0; + gcMarkFoundUnmarkedChildInPass = JAVA_FALSE; + } + while(rescanCursor < total && gcMarkWorklistTop < CN1_GC_MARK_WORKLIST_SIZE) { + JAVA_OBJECT o = allObjectsInHeap[rescanCursor]; + rescanCursor++; + if(o != JAVA_NULL && o->__codenameOneGcMark == currentGcMarkValue) { + if(o->__codenameOneParentClsReference->markFunction != 0) { + gcMarkWorklistPush(o, JAVA_FALSE); + } + } + } + } +} + JAVA_OBJECT allocArray(CODENAME_ONE_THREAD_STATE, int length, struct clazz* type, int primitiveSize, int dim) { int actualSize = length * primitiveSize; JAVA_ARRAY array = (JAVA_ARRAY)codenameOneGcMalloc(threadStateData, sizeof(struct JavaArrayPrototype) + actualSize + sizeof(void*), type);