Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions CodenameOne/src/com/codename1/annotations/DisableDebugInfo.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package com.codename1.annotations;

import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;

/**
* Marks a method so ParparVM omits emitted debug line information.
*/
@Retention(RetentionPolicy.CLASS)
@Target(ElementType.METHOD)
public @interface DisableDebugInfo {
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package com.codename1.annotations;

import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;

/**
* Marks a method so ParparVM omits emitted null and array bounds checks.
*/
@Retention(RetentionPolicy.CLASS)
@Target(ElementType.METHOD)
public @interface DisableNullChecksAndArrayBoundsChecks {
}
25 changes: 21 additions & 4 deletions CodenameOne/src/com/codename1/util/Base64.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@

package com.codename1.util;

import com.codename1.annotations.DisableDebugInfo;
import com.codename1.annotations.DisableNullChecksAndArrayBoundsChecks;


/// This class implements Base64 encoding/decoding functionality
/// as specified in RFC 2045 (http://www.ietf.org/rfc/rfc2045.txt).
public abstract class Base64 {
Expand All @@ -34,18 +38,25 @@ public abstract class Base64 {
'4', '5', '6', '7', '8', '9', '+', '/'};

private static final byte[] decodeMap = new byte[256];
private static final int[] decodeMapInt = new int[256];

static {
for (int i = 0; i < decodeMap.length; i++) {
decodeMap[i] = (byte) DECODE_INVALID;
decodeMapInt[i] = DECODE_INVALID;
}
for (int i = 0; i < map.length; i++) {
decodeMap[map[i] & 0xff] = (byte) i;
decodeMapInt[map[i] & 0xff] = i;
}
decodeMap['\n'] = (byte) DECODE_WHITESPACE;
decodeMap['\r'] = (byte) DECODE_WHITESPACE;
decodeMap[' '] = (byte) DECODE_WHITESPACE;
decodeMap['\t'] = (byte) DECODE_WHITESPACE;
decodeMapInt['\n'] = DECODE_WHITESPACE;
decodeMapInt['\r'] = DECODE_WHITESPACE;
decodeMapInt[' '] = DECODE_WHITESPACE;
decodeMapInt['\t'] = DECODE_WHITESPACE;
}

public static byte[] decode(byte[] in) {
Expand Down Expand Up @@ -89,6 +100,8 @@ public static byte[] decode(byte[] in, int len) {
* @param out destination buffer
* @return decoded length, or {@code -1} for invalid Base64
*/
@DisableDebugInfo
@DisableNullChecksAndArrayBoundsChecks
public static int decode(byte[] in, int len, byte[] out) {
if (len == 0) {
return 0;
Expand All @@ -103,7 +116,7 @@ public static int decode(byte[] in, int len, byte[] out) {
int end = len;
while (end > 0) {
int chr = in[end - 1] & 0xff;
if (decodeMap[chr] == DECODE_WHITESPACE) {
if (decodeMapInt[chr] == DECODE_WHITESPACE) {
end--;
continue;
}
Expand All @@ -121,7 +134,7 @@ public static int decode(byte[] in, int len, byte[] out) {
if (chr == '=') {
break;
}
int value = decodeMap[chr];
int value = decodeMapInt[chr];
if (value == DECODE_WHITESPACE) {
continue;
}
Expand All @@ -148,7 +161,7 @@ public static int decode(byte[] in, int len, byte[] out) {
if (chr == '=') {
break;
}
int bits = decodeMap[chr];
int bits = decodeMapInt[chr];
if (bits == DECODE_WHITESPACE) {
continue;
}
Expand Down Expand Up @@ -184,6 +197,8 @@ public static int decode(byte[] in, byte[] out) {
return decode(in, in.length, out);
}

@DisableDebugInfo
@DisableNullChecksAndArrayBoundsChecks
private static int decodeNoWhitespace(byte[] in, int len, byte[] out) {
if ((len & 0x3) != 0) {
return -1;
Expand All @@ -207,8 +222,8 @@ private static int decodeNoWhitespace(byte[] in, int len, byte[] out) {
throw new IllegalArgumentException("Output buffer too small for decoded data");
}
int outIndex = 0;
byte[] decodeMapLocal = decodeMap;
int fullLen = len - (pad > 0 ? 4 : 0);
int[] decodeMapLocal = decodeMapInt;

for (int i = 0; i < fullLen; i += 4) {
int c0 = in[i] & 0xff;
Expand Down Expand Up @@ -334,6 +349,8 @@ public static String encodeNoNewline(byte[] in) {
* @param out destination buffer
* @return number of bytes written to {@code out}
*/
@DisableDebugInfo
@DisableNullChecksAndArrayBoundsChecks
public static int encodeNoNewline(byte[] in, byte[] out) {
int inputLength = in.length;
int outputLength = ((inputLength + 2) / 3) * 4;
Expand Down
54 changes: 54 additions & 0 deletions docs/developer-guide/performance.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,60 @@ The simulator contains some tools to measure performance overhead of a specific
* *On some platforms mutable images are slow* - mutable images are images you can draw on (using `getGraphics()`). On some platforms they perform quite badly (e.g. iOS) and should generally be avoided. You can check if mutable images are fast in a platform using `Display.areMutableImagesFast()`
* * Make components either transparent or opaque * - a translucent component must paint it's parent every time. This can be expensive. An opaque component might have margins that would require that we paint the parent so there is often overdraw in such cases (overdraw means the same pixel being painted twice).

==== ParparVM Native Translation Performance Hints

For ParparVM-generated native code, we now support method-level optimization hints via annotations. These can provide very good wins in hot code paths, but they come with tradeoffs and should be applied surgically.

===== Method-level codegen hints

* `@DisableDebugInfo` +
Suppresses generated line/debug metadata for the annotated method.
This can reduce generated C size and remove some per-instruction debug overhead.

* `@DisableNullChecksAndArrayBoundsChecks` +
Suppresses generated null and array-bounds checks for the annotated method.
This can significantly reduce branch-heavy code in tight loops.

TIP: Use these only on methods that are both performance-critical and well-covered by tests. These annotations intentionally trade runtime safety diagnostics for speed.

===== Fast method-stack path

The translator can emit a fast method-stack prologue/epilogue (`DEFINE_METHOD_STACK_FAST_*` and `CN1_FAST_RETURN_RELEASE`) for methods that meet strict safety criteria.

In practice, this tends to help for:

* Small, hot methods.
* Methods without monitor usage / exception-heavy flow.
* Methods with straightforward control flow and low instruction complexity.

Tradeoffs:

* Overly broad fast-path eligibility can regress performance if extra branches or memory writes are introduced.
* Primitive-only fast-frame variants may not always outperform a straightforward full clear on all targets/compilers.

TIP: Benchmark representative workloads after enabling fast-stack behavior. Keep eligibility conservative and expand only where measurement shows consistent gains.

===== Base64-style hot-loop guidelines

For low-level loops (e.g. Base64 encode/decode):

* Prefer simple loop bodies with predictable branches.
* Cache decode/lookup tables in primitive arrays (`int[]` lookup tables can reduce per-iteration conversion overhead).
* Avoid adding “defensive” branches in the inner-most loop unless they are required for correctness in production inputs.

===== Build configuration matters

When benchmarking translator output, ensure native projects are compiled with optimization enabled (e.g. CMake `Release` builds). Debug/default builds can hide improvements or produce misleading regressions.

If you are using the integration test harness, make sure CMake is configured with:

[source]
----
-DCMAKE_BUILD_TYPE=Release
----

Without this setting, comparison between Java and ParparVM native output is often noisy and can lead to incorrect optimization conclusions.

=== Performance Monitor

The Performance Monitor tool can be accessible via the #Simulator# -> #Performance Monitor# menu option in the simulator. This launches the following UI that can help you improve application performance:
Expand Down
67 changes: 67 additions & 0 deletions vm/ByteCodeTranslator/src/cn1_globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,8 @@ extern void throwException(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT exceptionArg);
extern JAVA_INT throwException_R_int(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT exceptionArg);
extern JAVA_BOOLEAN throwException_R_boolean(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT exceptionArg);
extern JAVA_OBJECT __NEW_java_lang_NullPointerException(CODENAME_ONE_THREAD_STATE);
extern JAVA_OBJECT __NEW_INSTANCE_java_lang_NullPointerException(CODENAME_ONE_THREAD_STATE);
extern JAVA_OBJECT __NEW_INSTANCE_java_lang_StackOverflowError(CODENAME_ONE_THREAD_STATE);
extern JAVA_OBJECT __NEW_java_lang_ArrayIndexOutOfBoundsException(CODENAME_ONE_THREAD_STATE);
extern JAVA_VOID java_lang_ArrayIndexOutOfBoundsException___INIT_____int(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT __cn1ThisObject, JAVA_INT __cn1Arg1);
extern void throwArrayIndexOutOfBoundsException(CODENAME_ONE_THREAD_STATE, int index);
Expand Down Expand Up @@ -1129,6 +1131,31 @@ extern JAVA_OBJECT newStringFromCString(CODENAME_ONE_THREAD_STATE, const char *s
extern void initConstantPool();

extern void initMethodStack(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT __cn1ThisObject, int stackSize, int localsStackSize, int classNameId, int methodNameId);
static inline void cn1_init_method_stack_fast(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT __cn1ThisObject, int stackSize, int localsStackSize, JAVA_BOOLEAN fullClear) {
#ifdef CN1_INCLUDE_NPE_CHECKS
if(__cn1ThisObject == JAVA_NULL) {
THROW_NULL_POINTER_EXCEPTION();
}
#endif
if (threadStateData->callStackOffset >= CN1_STACK_OVERFLOW_CALL_DEPTH_LIMIT - 1) {
throwException(threadStateData, __NEW_INSTANCE_java_lang_StackOverflowError(threadStateData));
return;
}
if (fullClear) {
memset(&threadStateData->threadObjectStack[threadStateData->threadObjectStackOffset], 0,
sizeof(struct elementStruct) * (localsStackSize + stackSize));
} else {
/*
* Primitive-only fast frames intentionally use the same memset strategy.
* A per-slot type-only loop was measurably slower in benchmarks and did
* not improve generated-code performance.
*/
memset(&threadStateData->threadObjectStack[threadStateData->threadObjectStackOffset], 0,
sizeof(struct elementStruct) * (localsStackSize + stackSize));
}
threadStateData->threadObjectStackOffset += localsStackSize + stackSize;
threadStateData->callStackOffset++;
}

// we need to zero out the values with memset otherwise we will run into a problem
// when invoking release on pre-existing object which might be garbage
Expand All @@ -1150,6 +1177,46 @@ extern void initMethodStack(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT __cn1ThisObje
const int currentCodenameOneCallStackOffset = threadStateData->callStackOffset;\
int methodBlockOffset = threadStateData->tryBlockOffset;

#define DEFINE_METHOD_STACK_FAST_REF(stackSize, localsStackSize, spPosition) \
const int cn1LocalsBeginInThread = threadStateData->threadObjectStackOffset; \
struct elementStruct* locals = &threadStateData->threadObjectStack[cn1LocalsBeginInThread]; \
struct elementStruct* stack = &threadStateData->threadObjectStack[threadStateData->threadObjectStackOffset + localsStackSize]; \
struct elementStruct* SP = &stack[spPosition]; \
cn1_init_method_stack_fast(threadStateData, (JAVA_OBJECT)1, stackSize, localsStackSize, JAVA_TRUE); \
const int currentCodenameOneCallStackOffset = threadStateData->callStackOffset;\
int methodBlockOffset = threadStateData->tryBlockOffset;

#define DEFINE_INSTANCE_METHOD_STACK_FAST_REF(stackSize, localsStackSize, spPosition) \
const int cn1LocalsBeginInThread = threadStateData->threadObjectStackOffset; \
struct elementStruct* locals = &threadStateData->threadObjectStack[cn1LocalsBeginInThread]; \
struct elementStruct* stack = &threadStateData->threadObjectStack[threadStateData->threadObjectStackOffset + localsStackSize]; \
struct elementStruct* SP = &stack[spPosition]; \
cn1_init_method_stack_fast(threadStateData, __cn1ThisObject, stackSize, localsStackSize, JAVA_TRUE); \
const int currentCodenameOneCallStackOffset = threadStateData->callStackOffset;\
int methodBlockOffset = threadStateData->tryBlockOffset;

#define DEFINE_METHOD_STACK_FAST_PRIMITIVE(stackSize, localsStackSize, spPosition) \
const int cn1LocalsBeginInThread = threadStateData->threadObjectStackOffset; \
struct elementStruct* locals = &threadStateData->threadObjectStack[cn1LocalsBeginInThread]; \
struct elementStruct* stack = &threadStateData->threadObjectStack[threadStateData->threadObjectStackOffset + localsStackSize]; \
struct elementStruct* SP = &stack[spPosition]; \
cn1_init_method_stack_fast(threadStateData, (JAVA_OBJECT)1, stackSize, localsStackSize, JAVA_FALSE); \
const int currentCodenameOneCallStackOffset = threadStateData->callStackOffset;\
int methodBlockOffset = threadStateData->tryBlockOffset;

#define DEFINE_INSTANCE_METHOD_STACK_FAST_PRIMITIVE(stackSize, localsStackSize, spPosition) \
const int cn1LocalsBeginInThread = threadStateData->threadObjectStackOffset; \
struct elementStruct* locals = &threadStateData->threadObjectStack[cn1LocalsBeginInThread]; \
struct elementStruct* stack = &threadStateData->threadObjectStack[threadStateData->threadObjectStackOffset + localsStackSize]; \
struct elementStruct* SP = &stack[spPosition]; \
cn1_init_method_stack_fast(threadStateData, __cn1ThisObject, stackSize, localsStackSize, JAVA_FALSE); \
const int currentCodenameOneCallStackOffset = threadStateData->callStackOffset;\
int methodBlockOffset = threadStateData->tryBlockOffset;

#define CN1_FAST_RETURN_RELEASE() \
threadStateData->threadObjectStackOffset = cn1LocalsBeginInThread; \
threadStateData->callStackOffset--;


#if defined(__APPLE__) && defined(__OBJC__)
@class NSString;
Expand Down
Loading
Loading