From eb09fcebc43972f67d386b740cbc3ec0060382d7 Mon Sep 17 00:00:00 2001 From: Tobi Ajila Date: Fri, 19 May 2023 11:19:28 -0400 Subject: [PATCH 1/3] Add checkpoint delay when clinit is occuring The is PR addresses cases where a class is being initialized on a non-checkpoint thread while a checkpoint occurs. This scenario may lead to cases where a deadlock occurs when when attempting to initialize a class during a pre/post checkpoint hook. This PR address the issue by waiting for threads currently initializing a class to complete before taking a checkpoint up to 1ms. After the timeout, an exception will be thrown (unless -XX:-J9VM_CRIU_IS_THROW_ON_DELAYED_CHECKPOINT_ENABLED is specified) Also, this PR adds more detailed messages to checkpoint dir exceptions. Fixes: https://github.com/eclipse-openj9/openj9/issues/15806 Signed-off-by: Tobi Ajila --- .../org/eclipse/openj9/criu/CRIUSupport.java | 2 +- runtime/criusupport/criusupport.cpp | 84 ++++++++++------ runtime/criusupport/j9criu.tdf | 4 +- .../gc_modron_startup/mmparseXgcpolicy.cpp | 4 +- runtime/oti/j9nonbuilder.h | 11 ++- runtime/oti/jvminit.h | 2 + runtime/vm/CRIUHelpers.cpp | 16 +-- runtime/vm/jvminit.c | 17 +++- .../cmdLineTests/criu/criuScript.sh | 2 +- .../cmdLineTests/criu/criu_nonPortable.xml | 43 +++++++- .../src/org/openj9/criu/DeadlockTest.java | 97 +++++++++++++++++++ 11 files changed, 229 insertions(+), 53 deletions(-) diff --git a/jcl/src/openj9.criu/share/classes/org/eclipse/openj9/criu/CRIUSupport.java b/jcl/src/openj9.criu/share/classes/org/eclipse/openj9/criu/CRIUSupport.java index ff818679f40..1f5e85b3967 100644 --- a/jcl/src/openj9.criu/share/classes/org/eclipse/openj9/criu/CRIUSupport.java +++ b/jcl/src/openj9.criu/share/classes/org/eclipse/openj9/criu/CRIUSupport.java @@ -257,7 +257,7 @@ public static String getErrorMessage() { public CRIUSupport setImageDir(Path imageDir) { Objects.requireNonNull(imageDir, "Image directory cannot be null"); //$NON-NLS-1$ if (!Files.isDirectory(imageDir)) { - throw new IllegalArgumentException("imageDir is not a valid directory"); //$NON-NLS-1$ + throw new IllegalArgumentException(imageDir.toAbsolutePath() + " is not a valid directory"); //$NON-NLS-1$ } String dir = imageDir.toAbsolutePath().toString(); diff --git a/runtime/criusupport/criusupport.cpp b/runtime/criusupport/criusupport.cpp index 7c204d7ba09..1363b867446 100644 --- a/runtime/criusupport/criusupport.cpp +++ b/runtime/criusupport/criusupport.cpp @@ -51,6 +51,9 @@ extern "C" { #define RESTORE_ARGS_RETURN_OPTIONS_FILE_FAILED 2 #define RESTORE_ARGS_RETURN_ENV_VAR_FILE_FAILED 3 +#define J9VM_DELAYCHECKPOINT_NOTCHECKPOINTSAFE 0x1 +#define J9VM_DELAYCHECKPOINT_CLINIT 0x2 + #define OPENJ9_RESTORE_OPTS_VAR "OPENJ9_RESTORE_JAVA_OPTIONS=" static bool @@ -149,8 +152,11 @@ setupJNIFieldIDsAndCRIUAPI(JNIEnv *env, jclass *currentExceptionClass, IDATA *sy goto done; } - vm->checkpointState.isJdwpEnabled = (FIND_ARG_IN_VMARGS(STARTSWITH_MATCH, MAPOPT_AGENTLIB_JDWP_EQUALS, NULL) >= 0) - || (FIND_ARG_IN_VMARGS(STARTSWITH_MATCH, MAPOPT_XRUNJDWP, NULL) >= 0); + if ((FIND_ARG_IN_VMARGS(STARTSWITH_MATCH, MAPOPT_AGENTLIB_JDWP_EQUALS, NULL) >= 0) + || (FIND_ARG_IN_VMARGS(STARTSWITH_MATCH, MAPOPT_XRUNJDWP, NULL) >= 0) + ) { + vm->checkpointState.flags |= J9VM_CRIU_IS_JDWP_ENABLED; + } done: return returnCode; @@ -269,7 +275,7 @@ shouldToggleJavaThread(J9VMThread *currentThread, BOOLEAN toggleDebugThreads) { J9JavaVM *vm = currentThread->javaVM; bool result = true; - if (vm->checkpointState.isJdwpEnabled) { + if (J9_ARE_ALL_BITS_SET(vm->checkpointState.flags, J9VM_CRIU_IS_JDWP_ENABLED)) { char *threadName = getOMRVMThreadName(currentThread->omrVMThread); releaseOMRVMThreadName(currentThread->omrVMThread); /* all threads started by JDWP begin with "JDWP" in their name */ @@ -313,33 +319,47 @@ toggleSuspendOnJavaThreads(J9VMThread *currentThread, BOOLEAN suspend, BOOLEAN t } static UDATA -notCheckpointSafeFrameWalkFunction(J9VMThread *vmThread, J9StackWalkState *walkState) +notCheckpointSafeOrClinitFrameWalkFunction(J9VMThread *vmThread, J9StackWalkState *walkState) { J9Method *method = walkState->method; + UDATA returnCode = J9_STACKWALK_KEEP_ITERATING; + if (NULL != method) { + J9ROMMethod *romMethod = J9_ROM_METHOD_FROM_RAM_METHOD(method); J9ClassLoader *methodLoader = J9_CLASS_FROM_METHOD(method)->classLoader; + J9UTF8 *romMethodName = J9ROMMETHOD_NAME(romMethod); + /* only method names that start with '<' are , and */ + if (0 == strncmp((char*)J9UTF8_DATA(romMethodName), "userData1 = J9VM_DELAYCHECKPOINT_CLINIT; + goto fail; + } + /* we only enforce this in methods loaded by the bootloader */ if (methodLoader == vmThread->javaVM->systemClassLoader) { - J9ROMMethod *romMethod = J9_ROM_METHOD_FROM_RAM_METHOD(method); if (J9ROMMETHOD_HAS_EXTENDED_MODIFIERS(romMethod)) { U_32 extraModifiers = getExtendedModifiersDataFromROMMethod(romMethod); if (J9ROMMETHOD_HAS_NOT_CHECKPOINT_SAFE_ANNOTATION(extraModifiers)) { - *(bool *)walkState->userData1 = false; - walkState->userData2 = (void *)vmThread; - walkState->userData3 = (void *)method; - return J9_STACKWALK_STOP_ITERATING; + *(UDATA*)walkState->userData1 = J9VM_DELAYCHECKPOINT_NOTCHECKPOINTSAFE; + goto fail; } } } } - return J9_STACKWALK_KEEP_ITERATING; +done: + return returnCode; + +fail: + walkState->userData2 = (void *)vmThread; + walkState->userData3 = (void *)method; + returnCode = J9_STACKWALK_STOP_ITERATING; + goto done; } static bool checkIfSafeToCheckpoint(J9VMThread *currentThread) { - bool isSafe = true; + UDATA notSafeToCheckpoint = 0; J9JavaVM *vm = currentThread->javaVM; Assert_CRIU_true((J9_XACCESS_EXCLUSIVE == vm->exclusiveAccessState) || (J9_XACCESS_EXCLUSIVE == vm->safePointState)); @@ -353,19 +373,19 @@ checkIfSafeToCheckpoint(J9VMThread *currentThread) walkState.walkThread = walkThread; walkState.flags = J9_STACKWALK_ITERATE_FRAMES | J9_STACKWALK_INCLUDE_NATIVES; walkState.skipCount = 0; - walkState.userData1 = (void *)&isSafe; - walkState.frameWalkFunction = notCheckpointSafeFrameWalkFunction; + walkState.userData1 = (void *)¬SafeToCheckpoint; + walkState.frameWalkFunction = notCheckpointSafeOrClinitFrameWalkFunction; vm->walkStackFrames(walkThread, &walkState); - if (!isSafe) { - Trc_CRIU_checkpointJVMImpl_checkIfSafeToCheckpointBlocked(currentThread, walkState.userData2, walkState.userData3); + if (0 != notSafeToCheckpoint) { + Trc_CRIU_checkpointJVMImpl_checkIfSafeToCheckpointBlockedVer2(currentThread, walkState.userData2, walkState.userData3, walkState.userData1); break; } } walkThread = J9_LINKED_LIST_NEXT_DO(vm->mainThread, walkThread); } - return isSafe; + return notSafeToCheckpoint; } static VMINLINE void @@ -654,11 +674,12 @@ Java_org_eclipse_openj9_criu_CRIUSupport_checkpointJVMImpl(JNIEnv *env, U_64 restoreNanoUTCTime = 0; UDATA success = 0; bool safePoint = J9_ARE_ANY_BITS_SET(vm->extendedRuntimeFlags, J9_EXTENDED_RUNTIME_OSR_SAFE_POINT); - bool retryPermitted = vm->checkpointState.maxRetryForNotCheckpointSafe > 0; + UDATA maxRetries = vm->checkpointState.maxRetryForNotCheckpointSafe; BOOLEAN syslogFlagNone = TRUE; char *syslogOptions = NULL; I_32 syslogBufferSize = 0; UDATA oldVMState = VM_VMHelpers::setVMState(currentThread, J9VMSTATE_CRIU_SUPPORT_CHECKPOINT_PHASE_START); + UDATA notSafeToCheckpoint = 0; vmFuncs->internalEnterVMFromJNI(currentThread); @@ -803,18 +824,27 @@ Java_org_eclipse_openj9_criu_CRIUSupport_checkpointJVMImpl(JNIEnv *env, acquireSafeOrExcusiveVMAccess(currentThread, vmFuncs, safePoint); - for (UDATA i = 0; !checkIfSafeToCheckpoint(currentThread) && retryPermitted; i++) { + notSafeToCheckpoint = checkIfSafeToCheckpoint(currentThread); + + for (UDATA i = 0; (0 != notSafeToCheckpoint) && (i <= maxRetries); i++) { releaseSafeOrExcusiveVMAccess(currentThread, vmFuncs, safePoint); vmFuncs->internalExitVMToJNI(currentThread); - omrthread_nanosleep(1000); + omrthread_nanosleep(10000); vmFuncs->internalEnterVMFromJNI(currentThread); - if (i == vm->checkpointState.maxRetryForNotCheckpointSafe) { - currentExceptionClass = vm->checkpointState.criuJVMCheckpointExceptionClass; - systemReturnCode = vm->checkpointState.maxRetryForNotCheckpointSafe; - nlsMsgFormat = j9nls_lookup_message(J9NLS_DO_NOT_PRINT_MESSAGE_TAG | J9NLS_DO_NOT_APPEND_NEWLINE, J9NLS_JCL_CRIU_MAX_RETRY_FOR_NOTCHECKPOINTSAFE_REACHED, NULL); - goto closeWorkDirFD; - } acquireSafeOrExcusiveVMAccess(currentThread, vmFuncs, safePoint); + notSafeToCheckpoint = checkIfSafeToCheckpoint(currentThread); + } + + if ((J9VM_DELAYCHECKPOINT_NOTCHECKPOINTSAFE == notSafeToCheckpoint) + || ((J9VM_DELAYCHECKPOINT_CLINIT == notSafeToCheckpoint) && J9_ARE_ALL_BITS_SET(vm->checkpointState.flags, J9VM_CRIU_IS_THROW_ON_DELAYED_CHECKPOINT_ENABLED)) + ) { + releaseSafeOrExcusiveVMAccess(currentThread, vmFuncs, safePoint); + currentExceptionClass = vm->checkpointState.criuJVMCheckpointExceptionClass; + systemReturnCode = vm->checkpointState.maxRetryForNotCheckpointSafe; + nlsMsgFormat = j9nls_lookup_message(J9NLS_DO_NOT_PRINT_MESSAGE_TAG | J9NLS_DO_NOT_APPEND_NEWLINE, J9NLS_JCL_CRIU_MAX_RETRY_FOR_NOTCHECKPOINTSAFE_REACHED, NULL); + goto closeWorkDirFD; + } else { + Trc_CRIU_checkpointJVMImpl_checkpointWithActiveCLinit(currentThread); } toggleSuspendOnJavaThreads(currentThread, TRUE, FALSE); @@ -856,7 +886,7 @@ Java_org_eclipse_openj9_criu_CRIUSupport_checkpointJVMImpl(JNIEnv *env, goto wakeJavaThreadsWithExclusiveVMAccess; } - if (vm->checkpointState.isJdwpEnabled) { + if (J9_ARE_ALL_BITS_SET(vm->checkpointState.flags, J9VM_CRIU_IS_JDWP_ENABLED)) { toggleSuspendOnJavaThreads(currentThread, TRUE, TRUE); } @@ -954,7 +984,7 @@ Java_org_eclipse_openj9_criu_CRIUSupport_checkpointJVMImpl(JNIEnv *env, goto wakeJavaThreadsWithExclusiveVMAccess; } - if (vm->checkpointState.isJdwpEnabled) { + if (J9_ARE_ALL_BITS_SET(vm->checkpointState.flags, J9VM_CRIU_IS_JDWP_ENABLED)) { toggleSuspendOnJavaThreads(currentThread, FALSE, TRUE); } diff --git a/runtime/criusupport/j9criu.tdf b/runtime/criusupport/j9criu.tdf index 1493da2974b..2696d7e5fa9 100644 --- a/runtime/criusupport/j9criu.tdf +++ b/runtime/criusupport/j9criu.tdf @@ -37,9 +37,11 @@ TraceEvent=Trc_CRIU_before_checkpoint Overhead=1 Level=2 Template="Before checkp TraceEvent=Trc_CRIU_after_checkpoint Obsolete Overhead=1 Level=2 Template="After checkpoint criu_dump(), restoreNanoUTCTime = %llu, checkpointNanoUTCTime = %llu, checkpointRestoreTimeDelta = %lld, restoreNanoTimeMonotonic = %lld, checkpointNanoTimeMonotonic = %lld, nanoTimeMonotonicClockDelta = %lld" TraceEntry=Trc_CRIU_checkpointJVMImpl_Entry Overhead=1 Level=2 Template="Java_org_eclipse_openj9_criu_CRIUSupport_checkpointJVMImpl" TraceExit=Trc_CRIU_checkpointJVMImpl_Exit Overhead=1 Level=2 Template="Java_org_eclipse_openj9_criu_CRIUSupport_checkpointJVMImpl" -TraceEvent=Trc_CRIU_checkpointJVMImpl_checkIfSafeToCheckpointBlocked Overhead=1 Level=2 Template="Checkpoint blocked because thread=%p is in method=%p marked as not safe to checkpoint" +TraceEvent=Trc_CRIU_checkpointJVMImpl_checkIfSafeToCheckpointBlocked Obsolete Overhead=1 Level=2 Template="Checkpoint blocked because thread=%p is in method=%p marked as not safe to checkpoint" TraceEvent=Trc_CRIU_checkpointJVMImpl_syslogOptions Overhead=1 Level=3 Template="Current syslogOptions: %s" TraceEvent=Trc_CRIU_checkpoint_nano_times Overhead=1 Level=2 Template="Before checkpoint, checkpointNanoTimeMonotonic = %lld, checkpointNanoUTCTime = %llu" TraceEvent=Trc_CRIU_restore_nano_times Overhead=1 Level=2 Template="After restore, restoreNanoUTCTime = %llu, checkpointNanoUTCTime = %llu, checkpointRestoreTimeDelta = %lld, restoreNanoTimeMonotonic = %lld, checkpointNanoTimeMonotonic = %lld, nanoTimeMonotonicClockDelta = %lld" TraceEvent=Trc_CRIU_after_checkpoint Overhead=1 Level=2 Template="After checkpoint criu_dump(), j9time_nano_time() returns %lld, j9time_current_time_nanos() returns %llu" TraceEvent=Trc_CRIU_restoreArg Overhead=1 Level=5 Test Template="Restore arg: %s" +TraceEvent=Trc_CRIU_checkpointJVMImpl_checkIfSafeToCheckpointBlockedVer2 Overhead=1 Level=2 Template="Checkpoint blocked because thread=%p is in method=%p due to delay code %zu" +TraceEvent=Trc_CRIU_checkpointJVMImpl_checkpointWithActiveCLinit Overhead=1 Level=2 Template="Taking a checkpoint with active clinit" diff --git a/runtime/gc_modron_startup/mmparseXgcpolicy.cpp b/runtime/gc_modron_startup/mmparseXgcpolicy.cpp index 1236e9a4121..dd0fb76913e 100644 --- a/runtime/gc_modron_startup/mmparseXgcpolicy.cpp +++ b/runtime/gc_modron_startup/mmparseXgcpolicy.cpp @@ -85,7 +85,7 @@ isMetronomeGCPolicySupported(MM_GCExtensions *extensions) { #if defined(J9VM_OPT_CRIU_SUPPORT) J9JavaVM *vm = extensions->getJavaVM(); - if (TRUE == vm->checkpointState.isCheckPointEnabled) { + if (vm->internalVMFunctions->isCRIUSupportEnabled_VM(vm)) { PORT_ACCESS_FROM_JAVAVM(vm); j9nls_printf(PORTLIB, J9NLS_ERROR, J9NLS_GC_POLICY_NOT_SUPPOURTED_CRIU, "metronome"); return false; @@ -107,7 +107,7 @@ isBalancedGCPolicySupported(MM_GCExtensions *extensions) { #if defined(J9VM_OPT_CRIU_SUPPORT) J9JavaVM *vm = extensions->getJavaVM(); - if (TRUE == vm->checkpointState.isCheckPointEnabled) { + if (vm->internalVMFunctions->isCRIUSupportEnabled_VM(vm)) { PORT_ACCESS_FROM_JAVAVM(vm); j9nls_printf(PORTLIB, J9NLS_ERROR, J9NLS_GC_POLICY_NOT_SUPPOURTED_CRIU, "balanced"); return false; diff --git a/runtime/oti/j9nonbuilder.h b/runtime/oti/j9nonbuilder.h index 0aa1b670580..d5470b40875 100644 --- a/runtime/oti/j9nonbuilder.h +++ b/runtime/oti/j9nonbuilder.h @@ -4178,11 +4178,14 @@ typedef struct J9DelayedLockingOpertionsRecord { #define J9_SINGLE_THREAD_MODE_OP_NOTIFY_ALL 0x2 #define J9_SINGLE_THREAD_MODE_OP_INTERRUPT 0x3 +#define J9VM_CRIU_IS_CHECKPOINT_ENABLED 0x1 +#define J9VM_CRIU_IS_CHECKPOINT_ALLOWED 0x2 +#define J9VM_CRIU_IS_NON_PORTABLE_RESTORE_MODE 0x4 +#define J9VM_CRIU_IS_JDWP_ENABLED 0x8 +#define J9VM_CRIU_IS_THROW_ON_DELAYED_CHECKPOINT_ENABLED 0x10 + typedef struct J9CRIUCheckpointState { - BOOLEAN isCheckPointEnabled; - BOOLEAN isCheckPointAllowed; - BOOLEAN isNonPortableRestoreMode; - BOOLEAN isJdwpEnabled; + U_32 flags; struct J9DelayedLockingOpertionsRecord *delayedLockingOperationsRoot; struct J9Pool *hookRecords; struct J9Pool *classIterationRestoreHookRecords; diff --git a/runtime/oti/jvminit.h b/runtime/oti/jvminit.h index 17fd9fc689b..98f1e8690df 100644 --- a/runtime/oti/jvminit.h +++ b/runtime/oti/jvminit.h @@ -425,6 +425,8 @@ enum INIT_STAGE { #define VMOPT_XXENABLECRIUNONPORTABLEMODE "-XX:+CRIURestoreNonPortableMode" #define VMOPT_XXDISABLECRIUNONPORTABLEMODE "-XX:-CRIURestoreNonPortableMode" #define VMOPT_XSHARECLASSES_DISABLEONRESTORE "-Xshareclasses:disableOnRestore" +#define VMOPT_XXENABLETHROWONDELAYECHECKPOINTOPERATION "-XX:+ThrowOnDelayedCheckpointOperation" +#define VMOPT_XXDISABLETHROWONDELAYECHECKPOINTOPERATION "-XX:-ThrowOnDelayedCheckpointOperation" #endif /* defined(J9VM_OPT_CRIU_SUPPORT) */ /* diff --git a/runtime/vm/CRIUHelpers.cpp b/runtime/vm/CRIUHelpers.cpp index e01383467fc..d9bab03ca37 100644 --- a/runtime/vm/CRIUHelpers.cpp +++ b/runtime/vm/CRIUHelpers.cpp @@ -84,12 +84,12 @@ jvmRestoreHooks(J9VMThread *currentThread) nas.name = (J9UTF8 *)&runPostRestoreHooks_name; nas.signature = (J9UTF8 *)&runPostRestoreHooks_sig; - Assert_VM_true(vm->checkpointState.isCheckPointEnabled); + Assert_VM_true(isCRIUSupportEnabled_VM(vm)); - if (vm->checkpointState.isNonPortableRestoreMode) { + if (J9_ARE_ALL_BITS_SET(vm->checkpointState.flags, J9VM_CRIU_IS_NON_PORTABLE_RESTORE_MODE)) { PORT_ACCESS_FROM_JAVAVM(vm); - vm->checkpointState.isCheckPointAllowed = FALSE; vm->portLibrary->isCheckPointAllowed = FALSE; + vm->checkpointState.flags &= ~J9VM_CRIU_IS_CHECKPOINT_ALLOWED; j9port_control(J9PORT_CTLDATA_CRIU_SUPPORT_FLAGS, OMRPORT_CRIU_SUPPORT_ENABLED | J9OMRPORT_CRIU_SUPPORT_FINAL_RESTORE); } @@ -114,7 +114,7 @@ isCRIUSupportEnabled(J9VMThread *currentThread) BOOLEAN isCRIUSupportEnabled_VM(J9JavaVM *vm) { - return vm->checkpointState.isCheckPointEnabled; + return J9_ARE_ALL_BITS_SET(vm->checkpointState.flags, J9VM_CRIU_IS_CHECKPOINT_ENABLED); } BOOLEAN @@ -123,7 +123,7 @@ isCheckpointAllowed(J9VMThread *currentThread) BOOLEAN result = FALSE; if (isCRIUSupportEnabled(currentThread)) { - result = currentThread->javaVM->checkpointState.isCheckPointAllowed; + result = J9_ARE_ALL_BITS_SET(currentThread->javaVM->checkpointState.flags, J9VM_CRIU_IS_CHECKPOINT_ALLOWED); } return result; @@ -132,7 +132,7 @@ isCheckpointAllowed(J9VMThread *currentThread) BOOLEAN isNonPortableRestoreMode(J9VMThread *currentThread) { - return currentThread->javaVM->checkpointState.isNonPortableRestoreMode; + return J9_ARE_ALL_BITS_SET(currentThread->javaVM->checkpointState.flags, J9VM_CRIU_IS_NON_PORTABLE_RESTORE_MODE); } /** @@ -402,7 +402,7 @@ cleanupCriuHooks(J9VMThread *currentThread) hookRecord = (J9InternalHookRecord*)pool_nextDo(&walkState); } - if (vm->checkpointState.isNonPortableRestoreMode) { + if (J9_ARE_ALL_BITS_SET(vm->checkpointState.flags, J9VM_CRIU_IS_NON_PORTABLE_RESTORE_MODE)) { /* No more checkpoint, cleanup hook records. */ pool_kill(vm->checkpointState.hookRecords); vm->checkpointState.hookRecords = NULL; @@ -410,7 +410,7 @@ cleanupCriuHooks(J9VMThread *currentThread) } J9Pool *classIterationRestoreHookRecords = vm->checkpointState.classIterationRestoreHookRecords; - if ((NULL != classIterationRestoreHookRecords) && (vm->checkpointState.isNonPortableRestoreMode)) { + if ((NULL != classIterationRestoreHookRecords) && J9_ARE_ALL_BITS_SET(vm->checkpointState.flags, J9VM_CRIU_IS_NON_PORTABLE_RESTORE_MODE)) { /* No more checkpoint, cleanup hook records. */ pool_kill(vm->checkpointState.classIterationRestoreHookRecords); vm->checkpointState.classIterationRestoreHookRecords = NULL; diff --git a/runtime/vm/jvminit.c b/runtime/vm/jvminit.c index 830458a7687..77209326b92 100644 --- a/runtime/vm/jvminit.c +++ b/runtime/vm/jvminit.c @@ -2875,7 +2875,7 @@ VMInitStages(J9JavaVM *vm, IDATA stage, void* reserved) } } else #if defined(J9VM_OPT_CRIU_SUPPORT) - if (vm->checkpointState.isCheckPointAllowed) { + if (J9_ARE_ALL_BITS_SET(vm->checkpointState.flags, J9VM_CRIU_IS_CHECKPOINT_ALLOWED)) { if (J9JAVAVM_COMPRESS_OBJECT_REFERENCES(vm)) { #if defined(OMR_GC_COMPRESSED_POINTERS) vm->bytecodeLoop = criuBytecodeLoopCompressed; @@ -3819,8 +3819,7 @@ processVMArgsFromFirstToLast(J9JavaVM * vm) IDATA disableCRIU = FIND_AND_CONSUME_VMARG(EXACT_MATCH, VMOPT_XXDISABLECRIU, NULL); if (enableCRIU > disableCRIU) { PORT_ACCESS_FROM_JAVAVM(vm); - vm->checkpointState.isCheckPointEnabled = TRUE; - vm->checkpointState.isCheckPointAllowed = TRUE; + vm->checkpointState.flags |= J9VM_CRIU_IS_CHECKPOINT_ENABLED | J9VM_CRIU_IS_CHECKPOINT_ALLOWED; vm->portLibrary->isCheckPointAllowed = TRUE; j9port_control(J9PORT_CTLDATA_CRIU_SUPPORT_FLAGS, OMRPORT_CRIU_SUPPORT_ENABLED); } @@ -3830,12 +3829,20 @@ processVMArgsFromFirstToLast(J9JavaVM * vm) IDATA enableCRIUNonPortableMode = FIND_AND_CONSUME_VMARG(EXACT_MATCH, VMOPT_XXENABLECRIUNONPORTABLEMODE, NULL); IDATA disableCRIUNonPortableMode = FIND_AND_CONSUME_VMARG(EXACT_MATCH, VMOPT_XXDISABLECRIUNONPORTABLEMODE, NULL); if (enableCRIUNonPortableMode >= disableCRIUNonPortableMode) { - if (vm->checkpointState.isCheckPointEnabled) { - vm->checkpointState.isNonPortableRestoreMode = TRUE; + if (J9_ARE_ALL_BITS_SET(vm->checkpointState.flags, J9VM_CRIU_IS_CHECKPOINT_ENABLED)) { + vm->checkpointState.flags |= J9VM_CRIU_IS_NON_PORTABLE_RESTORE_MODE; } } } + { + IDATA enableThrowOnDelayedCheckpointOperation = FIND_AND_CONSUME_VMARG(EXACT_MATCH, VMOPT_XXENABLETHROWONDELAYECHECKPOINTOPERATION, NULL); + IDATA disableThrowOnDelayedCheckpointOperation = FIND_AND_CONSUME_VMARG(EXACT_MATCH, VMOPT_XXDISABLETHROWONDELAYECHECKPOINTOPERATION, NULL); + if (disableThrowOnDelayedCheckpointOperation >= enableThrowOnDelayedCheckpointOperation) { + vm->checkpointState.flags |= J9VM_CRIU_IS_THROW_ON_DELAYED_CHECKPOINT_ENABLED; + } + } + /* Its unclear if we need an option for this, so we can keep the init here for the time being */ vm->checkpointState.maxRetryForNotCheckpointSafe = 100; #endif /* defined(J9VM_OPT_CRIU_SUPPORT) */ diff --git a/test/functional/cmdLineTests/criu/criuScript.sh b/test/functional/cmdLineTests/criu/criuScript.sh index 92e7c276918..41b6d2d06b2 100644 --- a/test/functional/cmdLineTests/criu/criuScript.sh +++ b/test/functional/cmdLineTests/criu/criuScript.sh @@ -36,7 +36,7 @@ echo "export GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC,-XSAVE,-AVX2,-ERMS,-AVX,-AV export GLIBC_TUNABLES=glibc.pthread.rseq=0:glibc.cpu.hwcaps=-XSAVEC,-XSAVE,-AVX2,-ERMS,-AVX,-AVX_Fast_Unaligned_Load echo "export LD_BIND_NOT=on"; export LD_BIND_NOT=on - +echo "$2 -XX:+EnableCRIUSupport $3 -cp "$1/criu.jar" $4 $5 $6" $2 -XX:+EnableCRIUSupport $3 -cp "$1/criu.jar" $4 $5 $6 >testOutput 2>&1; if [ "$7" != true ]; then diff --git a/test/functional/cmdLineTests/criu/criu_nonPortable.xml b/test/functional/cmdLineTests/criu/criu_nonPortable.xml index e535ccba6c3..256475f1838 100644 --- a/test/functional/cmdLineTests/criu/criu_nonPortable.xml +++ b/test/functional/cmdLineTests/criu/criu_nonPortable.xml @@ -232,7 +232,7 @@ - bash $SCRIPPATH$ $TEST_RESROOT$ $JAVA_COMMAND$ "$JVM_OPTIONS$ -Xtrace:print=j9criu.11 --add-opens java.base/jdk.internal.misc=ALL-UNNAMED --add-exports java.base/openj9.internal.criu=ALL-UNNAMED" $MAINCLASS_DEADLOCK_TEST$ NotCheckpointSafeDeadlock 1 + bash $SCRIPPATH$ $TEST_RESROOT$ $JAVA_COMMAND$ "$JVM_OPTIONS$ -XX:+ThrowOnDelayedCheckpointOperation -Xtrace:print=j9criu.17 --add-opens java.base/jdk.internal.misc=ALL-UNNAMED --add-exports java.base/openj9.internal.criu=ALL-UNNAMED" $MAINCLASS_DEADLOCK_TEST$ NotCheckpointSafeDeadlock 1 TEST PASSED Pre-checkpoint Checkpoint blocked because thread @@ -246,9 +246,8 @@ Could not dump the JVM processes, err=-70 - + + + bash $SCRIPPATH$ $TEST_RESROOT$ $JAVA_COMMAND$ "$JVM_OPTIONS$ -XX:+ThrowOnDelayedCheckpointOperation -Xdump:system:events=user -Xtrace:print=j9criu.17 --add-opens java.base/jdk.internal.misc=ALL-UNNAMED --add-exports java.base/openj9.internal.criu=ALL-UNNAMED" $MAINCLASS_DEADLOCK_TEST$ ClinitTest 1 + TEST PASSED + TEST FAILED + Pre-checkpoint + Checkpoint blocked because thread + Killed + CRIU is not enabled + Operation not permitted + If CRIU can't acquire the original thread IDs, this test will fail. Nothing can be done about this failure. + Thread pid mismatch + do not match expected + Unable to create a thread: + TEST FAILED + In the past, the failure below was caused by an issue where CRIU can't be found on the PATH. + Could not dump the JVM processes, err=-70 + + + + bash $SCRIPPATH$ $TEST_RESROOT$ $JAVA_COMMAND$ "$JVM_OPTIONS$ -XX:-ThrowOnDelayedCheckpointOperation -Xdump:system:events=user -Xtrace:print=j9criu.17 --add-opens java.base/jdk.internal.misc=ALL-UNNAMED --add-exports java.base/openj9.internal.criu=ALL-UNNAMED" $MAINCLASS_DEADLOCK_TEST$ ClinitTest 1 + TEST PASSED + TEST FAILED + Pre-checkpoint + Checkpoint blocked because thread + Taking a checkpoint with active clinit + Killed + CRIU is not enabled + Operation not permitted + If CRIU can't acquire the original thread IDs, this test will fail. Nothing can be done about this failure. + Thread pid mismatch + do not match expected + Unable to create a thread: + TEST FAILED + In the past, the failure below was caused by an issue where CRIU can't be found on the PATH. + Could not dump the JVM processes, err=-70 + bash $SCRIPPATH$ $TEST_RESROOT$ $JAVA_COMMAND$ "$JVM_OPTIONS$" $MAINCLASS_DELAYEDOPERATIONS$ 1 1 false diff --git a/test/functional/cmdLineTests/criu/src/org/openj9/criu/DeadlockTest.java b/test/functional/cmdLineTests/criu/src/org/openj9/criu/DeadlockTest.java index 6f049d36890..38073b7eaa8 100644 --- a/test/functional/cmdLineTests/criu/src/org/openj9/criu/DeadlockTest.java +++ b/test/functional/cmdLineTests/criu/src/org/openj9/criu/DeadlockTest.java @@ -41,6 +41,8 @@ import jdk.internal.misc.Unsafe; public class DeadlockTest { + final static TestResult mainTestResult = new TestResult(true, 0); + final static Object lock = new Object(); public static void main(String[] args) { String test = args[0]; @@ -55,6 +57,12 @@ public static void main(String[] args) { case "MethodTypeDeadlockTest": methodTypeDeadlockTest(); break; + case "ClinitTest": + clinitTest(); + break; + case "ClinitTest2": + clinitTest2(); + break; default: throw new RuntimeException("incorrect parameters"); } @@ -229,6 +237,95 @@ static public byte[] getClassBytesFromResource(Class clazz) { return result; } + public static void clinitTest() { + Path path = Paths.get("cpData"); + + mainTestResult.testPassed = false; + mainTestResult.lockStatus = 0; + + Thread t1 = new Thread(()->{ + new ClinitDeadlock(); + }); + + t1.start(); + + while (mainTestResult.lockStatus == 0) { + Thread.yield(); + } + + try { + System.out.println("Pre-checkpoint"); + CRIUTestUtils.checkPointJVM(path); + mainTestResult.testPassed = false; + } catch (JVMCheckpointException e) { + mainTestResult.testPassed = true; + } finally { + synchronized(lock) { + lock.notify(); + } + } + + if (mainTestResult.testPassed) { + System.out.println("TEST PASSED"); + } else { + System.out.println("TEST FAILED"); + } + + System.exit(0); + } + + public static void clinitTest2() { + Path path = Paths.get("cpData"); + + mainTestResult.testPassed = false; + mainTestResult.lockStatus = 0; + + Thread t1 = new Thread(()->{ + new ClinitDeadlock(); + }); + + t1.start(); + + while (mainTestResult.lockStatus == 0) { + Thread.yield(); + } + + try { + System.out.println("Pre-checkpoint"); + CRIUTestUtils.checkPointJVM(path); + mainTestResult.testPassed = true; + } catch (JVMCheckpointException e) { + mainTestResult.testPassed = false; + } finally { + synchronized(lock) { + lock.notify(); + } + } + + if (mainTestResult.testPassed) { + System.out.println("TEST PASSED"); + } else { + System.out.println("TEST FAILED"); + } + + System.exit(0); + } + + static class ClinitDeadlock { + + static { + mainTestResult.lockStatus = 1; + synchronized(lock) { + try { + System.out.println("Thread waiting"); + lock.wait(); + } catch(InterruptedException e) { + e.printStackTrace(); + } + } + } + } + static class A { int x; } From f0cbaa38e4dd22d13f2070d900dce3c854ac144c Mon Sep 17 00:00:00 2001 From: Tobi Ajila Date: Thu, 22 Jun 2023 10:01:08 -0400 Subject: [PATCH 2/3] Fix delay checkpoint tracepoint arg Related: https://github.com/eclipse-openj9/openj9/issues/15806 Signed-off-by: Tobi Ajila --- runtime/criusupport/criusupport.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/criusupport/criusupport.cpp b/runtime/criusupport/criusupport.cpp index 1363b867446..a32fa2c1566 100644 --- a/runtime/criusupport/criusupport.cpp +++ b/runtime/criusupport/criusupport.cpp @@ -378,7 +378,7 @@ checkIfSafeToCheckpoint(J9VMThread *currentThread) vm->walkStackFrames(walkThread, &walkState); if (0 != notSafeToCheckpoint) { - Trc_CRIU_checkpointJVMImpl_checkIfSafeToCheckpointBlockedVer2(currentThread, walkState.userData2, walkState.userData3, walkState.userData1); + Trc_CRIU_checkpointJVMImpl_checkIfSafeToCheckpointBlockedVer2(currentThread, walkState.userData2, walkState.userData3, *(UDATA*)walkState.userData1); break; } } From 99eee9de3de2c6a8381809a03efc12a2e947c215 Mon Sep 17 00:00:00 2001 From: Tobi Ajila Date: Fri, 23 Jun 2023 09:55:22 -0400 Subject: [PATCH 3/3] Disable method deadlock test Signed-off-by: Tobi Ajila --- .../cmdLineTests/criu/criu_nonPortable.xml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/test/functional/cmdLineTests/criu/criu_nonPortable.xml b/test/functional/cmdLineTests/criu/criu_nonPortable.xml index 256475f1838..ae8b9489f9b 100644 --- a/test/functional/cmdLineTests/criu/criu_nonPortable.xml +++ b/test/functional/cmdLineTests/criu/criu_nonPortable.xml @@ -246,8 +246,9 @@ Could not dump the JVM processes, err=-70 + bash $SCRIPPATH$ $TEST_RESROOT$ $JAVA_COMMAND$ "$JVM_OPTIONS$ -XX:+ThrowOnDelayedCheckpointOperation -Xdump:system:events=user -Xtrace:print=j9criu.17 --add-opens java.base/jdk.internal.misc=ALL-UNNAMED --add-exports java.base/openj9.internal.criu=ALL-UNNAMED" $MAINCLASS_DEADLOCK_TEST$ ClinitTest 1 TEST PASSED @@ -273,12 +274,12 @@ Killed CRIU is not enabled Operation not permitted - If CRIU can't acquire the original thread IDs, this test will fail. Nothing can be done about this failure. + Thread pid mismatch do not match expected Unable to create a thread: TEST FAILED - In the past, the failure below was caused by an issue where CRIU can't be found on the PATH. + Could not dump the JVM processes, err=-70 @@ -292,12 +293,12 @@ Killed CRIU is not enabled Operation not permitted - If CRIU can't acquire the original thread IDs, this test will fail. Nothing can be done about this failure. + Thread pid mismatch do not match expected Unable to create a thread: TEST FAILED - In the past, the failure below was caused by an issue where CRIU can't be found on the PATH. + Could not dump the JVM processes, err=-70