Skip to content

More retry #236

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
May 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,29 @@

package com.coder.gateway

import com.coder.gateway.sdk.humanizeDuration
import com.coder.gateway.sdk.isCancellation
import com.coder.gateway.sdk.isWorkerTimeout
import com.coder.gateway.sdk.suspendingRetryWithExponentialBackOff
import com.coder.gateway.services.CoderRecentWorkspaceConnectionsService
import com.intellij.openapi.application.ApplicationManager
import com.intellij.openapi.components.service
import com.intellij.openapi.diagnostic.Logger
import com.intellij.openapi.rd.util.launchUnderBackgroundProgress
import com.intellij.openapi.ui.Messages
import com.jetbrains.gateway.api.ConnectionRequestor
import com.jetbrains.gateway.api.GatewayConnectionHandle
import com.jetbrains.gateway.api.GatewayConnectionProvider
import com.jetbrains.gateway.api.GatewayUI
import com.jetbrains.gateway.ssh.SshDeployFlowUtil
import com.jetbrains.gateway.ssh.SshMultistagePanelContext
import com.jetbrains.gateway.ssh.deploy.DeployException
import com.jetbrains.rd.util.lifetime.LifetimeDefinition
import kotlinx.coroutines.launch
import net.schmizz.sshj.common.SSHException
import net.schmizz.sshj.connection.ConnectionException
import java.time.Duration
import java.util.concurrent.TimeoutException

class CoderGatewayConnectionProvider : GatewayConnectionProvider {
private val recentConnectionsService = service<CoderRecentWorkspaceConnectionsService>()
Expand All @@ -24,12 +34,53 @@ class CoderGatewayConnectionProvider : GatewayConnectionProvider {
// TODO: If this fails determine if it is an auth error and if so prompt
// for a new token, configure the CLI, then try again.
clientLifetime.launchUnderBackgroundProgress(CoderGatewayBundle.message("gateway.connector.coder.connection.provider.title"), canBeCancelled = true, isIndeterminate = true, project = null) {
val context = SshMultistagePanelContext(parameters.toHostDeployInputs())
logger.info("Deploying and starting IDE with $context")
launch {
@Suppress("UnstableApiUsage") SshDeployFlowUtil.fullDeployCycle(
clientLifetime, context, Duration.ofMinutes(10)
try {
indicator.text = CoderGatewayBundle.message("gateway.connector.coder.connecting")
val context = suspendingRetryWithExponentialBackOff(
action = { attempt ->
logger.info("Connecting... (attempt $attempt")
if (attempt > 1) {
// indicator.text is the text above the progress bar.
indicator.text = CoderGatewayBundle.message("gateway.connector.coder.connecting.retry", attempt)
}
SshMultistagePanelContext(parameters.toHostDeployInputs())
},
retryIf = {
it is ConnectionException || it is TimeoutException
|| it is SSHException || it is DeployException
Comment on lines +48 to +50
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, this is neat! <3

},
onException = { attempt, nextMs, e ->
logger.error("Failed to connect (attempt $attempt; will retry in $nextMs ms)")
// indicator.text2 is the text below the progress bar.
indicator.text2 =
if (isWorkerTimeout(e)) "Failed to upload worker binary...it may have timed out"
else e.message ?: CoderGatewayBundle.message("gateway.connector.no-details")
},
onCountdown = { remainingMs ->
indicator.text = CoderGatewayBundle.message("gateway.connector.coder.connecting.failed.retry", humanizeDuration(remainingMs))
},
)
launch {
logger.info("Deploying and starting IDE with $context")
// At this point JetBrains takes over with their own UI.
@Suppress("UnstableApiUsage") SshDeployFlowUtil.fullDeployCycle(
clientLifetime, context, Duration.ofMinutes(10)
)
}
} catch (e: Exception) {
if (isCancellation(e)) {
logger.info("Connection canceled due to ${e.javaClass}")
} else {
logger.info("Failed to connect (will not retry)", e)
// The dialog will close once we return so write the error
// out into a new dialog.
ApplicationManager.getApplication().invokeAndWait {
Messages.showMessageDialog(
e.message ?: CoderGatewayBundle.message("gateway.connector.no-details"),
CoderGatewayBundle.message("gateway.connector.coder.connection.failed"),
Messages.getErrorIcon())
}
}
}
}

Expand Down
95 changes: 77 additions & 18 deletions src/main/kotlin/com/coder/gateway/sdk/Retry.kt
Original file line number Diff line number Diff line change
@@ -1,23 +1,50 @@
package com.coder.gateway.sdk

import com.intellij.openapi.progress.ProcessCanceledException
import com.intellij.ssh.SshException
import com.jetbrains.gateway.ssh.deploy.DeployException
import kotlinx.coroutines.delay
import kotlinx.datetime.Clock
import java.util.Random
import java.util.concurrent.TimeUnit
import kotlin.concurrent.timer
import kotlin.math.max
import kotlin.coroutines.cancellation.CancellationException
import kotlin.math.min

fun unwrap(ex: Exception): Throwable {
var cause = ex.cause
while(cause?.cause != null) {
cause = cause.cause
}
return cause ?: ex
}

/**
* Similar to Intellij's except it gives you the next delay, does not do its own
* logging, updates periodically (for counting down), and runs forever.
* Similar to Intellij's except it adds two new arguments: onCountdown (for
* displaying the time until the next try) and retryIf (to limit which
* exceptions can be retried).
*
* Exceptions that cannot be retried will be thrown.
*
* onException and onCountdown will be called immediately on retryable failures.
* onCountdown will also be called every second until the next try with the time
* left until that next try (the last interval might be less than one second if
* the total delay is not divisible by one second).
*
* Some other differences:
* - onException gives you the time until the next try (intended to be logged
* with the error).
* - Infinite tries.
* - SshException is unwrapped.
*
* It is otherwise identical.
*/
suspend fun <T> suspendingRetryWithExponentialBackOff(
initialDelayMs: Long = TimeUnit.SECONDS.toMillis(5),
backOffLimitMs: Long = TimeUnit.MINUTES.toMillis(3),
backOffFactor: Int = 2,
backOffJitter: Double = 0.1,
update: (attempt: Int, remainingMs: Long, e: Exception) -> Unit,
retryIf: (e: Throwable) -> Boolean,
onException: (attempt: Int, nextMs: Long, e: Throwable) -> Unit,
onCountdown: (remaining: Long) -> Unit,
action: suspend (attempt: Int) -> T
): T {
val random = Random()
Expand All @@ -26,21 +53,53 @@ suspend fun <T> suspendingRetryWithExponentialBackOff(
try {
return action(attempt)
}
catch (e: Exception) {
val end = Clock.System.now().toEpochMilliseconds() + delayMs
val timer = timer(period = TimeUnit.SECONDS.toMillis(1)) {
val now = Clock.System.now().toEpochMilliseconds()
val next = max(end - now, 0)
if (next > 0) {
update(attempt, next, e)
} else {
this.cancel()
}
catch (originalEx: Exception) {
// SshException can happen due to anything from a timeout to being
// canceled so unwrap to find out.
val unwrappedEx = if (originalEx is SshException) unwrap(originalEx) else originalEx
if (!retryIf(unwrappedEx)) {
throw unwrappedEx
}
onException(attempt, delayMs, unwrappedEx)
var remainingMs = delayMs
while (remainingMs > 0) {
onCountdown(remainingMs)
val next = min(remainingMs, TimeUnit.SECONDS.toMillis(1))
remainingMs -= next
delay(next)
}
delay(delayMs)
timer.cancel()
delayMs = min(delayMs * backOffFactor, backOffLimitMs) + (random.nextGaussian() * delayMs * backOffJitter).toLong()
}
}
error("Should never be reached")
}

/**
* Convert a millisecond duration into a human-readable string.
*
* < 1 second: "now"
* 1 second: "in one second"
* > 1 second: "in <duration> seconds"
*/
fun humanizeDuration(durationMs: Long): String {
val seconds = TimeUnit.MILLISECONDS.toSeconds(durationMs)
return if (seconds < 1) "now" else "in $seconds second${if (seconds > 1) "s" else ""}"
}

/**
* When the worker upload times out Gateway just says it failed. Even the root
* cause (IllegalStateException) is useless. The error also includes a very
* long useless tmp path. Return true if the error looks like this timeout.
*/
fun isWorkerTimeout(e: Throwable): Boolean {
return e is DeployException && e.message.contains("Worker binary deploy failed")
}

/**
* Return true if the exception is some kind of cancellation.
*/
fun isCancellation(e: Throwable): Boolean {
return e is InterruptedException
|| e is CancellationException
|| e is ProcessCanceledException
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ import com.coder.gateway.sdk.Arch
import com.coder.gateway.sdk.CoderCLIManager
import com.coder.gateway.sdk.CoderRestClientService
import com.coder.gateway.sdk.OS
import com.coder.gateway.sdk.humanizeDuration
import com.coder.gateway.sdk.isCancellation
import com.coder.gateway.sdk.isWorkerTimeout
import com.coder.gateway.sdk.suspendingRetryWithExponentialBackOff
import com.coder.gateway.sdk.toURL
import com.coder.gateway.sdk.withPath
Expand Down Expand Up @@ -68,7 +71,6 @@ import net.schmizz.sshj.connection.ConnectionException
import java.awt.Component
import java.awt.FlowLayout
import java.util.Locale
import java.util.concurrent.TimeUnit
import java.util.concurrent.TimeoutException
import javax.swing.ComboBoxModel
import javax.swing.DefaultComboBoxModel
Expand All @@ -79,7 +81,6 @@ import javax.swing.JPanel
import javax.swing.ListCellRenderer
import javax.swing.SwingConstants
import javax.swing.event.DocumentEvent
import kotlin.coroutines.cancellation.CancellationException

class CoderLocateRemoteProjectStepView(private val setNextButtonEnabled: (Boolean) -> Unit) : CoderWorkspacesWizardStep, Disposable {
private val cs = CoroutineScope(Dispatchers.Main)
Expand Down Expand Up @@ -162,6 +163,7 @@ class CoderLocateRemoteProjectStepView(private val setNextButtonEnabled: (Boolea
// Clear contents from the last attempt if any.
cbIDEComment.foreground = UIUtil.getContextHelpForeground()
cbIDEComment.text = CoderGatewayBundle.message("gateway.connector.view.coder.remoteproject.ide.none.comment")
cbIDE.renderer = IDECellRenderer(CoderGatewayBundle.message("gateway.connector.view.coder.retrieve-ides"))
ideComboBoxModel.removeAllElements()
setNextButtonEnabled(false)

Expand All @@ -178,54 +180,47 @@ class CoderLocateRemoteProjectStepView(private val setNextButtonEnabled: (Boolea
terminalLink.url = coderClient.coderURL.withPath("/@${coderClient.me.username}/${selectedWorkspace.name}/terminal").toString()

ideResolvingJob = cs.launch {
val ides = suspendingRetryWithExponentialBackOff(
action={ attempt ->
// Reset text in the select dropdown.
withContext(Dispatchers.Main) {
cbIDE.renderer = IDECellRenderer(
if (attempt > 1) CoderGatewayBundle.message("gateway.connector.view.coder.remoteproject.retry.text", attempt)
else CoderGatewayBundle.message("gateway.connector.view.coder.remoteproject.loading.text"))
}
try {
try {
val ides = suspendingRetryWithExponentialBackOff(
action = { attempt ->
logger.info("Retrieving IDEs...(attempt $attempt)")
if (attempt > 1) {
cbIDE.renderer = IDECellRenderer(CoderGatewayBundle.message("gateway.connector.view.coder.retrieve.ides.retry", attempt))
}
val executor = createRemoteExecutor(CoderCLIManager.getHostName(deploymentURL, selectedWorkspace))
if (ComponentValidator.getInstance(tfProject).isEmpty) {
installRemotePathValidator(executor)
}
retrieveIDEs(executor, selectedWorkspace)
} catch (e: Exception) {
when(e) {
is InterruptedException -> Unit
is CancellationException -> Unit
// Throw to retry these. The main one is
// DeployException which fires when dd times out.
is ConnectionException, is TimeoutException,
is SSHException, is DeployException -> throw e
else -> {
withContext(Dispatchers.Main) {
logger.error("Failed to retrieve IDEs (attempt $attempt)", e)
cbIDEComment.foreground = UIUtil.getErrorForeground()
cbIDEComment.text = e.message ?: "The error did not provide any further details"
cbIDE.renderer = IDECellRenderer(CoderGatewayBundle.message("gateway.connector.view.coder.remoteproject.error.text"), UIUtil.getBalloonErrorIcon())
}
}
}
null
}
},
update = { attempt, retryMs, e ->
logger.error("Failed to retrieve IDEs (attempt $attempt; will retry in $retryMs ms)", e)
cbIDEComment.foreground = UIUtil.getErrorForeground()
cbIDEComment.text = e.message ?: "The error did not provide any further details"
val delayS = TimeUnit.MILLISECONDS.toSeconds(retryMs)
val delay = if (delayS < 1) "now" else "in $delayS second${if (delayS > 1) "s" else ""}"
cbIDE.renderer = IDECellRenderer(CoderGatewayBundle.message("gateway.connector.view.coder.remoteproject.retry-error.text", delay))
},
)
if (ides != null) {
},
retryIf = {
it is ConnectionException || it is TimeoutException
|| it is SSHException || it is DeployException
},
onException = { attempt, nextMs, e ->
logger.error("Failed to retrieve IDEs (attempt $attempt; will retry in $nextMs ms)")
cbIDEComment.foreground = UIUtil.getErrorForeground()
cbIDEComment.text =
if (isWorkerTimeout(e)) "Failed to upload worker binary...it may have timed out. Check the command log for more details."
else e.message ?: CoderGatewayBundle.message("gateway.connector.no-details")
},
onCountdown = { remainingMs ->
cbIDE.renderer = IDECellRenderer(CoderGatewayBundle.message("gateway.connector.view.coder.retrieve-ides.failed.retry", humanizeDuration(remainingMs)))
},
)
withContext(Dispatchers.Main) {
ideComboBoxModel.addAll(ides)
cbIDE.selectedIndex = 0
}
} catch (e: Exception) {
if (isCancellation(e)) {
logger.info("Connection canceled due to ${e.javaClass}")
} else {
logger.error("Failed to retrieve IDEs (will not retry)", e)
cbIDEComment.foreground = UIUtil.getErrorForeground()
cbIDEComment.text = e.message ?: CoderGatewayBundle.message("gateway.connector.no-details")
cbIDE.renderer = IDECellRenderer(CoderGatewayBundle.message("gateway.connector.view.coder.retrieve-ides.failed"), UIUtil.getBalloonErrorIcon())
}
}
}
}
Expand Down
13 changes: 9 additions & 4 deletions src/main/resources/messages/CoderGatewayBundle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@ gateway.connector.view.workspaces.token.comment=The last used token is shown abo
gateway.connector.view.workspaces.token.rejected=This token was rejected.
gateway.connector.view.workspaces.token.injected=This token was pulled from your CLI config.
gateway.connector.view.workspaces.token.none=No existing token found.
gateway.connector.view.coder.remoteproject.loading.text=Retrieving products...
gateway.connector.view.coder.remoteproject.retry.text=Retrieving products (attempt {0})...
gateway.connector.view.coder.remoteproject.error.text=Failed to retrieve IDEs
gateway.connector.view.coder.remoteproject.retry-error.text=Failed to retrieve IDEs...retrying {0}
gateway.connector.view.coder.retrieve-ides=Retrieving IDEs...
gateway.connector.view.coder.retrieve.ides.retry=Retrieving IDEs (attempt {0})...
gateway.connector.view.coder.retrieve-ides.failed=Failed to retrieve IDEs
gateway.connector.view.coder.retrieve-ides.failed.retry=Failed to retrieve IDEs...retrying {0}
gateway.connector.view.coder.remoteproject.next.text=Start IDE and connect
gateway.connector.view.coder.remoteproject.choose.text=Choose IDE and project for workspace {0}
gateway.connector.view.coder.remoteproject.ide.download.comment=This IDE will be downloaded from jetbrains.com and installed to the default path on the remote host.
Expand All @@ -42,6 +42,10 @@ gateway.connector.recentconnections.new.wizard.button.tooltip=Open a new Coder W
gateway.connector.recentconnections.remove.button.tooltip=Remove from Recent Connections
gateway.connector.recentconnections.terminal.button.tooltip=Open SSH Web Terminal
gateway.connector.coder.connection.provider.title=Connecting to Coder workspace...
gateway.connector.coder.connecting=Connecting...
gateway.connector.coder.connecting.retry=Connecting (attempt {0})...
gateway.connector.coder.connection.failed=Failed to connect
gateway.connector.coder.connecting.failed.retry=Failed to connect...retrying {0}
gateway.connector.settings.binary-source.title=CLI source:
gateway.connector.settings.binary-source.comment=Used to download the Coder \
CLI which is necessary to make SSH connections. The If-None-Matched header \
Expand All @@ -54,3 +58,4 @@ gateway.connector.settings.binary-destination.comment=Directories are created \
here that store the CLI and credentials for each domain to which the plugin \
connects. \
Defaults to {0}.
gateway.connector.no-details="The error did not provide any further details"