Skip to content

[Bug]: Provisioning does not work in some environments #2963

@jvstme

Description

@jvstme

Steps to reproduce

Try to provision a new instance.

The bug is only reproducible in some environments. Known to be reproducible on T4 and A10G in AWS, P100 in OCI, A100 in Azure.

Actual behaviour

The run/instance status is stuck in provisioning for 10 minutes, then fails after the provisioning timeout.

dstack-shim does not run on the instance. Attempts to run it manually result in this error:

# ./dstack-shim-linux-amd64
time=2025-08-08T10:47:01.288114Z level=warning msg=not using DCGM Exporter err=exec: "dcgm-exporter": executable file not found in $PATH
SIGFPE: floating-point exception
PC=0x7c1005975bb9 m=3 sigcode=1
signal arrived during cgo execution
instruction bytes: 0x48 0xf7 0xf6 0x48 0x89 0xca 0x48 0xf 0xaf 0xc6 0x48 0x8d 0x84 0x1 0x0 0x8

goroutine 1 gp=0xc0000061c0 m=3 mp=0xc000056e08 [syscall]:
runtime.cgocall(0x90bdd0, 0xc0001d9508)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/cgocall.go:167 +0x4b fp=0xc0001d94e0 sp=0xc0001d94a8 pc=0x46a90b
github.com/NVIDIA/go-dcgm/pkg/dcgm._Cfunc_dlopen(0x7c0ffc000c90, 0x101)
	_cgo_gotypes.go:1666 +0x4c fp=0xc0001d9508 sp=0xc0001d94e0 pc=0x8b24cc
github.com/NVIDIA/go-dcgm/pkg/dcgm.initDCGM(0x0, {0x0, 0x0, 0x0})
	/home/runner/go/pkg/mod/github.com/!n!v!i!d!i!a/go-dcgm@v0.0.0-20250707210631-823394f2bd9b/pkg/dcgm/admin.go:66 +0x85 fp=0xc0001d9580 sp=0xc0001d9508 pc=0x8b2765
github.com/NVIDIA/go-dcgm/pkg/dcgm.Init(0x0, {0x0, 0x0, 0x0})
	/home/runner/go/pkg/mod/github.com/!n!v!i!d!i!a/go-dcgm@v0.0.0-20250707210631-823394f2bd9b/pkg/dcgm/api.go:34 +0x173 fp=0xc0001d9600 sp=0xc0001d9580 pc=0x8b1a73
github.com/dstackai/dstack/runner/internal/shim/dcgm.NewDCGMWrapper({0x0?, 0xc0002886c0?})
	/home/runner/work/dstack/dstack/runner/internal/shim/dcgm/wrapper.go:55 +0x2a fp=0xc0001d9670 sp=0xc0001d9600 pc=0x8b6a4a
main.start({0xbec670, 0xfed8a0}, {{0x2af6, {0x0, 0x0}, 0x4}, {0x2af7, 0x2726, {0x0, 0x0}, ...}, ...}, ...)
	/home/runner/work/dstack/dstack/runner/cmd/shim/main.go:227 +0xb4c fp=0xc0001d9aa0 sp=0xc0001d9670 pc=0x90ac6c
main.main.func1(0xc000130e80?)
	/home/runner/work/dstack/dstack/runner/cmd/shim/main.go:151 +0x58 fp=0xc0001d9b68 sp=0xc0001d9aa0 pc=0x90a0f8
github.com/urfave/cli/v2.(*Command).Run(0xc000152420, 0xc000130e80, {0xc00011e040, 0x1, 0x1})
	/home/runner/go/pkg/mod/github.com/urfave/cli/v2@v2.27.1/command.go:279 +0x7e2 fp=0xc0001d9de0 sp=0xc0001d9b68 pc=0x8f84a2
github.com/urfave/cli/v2.(*App).RunContext(0xc00013ae00, {0xbec670, 0xfed8a0}, {0xc00011e040, 0x1, 0x1})
	/home/runner/go/pkg/mod/github.com/urfave/cli/v2@v2.27.1/app.go:337 +0x58b fp=0xc0001d9e40 sp=0xc0001d9de0 pc=0x8f4e8b
github.com/urfave/cli/v2.(*App).Run(...)
	/home/runner/go/pkg/mod/github.com/urfave/cli/v2@v2.27.1/app.go:311
main.main()
	/home/runner/work/dstack/dstack/runner/cmd/shim/main.go:155 +0xf6c fp=0xc0001d9f50 sp=0xc0001d9e40 pc=0x90a04c
runtime.main()
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:272 +0x28b fp=0xc0001d9fe0 sp=0xc0001d9f50 pc=0x43cb0b
runtime.goexit({})
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc0001d9fe8 sp=0xc0001d9fe0 pc=0x4788e1

goroutine 2 gp=0xc000006c40 m=nil [force gc (idle)]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:424 +0xce fp=0xc000050fa8 sp=0xc000050f88 pc=0x470c0e
runtime.goparkunlock(...)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:430
runtime.forcegchelper()
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:337 +0xb3 fp=0xc000050fe0 sp=0xc000050fa8 pc=0x43ce53
runtime.goexit({})
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc000050fe8 sp=0xc000050fe0 pc=0x4788e1
created by runtime.init.7 in goroutine 1
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:325 +0x1a

goroutine 3 gp=0xc000007180 m=nil [GC sweep wait]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:424 +0xce fp=0xc000051780 sp=0xc000051760 pc=0x470c0e
runtime.goparkunlock(...)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:430
runtime.bgsweep(0xc00007c000)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgcsweep.go:277 +0x94 fp=0xc0000517c8 sp=0xc000051780 pc=0x427594
runtime.gcenable.gowrap1()
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgc.go:204 +0x25 fp=0xc0000517e0 sp=0xc0000517c8 pc=0x41bca5
runtime.goexit({})
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc0000517e8 sp=0xc0000517e0 pc=0x4788e1
created by runtime.gcenable in goroutine 1
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgc.go:204 +0x66

goroutine 4 gp=0xc000007340 m=nil [GC scavenge wait]:
runtime.gopark(0xc00007c000?, 0xbe1e50?, 0x1?, 0x0?, 0xc000007340?)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:424 +0xce fp=0xc000051f78 sp=0xc000051f58 pc=0x470c0e
runtime.goparkunlock(...)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:430
runtime.(*scavengerState).park(0xfc6bc0)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgcscavenge.go:425 +0x49 fp=0xc000051fa8 sp=0xc000051f78 pc=0x424fc9
runtime.bgscavenge(0xc00007c000)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgcscavenge.go:653 +0x3c fp=0xc000051fc8 sp=0xc000051fa8 pc=0x42553c
runtime.gcenable.gowrap2()
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgc.go:205 +0x25 fp=0xc000051fe0 sp=0xc000051fc8 pc=0x41bc45
runtime.goexit({})
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc000051fe8 sp=0xc000051fe0 pc=0x4788e1
created by runtime.gcenable in goroutine 1
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgc.go:205 +0xa5

goroutine 18 gp=0xc000104700 m=nil [finalizer wait]:
runtime.gopark(0xc000050648?, 0x4121e5?, 0xb0?, 0x1?, 0xc0000061c0?)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:424 +0xce fp=0xc000050620 sp=0xc000050600 pc=0x470c0e
runtime.runfinq()
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mfinal.go:193 +0x107 fp=0xc0000507e0 sp=0xc000050620 pc=0x41ad27
runtime.goexit({})
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc0000507e8 sp=0xc0000507e0 pc=0x4788e1
created by runtime.createfing in goroutine 1
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mfinal.go:163 +0x3d

goroutine 19 gp=0xc000104c40 m=nil [chan receive]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:424 +0xce fp=0xc00004c718 sp=0xc00004c6f8 pc=0x470c0e
runtime.chanrecv(0xc0001125b0, 0x0, 0x1)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/chan.go:639 +0x41c fp=0xc00004c790 sp=0xc00004c718 pc=0x40b8fc
runtime.chanrecv1(0x0?, 0x0?)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/chan.go:489 +0x12 fp=0xc00004c7b8 sp=0xc00004c790 pc=0x40b4b2
runtime.unique_runtime_registerUniqueMapCleanup.func1(...)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgc.go:1781
runtime.unique_runtime_registerUniqueMapCleanup.gowrap1()
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgc.go:1784 +0x2f fp=0xc00004c7e0 sp=0xc00004c7b8 pc=0x41eccf
runtime.goexit({})
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc00004c7e8 sp=0xc00004c7e0 pc=0x4788e1
created by unique.runtime_registerUniqueMapCleanup in goroutine 1
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgc.go:1779 +0x96

goroutine 21 gp=0xc000104fc0 m=nil [IO wait]:
runtime.gopark(0x30?, 0x8?, 0x8?, 0x81?, 0xb?)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:424 +0xce fp=0xc000062af0 sp=0xc000062ad0 pc=0x470c0e
runtime.netpollblock(0x4916f8?, 0x408dc6?, 0x0?)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/netpoll.go:575 +0xf7 fp=0xc000062b28 sp=0xc000062af0 pc=0x435537
internal/poll.runtime_pollWait(0x7c10071e64f0, 0x72)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/netpoll.go:351 +0x85 fp=0xc000062b48 sp=0xc000062b28 pc=0x46ff05
internal/poll.(*pollDesc).wait(0xc000165780?, 0xc000291000?, 0x0)
	/opt/hostedtoolcache/go/1.23.8/x64/src/internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc000062b70 sp=0xc000062b48 pc=0x4ef207
internal/poll.(*pollDesc).waitRead(...)
	/opt/hostedtoolcache/go/1.23.8/x64/src/internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Read(0xc000165780, {0xc000291000, 0x1000, 0x1000})
	/opt/hostedtoolcache/go/1.23.8/x64/src/internal/poll/fd_unix.go:165 +0x27a fp=0xc000062c08 sp=0xc000062b70 pc=0x4f04fa
net.(*netFD).Read(0xc000165780, {0xc000291000?, 0x0?, 0xbe71a0?})
	/opt/hostedtoolcache/go/1.23.8/x64/src/net/fd_posix.go:55 +0x25 fp=0xc000062c50 sp=0xc000062c08 pc=0x5d09e5
net.(*conn).Read(0xc0001242e0, {0xc000291000?, 0x0?, 0x0?})
	/opt/hostedtoolcache/go/1.23.8/x64/src/net/net.go:189 +0x45 fp=0xc000062c98 sp=0xc000062c50 pc=0x5dee85
net.(*UnixConn).Read(0x0?, {0xc000291000?, 0xfc7ee0?, 0xfc83e0?})
	<autogenerated>:1 +0x25 fp=0xc000062cc8 sp=0xc000062c98 pc=0x5f45c5
net/http.(*persistConn).Read(0xc00014aea0, {0xc000291000?, 0x711c65?, 0xa5a9c0?})
	/opt/hostedtoolcache/go/1.23.8/x64/src/net/http/transport.go:2052 +0x4a fp=0xc000062d28 sp=0xc000062cc8 pc=0x718f4a
bufio.(*Reader).fill(0xc000132480)
	/opt/hostedtoolcache/go/1.23.8/x64/src/bufio/bufio.go:110 +0x103 fp=0xc000062d60 sp=0xc000062d28 pc=0x689da3
bufio.(*Reader).Peek(0xc000132480, 0x1)
	/opt/hostedtoolcache/go/1.23.8/x64/src/bufio/bufio.go:148 +0x53 fp=0xc000062d80 sp=0xc000062d60 pc=0x689ed3
net/http.(*persistConn).readLoop(0xc00014aea0)
	/opt/hostedtoolcache/go/1.23.8/x64/src/net/http/transport.go:2205 +0x185 fp=0xc000062fc8 sp=0xc000062d80 pc=0x719a85
net/http.(*Transport).dialConn.gowrap2()
	/opt/hostedtoolcache/go/1.23.8/x64/src/net/http/transport.go:1874 +0x25 fp=0xc000062fe0 sp=0xc000062fc8 pc=0x718485
runtime.goexit({})
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc000062fe8 sp=0xc000062fe0 pc=0x4788e1
created by net/http.(*Transport).dialConn in goroutine 20
	/opt/hostedtoolcache/go/1.23.8/x64/src/net/http/transport.go:1874 +0x154f

goroutine 22 gp=0xc000105180 m=nil [select]:
runtime.gopark(0xc000304f48?, 0x2?, 0x80?, 0x51?, 0xc000304ef4?)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:424 +0xce fp=0xc000304d90 sp=0xc000304d70 pc=0x470c0e
runtime.selectgo(0xc000304f48, 0xc000304ef0, 0xc000131400?, 0x0, 0x0?, 0x1)
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/select.go:335 +0x7a5 fp=0xc000304eb8 sp=0xc000304d90 pc=0x44e905
net/http.(*persistConn).writeLoop(0xc00014aea0)
	/opt/hostedtoolcache/go/1.23.8/x64/src/net/http/transport.go:2519 +0xe7 fp=0xc000304fc8 sp=0xc000304eb8 pc=0x71b487
net/http.(*Transport).dialConn.gowrap3()
	/opt/hostedtoolcache/go/1.23.8/x64/src/net/http/transport.go:1875 +0x25 fp=0xc000304fe0 sp=0xc000304fc8 pc=0x718425
runtime.goexit({})
	/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc000304fe8 sp=0xc000304fe0 pc=0x4788e1
created by net/http.(*Transport).dialConn in goroutine 20
	/opt/hostedtoolcache/go/1.23.8/x64/src/net/http/transport.go:1875 +0x15a5

rax    0xffffffffffffffff
rbx    0x0
rcx    0x1000
rdx    0x0
rdi    0x800000
rsi    0x0
rbp    0x7c10081fdbc0
rsp    0x7c10081fdb20
r8     0x963cf85
r9     0x7
r10    0x7c10081fdb20
r11    0x246
r12    0x8
r13    0x40
r14    0x1
r15    0x7c0ffc000ce0
rip    0x7c1005975bb9
rflags 0x10297
cs     0x33
fs     0x0
gs     0x0

Expected behaviour

No response

dstack version

0.19.22

Server logs

Additional information

No response

Metadata

Metadata

Assignees

Labels

bugSomething isn't workingmajor

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions