-
Notifications
You must be signed in to change notification settings - Fork 222
[Bug]: Provisioning does not work in some environments #2963
Copy link
Copy link
Closed
Labels
Description
Steps to reproduce
Try to provision a new instance.
The bug is only reproducible in some environments. Known to be reproducible on T4 and A10G in AWS, P100 in OCI, A100 in Azure.
Actual behaviour
The run/instance status is stuck in provisioning for 10 minutes, then fails after the provisioning timeout.
dstack-shim does not run on the instance. Attempts to run it manually result in this error:
# ./dstack-shim-linux-amd64
time=2025-08-08T10:47:01.288114Z level=warning msg=not using DCGM Exporter err=exec: "dcgm-exporter": executable file not found in $PATH
SIGFPE: floating-point exception
PC=0x7c1005975bb9 m=3 sigcode=1
signal arrived during cgo execution
instruction bytes: 0x48 0xf7 0xf6 0x48 0x89 0xca 0x48 0xf 0xaf 0xc6 0x48 0x8d 0x84 0x1 0x0 0x8
goroutine 1 gp=0xc0000061c0 m=3 mp=0xc000056e08 [syscall]:
runtime.cgocall(0x90bdd0, 0xc0001d9508)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/cgocall.go:167 +0x4b fp=0xc0001d94e0 sp=0xc0001d94a8 pc=0x46a90b
github.com/NVIDIA/go-dcgm/pkg/dcgm._Cfunc_dlopen(0x7c0ffc000c90, 0x101)
_cgo_gotypes.go:1666 +0x4c fp=0xc0001d9508 sp=0xc0001d94e0 pc=0x8b24cc
github.com/NVIDIA/go-dcgm/pkg/dcgm.initDCGM(0x0, {0x0, 0x0, 0x0})
/home/runner/go/pkg/mod/github.com/!n!v!i!d!i!a/go-dcgm@v0.0.0-20250707210631-823394f2bd9b/pkg/dcgm/admin.go:66 +0x85 fp=0xc0001d9580 sp=0xc0001d9508 pc=0x8b2765
github.com/NVIDIA/go-dcgm/pkg/dcgm.Init(0x0, {0x0, 0x0, 0x0})
/home/runner/go/pkg/mod/github.com/!n!v!i!d!i!a/go-dcgm@v0.0.0-20250707210631-823394f2bd9b/pkg/dcgm/api.go:34 +0x173 fp=0xc0001d9600 sp=0xc0001d9580 pc=0x8b1a73
github.com/dstackai/dstack/runner/internal/shim/dcgm.NewDCGMWrapper({0x0?, 0xc0002886c0?})
/home/runner/work/dstack/dstack/runner/internal/shim/dcgm/wrapper.go:55 +0x2a fp=0xc0001d9670 sp=0xc0001d9600 pc=0x8b6a4a
main.start({0xbec670, 0xfed8a0}, {{0x2af6, {0x0, 0x0}, 0x4}, {0x2af7, 0x2726, {0x0, 0x0}, ...}, ...}, ...)
/home/runner/work/dstack/dstack/runner/cmd/shim/main.go:227 +0xb4c fp=0xc0001d9aa0 sp=0xc0001d9670 pc=0x90ac6c
main.main.func1(0xc000130e80?)
/home/runner/work/dstack/dstack/runner/cmd/shim/main.go:151 +0x58 fp=0xc0001d9b68 sp=0xc0001d9aa0 pc=0x90a0f8
github.com/urfave/cli/v2.(*Command).Run(0xc000152420, 0xc000130e80, {0xc00011e040, 0x1, 0x1})
/home/runner/go/pkg/mod/github.com/urfave/cli/v2@v2.27.1/command.go:279 +0x7e2 fp=0xc0001d9de0 sp=0xc0001d9b68 pc=0x8f84a2
github.com/urfave/cli/v2.(*App).RunContext(0xc00013ae00, {0xbec670, 0xfed8a0}, {0xc00011e040, 0x1, 0x1})
/home/runner/go/pkg/mod/github.com/urfave/cli/v2@v2.27.1/app.go:337 +0x58b fp=0xc0001d9e40 sp=0xc0001d9de0 pc=0x8f4e8b
github.com/urfave/cli/v2.(*App).Run(...)
/home/runner/go/pkg/mod/github.com/urfave/cli/v2@v2.27.1/app.go:311
main.main()
/home/runner/work/dstack/dstack/runner/cmd/shim/main.go:155 +0xf6c fp=0xc0001d9f50 sp=0xc0001d9e40 pc=0x90a04c
runtime.main()
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:272 +0x28b fp=0xc0001d9fe0 sp=0xc0001d9f50 pc=0x43cb0b
runtime.goexit({})
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc0001d9fe8 sp=0xc0001d9fe0 pc=0x4788e1
goroutine 2 gp=0xc000006c40 m=nil [force gc (idle)]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:424 +0xce fp=0xc000050fa8 sp=0xc000050f88 pc=0x470c0e
runtime.goparkunlock(...)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:430
runtime.forcegchelper()
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:337 +0xb3 fp=0xc000050fe0 sp=0xc000050fa8 pc=0x43ce53
runtime.goexit({})
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc000050fe8 sp=0xc000050fe0 pc=0x4788e1
created by runtime.init.7 in goroutine 1
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:325 +0x1a
goroutine 3 gp=0xc000007180 m=nil [GC sweep wait]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:424 +0xce fp=0xc000051780 sp=0xc000051760 pc=0x470c0e
runtime.goparkunlock(...)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:430
runtime.bgsweep(0xc00007c000)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgcsweep.go:277 +0x94 fp=0xc0000517c8 sp=0xc000051780 pc=0x427594
runtime.gcenable.gowrap1()
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgc.go:204 +0x25 fp=0xc0000517e0 sp=0xc0000517c8 pc=0x41bca5
runtime.goexit({})
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc0000517e8 sp=0xc0000517e0 pc=0x4788e1
created by runtime.gcenable in goroutine 1
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgc.go:204 +0x66
goroutine 4 gp=0xc000007340 m=nil [GC scavenge wait]:
runtime.gopark(0xc00007c000?, 0xbe1e50?, 0x1?, 0x0?, 0xc000007340?)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:424 +0xce fp=0xc000051f78 sp=0xc000051f58 pc=0x470c0e
runtime.goparkunlock(...)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:430
runtime.(*scavengerState).park(0xfc6bc0)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgcscavenge.go:425 +0x49 fp=0xc000051fa8 sp=0xc000051f78 pc=0x424fc9
runtime.bgscavenge(0xc00007c000)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgcscavenge.go:653 +0x3c fp=0xc000051fc8 sp=0xc000051fa8 pc=0x42553c
runtime.gcenable.gowrap2()
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgc.go:205 +0x25 fp=0xc000051fe0 sp=0xc000051fc8 pc=0x41bc45
runtime.goexit({})
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc000051fe8 sp=0xc000051fe0 pc=0x4788e1
created by runtime.gcenable in goroutine 1
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgc.go:205 +0xa5
goroutine 18 gp=0xc000104700 m=nil [finalizer wait]:
runtime.gopark(0xc000050648?, 0x4121e5?, 0xb0?, 0x1?, 0xc0000061c0?)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:424 +0xce fp=0xc000050620 sp=0xc000050600 pc=0x470c0e
runtime.runfinq()
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mfinal.go:193 +0x107 fp=0xc0000507e0 sp=0xc000050620 pc=0x41ad27
runtime.goexit({})
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc0000507e8 sp=0xc0000507e0 pc=0x4788e1
created by runtime.createfing in goroutine 1
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mfinal.go:163 +0x3d
goroutine 19 gp=0xc000104c40 m=nil [chan receive]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:424 +0xce fp=0xc00004c718 sp=0xc00004c6f8 pc=0x470c0e
runtime.chanrecv(0xc0001125b0, 0x0, 0x1)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/chan.go:639 +0x41c fp=0xc00004c790 sp=0xc00004c718 pc=0x40b8fc
runtime.chanrecv1(0x0?, 0x0?)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/chan.go:489 +0x12 fp=0xc00004c7b8 sp=0xc00004c790 pc=0x40b4b2
runtime.unique_runtime_registerUniqueMapCleanup.func1(...)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgc.go:1781
runtime.unique_runtime_registerUniqueMapCleanup.gowrap1()
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgc.go:1784 +0x2f fp=0xc00004c7e0 sp=0xc00004c7b8 pc=0x41eccf
runtime.goexit({})
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc00004c7e8 sp=0xc00004c7e0 pc=0x4788e1
created by unique.runtime_registerUniqueMapCleanup in goroutine 1
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/mgc.go:1779 +0x96
goroutine 21 gp=0xc000104fc0 m=nil [IO wait]:
runtime.gopark(0x30?, 0x8?, 0x8?, 0x81?, 0xb?)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:424 +0xce fp=0xc000062af0 sp=0xc000062ad0 pc=0x470c0e
runtime.netpollblock(0x4916f8?, 0x408dc6?, 0x0?)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/netpoll.go:575 +0xf7 fp=0xc000062b28 sp=0xc000062af0 pc=0x435537
internal/poll.runtime_pollWait(0x7c10071e64f0, 0x72)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/netpoll.go:351 +0x85 fp=0xc000062b48 sp=0xc000062b28 pc=0x46ff05
internal/poll.(*pollDesc).wait(0xc000165780?, 0xc000291000?, 0x0)
/opt/hostedtoolcache/go/1.23.8/x64/src/internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc000062b70 sp=0xc000062b48 pc=0x4ef207
internal/poll.(*pollDesc).waitRead(...)
/opt/hostedtoolcache/go/1.23.8/x64/src/internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Read(0xc000165780, {0xc000291000, 0x1000, 0x1000})
/opt/hostedtoolcache/go/1.23.8/x64/src/internal/poll/fd_unix.go:165 +0x27a fp=0xc000062c08 sp=0xc000062b70 pc=0x4f04fa
net.(*netFD).Read(0xc000165780, {0xc000291000?, 0x0?, 0xbe71a0?})
/opt/hostedtoolcache/go/1.23.8/x64/src/net/fd_posix.go:55 +0x25 fp=0xc000062c50 sp=0xc000062c08 pc=0x5d09e5
net.(*conn).Read(0xc0001242e0, {0xc000291000?, 0x0?, 0x0?})
/opt/hostedtoolcache/go/1.23.8/x64/src/net/net.go:189 +0x45 fp=0xc000062c98 sp=0xc000062c50 pc=0x5dee85
net.(*UnixConn).Read(0x0?, {0xc000291000?, 0xfc7ee0?, 0xfc83e0?})
<autogenerated>:1 +0x25 fp=0xc000062cc8 sp=0xc000062c98 pc=0x5f45c5
net/http.(*persistConn).Read(0xc00014aea0, {0xc000291000?, 0x711c65?, 0xa5a9c0?})
/opt/hostedtoolcache/go/1.23.8/x64/src/net/http/transport.go:2052 +0x4a fp=0xc000062d28 sp=0xc000062cc8 pc=0x718f4a
bufio.(*Reader).fill(0xc000132480)
/opt/hostedtoolcache/go/1.23.8/x64/src/bufio/bufio.go:110 +0x103 fp=0xc000062d60 sp=0xc000062d28 pc=0x689da3
bufio.(*Reader).Peek(0xc000132480, 0x1)
/opt/hostedtoolcache/go/1.23.8/x64/src/bufio/bufio.go:148 +0x53 fp=0xc000062d80 sp=0xc000062d60 pc=0x689ed3
net/http.(*persistConn).readLoop(0xc00014aea0)
/opt/hostedtoolcache/go/1.23.8/x64/src/net/http/transport.go:2205 +0x185 fp=0xc000062fc8 sp=0xc000062d80 pc=0x719a85
net/http.(*Transport).dialConn.gowrap2()
/opt/hostedtoolcache/go/1.23.8/x64/src/net/http/transport.go:1874 +0x25 fp=0xc000062fe0 sp=0xc000062fc8 pc=0x718485
runtime.goexit({})
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc000062fe8 sp=0xc000062fe0 pc=0x4788e1
created by net/http.(*Transport).dialConn in goroutine 20
/opt/hostedtoolcache/go/1.23.8/x64/src/net/http/transport.go:1874 +0x154f
goroutine 22 gp=0xc000105180 m=nil [select]:
runtime.gopark(0xc000304f48?, 0x2?, 0x80?, 0x51?, 0xc000304ef4?)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/proc.go:424 +0xce fp=0xc000304d90 sp=0xc000304d70 pc=0x470c0e
runtime.selectgo(0xc000304f48, 0xc000304ef0, 0xc000131400?, 0x0, 0x0?, 0x1)
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/select.go:335 +0x7a5 fp=0xc000304eb8 sp=0xc000304d90 pc=0x44e905
net/http.(*persistConn).writeLoop(0xc00014aea0)
/opt/hostedtoolcache/go/1.23.8/x64/src/net/http/transport.go:2519 +0xe7 fp=0xc000304fc8 sp=0xc000304eb8 pc=0x71b487
net/http.(*Transport).dialConn.gowrap3()
/opt/hostedtoolcache/go/1.23.8/x64/src/net/http/transport.go:1875 +0x25 fp=0xc000304fe0 sp=0xc000304fc8 pc=0x718425
runtime.goexit({})
/opt/hostedtoolcache/go/1.23.8/x64/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc000304fe8 sp=0xc000304fe0 pc=0x4788e1
created by net/http.(*Transport).dialConn in goroutine 20
/opt/hostedtoolcache/go/1.23.8/x64/src/net/http/transport.go:1875 +0x15a5
rax 0xffffffffffffffff
rbx 0x0
rcx 0x1000
rdx 0x0
rdi 0x800000
rsi 0x0
rbp 0x7c10081fdbc0
rsp 0x7c10081fdb20
r8 0x963cf85
r9 0x7
r10 0x7c10081fdb20
r11 0x246
r12 0x8
r13 0x40
r14 0x1
r15 0x7c0ffc000ce0
rip 0x7c1005975bb9
rflags 0x10297
cs 0x33
fs 0x0
gs 0x0
Expected behaviour
No response
dstack version
0.19.22
Server logs
Additional information
No response
Reactions are currently unavailable