diff --git a/agent/go.mod b/agent/go.mod index 0bdac54c7358..6bfae84ae35d 100644 --- a/agent/go.mod +++ b/agent/go.mod @@ -169,6 +169,14 @@ require ( sigs.k8s.io/yaml v1.2.0 // indirect ) +require github.hpe.com/hpe/hpc-ard-launcher-go/launcher v0.1.2 // indirect + replace github.com/determined-ai/determined/master => ../master replace github.com/determined-ai/determined/proto => ../proto + +// Determined AI's CircleCI doesn't have access to "github.hpe.com/hpe/hpc-ard-launcher-go", +// so the build will fail in CircleCI. Therefore, we had to do a "git clone" of the +// launcher repo to store a local copy. We make use of the "replace" directive to use the +// local copy and not try to pull it from GitHub. +replace github.hpe.com/hpe/hpc-ard-launcher-go/launcher => ../hpc-ard-launcher-go/launcher diff --git a/agent/go.sum b/agent/go.sum index 7c4304eb79f8..e480cb43640f 100644 --- a/agent/go.sum +++ b/agent/go.sum @@ -841,7 +841,6 @@ github.com/onsi/ginkgo v1.10.1/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+ github.com/onsi/ginkgo v1.10.2/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.10.3/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.11.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= -github.com/onsi/ginkgo v1.11.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk= github.com/onsi/ginkgo v1.14.2 h1:8mVmC9kjFFmA8H4pKMUhcblgifdkOIXPvbhN1T36q1M= github.com/onsi/ginkgo v1.14.2/go.mod h1:iSB4RoI2tjJc9BBv4NKIKWKya62Rps+oPG/Lv9klQyY= diff --git a/hpc-ard-launcher-go/README.md b/hpc-ard-launcher-go/README.md new file mode 100644 index 000000000000..b0029b4a0b20 --- /dev/null +++ b/hpc-ard-launcher-go/README.md @@ -0,0 +1,29 @@ +# hpc-ard-launcher-go + +This repo is the home of the Capsules (hpc-ard-capsules-core) dispatch server Go client. + +The code found here is generated automatically using openapi tools from the Capsules REST API specification. It can be build wit the following command line executed in the hpc-ard-capsules-core project: + +``` +mvn -pl com.cray.analytics.capsules:capsules-dispatch-client clean generate-sources -P go-client +``` +To install the package to your Go environment: + +If you use ssh to interact with github.hpe.com, add the following to your ~/.gitconfig: +``` +[url "ssh://git@github.hpe.com/"] + insteadOf = https://github.hpe.com/ +``` +Then: +``` +% export GOPRIVATE=github.hpe.com/hpe/hpc-ard-launcher-go +% go get github.hpe.com/hpe/hpc-ard-launcher-go/launcher +``` +Import the launcher package to your Go program thus: +``` +import ( + + + "github.hpe.com/hpe/hpc-ard-launcher-go/launcher" +) +``` \ No newline at end of file diff --git a/hpc-ard-launcher-go/go.mod b/hpc-ard-launcher-go/go.mod new file mode 100644 index 000000000000..ce8980671540 --- /dev/null +++ b/hpc-ard-launcher-go/go.mod @@ -0,0 +1,5 @@ +module github.hpe.com/hpe/hpc-ard-launcher-go + +go 1.13 + +require golang.org/x/oauth2 v0.0.0-20210218202405-ba52d332ba99 diff --git a/hpc-ard-launcher-go/go.sum b/hpc-ard-launcher-go/go.sum new file mode 100644 index 000000000000..ae0eabaa9d07 --- /dev/null +++ b/hpc-ard-launcher-go/go.sum @@ -0,0 +1,362 @@ +cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= +cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6AU= +cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxKY= +cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc= +cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0= +cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To= +cloud.google.com/go v0.52.0/go.mod h1:pXajvRH/6o3+F9jDHZWQ5PbGhn+o8w9qiu/CffaVdO4= +cloud.google.com/go v0.53.0/go.mod h1:fp/UouUEsRkN6ryDKNW/Upv/JBKnv6WDthjR6+vze6M= +cloud.google.com/go v0.54.0/go.mod h1:1rq2OEkV3YMf6n/9ZvGWI3GWw0VoqH/1x2nd8Is/bPc= +cloud.google.com/go v0.56.0/go.mod h1:jr7tqZxxKOVYizybht9+26Z/gUq7tiRzu+ACVAMbKVk= +cloud.google.com/go v0.57.0/go.mod h1:oXiQ6Rzq3RAkkY7N6t3TcE6jE+CIBBbA36lwQ1JyzZs= +cloud.google.com/go v0.62.0/go.mod h1:jmCYTdRCQuc1PHIIJ/maLInMho30T/Y0M4hTdTShOYc= +cloud.google.com/go v0.65.0/go.mod h1:O5N8zS7uWy9vkA9vayVHs65eM1ubvY4h553ofrNHObY= +cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= +cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= +cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc= +cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg= +cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc= +cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ= +cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= +cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= +cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I= +cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= +cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA= +cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU= +cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= +cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= +cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= +cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs= +cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0= +dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= +github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= +github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= +github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= +github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= +github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= +github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= +github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= +github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= +github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= +github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= +github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y= +github.com/golang/mock v1.4.0/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= +github.com/golang/mock v1.4.1/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= +github.com/golang/mock v1.4.3/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= +github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= +github.com/golang/protobuf v1.3.4/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= +github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= +github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= +github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= +github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= +github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= +github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= +github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= +github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0= +github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= +github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= +github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.4.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.1 h1:JFrFEBb2xKufg6XkJsJr+WbKb4FQlURi5RUcBveYu9k= +github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= +github.com/google/martian/v3 v3.0.0/go.mod h1:y5Zk1BBys9G+gd6Jrk0W3cC1+ELVxBWuIGO+w/tUAp0= +github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= +github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= +github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200430221834-fc25d7d30c6d/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200708004538-1a94d8640e99/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= +github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= +github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= +github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= +github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= +github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= +github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= +github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= +go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= +go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= +go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= +go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= +golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek= +golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY= +golang.org/x/exp v0.0.0-20191129062945-2f5052295587/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= +golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= +golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= +golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= +golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= +golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= +golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= +golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs= +golang.org/x/lint v0.0.0-20200130185559-910be7a94367/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= +golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= +golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE= +golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= +golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= +golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY= +golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= +golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190628185345-da137c7871d7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200222125558-5a598a2470a0/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200513185701-a91f0712d120/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20200822124328-c89045814202 h1:VvcQYSHwXgi7W+TpUR6A9g6Up98WAHf3f/ulnJ62IyA= +golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= +golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20210218202405-ba52d332ba99 h1:5vD4XjIc0X5+kHZjx4UecYdjA6mJo+XXNoaW0EjU5Os= +golang.org/x/oauth2 v0.0.0-20210218202405-ba52d332ba99/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200331124033-c3d80250170d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200501052902-10377860bb8e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200515095857-1151b9dac4a9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200523222454-059865788121/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= +golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191130070609-6e064ea0cf2d/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191216173652-a0e659d51361/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200117161641-43d50277825c/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200122220014-bf1340f18c4a/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200204074204-1cc6d1ef6c74/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200212150539-ea181f53ac56/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200224181240-023911ca70b2/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200227222343-706bc42d1f0d/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200304193943-95d2e580d8eb/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw= +golang.org/x/tools v0.0.0-20200312045724-11d5b4c81c7d/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw= +golang.org/x/tools v0.0.0-20200331025713-a30bf2db82d4/go.mod h1:Sl4aGygMT6LrqrWclx+PTx3U+LnKx/seiNR+3G19Ar8= +golang.org/x/tools v0.0.0-20200501065659-ab2804fb9c9d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200512131952-2bc93b1c0c88/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200515010526-7d3b6ebf133d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200618134242-20370b0cb4b2/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200729194436-6467de6f59a7/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= +golang.org/x/tools v0.0.0-20200804011535-6c149bb5ef0d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= +golang.org/x/tools v0.0.0-20200825202427-b303f430e36d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= +google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M= +google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= +google.golang.org/api v0.9.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= +google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/api v0.14.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/api v0.17.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.18.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.19.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.20.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.22.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.24.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= +google.golang.org/api v0.28.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= +google.golang.org/api v0.29.0/go.mod h1:Lcubydp8VUV7KeIHD9z2Bys/sm/vGKnG1UHuDBSrHWM= +google.golang.org/api v0.30.0/go.mod h1:QGmEvQ87FHZNiUVJkT14jQNYJ4ZJjdRF23ZXz5138Fc= +google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0= +google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/appengine v1.6.6 h1:lMO5rYAqUxkmaj76jAkRUvt5JZgFymx/+Q5Mzfivuhc= +google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= +google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8= +google.golang.org/genproto v0.0.0-20191108220845-16a3f7862a1a/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20191115194625-c23dd37a84c9/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20191216164720-4f79533eabd1/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20191230161307-f3c370f40bfb/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20200115191322-ca5a22157cba/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20200122232147-0452cf42e150/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20200204135345-fa8e72b47b90/go.mod h1:GmwEX6Z4W5gMy59cAlVYjN9JhxgbQH6Gn+gFDQe2lzA= +google.golang.org/genproto v0.0.0-20200212174721-66ed5ce911ce/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200224152610-e50cd9704f63/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200228133532-8c2c7df3a383/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200305110556-506484158171/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200312145019-da6875a35672/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200331122359-1ee6d9798940/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200515170657-fc4c6c6a6587/go.mod h1:YsZOwe1myG/8QRHRsmBRE1LrgQY60beZKjly0O1fX9U= +google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= +google.golang.org/genproto v0.0.0-20200618031413-b414f8b61790/go.mod h1:jDfRM7FcilCzHH/e9qn6dsT145K34l5v+OpcnNgKAAA= +google.golang.org/genproto v0.0.0-20200729003335-053ba62fc06f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/genproto v0.0.0-20200804131852-c06518451d9c/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/genproto v0.0.0-20200825200019-8632dd797987/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= +google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= +google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= +google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= +google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= +google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/grpc v1.28.0/go.mod h1:rpkK4SK4GF4Ach/+MFLZUBavHOvF2JJB5uozKKal+60= +google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk= +google.golang.org/grpc v1.30.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= +google.golang.org/grpc v1.31.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= +google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= +google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= +google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= +google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= +google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= +google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= +google.golang.org/protobuf v1.25.0 h1:Ejskq+SyPohKW+1uil0JJMtmHCgJPJ/qWTxr8qp+R4c= +google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= +honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= +honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= +rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= +rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= +rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= diff --git a/master/determined.code-workspace b/master/determined.code-workspace index 0800c377048f..c1f26ffdac69 100644 --- a/master/determined.code-workspace +++ b/master/determined.code-workspace @@ -9,5 +9,17 @@ { "path": "../proto" } - ] + ], + "launch": { + "version": "0.2.0", + "configurations": [ + { + "name": "Attach to Process", + "type": "go", + "request": "attach", + "mode": "local", + "processId": 0 + } + ] + } } \ No newline at end of file diff --git a/master/go.mod b/master/go.mod index e8e0092572c6..19ff95f12142 100644 --- a/master/go.mod +++ b/master/go.mod @@ -70,6 +70,7 @@ require ( ) require ( + github.hpe.com/hpe/hpc-ard-launcher-go/launcher v0.1.2 go.opentelemetry.io/contrib/instrumentation/github.com/labstack/echo/otelecho v0.29.0 go.opentelemetry.io/otel v1.6.1 go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.6.1 @@ -185,3 +186,9 @@ require ( ) replace github.com/determined-ai/determined/proto => ../proto + +// Determined AI's CircleCI doesn't have access to "github.hpe.com/hpe/hpc-ard-launcher-go", +// so the build will fail in CircleCI. Therefore, we had to do a "git clone" of the +// launcher repo to store a local copy. We make use of the "replace" directive to use the +// local copy and not try to pull it from GitHub. +replace github.hpe.com/hpe/hpc-ard-launcher-go/launcher => ../hpc-ard-launcher-go/launcher diff --git a/master/internal/config/config.go b/master/internal/config/config.go index b5c2e04d8dc1..b5878e1f16df 100644 --- a/master/internal/config/config.go +++ b/master/internal/config/config.go @@ -353,6 +353,9 @@ func readRMPreemptionStatus(config *Config, rpName string) bool { return config.ResourceManager.AgentRM.Scheduler.GetPreemption() case config.ResourceManager.KubernetesRM != nil: return config.ResourceManager.KubernetesRM.GetPreemption() + case config.ResourceManager.DispatcherRM != nil: + // TODO: Determine if this needs to be enabled for DispatcherRM + return false default: panic("unexpected resource configuration") } diff --git a/master/internal/config/dispatcher_resource_manager_config.go b/master/internal/config/dispatcher_resource_manager_config.go new file mode 100644 index 000000000000..d7f74877b768 --- /dev/null +++ b/master/internal/config/dispatcher_resource_manager_config.go @@ -0,0 +1,127 @@ +package config + +import ( + "encoding/json" + + "github.com/determined-ai/determined/master/pkg/device" + "github.com/determined-ai/determined/master/pkg/model" +) + +// DispatcherResourceManagerConfig is the object that stores the values of +// the "resource_manager" section of "tools/devcluster.yaml". +type DispatcherResourceManagerConfig struct { + MasterHost string `json:"master_host"` + MasterPort int `json:"master_port"` + LauncherHost string `json:"host"` + LauncherPort int `json:"port"` + LauncherProtocol string `json:"protocol"` + SlotType *device.Type `json:"slot_type"` + LauncherAuthFile string `json:"auth_file"` + RendezvousNetworkInterface string `json:"rendezvous_network_interface"` + ProxyNetworkInterface string `json:"proxy_network_interface"` + // Configuration parameters that are proxies for launcher.conf + // and will be applied there by the init script. + UserName string `json:"user_name"` + GroupName string `json:"group_name"` + SingularityImageRoot string `json:"singularity_image_root"` + JobStorageRoot string `json:"job_storage_root"` + Path string `json:"path"` + LdLibraryPath string `json:"ld_library_path"` + TresSupported bool `json:"tres_supported"` + + Security *DispatcherSecurityConfig `json:"security"` + PartitionOverrides map[string]DispatcherPartitionOverrideConfigs `json:"partition_overrides"` +} + +// DispatcherSecurityConfig configures security-related options for the elastic logging backend. +type DispatcherSecurityConfig struct { + TLS model.TLSClientConfig `json:"tls"` +} + +// Validate performs validation. +func (c DispatcherResourceManagerConfig) Validate() error { + return nil +} + +var defaultDispatcherResourceManagerConfig = DispatcherResourceManagerConfig{ + TresSupported: true, +} + +// UnmarshalJSON implements the json.Unmarshaler interface. +func (c *DispatcherResourceManagerConfig) UnmarshalJSON(data []byte) error { + *c = defaultDispatcherResourceManagerConfig + type DefaultParser *DispatcherResourceManagerConfig + return json.Unmarshal(data, DefaultParser(c)) +} + +// ResolveSlotType resolves the slot type by first looking for a partition-specific setting, +// then falling back to the master config, and finally falling back to what we can infer. +func (c DispatcherResourceManagerConfig) ResolveSlotType(partition string) *device.Type { + for name, overrides := range c.PartitionOverrides { + if name != partition { + continue + } + if overrides.SlotType == nil { + break + } + return overrides.SlotType + } + return c.SlotType +} + +// ResolveRendezvousNetworkInterface resolves the rendezvous network interface by first looking for +// a partition-specific setting and then falling back to the master config. +func (c DispatcherResourceManagerConfig) ResolveRendezvousNetworkInterface( + partition string) string { + for name, overrides := range c.PartitionOverrides { + if name != partition { + continue + } + if overrides.RendezvousNetworkInterface == nil { + break + } + return *overrides.RendezvousNetworkInterface + } + return c.RendezvousNetworkInterface +} + +// ResolveProxyNetworkInterface resolves the proxy network interface by first looking for a +// partition-specific setting and then falling back to the master config. +func (c DispatcherResourceManagerConfig) ResolveProxyNetworkInterface(partition string) string { + for name, overrides := range c.PartitionOverrides { + if name != partition { + continue + } + if overrides.ProxyNetworkInterface == nil { + break + } + return *overrides.ProxyNetworkInterface + } + return c.ProxyNetworkInterface +} + +// ResolveTaskContainerDefaults resolves the task container defaults by first looking for +// a partition-specific setting and then falling back to the master config. +func (c DispatcherResourceManagerConfig) ResolveTaskContainerDefaults( + partition string, +) *model.TaskContainerDefaultsConfig { + for name, overrides := range c.PartitionOverrides { + if name != partition { + continue + } + if overrides.TaskContainerDefaultsConfig == nil { + break + } + return overrides.TaskContainerDefaultsConfig + } + return nil +} + +// DispatcherPartitionOverrideConfigs describes per-partition overrides. +type DispatcherPartitionOverrideConfigs struct { + //nolint:lll // I honestly don't know how to break this line within Go's grammar. + RendezvousNetworkInterface *string `json:"rendezvous_network_interface"` + ProxyNetworkInterface *string `json:"proxy_network_interface"` + SlotType *device.Type `json:"slot_type"` + TaskContainerDefaultsConfig *model.TaskContainerDefaultsConfig `json:"task_container_defaults"` +} diff --git a/master/internal/config/resource_config.go b/master/internal/config/resource_config.go index 31af2407732a..a6eb9c5692bb 100644 --- a/master/internal/config/resource_config.go +++ b/master/internal/config/resource_config.go @@ -22,7 +22,9 @@ func (r *ResourceConfig) ResolveResource() error { AgentRM: &AgentResourceManagerConfig{}, } } - if r.ResourceManager.AgentRM == nil && r.ResourceManager.KubernetesRM == nil { + if r.ResourceManager.AgentRM == nil && + r.ResourceManager.KubernetesRM == nil && + r.ResourceManager.DispatcherRM == nil { r.ResourceManager.AgentRM = &AgentResourceManagerConfig{} } if r.ResourceManager.AgentRM != nil && r.ResourcePools == nil { diff --git a/master/internal/config/resource_manager_config.go b/master/internal/config/resource_manager_config.go index 23e383d4f655..2915d6613c0a 100644 --- a/master/internal/config/resource_manager_config.go +++ b/master/internal/config/resource_manager_config.go @@ -17,6 +17,7 @@ const defaultResourcePoolName = "default" type ResourceManagerConfig struct { AgentRM *AgentResourceManagerConfig `union:"type,agent" json:"-"` KubernetesRM *KubernetesResourceManagerConfig `union:"type,kubernetes" json:"-"` + DispatcherRM *DispatcherResourceManagerConfig `union:"type,slurm" json:"-"` } // MarshalJSON implements the json.Marshaler interface. @@ -36,7 +37,7 @@ func (r *ResourceManagerConfig) UnmarshalJSON(data []byte) error { } // Fill in the default config. - if r.AgentRM == nil && r.KubernetesRM == nil { + if r.AgentRM == nil && r.KubernetesRM == nil && r.DispatcherRM == nil { r.AgentRM = &AgentResourceManagerConfig{ Scheduler: &SchedulerConfig{ FittingPolicy: defaultFitPolicy, diff --git a/master/internal/core.go b/master/internal/core.go index 1457ebd66783..84ee823e7d14 100644 --- a/master/internal/core.go +++ b/master/internal/core.go @@ -134,7 +134,6 @@ func (m *Master) getTaskContainerDefaults(poolName string) model.TaskContainerDe // Always fall back to the top-level TaskContainerDefaults taskContainerDefaults := m.config.TaskContainerDefaults - // Only look for pool settings with Agent resource managers. if m.config.ResourceManager.AgentRM != nil { // Iterate through configured pools looking for a TaskContainerDefaults setting. for _, pool := range m.config.ResourcePools { @@ -146,6 +145,13 @@ func (m *Master) getTaskContainerDefaults(poolName string) model.TaskContainerDe } } } + + if drm := m.config.ResourceManager.DispatcherRM; drm != nil { + if tcd := drm.ResolveTaskContainerDefaults(poolName); tcd != nil { + taskContainerDefaults = *tcd + } + } + return taskContainerDefaults } diff --git a/master/internal/db/postgres_resource_managers_dispatcher.go b/master/internal/db/postgres_resource_managers_dispatcher.go new file mode 100644 index 000000000000..90da19ac162f --- /dev/null +++ b/master/internal/db/postgres_resource_managers_dispatcher.go @@ -0,0 +1,119 @@ +package db + +import ( + "context" + "fmt" + + "github.com/uptrace/bun" + + "github.com/determined-ai/determined/master/internal/sproto" + "github.com/determined-ai/determined/master/pkg/model" +) + +// Dispatch is the Determined-persisted representation for dispatch existence. +type Dispatch struct { + bun.BaseModel `bun:"table:resourcemanagers_dispatcher_dispatches"` + + DispatchID string `bun:"dispatch_id"` + ResourceID sproto.ResourcesID `bun:"resource_id"` + AllocationID model.AllocationID `bun:"allocation_id"` + ImpersonatedUser string `bun:"impersonated_user"` +} + +// InsertDispatch persists the existence for a dispatch. +func InsertDispatch(ctx context.Context, r *Dispatch) error { + _, err := Bun().NewInsert().Model(r).Exec(ctx) + if err != nil { + return fmt.Errorf("inserting dispatch: %w", err) + } + return nil +} + +// DispatchByID retrieves a dispatch by its ID. +func DispatchByID( + ctx context.Context, + id string, +) (*Dispatch, error) { + d := Dispatch{} + err := Bun().NewSelect().Model(&d).Where("dispatch_id = ?", id).Scan(ctx) + if err != nil { + return nil, fmt.Errorf("scanning dispatch by ID (%s): %w", id, err) + } + return &d, nil +} + +// ListDispatchesByJobID returns a list of dispatches associated with the specified job. +func ListDispatchesByJobID( + ctx context.Context, + jobID string, +) ([]*Dispatch, error) { + ds := []*Dispatch{} + err := Bun().NewSelect().Model(&ds).Join( + "join allocations on allocations.allocation_id = dispatch.allocation_id").Join( + "join tasks on tasks.task_id = allocations.task_id").Where("job_id = ?", jobID).Scan(ctx) + if err != nil { + return nil, fmt.Errorf("scanning dispatch by job ID (%s): %w", jobID, err) + } + return ds, nil +} + +// ListAllDispatches lists all dispatches in the DB. +func ListAllDispatches(ctx context.Context) ([]*Dispatch, error) { + return ListDispatches(ctx, func(q *bun.SelectQuery) (*bun.SelectQuery, error) { + return q, nil + }) +} + +// ListDispatchesByAllocationID lists all dispatches for an allocation ID. +func ListDispatchesByAllocationID( + ctx context.Context, + id model.AllocationID, +) ([]*Dispatch, error) { + return ListDispatches(ctx, func(q *bun.SelectQuery) (*bun.SelectQuery, error) { + return q.Where("allocation_id = ?", id), nil + }) +} + +// ListDispatches lists all dispatches according to the options provided. +func ListDispatches( + ctx context.Context, + opts func(*bun.SelectQuery) (*bun.SelectQuery, error), +) ([]*Dispatch, error) { + var ds []*Dispatch + + q, err := opts(Bun().NewSelect().Model(&ds)) + if err != nil { + return nil, fmt.Errorf("building dispatch model query: %w", err) + } + + if err = q.Scan(ctx); err != nil { + return nil, fmt.Errorf("scanning dispatch models: %w", err) + } + + return ds, nil +} + +// DeleteDispatch deletes the specified dispatch and returns the number deleted. +func DeleteDispatch( + ctx context.Context, + id string, +) (int64, error) { + return DeleteDispatches(ctx, func(q *bun.DeleteQuery) *bun.DeleteQuery { + return q.Where("dispatch_id = ?", id) + }) +} + +// DeleteDispatches deletes all dispatches for the specified query +// and returns the number deleted. +func DeleteDispatches( + ctx context.Context, + opts func(*bun.DeleteQuery) *bun.DeleteQuery, +) (int64, error) { + var ds []*Dispatch + res, err := opts(Bun().NewDelete().Model(&ds)).Exec(ctx) + if err != nil { + return 0, fmt.Errorf("delete dispatch exec: %w", err) + } + count, _ := res.RowsAffected() + return count, err +} diff --git a/master/internal/db/postgres_resource_managers_dispatcher__intg_test.go b/master/internal/db/postgres_resource_managers_dispatcher__intg_test.go new file mode 100644 index 000000000000..86280897ab30 --- /dev/null +++ b/master/internal/db/postgres_resource_managers_dispatcher__intg_test.go @@ -0,0 +1,65 @@ +//go:build integration +// +build integration + +package db + +import ( + "context" + "testing" + + "github.com/google/uuid" + "github.com/stretchr/testify/require" + + "github.com/determined-ai/determined/master/internal/sproto" + "github.com/determined-ai/determined/master/pkg/etc" +) + +func TestDispatchPersistence(t *testing.T) { + etc.SetRootPath(rootFromDB) + + db := MustResolveTestPostgres(t) + MustMigrateTestPostgres(t, db, migrationsFromDB) + + u := requireMockUser(t, db) + tk := RequireMockTask(t, db, &u.ID) + a := requireMockAllocation(t, db, tk.TaskID) + + // Hack, to avoid circular imports. + rID := sproto.ResourcesID(uuid.NewString()) + _, err := db.sql.Exec(` +INSERT INTO allocation_resources (allocation_id, resource_id) +VALUES ($1, $2) + `, a.AllocationID, rID) + require.NoError(t, err) + + d := Dispatch{ + DispatchID: uuid.NewString(), + ResourceID: rID, + AllocationID: a.AllocationID, + ImpersonatedUser: uuid.NewString(), + } + err = InsertDispatch(context.TODO(), &d) + require.NoError(t, err) + + ds, err := ListDispatchesByAllocationID(context.TODO(), d.AllocationID) + require.Len(t, ds, 1) + require.Equal(t, &d, ds[0]) + + ds, err = ListAllDispatches(context.TODO()) + require.Len(t, ds, 1) + require.Equal(t, &d, ds[0]) + + byID, err := DispatchByID(context.TODO(), d.DispatchID) + require.NoError(t, err) + require.Equal(t, &d, byID) + + count, err := DeleteDispatch(context.TODO(), d.DispatchID) + require.NoError(t, err) + require.Equal(t, int64(1), count) + + ds, err = ListDispatchesByAllocationID(context.TODO(), d.AllocationID) + require.Len(t, ds, 0) + + ds, err = ListAllDispatches(context.TODO()) + require.Len(t, ds, 0) +} diff --git a/master/internal/rm/dispatcher_monitor.go b/master/internal/rm/dispatcher_monitor.go new file mode 100644 index 000000000000..33b193b04303 --- /dev/null +++ b/master/internal/rm/dispatcher_monitor.go @@ -0,0 +1,398 @@ +package resourcemanagers + +// Follow launcher jobs to completion and report status back to Determined. + +import ( + "context" + "fmt" + "io/fs" + "regexp" + "strconv" + "strings" + "time" + + "github.com/sirupsen/logrus" + + "github.com/determined-ai/determined/master/pkg/actor" + + launcher "github.hpe.com/hpe/hpc-ard-launcher-go/launcher" +) + +const ( + pollLoopIntervalSecs = 10 + minItemPollingIntervalSecs = pollLoopIntervalSecs +) + +// A list of WARNING/ERROR level messages that we're interested in, because they contain +// the root cause of the error. The last matching pattern is used. +var messagePatternsOfInterest = []*regexp.Regexp{ + // Remove the carrier prefix and "()" will contain just the Slurm error message. + // The (?s) is a non-capturing option that allows . to match newlines. + regexp.MustCompile("com.cray.analytics.capsules.carriers.hpc.slurm.SingularityOverSlurm" + + " - Slurm job is in a (?s)(.+)"), + + // Whatever matches what's inside the "()" will contain the root cause of the SLURM error. + regexp.MustCompile("Slurm job process terminated with exit code \\d+:\n*(.+)\n*"), +} + +// launcherJob describes a new launcher job, the progress of which we need to track. +type launcherJob struct { + user string + dispatcherID string + payloadName string + timestamp time.Time +} + +// launcherMonitor describes the monitoring of jobs created by the launcher. +type launcherMonitor struct { + monitoredJobs map[string]launcherJob + jobsToRemove map[string]bool + apiClient *launcher.APIClient + newLauncherJob chan launcherJob + removeLauncherJob chan launcherJob + checkLauncherJob chan launcherJob + schedulerTick *time.Ticker + authToken string +} + +// newDispatchWatcher initiates the process of monitoring the progress of launched jobs. +func newDispatchWatcher(apiClient *launcher.APIClient, authToken string) *launcherMonitor { + return &launcherMonitor{ + monitoredJobs: map[string]launcherJob{}, + jobsToRemove: map[string]bool{}, + apiClient: apiClient, + newLauncherJob: make(chan launcherJob), + removeLauncherJob: make(chan launcherJob), + checkLauncherJob: make(chan launcherJob), + // Poll job status this often + schedulerTick: time.NewTicker(time.Second * pollLoopIntervalSecs), + authToken: authToken, + } +} + +// monitorJob adds the specified job to the collection of jobs whose status is monitored. +func (m *launcherMonitor) monitorJob(user string, dispatchID string, payloadName string) { + m.newLauncherJob <- launcherJob{ + user: user, + dispatcherID: dispatchID, + payloadName: payloadName, + timestamp: time.Now(), + } +} + +// removeJob removes the specified job from the collection of jobs whose status is monitored. +func (m *launcherMonitor) removeJob(dispatchID string) { + m.removeLauncherJob <- launcherJob{ + dispatcherID: dispatchID, + } +} + +// Return a starting context for the API client call that includes the authToken +// (may be empty if disabled). +func (m *launcherMonitor) authContext(ctx *actor.Context) context.Context { + return context.WithValue(context.Background(), launcher.ContextAccessToken, m.authToken) +} + +// watch runs asynchronously as a go routine. It receives instructions as +// to what jobs to monitor, and when to monitor them, via channels. +func (m *launcherMonitor) watch(ctx *actor.Context) { + for { + select { + case msg := <-m.newLauncherJob: + ctx.Log().Infof("Starting monitoring of %s", msg.dispatcherID) + // Add job to collection of those being monitored. + m.monitoredJobs[msg.dispatcherID] = msg + + case msg := <-m.removeLauncherJob: + // Save the job to be removed in map. This job will deleted later when processing watched jobs. + _ = m.updateJobStatus(ctx, m.monitoredJobs[msg.dispatcherID]) + m.jobsToRemove[msg.dispatcherID] = true + + case msg := <-m.checkLauncherJob: + // Check the status of the given job. + _ = m.updateJobStatus(ctx, m.monitoredJobs[msg.dispatcherID]) + + case <-m.schedulerTick.C: + m.processWatchedJobs(ctx) + } + } +} + +// This function filters out the noise from the error message, such that only the information that's +// useful to idenfify the root cause is shown in the master output. +//

+// When there's a job error, the launcher may send back too much information. +//

+// For example, +//

+// Capsule corujor/DAI-singularity-over-Slurm:unknown submitted for launch by user corujor +// Attempting to launch Payload DAI-task-container_exp-388-trial-365 ... +// Failed to launch payload DAI-task-container_exp-388-trial-365 ... Slurm job process terminated +// with exit code 1: +// sbatch: error: Batch job submission failed: Requested GRES option unsupported by configured +// SelectType plugin +// Failed to launch payload DAI-task-container_exp-388-trial-365 ... with any of the specified +// carriers +// Transitioned environment from state PENDING to FAILED +// Failed to launch capsule +//

+// Much of this information is of no value to the user and will only serve as noise. +//

+// In this example, the actual root cause of the failure is the line: +//

+// sbatch: error: Batch job submission failed: Requested GRES option unsupported by configured +// SelectType plugin +//

+// Therefore, we should only be returning the root cause and nothing else. +func filterOutSuperfluousMessages(allMessages []string) []string { + // A list of messages that matched the pattern(s). + messagesMatchingPattern := make([]string, 0) + + // The error messages that are returned from the launcher will be on multiple lines. Iterate + // through all the lines of output. + for _, msg := range allMessages { + // Iterate through all the message patterns to see if the error message matches any of them. + for _, messagePatternOfInterest := range messagePatternsOfInterest { + // Does this error message line match any of the patterns we're looking for? + matches := messagePatternOfInterest.FindAllStringSubmatch(msg, -1) + + // The 1st element (i.e., "matches[0][0]") contains the entire messasge that matched. + // The 2nd element (i.e., "matches[0][1]") contains the substring we want. + if len(matches) > 0 && len(matches[0]) >= 2 { + messagesMatchingPattern = append(messagesMatchingPattern, matches[0][1]) + } + } + } + + return messagesMatchingPattern +} + +// processWatchedJobs is called periodically to poll for the completion status +// of launched jobs. The exit status of any completed job is reported to Determined; such +// jobs are them removed from further consideration. +func (m *launcherMonitor) processWatchedJobs(ctx *actor.Context) { + // Loop through the jobs in the monitoredJobs map and update status accordingly + for _, job := range m.monitoredJobs { + // Check if the current job is in the jobsToRemove map. If it is, then delete the + // job from both monitoredJobs map and jobsToRemove map and continue to process + // the next job. + if _, ok := m.jobsToRemove[job.dispatcherID]; ok { + ctx.Log().Infof("Stopping monitoring of %s", job.dispatcherID) + delete(m.monitoredJobs, job.dispatcherID) + delete(m.jobsToRemove, job.dispatcherID) + continue + } + + if m.shouldSkip(job) { + continue + } + + if removeJob := m.updateJobStatus(ctx, job); removeJob { + ctx.Log().Infof("Stopping monitoring of %s", job.dispatcherID) + delete(m.monitoredJobs, job.dispatcherID) + continue + } + + job.timestamp = time.Now() + } + // There are chances that jobsToRemove might still have some elements remaining. + // These values are stale and can be removed safely. + if len(m.jobsToRemove) > 0 { + m.jobsToRemove = map[string]bool{} + } +} + +func (m *launcherMonitor) updateJobStatus(ctx *actor.Context, job launcherJob) bool { + removeJob := false + dispatchID := job.dispatcherID + owner := job.user + ctx.Log().Debugf("Checking status of launcher job %s", dispatchID) + + resp, r, err := m.apiClient.MonitoringApi. + GetEnvironmentStatus(m.authContext(ctx), owner, dispatchID). + Refresh(true). + Execute() + + if err != nil { + if r != nil && r.StatusCode == 404 { + ctx.Log().Infof("DispatchID %s is either COMPLETE or TERMINATED", dispatchID) + removeJob = true + } else { + ctx.Log().WithError(err).Infof("error when calling `GetEnvironmentStatus` for %s:\n%v", + dispatchID, r) + } + return removeJob + } + + ctx.Log().Infof("DispatchID %s state: %s", dispatchID, *resp.State) + + if exitStatus, exitMessages, ok := calculateJobExitStatus(resp); ok { + // Try to filter out messages that offer no value to the user, leaving only the + // message that identifies the root cause of the error. + filteredMessages := filterOutSuperfluousMessages(exitMessages) + + // If we were able to filter out the root cause of the error, then replace the messages + // that we're going to propagate upstream with the filtered messages. Otherwise, we'll + // simply propagate the original messages upstream, which will likely include a lot of + // noise that the user doesn't care about, but we have no choice in this case. + if len(filteredMessages) > 0 { + exitMessages = filteredMessages + } + + if exitStatus != 0 && len(exitMessages) == 0 { + // If we have no messages, it may be a connection failure from the container + // and we will have no logs to assist in diagnosis, so insert the last + // few lines of the error and output logs into the failure message. + exitMessages, _ = m.getTaskLogsFromDispatcher(ctx, &job, "error.log") + outputMessages, _ := m.getTaskLogsFromDispatcher(ctx, &job, "output.log") + exitMessages = append(exitMessages, outputMessages...) + } + + ctx.Log().Debugf("Send status to DAI: %d, messages %s", exitStatus, exitMessages) + ctx.Tell(ctx.Self(), DispatchExited{ + DispatchID: dispatchID, + ExitCode: exitStatus, + Message: strings.Join(exitMessages, "\n"), + }) + + // If status sent, remove this job form the monitored list as we are done. + // I tried this approach: + // + // monitoredLauncherJobs.Remove(e) + // + // but it seemed to cause other jobs in the list to be not processed, so instead + // keep list of jobs to be removed for later. + removeJob = true + } else { + ctx.Tell(ctx.Self(), DispatchStateChange{ + DispatchID: dispatchID, + State: *resp.State, + }) + } + return removeJob +} + +// shouldSkip returns true if we should not get the status of the specified job +// this time around the polling loop. The skip is computed on the time elapsed since +// either the time the job was added to the list of those to monitor, or the time +// of the last sample, for the following reasons. If insufficient time has elapsed +// since the job was launched then the launcher GetEnvironmentStatus REST API +// may block awaiting the job status to be come available, so reduce the likelihood of this by +// requiring a minimum time before the first status fetch. If we do encounter a delay +// for the above (or any other) reason, then the next scheduling tick may arrive +// soon thereafter, resulting in the overhead of unnecessarily rapid polling. +// Avoid the latter by applying a rate limit to each job. +func (*launcherMonitor) shouldSkip(job launcherJob) bool { + durationSinceJobAddedOrLastStatusCollection := time.Now().Sub(job.timestamp).Seconds() + return durationSinceJobAddedOrLastStatusCollection < minItemPollingIntervalSecs +} + +type exitCode int + +// calculateJobExitStatus determines an exit status for the specified job. If the job is not +// in a terminal state, there is no exit status (and monitoring continues). +// If in a terminal state, also return the job messages. +func calculateJobExitStatus( + resp launcher.DispatchInfo, +) (exitCode, []string, bool) { + state, ok := resp.GetStateOk() + if ok { + // TODO(HAL-2813): Track and send more of these state changes with sendStatusToDetermined. + switch *state { + case "UNKNOWN": + case "PENDING": // Successfully launched; pending Slurm scheduling + case "RUNNING": // Job is executing + case "TERMINATING": // User-initiated termination in process + return 0, nil, false + case "TERMINATED": // User-initiated termination complete + return 1, getJobExitMessages(resp), true + case "FAILED": + return 1, getJobExitMessages(resp), true // exit status TBD -- use 1 for now + case "COMPLETED": // Normal completion + return 0, getJobExitMessages(resp), true + default: + return 0, nil, false + } + } + return 0, nil, false +} + +// getJobExitMessages returns the job messages from the event array (if any). +func getJobExitMessages(resp launcher.DispatchInfo) []string { + var result []string + for _, event := range resp.GetEvents() { + // Only need messages that help diagnose the failure + if *event.Level == "WARNING" || *event.Level == "ERROR" { + result = append(result, *event.Message) + } + } + return result +} + +// getTaskLogsFromDispatcher is used to read the logs direct from the dispatcher. +// It is used on job failure when no messages have been relayed from the job +// as a last-chance to provide context for the failure. +// The baseLogName string is error.log, output.log, submission.log (etc), the +// prefix is taken from the job payload name. +// The logRange expression can be used to limit the size of the logs returned. +// For example "lines=-30" is the last 30 lines of the file. +func (m *launcherMonitor) getTaskLogsFromDispatcher( + ctx *actor.Context, job *launcherJob, baseLogName string, +) ([]string, error) { + dispatchID := job.dispatcherID + + // By default show limited lines, on debug/trace levels show more + linesToShow := 15 + if ctx.Log().Logger.Level == logrus.DebugLevel { + linesToShow = 100 + } else if ctx.Log().Logger.Level == logrus.TraceLevel { + linesToShow = 1000 + } + // The number of lines from error/output logs to display on failure. + logRange := fmt.Sprintf("lines=-%d", linesToShow) + + // Compose the file name + logFileName := fmt.Sprintf("%s-%s", job.payloadName, baseLogName) + + logFile, httpResponse, err := m.apiClient.MonitoringApi.LoadEnvironmentLog( + m.authContext(ctx), job.user, dispatchID, logFileName, + ).Range_(logRange).Execute() + if err != nil { + ctx.Log().WithError(err).Warnf("unable to access %s for dispatch", logFileName) + return []string{}, err + } + + contentLength := 0 + // Content-Length is not always set sometimes only Content-Range + contentLengthStr := httpResponse.Header.Get("Content-Length") + if len(contentLengthStr) == 0 { + // No specified length header just read the whole http response + var fileStat fs.FileInfo + fileStat, err = logFile.Stat() + if err != nil { + ctx.Log().Errorf("logFile.Stat() failed: %s", err.Error()) + return []string{}, nil + } + contentLength = int(fileStat.Size()) + } else { + contentLength, err = strconv.Atoi(contentLengthStr) + if err != nil { + ctx.Log().Errorf("atoi(Content-Length) failed: %s", err.Error()) + return []string{}, err + } + if contentLength == 0 { + ctx.Log().Debugf("No content yet for %s", logFileName) + return []string{}, nil + } + } + + buffer := make([]byte, contentLength) + bytesRead, err := logFile.Read(buffer) + if err != nil || bytesRead != contentLength { + ctx.Log().WithError(err).Errorf( + "Failed to read full http response: read %d != contentLength %d", + bytesRead, contentLength) + return nil, err + } + return strings.Split(string(buffer), "\n"), nil +} diff --git a/master/internal/rm/dispatcher_resource_manager.go b/master/internal/rm/dispatcher_resource_manager.go new file mode 100644 index 000000000000..a1536a4f5a77 --- /dev/null +++ b/master/internal/rm/dispatcher_resource_manager.go @@ -0,0 +1,1280 @@ +package resourcemanagers + +import ( + "context" + "crypto/tls" + "fmt" + "io" + "net/http" + "os" + "time" + + "github.com/ghodss/yaml" + "github.com/google/uuid" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + launcher "github.hpe.com/hpe/hpc-ard-launcher-go/launcher" + "golang.org/x/exp/maps" + + "github.com/determined-ai/determined/master/internal/config" + "github.com/determined-ai/determined/master/internal/db" + "github.com/determined-ai/determined/master/internal/job" + "github.com/determined-ai/determined/master/internal/sproto" + "github.com/determined-ai/determined/master/pkg/actor" + "github.com/determined-ai/determined/master/pkg/actor/actors" + "github.com/determined-ai/determined/master/pkg/aproto" + "github.com/determined-ai/determined/master/pkg/device" + "github.com/determined-ai/determined/master/pkg/logger" + "github.com/determined-ai/determined/master/pkg/model" + "github.com/determined-ai/determined/master/pkg/ptrs" + "github.com/determined-ai/determined/master/pkg/tasks" + "github.com/determined-ai/determined/proto/pkg/agentv1" + "github.com/determined-ai/determined/proto/pkg/apiv1" + "github.com/determined-ai/determined/proto/pkg/containerv1" + "github.com/determined-ai/determined/proto/pkg/devicev1" + "github.com/determined-ai/determined/proto/pkg/resourcepoolv1" +) + +const ( + slurmSchedulerType = "slurm" + maxResourceDetailsSampleAgeSeconds = 60 +) + +// hpcResources is a data type describing the HPC resources available +// to Slurm on on the Launcher node. +// Example output of the HPC resource details from the Launcher. +// --- +// partitions: +// - totalAvailableNodes: 293 +// totalAllocatedNodes: 21 +// partitionName: workq +// totalAvailableGpuSlots: 16 +// totalNodes: 314 +// totalGpuSlots: 16 +// - totalAvailableNodes: 293 +// ...more partitions. +type hpcResources struct { + Partitions []hpcPartitionDetails `json:"partitions,flow"` + Nodes []hpcNodeDetails `json:"nodes,flow"` +} + +// hpcPartitionDetails holds HPC Slurm partition details. +type hpcPartitionDetails struct { + TotalAvailableNodes int `json:"totalAvailableNodes"` + PartitionName string `json:"partitionName"` + IsDefault bool `json:"default"` + TotalAllocatedNodes int `json:"totalAllocatedNodes"` + TotalAvailableGpuSlots int `json:"totalAvailableGpuSlots"` + TotalNodes int `json:"totalNodes"` + TotalGpuSlots int `json:"totalGpuSlots"` + TotalAvailableCPUSlots int `json:"totalAvailableCpuSlots"` + TotalCPUSlots int `json:"totalCpuSlots"` +} + +// hpcNodeDetails holds HPC Slurm node details. +type hpcNodeDetails struct { + Partitions []string `json:"partitions"` + Addresses []string `json:"addresses"` + Draining bool `json:"draining"` + Allocated bool `json:"allocated"` + Name string `json:"name"` + GpuCount int `json:"gpuCount"` + GpuInUseCount int `json:"gpuInUseCount"` + CPUCount int `json:"cpuCount"` + CPUInUseCount int `json:"cpuInUseCount"` +} + +// hpcResourceDetailsCache stores details of the HPC resource information cache. +type hpcResourceDetailsCache struct { + lastSample hpcResources + sampleTime time.Time +} + +// dispatcherResourceProvider manages the lifecycle of dispatcher resources. +type dispatcherResourceManager struct { + config *config.DispatcherResourceManagerConfig + + apiClient *launcher.APIClient + hpcResourcesManifest *launcher.Manifest + reqList *taskList + groups map[*actor.Ref]*group + addrToResourcesID map[*actor.Ref]sproto.ResourcesID + resourcesIDtoAddr map[sproto.ResourcesID]*actor.Ref + dispatchIDToAllocationID map[string]model.AllocationID + allocationIDToDispatchID map[model.AllocationID]string + slotsUsedPerGroup map[*group]int + masterTLSConfig model.TLSClientConfig + loggingConfig model.LoggingConfig + jobWatcher *launcherMonitor + authToken string + resourceDetails hpcResourceDetailsCache + defaultComputePoolPartition string + defaultAuxPoolPartition string +} + +func newDispatcherResourceManager( + config *config.DispatcherResourceManagerConfig, + masterTLSConfig model.TLSClientConfig, + loggingConfig model.LoggingConfig, +) *dispatcherResourceManager { + // Set up the host address and IP address of the "launcher". + clientConfiguration := launcher.NewConfiguration() + + // Host, port, and protocol are configured in the "resource_manager" section + // of the "tools/devcluster.yaml" file. The host address and port refer to the + // system where the "launcher" is running. + clientConfiguration.Host = fmt.Sprintf("%s:%d", config.LauncherHost, config.LauncherPort) + clientConfiguration.Scheme = config.LauncherProtocol // "http" or "https" + if config.Security != nil { + logrus.Debugf("Launcher communications InsecureSkipVerify: %t", config.Security.TLS.SkipVerify) + transCfg := &http.Transport{ + TLSClientConfig: &tls.Config{InsecureSkipVerify: config.Security.TLS.SkipVerify}, //nolint:gosec + } + clientConfiguration.HTTPClient = &http.Client{Transport: transCfg} + } + + apiClient := launcher.NewAPIClient(clientConfiguration) + + // One time activity to create a manifest using SlurmResources carrier. + // This manifiest is used on demand to retrieve details regarding HPC resources + // e.g., nodes, GPUs etc + hpcResourcesManifest := createSlurmResourcesManifest() + + // Authentication token that gets passed to the "launcher" REST API. + authToken := loadAuthToken(config) + + return &dispatcherResourceManager{ + config: config, + + apiClient: apiClient, + hpcResourcesManifest: hpcResourcesManifest, + reqList: newTaskList(), + groups: make(map[*actor.Ref]*group), + addrToResourcesID: make(map[*actor.Ref]sproto.ResourcesID), + resourcesIDtoAddr: make(map[sproto.ResourcesID]*actor.Ref), + dispatchIDToAllocationID: make(map[string]model.AllocationID), + allocationIDToDispatchID: make(map[model.AllocationID]string), + slotsUsedPerGroup: make(map[*group]int), + + masterTLSConfig: masterTLSConfig, + loggingConfig: loggingConfig, + jobWatcher: newDispatchWatcher(apiClient, authToken), + authToken: authToken, + } +} + +// Return a starting context for the API client call that includes the authToken +// (may be empty if disabled). +func (m *dispatcherResourceManager) authContext(ctx *actor.Context) context.Context { + return context.WithValue(context.Background(), launcher.ContextAccessToken, m.authToken) +} + +func (m *dispatcherResourceManager) Receive(ctx *actor.Context) error { + switch msg := ctx.Message().(type) { + case actor.PreStart: + m.killAllActiveDispatches(ctx, ctx.Self()) + go m.jobWatcher.watch(ctx) + actors.NotifyAfter(ctx, actionCoolDown, schedulerTick{}) + + case + sproto.AllocateRequest, + StartDispatcherResources, + KillDispatcherResources, + DispatchStateChange, + DispatchExited, + sproto.SetGroupMaxSlots, + sproto.SetTaskName, + sproto.PendingPreemption, + sproto.ResourcesReleased, + groupActorStopped: + return m.receiveRequestMsg(ctx) + + case + job.GetJobQ, + job.GetJobSummary, + job.GetJobQStats, + job.SetGroupWeight, + job.SetGroupPriority, + job.MoveJob, + job.DeleteJob, + *apiv1.GetJobQueueStatsRequest: + return m.receiveJobQueueMsg(ctx) + + case sproto.GetTaskHandler: + ctx.Respond(getTaskHandler(m.reqList, msg.ID)) + + case sproto.GetTaskSummary: + if resp := getTaskSummary(m.reqList, *msg.ID, m.groups, slurmSchedulerType); resp != nil { + ctx.Respond(*resp) + } + + case sproto.GetTaskSummaries: + ctx.Respond(getTaskSummaries(m.reqList, m.groups, slurmSchedulerType)) + + case *apiv1.GetResourcePoolsRequest: + resourcePoolSummary, err := m.summarizeResourcePool(ctx) + if err != nil { + ctx.Respond(err) + return nil + } + ctx.Respond(&apiv1.GetResourcePoolsResponse{ + ResourcePools: resourcePoolSummary, + }) + + case sproto.GetDefaultComputeResourcePoolRequest: + _, _ = m.fetchHpcResourceDetailsCached(ctx) + // Don't bother to check for errors, a response is required (may have no name) + ctx.Respond(sproto.GetDefaultComputeResourcePoolResponse{ + PoolName: m.defaultComputePoolPartition}) + + case sproto.GetDefaultAuxResourcePoolRequest: + _, _ = m.fetchHpcResourceDetailsCached(ctx) + // Don't bother to check for errors, a response is required (may have no name) + ctx.Respond(sproto.GetDefaultAuxResourcePoolResponse{ + PoolName: m.defaultAuxPoolPartition}) + + case sproto.HasResourcePoolRequest: + // This is a query to see if the specified resource pool exists + hpcDetails, err := m.fetchHpcResourceDetailsCached(ctx) + result := false + if err == nil { + for _, p := range hpcDetails.Partitions { + if p.PartitionName == msg.PoolName { + result = true + break + } + } + } + ctx.Respond(sproto.HasResourcePoolResponse{HasResourcePool: result}) + + case sproto.ValidateCommandResourcesRequest: + // TODO(HAL-2862): Use inferred value here if possible. + // fulfillable := m.config.MaxSlotsPerContainer >= msg.Slots + ctx.Respond(sproto.ValidateCommandResourcesResponse{Fulfillable: true}) + + case schedulerTick: + m.schedulePendingTasks(ctx) + actors.NotifyAfter(ctx, actionCoolDown, schedulerTick{}) + + case *apiv1.GetAgentsRequest: + ctx.Respond(m.generateGetAgentsResponse(ctx)) + + default: + ctx.Log().Errorf("unexpected message %T", msg) + return actor.ErrUnexpectedMessage(ctx) + } + + return nil +} + +// generateGetAgentsResponse returns a suitable response to the GetAgentsRequest request. +func (m *dispatcherResourceManager) generateGetAgentsResponse( + ctx *actor.Context) *apiv1.GetAgentsResponse { + response := apiv1.GetAgentsResponse{ + Agents: []*agentv1.Agent{}, + } + _, _ = m.fetchHpcResourceDetailsCached(ctx) + for _, node := range m.resourceDetails.lastSample.Nodes { + agent := agentv1.Agent{ + Id: node.Name, + RegisteredTime: nil, + Slots: map[string]*agentv1.Slot{}, + ResourcePools: node.Partitions, + Addresses: node.Addresses, + Enabled: true, + Draining: node.Draining, + } + response.Agents = append(response.Agents, &agent) + if node.GpuCount == 0 { + addSlotToAgent( + &agent, devicev1.Type_TYPE_CPU, node, node.CPUCount, node.Allocated) // One CPU slot/device + } else { + for i := 0; i < node.GpuCount; i++ { + addSlotToAgent( + &agent, devicev1.Type_TYPE_CUDA, node, i, i < node.GpuInUseCount) // [1:N] CUDA slots + } + } + } + return &response +} + +// addSlotToAgent adds to the specifies agent a slot populated with a device of the specified type. +func addSlotToAgent( + agent *agentv1.Agent, + deviceType devicev1.Type, + node hpcNodeDetails, + slotID int, + slotInUse bool) { + device := devicev1.Device{ + Id: 0, + Brand: "", + Uuid: "", + Type: deviceType, + } + slotRef := fmt.Sprintf("/agents/%s/slots/%d", node.Name, slotID) + slot := agentv1.Slot{ + Id: fmt.Sprintf("%d", slotID), + Device: &device, + Enabled: true, + Draining: false, + } + if slotInUse { + // Claiming a container causes the DAI GUI dashboard to consider the + // slot to be not available; other implications TBD. + slot.Container = &containerv1.Container{} + slot.Container.State = containerv1.State_STATE_RUNNING + } + agent.Slots[slotRef] = &slot +} + +func (m *dispatcherResourceManager) receiveRequestMsg(ctx *actor.Context) error { + switch msg := ctx.Message().(type) { + case sproto.AllocateRequest: + m.addTask(ctx, msg) + + case StartDispatcherResources: + req := m.reqList.taskByHandler[msg.TaskActor] + + slotType, err := m.resolveSlotType(ctx, req.ResourcePool) + if err != nil { + sendResourceStateChangedErrorResponse(ctx, err, msg, + "unable to access resource pool configuration") + return nil + } + + // Make sure we explicitly choose a partition. Use default if unspecified. + partition := req.ResourcePool + if partition == "" { + if slotType == device.CPU { + partition = m.defaultAuxPoolPartition + } else { + partition = m.defaultComputePoolPartition + } + } + + // Create the manifest that will be ultimately sent to the launcher. + manifest, impersonatedUser, payloadName, err := msg.Spec.ToDispatcherManifest( + m.config.MasterHost, m.config.MasterPort, m.masterTLSConfig.CertificateName, + req.SlotsNeeded, slotType, partition, m.config.TresSupported) + if err != nil { + sendResourceStateChangedErrorResponse(ctx, err, msg, + "unable to create the Slurm launcher manifest") + return nil + } + + if impersonatedUser == "root" { + sendResourceStateChangedErrorResponse(ctx, + fmt.Errorf( + "agent user not configured for user '%s' or specified as 'root'", + msg.Spec.Owner.Username), + msg, "") + return nil + } + + dispatchID, err := m.sendManifestToDispatcher(ctx, manifest, impersonatedUser) + if err != nil { + sendResourceStateChangedErrorResponse(ctx, err, msg, + "unable to create Slurm job") + return nil + } + + ctx.Log().Info(fmt.Sprintf("DispatchID is %s", dispatchID)) + m.dispatchIDToAllocationID[dispatchID] = req.AllocationID + m.allocationIDToDispatchID[req.AllocationID] = dispatchID + if err := db.InsertDispatch(context.TODO(), &db.Dispatch{ + DispatchID: dispatchID, + ResourceID: msg.ResourcesID, + AllocationID: req.AllocationID, + ImpersonatedUser: impersonatedUser, + }); err != nil { + ctx.Log().WithError(err).Errorf("failed to persist dispatch: %v", dispatchID) + } + m.jobWatcher.monitorJob(impersonatedUser, dispatchID, payloadName) + return nil + + case sproto.PendingPreemption: + ctx.Log().Info(fmt.Sprintf("PendingPreemption of %s. Terminating.", msg.AllocationID)) + allocReq, ok := m.reqList.GetTaskByID(msg.AllocationID) + if ok { + ctx.Tell(allocReq.TaskActor, sproto.ReleaseResources{ForcePreemption: true}) + } else { + ctx.Log().Error(fmt.Sprintf("unable to find Allocation actor for AllocationID %s", + msg.AllocationID)) + } + + case KillDispatcherResources: + + ctx.Log().Debug(fmt.Sprintf("Received request to terminate jobs associated with AllocationID %s", + msg.AllocationID)) + + // Find the Dispatch IDs associated with the allocation ID. We'll need the + // Dispatch ID to cancel the job on the launcher side. + dispatches, err := db.ListDispatchesByAllocationID(context.TODO(), msg.AllocationID) + if err != nil { + ctx.Log().WithError(err).Errorf( + "Failed to retrieve the DispatchIDs associated with AllocationID %s", + msg.AllocationID) + return nil + } + + ctx.Log().Debug(fmt.Sprintf("Found %d jobs associated with AllocationID %s", + len(dispatches), msg.AllocationID)) + + for _, dispatch := range dispatches { + dispatchID := dispatch.DispatchID + impersonatedUser := dispatch.ImpersonatedUser + + ctx.Log().Info(fmt.Sprintf("Terminating job with DispatchID %s initiated by %s", + dispatchID, impersonatedUser)) + + // Terminate and cleanup, on failure leave Dispatch in DB for later retry + if m.terminateDispatcherJob(ctx, dispatchID, impersonatedUser) { + // Debugging aid, if we need access to the dispatcher environment logs + // When trace is enabled leave them until the job is deleted, or we restart. + if ctx.Log().Logger.Level != logrus.TraceLevel { + m.removeDispatchEnvironment(ctx, impersonatedUser, dispatchID) + } + } + m.jobWatcher.removeJob(dispatchID) + } + + case DispatchStateChange: + log := ctx.Log().WithField("dispatch-id", msg.DispatchID) + allocationID, ok := m.dispatchIDToAllocationID[msg.DispatchID] + if !ok { + log.Warnf("received DispatchStateChange for unknown dispatch %s", msg.DispatchID) + return nil + } + + task, ok := m.reqList.GetTaskByID(allocationID) + if !ok { + log.Warnf("received DispatchStateChange for dispatch unknown to task list: %s", allocationID) + return nil + } + + alloc := m.reqList.GetAllocations(task.TaskActor) + if len(alloc.Resources) != 1 { + log.Warnf("allocation has malformed resources: %v", alloc) + return nil + } + r := maps.Values(alloc.Resources)[0] + rID := r.Summary().ResourcesID + + task.State = schedulingStateFromDispatchState(msg.State) + ctx.Tell(task.TaskActor, sproto.ResourcesStateChanged{ + ResourcesID: rID, + ResourcesState: resourcesStateFromDispatchState(msg.State), + ResourcesStarted: &sproto.ResourcesStarted{}, + }) + + case DispatchExited: + log := ctx.Log().WithField("dispatch-id", msg.DispatchID) + allocationID, ok := m.dispatchIDToAllocationID[msg.DispatchID] + if !ok { + log.Warnf("received DispatchExited for unknown dispatch %s", msg.DispatchID) + return nil + } + + task, ok := m.reqList.GetTaskByID(allocationID) + if !ok { + log.Warnf("received DispatchExited for dispatch unknown to task list: %s", allocationID) + return nil + } + + alloc := m.reqList.GetAllocations(task.TaskActor) + if len(alloc.Resources) != 1 { + log.Warnf("allocation has malformed resources: %v", alloc) + return nil + } + r := maps.Values(alloc.Resources)[0] + rID := r.Summary().ResourcesID + + stopped := sproto.ResourcesStopped{} + if msg.ExitCode > 0 { + stopped.Failure = sproto.NewResourcesFailure( + sproto.TaskError, + msg.Message, + ptrs.Ptr(sproto.ExitCode(msg.ExitCode)), + ) + } + + ctx.Tell(task.TaskActor, sproto.ResourcesStateChanged{ + ResourcesID: rID, + ResourcesState: sproto.Terminated, + ResourcesStopped: &stopped, + }) + + // Find the Dispatch IDs associated with the allocation ID. We'll need the + // Dispatch ID to clean up the dispatcher environments for the job. + dispatches, err := db.ListDispatchesByAllocationID(context.TODO(), allocationID) + if err != nil { + ctx.Log().WithError(err).Errorf( + "Failed to retrieve the DispatchIDs associated with AllocationID %s", + allocationID) + return nil + } + ctx.Log().Debug(fmt.Sprintf("Found %d jobs associated with AllocationID %s", + len(dispatches), allocationID)) + + // Cleanup all the dispatcher environments associated with current allocation + for _, dispatch := range dispatches { + dispatchID := dispatch.DispatchID + impersonatedUser := dispatch.ImpersonatedUser + + ctx.Log().Info(fmt.Sprintf( + "Deleting dispatcher environment for job with DispatchID %s initiated by %s", + dispatchID, impersonatedUser)) + + // Cleanup the dispatcher environment + m.removeDispatchEnvironment(ctx, impersonatedUser, dispatchID) + } + + // Remove the dispatch from mapping tables and DB. + delete(m.addrToResourcesID, m.resourcesIDtoAddr[rID]) + delete(m.resourcesIDtoAddr, rID) + delete(m.dispatchIDToAllocationID, msg.DispatchID) + delete(m.allocationIDToDispatchID, allocationID) + + case sproto.SetGroupMaxSlots: + m.getOrCreateGroup(ctx, msg.Handler).maxSlots = msg.MaxSlots + + case groupActorStopped: + delete(m.slotsUsedPerGroup, m.groups[msg.Ref]) + delete(m.groups, msg.Ref) + + case sproto.SetTaskName: + m.receiveSetTaskName(ctx, msg) + + case sproto.ResourcesReleased: + m.resourcesReleased(ctx, msg.TaskActor) + + default: + ctx.Log().Errorf("receiveRequestMsg: unexpected message %T", msg) + return actor.ErrUnexpectedMessage(ctx) + } + return nil +} + +// Log the failure, and send a ResourcesStateChanged describing the failure. +func sendResourceStateChangedErrorResponse( + ctx *actor.Context, err error, + msg StartDispatcherResources, + errMessageStr string) { + ctx.Log().WithError(err).Error(errMessageStr) + stopped := sproto.ResourcesStopped{} + stopped.Failure = sproto.NewResourcesFailure( + sproto.ResourcesFailed, + errors.Wrapf(err, errMessageStr).Error(), + nil, + ) + ctx.Tell(msg.TaskActor, sproto.ResourcesStateChanged{ + ResourcesID: msg.ResourcesID, + // Could be a better message("container failed with non-zero exit code") + ResourcesState: sproto.Terminated, + ResourcesStopped: &stopped, + }) +} + +func (m *dispatcherResourceManager) receiveJobQueueMsg(ctx *actor.Context) error { + switch msg := ctx.Message().(type) { + case job.GetJobQ: + // TODO(HAL-2863): Get the job Q info from slurm, for the proper pool as per the message. + ctx.Log().Debugf("GetJobQ for resource pool %s", msg.ResourcePool) + ctx.Respond(m.jobQInfo(msg.ResourcePool)) + + case *apiv1.GetJobQueueStatsRequest: + // TODO(HAL-2863): Fill this in per-pool as discerned from the slurm resources info job. + ctx.Log().Debugf("GetJobQueueStatsRequest, pool count %d", len(msg.ResourcePools)) + resp := &apiv1.GetJobQueueStatsResponse{ + Results: make([]*apiv1.RPQueueStat, 0), + } + // If no list of resource pools has been specified, return data for all pools. + if (len(msg.ResourcePools)) == 0 { + hpcDetails, err := m.fetchHpcResourceDetailsCached(ctx) + if err != nil { + ctx.Respond(resp) + return nil + } + for _, p := range hpcDetails.Partitions { + msg.ResourcePools = append(msg.ResourcePools, p.PartitionName) + } + } + // Compute RPQueueStat results for each resource pool + for _, resourcePool := range msg.ResourcePools { + resp.Results = append(resp.Results, &apiv1.RPQueueStat{ + Stats: jobStatsByPool(m.reqList, resourcePool), + ResourcePool: resourcePool, + }) + } + ctx.Respond(resp) + + case job.GetJobQStats: + ctx.Log().Debugf("GetJobQStats for resource pool %s", msg.ResourcePool) + // TODO(HAL-2863): Fill this in for the given pool as discerned from the slurm resources + // info job. + ctx.Respond(jobStats(m.reqList)) + + case job.SetGroupWeight, job.SetGroupPriority, job.MoveJob: + // TODO(HAL-2863): We may not be able to support these specific actions, but how we + // let people interact with the job queue in dispatcher/slurm world. + // ctx.Respond(fmt.Errorf("modifying job positions is not yet supported in slurm")) + + case job.DeleteJob: + ctx.Log().Infof("Delete job %s", string(msg.JobID)) + + dispatches, err := db.ListDispatchesByJobID(context.TODO(), string(msg.JobID)) + if err != nil { + ctx.Log().WithError(err).Errorf( + "Failed to retrieve the DispatchIDs associated with Job %s", + msg.JobID) + ctx.Respond(job.DeleteJobResponseOf(err)) + return nil + } + for _, dispatch := range dispatches { + ctx.Log().Debugf("Found dispatch %s associated with job %s", dispatch.DispatchID, msg.JobID) + m.removeDispatchEnvironment(ctx, dispatch.ImpersonatedUser, dispatch.DispatchID) + } + ctx.Log().Debugf("Delete job successful %s", msg.JobID) + ctx.Respond(job.EmptyDeleteJobResponse()) + + default: + return actor.ErrUnexpectedMessage(ctx) + } + return nil +} + +// selectDefaultPools identifies partitions suitable as default compute and default +// aux partitions (if possible). +func (m *dispatcherResourceManager) selectDefaultPools( + ctx *actor.Context, hpcResourceDetails []hpcPartitionDetails) (string, string) { + // The default compute pool is the default partition if it has any GPUS, + // otherwise select any partion with GPUs. + // The AUX partition, use the default partition if available, otherwise any partition. + + defaultComputePar := "" // Selected default Compute/GPU partition + defaultAuxPar := "" // Selected default Aux parittion + + fallbackComputePar := "" // Fallback Compute/GPU partion (has GPUs) + fallbackAuxPar := "" // Fallback parition if no default + + for _, v := range hpcResourceDetails { + if v.IsDefault { + defaultAuxPar = v.PartitionName + if v.TotalGpuSlots > 0 { + defaultComputePar = v.PartitionName + } + } else { + fallbackAuxPar = v.PartitionName + if v.TotalGpuSlots > 0 { + fallbackComputePar = v.PartitionName + } + } + } + + // Ensure we have a default aux, even if no partitions marked as such + if defaultAuxPar == "" { + defaultAuxPar = fallbackAuxPar + } + + // If no default compute/GPU partitions, use a fallback partition + if defaultComputePar == "" { + if fallbackComputePar != "" { + defaultComputePar = fallbackComputePar + } else { + defaultComputePar = defaultAuxPar + } + } + return defaultComputePar, defaultAuxPar +} + +func (m *dispatcherResourceManager) summarizeResourcePool( + ctx *actor.Context) ([]*resourcepoolv1.ResourcePool, error) { + hpcResourceDetails, err := m.fetchHpcResourceDetailsCached(ctx) + if err != nil { + return nil, err + } + var result []*resourcepoolv1.ResourcePool + for _, v := range hpcResourceDetails.Partitions { + slotType, err := m.resolveSlotType(ctx, v.PartitionName) + if err != nil { + return nil, fmt.Errorf("resolving slot type: %w", err) + } + + slotsAvailable := int32(v.TotalGpuSlots) + slotsUsed := int32(v.TotalGpuSlots - v.TotalAvailableGpuSlots) + if slotType == device.CPU { + slotsAvailable = int32(v.TotalCPUSlots) + slotsUsed = int32(v.TotalCPUSlots - v.TotalAvailableCPUSlots) + } + + pool := resourcepoolv1.ResourcePool{ + Name: v.PartitionName, + Description: "Slurm-managed pool of resources", + Type: resourcepoolv1.ResourcePoolType_RESOURCE_POOL_TYPE_STATIC, + NumAgents: int32(v.TotalNodes), + SlotType: slotType.Proto(), + SlotsAvailable: slotsAvailable, + SlotsUsed: slotsUsed, + AuxContainerCapacity: int32(v.TotalCPUSlots), + AuxContainersRunning: int32(v.TotalCPUSlots - v.TotalAvailableCPUSlots), + DefaultComputePool: v.PartitionName == m.defaultComputePoolPartition, + DefaultAuxPool: v.PartitionName == m.defaultAuxPoolPartition, + Preemptible: true, + MinAgents: int32(v.TotalNodes), + MaxAgents: int32(v.TotalNodes), + SlotsPerAgent: 0, // Must be unspecified + AuxContainerCapacityPerAgent: 0, + SchedulerType: resourcepoolv1.SchedulerType_SCHEDULER_TYPE_SLURM, + SchedulerFittingPolicy: resourcepoolv1.FittingPolicy_FITTING_POLICY_SLURM, + Location: "Slurm", + ImageId: "", + InstanceType: "Slurm", + Details: &resourcepoolv1.ResourcePoolDetail{}, + } + result = append(result, &pool) + } + return result, nil +} + +// fetchHpcResourceDetailsCached fetches cached Slurm resource details from the launcher node. +// If the cached info is too old, a cache reload will occur, and the candidates for the +// default compute & aux resource pools will be reevaluated. +func (m *dispatcherResourceManager) fetchHpcResourceDetailsCached(ctx *actor.Context) ( + hpcResources, error) { + // If anyone is viewing the 'Cluster' section of the DAI GUI then there is activity here + // about every 10s per user. To mitigate concerns of overloading slurmd with polling + // activity, we will return a cached result, updating the cache only every so often. + if time.Since(m.resourceDetails.sampleTime).Seconds() > maxResourceDetailsSampleAgeSeconds { + newSample, err := m.fetchHpcResourceDetails(ctx) + if err != nil { + return hpcResources{}, err + } + m.resourceDetails.lastSample = newSample + m.resourceDetails.sampleTime = time.Now() + m.defaultComputePoolPartition, m.defaultAuxPoolPartition = + m.selectDefaultPools(ctx, m.resourceDetails.lastSample.Partitions) + ctx.Log().Infof("default resource pools are '%s', '%s'", + m.defaultComputePoolPartition, m.defaultAuxPoolPartition) + } + return m.resourceDetails.lastSample, nil +} + +// resolveSlotType resolves the correct slot type for a job targeting the given partition. If the +// slot type is specified in the master config, use that. Otherwise if the partiton is specified and +// known, and has no GPUs select CPU as the processor type, else default to CUDA. +func (m *dispatcherResourceManager) resolveSlotType( + ctx *actor.Context, + partition string, +) (device.Type, error) { + if slotType := m.config.ResolveSlotType(partition); slotType != nil { + return *slotType, nil + } + + hpc, err := m.fetchHpcResourceDetailsCached(ctx) + if err != nil { + return "", fmt.Errorf("inferring slot type for resource info: %w", err) + } + + for _, v := range hpc.Partitions { + if v.PartitionName == partition && v.TotalGpuSlots == 0 { + return device.CPU, nil + } + } + return device.CUDA, nil +} + +// fetchHpcResourceDetails retrieves the details about HPC Resources. +// This function uses HPC Resources manifest to retrieve the required details. +// This function performs the following steps: +// 1. Launch the manifest. +// 2. Read the log file with details on HPC resources. +// 3. Parse and load the details into a predefined struct - HpcResourceDetails +// 4. Terminate the manifest. +// Returns struct with HPC resource details - HpcResourceDetails. +func (m *dispatcherResourceManager) fetchHpcResourceDetails( + ctx *actor.Context) (hpcResources, error) { + impersonatedUser := "" + + // Launch the HPC Resources manifest. Launch() method will ensure + // the manifest is in the RUNNING state on successful completion. + dispatchInfo, response, err := m.apiClient.LaunchApi. + Launch(m.authContext(ctx)). + Manifest(*m.hpcResourcesManifest). + Impersonate(impersonatedUser). + Execute() + if err != nil { + ctx.Log().Errorf("Failed to launch Manifest.\n%v\n%v", response, err) + return hpcResources{}, err + } + ctx.Log().Debug(fmt.Sprintf("Launched Manifest with DispatchID %s", dispatchInfo.GetDispatchId())) + + dispatchID := dispatchInfo.GetDispatchId() + owner := "launcher" + + defer m.resourceQueryPostActions(ctx, dispatchID, owner) + + logFileName := "slurm-resources-info" + // HPC resource details will be listed in a log file with name + // 'slurm-resources-info' in YAML format. Use LoadEnvironmentLog() + // method to retrieve the log file. + // + // Because we're using "launch()" instead of "launchAsync()" to get + // the HPC resources, we can expect that the "slurm-resources-info" log + // file containing the SLURM partition info will be available, because + // "launch()" will not return until the "slurm-resources-info" file is + // written. Had we used "launchAsync()", we would have to poll the launcher + // for job completion, but that's tricky, because the monitoring API will + // go through the SlurmCarrier on the launcher side, which expects a job ID. + // The SlurmCarrier will hang for a while waiting for the SLURM job ID to be + // written, which it never will, because SlurmResources only queries SLURM + // to get the partition info and does not create a job, so no job ID is ever + // generated. Eventually it will timeout waiting and return, but that's too + // long of a delay for us to deal with. + resp, _, err := m.apiClient.MonitoringApi. + LoadEnvironmentLog(m.authContext(ctx), owner, dispatchID, logFileName). + Execute() + if err != nil { + ctx.Log().WithError(err).Errorf("failed to retrieve HPC Resource details") + return hpcResources{}, err + } + + // Parse the HPC resources file and extract the details into a + // HpcResourceDetails object using YAML package. + resourcesBytes, err := io.ReadAll(resp) + if err != nil { + ctx.Log().WithError(err).Errorf("failed to read response") + return hpcResources{}, err + } + resources := hpcResources{} + if err = yaml.Unmarshal(resourcesBytes, &resources); err != nil { + ctx.Log().WithError(err).Errorf("failed to parse HPC Resource details") + return hpcResources{}, err + } + m.hpcResourcesToDebugLog(ctx, resources) + return resources, nil +} + +// hpcResourcesToDebugLog puts a summary of the available HPC resources to the debug log. +func (m *dispatcherResourceManager) hpcResourcesToDebugLog( + ctx *actor.Context, resources hpcResources) { + if ctx.Log().Logger.Level != logrus.DebugLevel { + return + } + ctx.Log().Debugf("HPC Resource details: %+v", resources.Partitions) + nodesWithGpu := 0 + gpusFound := 0 + nodesAllocated := 0 + gpusAllocated := 0 + cpusFound := 0 + cpusAllocated := 0 + for _, node := range resources.Nodes { + gpusFound += node.GpuCount + cpusFound += node.CPUCount + if node.GpuCount > 0 { + nodesWithGpu++ + } + if node.Allocated { + nodesAllocated++ + } + gpusAllocated += node.GpuInUseCount + cpusAllocated += node.CPUInUseCount + } + ctx.Log(). + WithField("nodes", len(resources.Nodes)). + WithField("allocated", nodesAllocated). + WithField("nodes with GPU", nodesWithGpu). + WithField("GPUs", gpusFound). + WithField("GPUs allocated", gpusAllocated). + WithField("CPUs", cpusFound). + WithField("CPUs allocated", cpusAllocated). + Debug("Node summary") +} + +// resourceQueryPostActions performs actions to clean up after any dispatch +// completion (either a Slurm resource query, or launched manifest allocation). +// We use dispatcher REST API calls to instruct the dispatcher to clean up. +// On success, the Dispatch (if present) is removed from the DB (if present). +// When querying Slurm resource information, the DispatchID is not registered +// with the DB, so we do not log an error if we fail to delete it. +// On any REST failure where we cannot confirm the dispatch has been removed +// by the launcher, we skip any attempt to delete the Dispatch from the DB. +// The Dispatch is left in the DB, for a future cleanup attempt on startup. +func (m *dispatcherResourceManager) resourceQueryPostActions(ctx *actor.Context, + dispatchID string, owner string) { + if m.terminateDispatcherJob(ctx, dispatchID, owner) { + m.removeDispatchEnvironment(ctx, owner, dispatchID) + } +} + +// terminateDispatcherJob terminates the dispatcher job with the given ID. +// Return true to indicate if the DB dispatch should additionally be deleted. +func (m *dispatcherResourceManager) terminateDispatcherJob(ctx *actor.Context, + dispatchID string, owner string) bool { + if dispatchID == "" { + ctx.Log().Warn("Missing dispatchID, so no environment clean-up") + return false + } + var err error + var response *http.Response + if _, response, err = m.apiClient.RunningApi.TerminateRunning(m.authContext(ctx), + owner, dispatchID).Force(true).Execute(); err != nil { + if response == nil || response.StatusCode != 404 { + ctx.Log().WithError(err).Errorf("Failed to terminate job with Dispatch ID %s", + dispatchID) + // We failed to delete, and not 404/notfound so leave in DB. + return false + } + } + ctx.Log().Debug(fmt.Sprintf("Terminated manifest with DispatchID %s", dispatchID)) + return true +} + +// removeDispatchEnvironment uses the dispatcher REST API to remove +// the environment created on the launcher node in support of the +// job with the specified dispatch ID. This prevents stale information +// from accumulating in the dispatcher. Upon success, it additionally +// attempts to remove the dispatchID association (if present) with the allocation +// in the DB. On failure, the attempt to remove the Dispatch +// from the DB is skipped and left for a future cleanup attempt on startup. +// When querying Slurm resource information, the DispatchID is not registered +// with the DB, so we do not log an error if we fail to remove it. +func (m *dispatcherResourceManager) removeDispatchEnvironment( + ctx *actor.Context, owner string, dispatchID string) { + if response, err := m.apiClient.MonitoringApi.DeleteEnvironment(m.authContext(ctx), + owner, dispatchID).Execute(); err != nil { + if response == nil || response.StatusCode != 404 { + ctx.Log().WithError(err).Errorf("Failed to remove environment for Dispatch ID %s", + dispatchID) + // We failed to delete, and not 404/notfound so leave in DB for later retry + return + } + } else { + ctx.Log().Debug(fmt.Sprintf("Deleted environment with DispatchID %s", dispatchID)) + } + count, err := db.DeleteDispatch(context.TODO(), dispatchID) + if err != nil { + ctx.Log().WithError(err).Errorf("Failed to delete DispatchID %s from DB", dispatchID) + } + // On Slurm resource query there may be no Dispatch in the DB, so only log as trace. + ctx.Log().Tracef("Deleted DispatchID %s from DB, count %d", dispatchID, count) +} + +// Sends the manifest to the launcher. +func (m *dispatcherResourceManager) sendManifestToDispatcher( + ctx *actor.Context, + manifest *launcher.Manifest, + impersonatedUser string) (string, error) { + /* + * "LaunchAsync()" does not wait for the "launcher" to move the job to the "RUNNING" + * state and returns right away while the job is still in the "PENDING" state. If it + * becomes necessary to wait for the job to be in the "RUNNING" state, we can switch + * to using "Launch()". + * + * The "manifest" describes the job to be launched and includes any environment + * variables, mount points, etc., that are needed by the job. + * + * The "impersonatedUser" is the user that we want to run the job as on the cluster. + * Of course, that user must be known to the cluster as either a local Linux user + * (e.g. "/etc/passwd"), LDAP, or some other authentication mechanism. + */ + dispatchInfo, response, err := m.apiClient.LaunchApi. + LaunchAsync(m.authContext(ctx)). + Manifest(*manifest). + Impersonate(impersonatedUser). + Execute() + if err != nil { + httpStatus := "" + if response != nil { + // So we can show the HTTP status code, if available. + httpStatus = fmt.Sprintf("(HTTP status %d)", response.StatusCode) + } + return "", errors.Wrapf(err, "LaunchApi.LaunchAsync() returned an error %s", httpStatus) + } + return dispatchInfo.GetDispatchId(), nil +} + +func (m *dispatcherResourceManager) addTask(ctx *actor.Context, msg sproto.AllocateRequest) { + actors.NotifyOnStop(ctx, msg.TaskActor, sproto.ResourcesReleased{TaskActor: msg.TaskActor}) + + if len(msg.AllocationID) == 0 { + msg.AllocationID = model.AllocationID(uuid.New().String()) + } + if msg.Group == nil { + msg.Group = msg.TaskActor + } + m.getOrCreateGroup(ctx, msg.Group) + if len(msg.Name) == 0 { + msg.Name = "Unnamed-Slurm-Job" + } + + ctx.Log().Infof( + "resources are requested by %s (Allocation ID: %s)", + msg.TaskActor.Address(), msg.AllocationID, + ) + m.reqList.AddTask(&msg) +} + +func (m *dispatcherResourceManager) jobQInfo(rp string) map[model.JobID]*job.RMJobInfo { + var reqs []*sproto.AllocateRequest + for it := m.reqList.iterator(); it.next(); { + if it.value().ResourcePool == rp { + reqs = append(reqs, it.value()) + } + } + return reduceToJobQInfo(reqs) +} + +func (m *dispatcherResourceManager) receiveSetTaskName(ctx *actor.Context, msg sproto.SetTaskName) { + if task, found := m.reqList.GetTaskByHandler(msg.TaskHandler); found { + task.Name = msg.Name + } +} + +func (m *dispatcherResourceManager) assignResources( + ctx *actor.Context, req *sproto.AllocateRequest, +) { + m.slotsUsedPerGroup[m.groups[req.Group]] += req.SlotsNeeded + + rID := sproto.ResourcesID(uuid.NewString()) + allocations := sproto.ResourceList{ + rID: &DispatcherResources{ + id: rID, + req: req, + rm: ctx.Self(), + group: m.groups[req.Group], + defaultRendezvousIface: m.config.ResolveRendezvousNetworkInterface(req.ResourcePool), + defaultProxyIface: m.config.ResolveProxyNetworkInterface(req.ResourcePool), + }, + } + m.addrToResourcesID[req.TaskActor] = rID + m.resourcesIDtoAddr[rID] = req.TaskActor + + assigned := sproto.ResourcesAllocated{ID: req.AllocationID, Resources: allocations} + m.reqList.SetAllocationsRaw(req.TaskActor, &assigned) + req.TaskActor.System().Tell(req.TaskActor, assigned) + + ctx.Log(). + WithField("allocation-id", req.AllocationID). + WithField("task-handler", req.TaskActor.Address()). + Infof("resources assigned") +} + +func (m *dispatcherResourceManager) resourcesReleased(ctx *actor.Context, handler *actor.Ref) { + ctx.Log().Infof("resources are released for %s", handler.Address()) + m.reqList.RemoveTaskByHandler(handler) + + delete(m.resourcesIDtoAddr, m.addrToResourcesID[handler]) + delete(m.addrToResourcesID, handler) + + if req, ok := m.reqList.GetTaskByHandler(handler); ok { + if group := m.groups[handler]; group != nil { + m.slotsUsedPerGroup[group] -= req.SlotsNeeded + } + } +} + +// Used on startup, to queue terminate and delete all dispatches in the DB +// such that we do not get duplicate tasks queued on the system. +func (m *dispatcherResourceManager) killAllActiveDispatches( + ctx *actor.Context, handler *actor.Ref) { + ctx.Log().Infof("Releasing all resources due to master restart") + + // Find the Dispatch IDs associated with the allocation ID. We'll need the + // Dispatch ID to cancel the job on the launcher side. + dispatches, err := db.ListAllDispatches(context.TODO()) + if err != nil { + ctx.Log().WithError(err).Errorf("Failed to retrieve all Dispatches") + return + } + ctx.Log().Debug(fmt.Sprintf("Found %d Dispatches to release", len(dispatches))) + for _, dispatch := range dispatches { + ctx.Log().Debug(fmt.Sprintf("Queuing cleanup of AllocationID %s, DispatchID %s", + dispatch.AllocationID, dispatch.DispatchID)) + ctx.Tell(handler, KillDispatcherResources{ + ResourcesID: dispatch.ResourceID, + AllocationID: dispatch.AllocationID}) + } +} + +func (m *dispatcherResourceManager) getOrCreateGroup( + ctx *actor.Context, + handler *actor.Ref, +) *group { + if g, ok := m.groups[handler]; ok { + return g + } + priority := config.KubernetesDefaultPriority + g := &group{handler: handler, weight: 1, priority: &priority} + m.groups[handler] = g + m.slotsUsedPerGroup[g] = 0 + + if ctx != nil && handler != nil { // ctx is nil only for testing purposes. + actors.NotifyOnStop(ctx, handler, groupActorStopped{}) + } + return g +} + +func (m *dispatcherResourceManager) schedulePendingTasks(ctx *actor.Context) { + for it := m.reqList.iterator(); it.next(); { + req := it.value() + group := m.groups[req.Group] + assigned := m.reqList.GetAllocations(req.TaskActor) + if !assignmentIsScheduled(assigned) { + if maxSlots := group.maxSlots; maxSlots != nil { + if m.slotsUsedPerGroup[group]+req.SlotsNeeded > *maxSlots { + continue + } + } + m.assignResources(ctx, req) + } + } +} + +type ( + // DispatcherResources information. + DispatcherResources struct { + id sproto.ResourcesID + req *sproto.AllocateRequest + rm *actor.Ref + group *group + + defaultRendezvousIface string + defaultProxyIface string + } + + // StartDispatcherResources comment to keep "golint" from complaining. + StartDispatcherResources struct { + AllocationID model.AllocationID + ResourcesID sproto.ResourcesID + TaskActor *actor.Ref + Spec tasks.TaskSpec + } + + // KillDispatcherResources tells the dispatcher RM to clean up the resources with the given + // resources ID. + KillDispatcherResources struct { + ResourcesID sproto.ResourcesID + AllocationID model.AllocationID + } + + // DispatchStateChange notifies the dispatcher that the give dispatch has changed state. + DispatchStateChange struct { + DispatchID string + State launcher.DispatchState + } + + // DispatchExited notifies the dispatcher that the give dispatch exited. + DispatchExited struct { + DispatchID string + ExitCode exitCode + Message string + } +) + +// Summary summarizes a container allocation. +func (r DispatcherResources) Summary() sproto.ResourcesSummary { + return sproto.ResourcesSummary{ + ResourcesID: r.id, + ResourcesType: sproto.ResourcesTypeSlurmJob, + AllocationID: r.req.AllocationID, + AgentDevices: map[aproto.ID][]device.Device{}, + ContainerID: nil, + } +} + +// Start notifies the pods actor that it should launch a pod for the provided task spec. +func (r DispatcherResources) Start( + ctx *actor.Context, _ logger.Context, spec tasks.TaskSpec, rri sproto.ResourcesRuntimeInfo, +) error { + spec.ResourcesID = string(r.id) + spec.AllocationID = string(r.req.AllocationID) + spec.AllocationSessionToken = rri.Token + spec.TaskID = string(r.req.TaskID) + spec.UseHostMode = rri.IsMultiAgent + spec.ResourcesConfig.SetPriority(r.group.priority) + if spec.LoggingFields == nil { + spec.LoggingFields = map[string]string{} + } + spec.LoggingFields["allocation_id"] = spec.AllocationID + spec.LoggingFields["task_id"] = spec.TaskID + spec.ExtraEnvVars[sproto.ResourcesTypeEnvVar] = string(sproto.ResourcesTypeSlurmJob) + spec.ExtraEnvVars[sproto.SlurmRendezvousIfaceEnvVar] = r.defaultRendezvousIface + spec.ExtraEnvVars[sproto.SlurmProxyIfaceEnvVar] = r.defaultProxyIface + ctx.Tell(r.rm, StartDispatcherResources{ + AllocationID: r.req.AllocationID, + ResourcesID: r.id, + TaskActor: r.req.TaskActor, + Spec: spec, + }) + return nil +} + +// Kill notifies the pods actor that it should stop the pod. +func (r DispatcherResources) Kill(ctx *actor.Context, _ logger.Context) { + ctx.Tell(r.rm, KillDispatcherResources{ResourcesID: r.id, AllocationID: r.req.AllocationID}) +} + +// CreateSlurmResourcesManifest creates a Manifest for SlurmResources Carrier. +// This Manifest is used to retrieve information about resources available on the HPC system. +func createSlurmResourcesManifest() *launcher.Manifest { + payload := launcher.NewPayloadWithDefaults() + payload.SetName("DAI-HPC-Resources") + payload.SetId("com.cray.analytics.capsules.hpc.resources") + payload.SetVersion("latest") + payload.SetCarriers([]string{"com.cray.analytics.capsules.carriers.hpc.slurm.SlurmResources"}) + + // Create payload launch parameters + launchParameters := launcher.NewLaunchParameters() + launchParameters.SetMode("interactive") + payload.SetLaunchParameters(*launchParameters) + + clientMetadata := launcher.NewClientMetadataWithDefaults() + clientMetadata.SetName("DAI-Slurm-Resources") + + // Create & populate the manifest + manifest := *launcher.NewManifest("v1", *clientMetadata) + manifest.SetPayloads([]launcher.Payload{*payload}) + + return &manifest +} + +// If an auth_file was specified, load the content and return it to enable authorization +// with the launcher. If the auth_file is configured, but does not exist we panic. +func loadAuthToken(config *config.DispatcherResourceManagerConfig) string { + if len(config.LauncherAuthFile) > 0 { + authToken, err := os.ReadFile(config.LauncherAuthFile) + if err != nil { + panic("Configuration resource_manager.auth_file not readable: " + config.LauncherAuthFile) + } + return string(authToken) + } + return "" +} + +// schedulingStateFromDispatchState returns SchedulingState from DispatchState representation. +func schedulingStateFromDispatchState(state launcher.DispatchState) job.SchedulingState { + switch state { + case launcher.PENDING: + return job.SchedulingStateQueued + default: + return job.SchedulingStateScheduled + } +} + +// resourcesStateFromDispatchState returns ResourcesState from DispatchState representation. +func resourcesStateFromDispatchState(state launcher.DispatchState) sproto.ResourcesState { + switch state { + case launcher.PENDING: + return sproto.Starting + case launcher.RUNNING: + return sproto.Running + case launcher.TERMINATING: + return sproto.Running + case launcher.COMPLETED: + return sproto.Terminated + case launcher.FAILED: + return sproto.Terminated + default: + return sproto.Unknown + } +} diff --git a/master/internal/rm/dispatcher_resource_manager_test.go b/master/internal/rm/dispatcher_resource_manager_test.go new file mode 100644 index 000000000000..65616606b78b --- /dev/null +++ b/master/internal/rm/dispatcher_resource_manager_test.go @@ -0,0 +1,144 @@ +package resourcemanagers + +import ( + "testing" + + launcher "github.hpe.com/hpe/hpc-ard-launcher-go/launcher" + + "github.com/determined-ai/determined/master/internal/config" + "github.com/determined-ai/determined/master/internal/sproto" + "github.com/determined-ai/determined/master/pkg/actor" + "github.com/determined-ai/determined/master/pkg/model" +) + +func Test_dispatcherResourceManager_selectDefaultPools(t *testing.T) { + type fields struct { + config *config.DispatcherResourceManagerConfig + apiClient *launcher.APIClient + hpcResourcesManifest *launcher.Manifest + reqList *taskList + groups map[*actor.Ref]*group + addrToResourcesID map[*actor.Ref]sproto.ResourcesID + resourcesIDToAddr map[sproto.ResourcesID]*actor.Ref + slotsUsedPerGroup map[*group]int + dispatchIDToAllocationID map[string]model.AllocationID + masterTLSConfig model.TLSClientConfig + loggingConfig model.LoggingConfig + jobWatcher *launcherMonitor + authToken string + resourceDetails hpcResourceDetailsCache + DefaultComputePoolPartition string + DefaultAuxPoolPartition string + } + type args struct { + ctx *actor.Context + hpcResourceDetails []hpcPartitionDetails + } + + p1 := hpcPartitionDetails{ + TotalAvailableNodes: 0, + PartitionName: "worf", + IsDefault: true, + TotalAllocatedNodes: 0, + TotalAvailableGpuSlots: 0, + TotalNodes: 0, + TotalGpuSlots: 0, + } + p2 := hpcPartitionDetails{ + TotalAvailableNodes: 0, + PartitionName: "data", + IsDefault: false, + TotalAllocatedNodes: 0, + TotalAvailableGpuSlots: 0, + TotalNodes: 0, + TotalGpuSlots: 1, + } + p3 := hpcPartitionDetails{ + TotalAvailableNodes: 0, + PartitionName: "picard", + IsDefault: false, + TotalAllocatedNodes: 0, + TotalAvailableGpuSlots: 0, + TotalNodes: 0, + TotalGpuSlots: 0, + } + hpc := []hpcPartitionDetails{ + p1, + } + hpc2 := []hpcPartitionDetails{ + p1, p2, + } + hpc3 := []hpcPartitionDetails{ + p1, p2, p3, + } + // One partition, no GPUs + hpc4 := []hpcPartitionDetails{ + p3, + } + + tests := []struct { + name string + fields fields + args args + wantCompute string + wantAux string + }{ + { + name: "One partition test", + fields: fields{}, + args: args{hpcResourceDetails: hpc}, + wantCompute: "worf", + wantAux: "worf", + }, + { + name: "Two partition test", + fields: fields{}, + args: args{hpcResourceDetails: hpc2}, + wantCompute: "data", + wantAux: "worf", + }, + { + name: "Three partition test", + fields: fields{}, + args: args{hpcResourceDetails: hpc3}, + wantCompute: "data", + wantAux: "worf", + }, + { + name: "No GPU partition test", + fields: fields{}, + args: args{hpcResourceDetails: hpc4}, + wantCompute: "picard", + wantAux: "picard", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + m := &dispatcherResourceManager{ + config: tt.fields.config, + apiClient: tt.fields.apiClient, + hpcResourcesManifest: tt.fields.hpcResourcesManifest, + reqList: tt.fields.reqList, + groups: tt.fields.groups, + addrToResourcesID: tt.fields.addrToResourcesID, + resourcesIDtoAddr: tt.fields.resourcesIDToAddr, + slotsUsedPerGroup: tt.fields.slotsUsedPerGroup, + dispatchIDToAllocationID: tt.fields.dispatchIDToAllocationID, + masterTLSConfig: tt.fields.masterTLSConfig, + loggingConfig: tt.fields.loggingConfig, + jobWatcher: tt.fields.jobWatcher, + authToken: tt.fields.authToken, + resourceDetails: tt.fields.resourceDetails, + defaultComputePoolPartition: tt.fields.DefaultComputePoolPartition, + defaultAuxPoolPartition: tt.fields.DefaultAuxPoolPartition, + } + compute, aux := m.selectDefaultPools(tt.args.ctx, tt.args.hpcResourceDetails) + if compute != tt.wantCompute { + t.Errorf("selectDefaultPools() compute got = %v, want %v", compute, tt.wantCompute) + } + if aux != tt.wantAux { + t.Errorf("selectDefaultPools() aux got = %v, want %v", aux, tt.wantAux) + } + }) + } +} diff --git a/master/internal/sproto/globals.go b/master/internal/sproto/globals.go index 902980b5f8bf..f2156b6b40f3 100644 --- a/master/internal/sproto/globals.go +++ b/master/internal/sproto/globals.go @@ -9,6 +9,8 @@ var ( AgentRMAddr = actor.Addr("agentRM") // K8sRMAddr is the actor address of the k8s resource manager. K8sRMAddr = actor.Addr("kubernetesRM") + // DispatcherRMAddr is the actor address of the dispatcher. + DispatcherRMAddr = actor.Addr("SlurmRM") // AgentsAddr is the actor address of the agents. AgentsAddr = actor.Addr("agents") // PodsAddr is the actor address of the pods. diff --git a/master/pkg/tasks/dispatcher_task.go b/master/pkg/tasks/dispatcher_task.go new file mode 100644 index 000000000000..ecec79476717 --- /dev/null +++ b/master/pkg/tasks/dispatcher_task.go @@ -0,0 +1,517 @@ +package tasks + +import ( + "archive/tar" + "encoding/base64" + "fmt" + "path/filepath" + "regexp" + "strconv" + "strings" + + "github.com/docker/docker/api/types/mount" + "github.com/sirupsen/logrus" + launcher "github.hpe.com/hpe/hpc-ard-launcher-go/launcher" + + "github.com/determined-ai/determined/master/pkg/archive" + "github.com/determined-ai/determined/master/pkg/cproto" + "github.com/determined-ai/determined/master/pkg/device" + "github.com/determined-ai/determined/master/pkg/etc" + "github.com/determined-ai/determined/master/pkg/model" +) + +const ( + trueValue = "true" + falseValue = "false" + // dispatcherEntrypointScriptResource is the script to handle container initialization + // before transferring to the defined entrypoint script. + dispatcherEntrypointScriptResource = "dispatcher-wrapper.sh" + dispatcherEntrypointScriptMode = 0700 + + // Content managed by dispatcher-wrapper.sh script for container-local volumes. + determinedLocalFs = "/determined_local_fs" + // Location of container-local temporary directory. + containerTmpDeterminedDir = "/determined/" +) + +// The "launcher" is very sensitive when it comes to the payload name. There +// are certain characters, such as parenthesis, commas, spaces, etc, that will +// cause the "launcher" to bomb out during the processing of the manifest. +// Therefore, we'll stick to only alpha-numberic characters, plus dashes and +// underscores. This regular expression is used to filter out all characters +// that are NOT alpha-numberic, dashes, or underscores from the task +// description that we use to construct the payload name. Presently, the task +// description looks something like "exp-118-trial-104", which contains all +// legit characters, but we must protect ourselves from any changes in the +// future which may cause this format to change and introduce, say, parenthesis +// or spaces. +var payloadNameCompiledRegEx = regexp.MustCompile(`[^a-zA-Z0-9\-_]+`) + +// ToDispatcherManifest creates the manifest that will be ultimately sent to the launcher. +// Returns: +// Manifest, launchingUserName, PayloadName, err +// +// Note: Cannot pass "req *sproto.AllocateRequest" as an argument, as it requires +// import of "github.com/determined-ai/determined/master/internal/sproto", which +// results in an "import cycle not allowed" error. +func (t *TaskSpec) ToDispatcherManifest( + masterHost string, + masterPort int, + certificateName string, + numSlots int, + slotType device.Type, + slurmPartition string, + tresSupported bool) (*launcher.Manifest, string, string, error) { + /* + * The user that the "launcher" is going to run the Determined task + * container as. Eventually, the impersonated user will likely come from the + * UID and GID that's embedded in the authentication token. But, since we're + * not performing authentication currently, pending HAL-2746, we'll just let + * the impersonated user be accepted by the "launcher" without worrying about + * the lack of security. + */ + impersonatedUser := "" + + /* + * The "AgentUserGroup.User" will be the username of the user who we will be + * launching the Determined task container as. In launcher lingo, this will + * be the "impersonated" user. There needs to be a mapping of the Determined + * user to the username that we wish to launch the Determined task container + * as. This mapping can be done via the following command, for example: + * + * det user link-with-agent-user --agent-uid 504 \ + * --agent-gid 20 \ + * --agent-user crayuser \ + * --agent-group staff \ + * determined + * + * where "determined" is the name of the Determined user and "crayuser" is + * the user we're going to be impersonating. + * + * Note that the command above needs to be run as a privileged Determined + * user, such as the "admin" user, so you may need to switch users in order + * to execute the command. For example, + * + * det user login admin + * + */ + if t.AgentUserGroup != nil { + impersonatedUser = t.AgentUserGroup.User + } + + payloadName := getPayloadName(t) + + // Create a payload + payload := launcher.NewPayloadWithDefaults() + + payload.SetName(payloadName) + payload.SetId("com.cray.analytics.capsules.generic.container") + payload.SetVersion("latest") + + payload.SetCarriers([]string{ + "com.cray.analytics.capsules.carriers.hpc.slurm.SingularityOverSlurm", + }) + + // Create payload launch parameters + launchParameters := launcher.NewLaunchParameters() + launchParameters.SetMode("batch") + + mounts, userWantsDirMountedOnTmp := getDataVolumes(t.Mounts) + + // Use the specified workDir if it is user-specified. + // If the workdir is the the default (/run/determined/workdir) + // it does not exist on the launcher node so causes and error log. + // Instead it will be set dispatcher-wrapper.sh using setting DET_WORKDIR + // So use /var/tmp here to eliminate spurious error logs. We avoid using /tmp + // here because dispatcher-wrapper.sh by default relinks /tmp to + // a container-private directory and if it is in use we faile with EBUSY. + workDir := t.WorkDir + if workDir == DefaultWorkDir { + workDir = "/var/tmp" + } + + enableNvidia := falseValue + if slotType == device.CUDA { + enableNvidia = trueValue + } + + launchParameters.SetConfiguration(map[string]string{ + "workingDir": workDir, + "enableNvidia": enableNvidia, // triggers 'singularity run --nv ...' + "enableWritableTmpFs": trueValue, // Make container filesystem writable (for links in /) + }) + if slurmPartition != "" { + launchParameters.GetConfiguration()["partition"] = slurmPartition + } + + // Determined generates tar archives including initialization, garbage collection, + // and security configuration and then maps them into generic containers when + // they are launched. The equivalent capability is provided by the launcher + // via the --custom Archive capsules argument. Encode the archives + // into a format that can be set as custom launch arguments. + encodedArchiveParams, err := encodeArchiveParameters( + dispatcherArchive(t.AgentUserGroup, + generateRunDeterminedLinkNames(t.Archives())), t.Archives()) + if err != nil { + return nil, "", "", err + } + var slurmArgs []string + slurmArgs = append(slurmArgs, t.TaskContainerDefaults.Slurm...) + slurmArgs = append(slurmArgs, t.Environment.Slurm()...) + logrus.Debugf("Custom slurm arguments: %s", slurmArgs) + encodedArchiveParams["slurmArgs"] = slurmArgs + errList := model.ValidateSlurm(slurmArgs) + if len(errList) > 0 { + logrus.WithError(errList[0]).Error("Forbidden slurm option specified") + return nil, "", "", errList[0] + } + launchParameters.SetCustom(encodedArchiveParams) + + // Add entrypoint command as argument + wrappedEntryPoint := append( + []string{determinedLocalFs + "/" + dispatcherEntrypointScriptResource}, + t.Entrypoint...) + launchParameters.SetArguments(wrappedEntryPoint) + + // We just pass through the image reference here. It may be any scheme that + // singularity supports including (docker, library, file path, etc). If + // a docker reference without scheme (the default), the launcher will attempt + // to match to a locally cached image. + launchParameters.SetImages(map[string]string{ + "default": t.Environment.Image().For(slotType), + }) + + // Add some data volumes + launchParameters.SetData(mounts) + + envVars, err := getEnvVarsForLauncherManifest( + t, masterHost, masterPort, certificateName, userWantsDirMountedOnTmp, slotType) + if err != nil { + return nil, "", "", err + } + + launchParameters.SetEnvironment(envVars) + + payload.SetLaunchParameters(*launchParameters) + + // Create payload resource requirements + resources := launcher.NewResourceRequirementsWithDefaults() + + // One task per node. + if tresSupported || numSlots == 0 { + resources.SetInstances(map[string]int32{"per-node": 1}) + } else { + // When tresSupported==false then we can't use --gpus in slurm, so map the total nodes to + // the total GPUs which will cause launcher to map SetGpus below into --gres:gpus. + resources.SetInstances(map[string]int32{ + "nodes": int32(numSlots), + "total": int32(numSlots)}) + } + // Set the required number of GPUs if the device type is CUDA (Nvidia) or RCOM (AMD). + if slotType == device.CUDA || slotType == device.ROCM { + resources.SetGpus(map[string]int32{"total": int32(numSlots)}) + } else { + resources.SetCores(map[string]float32{"total": float32(numSlots)}) + } + + payload.SetResourceRequirements(*resources) + + clientMetadata := launcher.NewClientMetadataWithDefaults() + clientMetadata.SetName("det") + + // Create & populate the manifest + manifest := *launcher.NewManifest("v1", *clientMetadata) // Manifest | The manifest to launch + manifest.SetPayloads([]launcher.Payload{*payload}) + // manifest.SetManifestVersion("latest") //? + + return &manifest, impersonatedUser, payloadName, err +} + +// Return true if the archive specified should be treated +// as per-process and not a shared volume for all processes. +// Unless configured in this list, all items are shared. It +// saves additional softlinks if we properly identify read-only +// scripts below, but it does not cause breakage if we miss one. +func makeLocalVolume(archiveItem cproto.RunArchive) bool { + // We cannot clone the ssh config because sshd will not process softlinks + if archiveItem.Archive.ContainsFilePrefix(sshDir) { + return false + } + // The helper scripts are read-only, so leave that archive as shared + if archiveItem.Archive.ContainsFilePrefix(etc.TaskLoggingSetupScriptResource) { + return false + } + // The helper scripts are read-only, so leave that archive as shared + if archiveItem.Archive.ContainsFilePrefix( + filepath.Join(runDir, etc.CommandEntrypointResource)) { + return false + } + // The helper scripts are read-only, so leave that archive as shared + if archiveItem.Archive.ContainsFilePrefix( + filepath.Join(runDir, etc.ShellEntrypointResource)) { + return false + } + // We create the run dir (/run/determined) to contain links + if archiveItem.Path == runDir || archiveItem.Path == DefaultWorkDir { + return true + } + // If the archive maps content under /run/determined, make a local volume + if archiveItem.Archive.ContainsFilePrefix(runDir) || + archiveItem.Archive.ContainsFilePrefix(DefaultWorkDir) { + return true + } + return false +} + +// Return the archives in an argument format for launcher custom Archive args. +// Encoding the files to Base64 string arguments. +func encodeArchiveParameters( + dispatcherArchive cproto.RunArchive, + archives []cproto.RunArchive) (map[string][]string, error) { + // Insert the dispatcherArchive into the list for processing (first in list) + archives = append([]cproto.RunArchive{dispatcherArchive}, archives...) + archiveStrings := make([]string, len(archives)) + + for idx, archiveItem := range archives { + runDirPrefix := "" + // Other than the dispatcherArchive (first in list), if the archive provides files + // that should be local per-container instance copies, redirect to the /dispatcher + // directory for processing during container initialization. + if idx != 0 && makeLocalVolume(archiveItem) { + runDirPrefix = determinedLocalFs + } + bytesString, err := archive.ToRelocatedTarGz( + runDirPrefix+archiveItem.Path+"/", + archiveItem.Archive) + if err != nil { + logrus.Error("Failure to create TarGz Archive", err) + return nil, err + } + archiveStrings[idx] = base64.StdEncoding.EncodeToString(bytesString) + } + + customArgs := make(map[string][]string) + customArgs["Archives"] = archiveStrings + return customArgs, nil +} + +// Gets the environment variables that are to be added to the Launcher's manifest. +func getEnvVarsForLauncherManifest( + taskSpec *TaskSpec, masterHost string, masterPort int, certificateName string, + tmpMount bool, slotType device.Type, +) (map[string]string, error) { + // Hash map containing the environment variables. + m := make(map[string]string) + + // These represent the environment variables that are set by Determined AI. + for k, v := range taskSpec.EnvVars() { + m[k] = v + } + + // For some reason, getting the user-defined environment variable requires a device type. + // Merely copying the same code that's in "ToDockerSpec()" without fully understanding + // the connection between the deviceType and the user-defined environment variables. + deviceType := device.CPU + + if len(taskSpec.Devices) > 0 { + deviceType = taskSpec.Devices[0].Type + } + + // The user-defined environment variables, if any. These come from the experiment's + // YAML file. For example, + // + // environment: + // image: "environment:cuda-11.2-tf-2.5-gpu-0.17.7.sif" + // environment_variables: + // - DETECTRON2_DATASETS=/mnt/dtrain-fsx/detectron2 + // - MY_ENV_VAR1=abc + // - MY_ENV_VAR2=xyz + envVars := taskSpec.Environment.EnvironmentVariables().For(deviceType) + + // Add each user-defined environment variable to the map. + for _, s := range envVars { + tokens := strings.Split(s, "=") + + if len(tokens) > 1 { + m[tokens[0]] = tokens[1] + } else { + return nil, fmt.Errorf("invalid user-defined environment variable '%s'", s) + } + } + + // These environment variables are required in "harness/determined/_info.py". If + // they are not set, then task container will fail. + m["DET_MASTER"] = fmt.Sprintf("%s:%d", masterHost, masterPort) + m["DET_MASTER_HOST"] = masterHost + m["DET_MASTER_IP"] = masterHost + m["DET_MASTER_PORT"] = fmt.Sprintf("%d", masterPort) + m["DET_CONTAINER_ID"] = taskSpec.ContainerID + m["DET_CLUSTER_ID"] = taskSpec.ClusterID + // On non-zero exit of any component/step of the sbatch job, terminate with an error + m["SLURM_KILL_BAD_EXIT"] = "1" + + // Some in-container setup in slurm needs to know the slot type to set other envvars correctly. + m["DET_SLOT_TYPE"] = string(slotType) + + // The "entrypoint.sh" script that's mounted by the Singularity task container + // will set the DET_SLOT_IDS environment variable when it sees that DET_AGENT_ID is + // set to "launcher". So, if you change the value here, you also need to make the + // corresponding change to "entrypoint.sh". + m["DET_AGENT_ID"] = "launcher" + + // The "master/internal/resourcemanagers/kubernetes/spec.go" checks if the + // certificate name is set before assigning it to an environment variable, so + // we're duplicating that same behavior here. + if certificateName != "" { + m["DET_MASTER_CERT_NAME"] = certificateName + } + + // If the user has not configured a bind mount of /tmp trigger + // dispatcher-wrapper.sh to make it local to the container. + if !tmpMount { + m["DET_CONTAINER_LOCAL_TMP"] = "1" + } + + if taskSpec.Environment.RegistryAuth() != nil { + m["SINGULARITY_DOCKER_USERNAME"] = taskSpec.Environment.RegistryAuth().Username + m["SINGULARITY_DOCKER_PASSWORD"] = taskSpec.Environment.RegistryAuth().Password + if len(taskSpec.Environment.RegistryAuth().ServerAddress) > 0 { + logrus.Warningf( + "NOT SUPPORTED: environment.registry_auth.serveraddress: %s ", + taskSpec.Environment.RegistryAuth().ServerAddress) + } + if len(taskSpec.Environment.RegistryAuth().Email) > 0 { + logrus.Warningf( + "NOT SUPPORTED: environment.registry_auth.email: %s ", + taskSpec.Environment.RegistryAuth().Email) + } + } + + if taskSpec.Environment.ForcePullImage() { + m["SINGULARITY_DISABLE_CACHE"] = trueValue + } + + if len(taskSpec.Environment.AddCapabilities()) > 0 { + m["SINGULARITY_ADD_CAPS"] = strings.Join(taskSpec.Environment.AddCapabilities(), ",") + } + + if len(taskSpec.Environment.DropCapabilities()) > 0 { + m["SINGULARITY_DROP_CAPS"] = strings.Join(taskSpec.Environment.DropCapabilities(), ",") + } + + // Do not auto mount the host /tmp within the container + m["SINGULARITY_NO_MOUNT"] = "tmp" + + return m, nil +} + +// Assigns the name for the payload we're going to send to the launcher. It's up for +// debate, but I figured we'd give the payload a name that we can associate with the +// experiment that's being run to allow us to better debug problems when associating +// what's in the launcher's log file to what the determined log file may have. +// +// For example, if I'm running the "determined-ee/examples/computer_vision/cifar10_pytorch" +// experiment, and that creates an experiment #107, then the payload name would be: +// +// DAI-task-container_exp-118-trial-104 +// +// The launcher, or whatever is processing the manifest sent to the launcher, doesn't +// like certain characters in the name, such as spaces, colons, or commas. +func getPayloadName(taskSpec *TaskSpec) string { + payloadName := "ai" + + // Remove all characters that are not alpha-numberic, dashes, or spaces. + experimentDescription := payloadNameCompiledRegEx.ReplaceAllString(taskSpec.Description, "") + + if len(experimentDescription) > 0 { + payloadName += "_" + experimentDescription + } + + return payloadName +} + +// Provide all task mount points as data volumes, and return true if there is a bind for /tmp +// Launcher requires that a Data object has a name; source, target & read-only are all +// that matter to Singularity. +func getDataVolumes(mounts []mount.Mount) ([]launcher.Data, bool) { + volumes := []launcher.Data{} + userWantsDirMountedOnTmp := false + + for i, mount := range mounts { + var volume = *launcher.NewData() + volume.SetName("ds" + strconv.Itoa(i)) + volume.SetSource(mount.Source) + volume.SetTarget(mount.Target) + volume.SetReadOnly(mount.ReadOnly) + volumes = append(volumes, volume) + if mount.Target == "/tmp" { + userWantsDirMountedOnTmp = true + } + } + + return volumes, userWantsDirMountedOnTmp +} + +// Create a softlink archive entry for the specified file name in the +// '/run/determined' directory to the local container temp version. +func getRunSubdirLink(aug *model.AgentUserGroup, name string) archive.Item { + return aug.OwnedArchiveItem(runDir+"/"+name, + []byte(containerTmpDeterminedDir+name), 0700, tar.TypeSymlink) +} + +// Return any paths that need to be created within /run/determined +// for unshared directories and files. +func generateRunDeterminedLinkNames( + archives []cproto.RunArchive) []string { + // Use a map as a set to avoid duplicates + linksSet := make(map[string]bool) + + for _, archive := range archives { + // If archive will be in a local volume, determine the required links + if makeLocalVolume(archive) { + for _, archiveItem := range archive.Archive { + filePath := filepath.Join(archive.Path, archiveItem.Path) + // Not the toplevel runDir, but is under it + if strings.HasPrefix(filePath, runDir) && filePath != runDir { + contained := strings.TrimPrefix(strings.TrimPrefix(filePath, runDir), "/") + // If not a file, then extract the directory name + if filepath.Base(contained) != contained { + dir, _ := filepath.Split(contained) + contained = filepath.Dir(dir) + } + linksSet[contained] = true + } + } + } + } + + // Conver the map keys to the list of link names + linkNames := []string{} + for k := range linksSet { + linkNames = append(linkNames, k) + } + return linkNames +} + +// Archive with dispatcher wrapper entrypoint script, /run/determined directory, +// and links for each entry under /run/determined for unshared files/directories. +func dispatcherArchive(aug *model.AgentUserGroup, linksNeeded []string) cproto.RunArchive { + dispatherArchive := archive.Archive{ + // Add the dispatcher wrapper script + aug.OwnedArchiveItem( + determinedLocalFs+"/"+dispatcherEntrypointScriptResource, + etc.MustStaticFile(dispatcherEntrypointScriptResource), + dispatcherEntrypointScriptMode, + tar.TypeReg, + ), + aug.OwnedArchiveItem(runDir, nil, 0700, tar.TypeDir), + } + + // Create and add each link + for _, linkName := range linksNeeded { + dispatherArchive = append(dispatherArchive, getRunSubdirLink(aug, linkName)) + logrus.Tracef("Created link for %s", linkName) + } + + return wrapArchive(dispatherArchive, "/") +} diff --git a/master/static/migrations/20220628065553_dispatcher-state-persistence.tx.down.sql b/master/static/migrations/20220628065553_dispatcher-state-persistence.tx.down.sql new file mode 100644 index 000000000000..d9d12efe812b --- /dev/null +++ b/master/static/migrations/20220628065553_dispatcher-state-persistence.tx.down.sql @@ -0,0 +1 @@ +DROP TABLE resourcemanagers_dispatcher_dispatches; diff --git a/master/static/migrations/20220628065553_dispatcher-state-persistence.tx.up.sql b/master/static/migrations/20220628065553_dispatcher-state-persistence.tx.up.sql new file mode 100644 index 000000000000..27091b9a73b2 --- /dev/null +++ b/master/static/migrations/20220628065553_dispatcher-state-persistence.tx.up.sql @@ -0,0 +1,5 @@ +CREATE TABLE resourcemanagers_dispatcher_dispatches ( + dispatch_id text PRIMARY KEY, + resource_id text NOT NULL REFERENCES allocation_resources(resource_id) ON DELETE CASCADE NOT NULL, + allocation_id text NOT NULL +); diff --git a/master/static/migrations/20220628070126_add-original-users-to-dispatches.tx.down.sql b/master/static/migrations/20220628070126_add-original-users-to-dispatches.tx.down.sql new file mode 100644 index 000000000000..117b875e0ae0 --- /dev/null +++ b/master/static/migrations/20220628070126_add-original-users-to-dispatches.tx.down.sql @@ -0,0 +1,3 @@ +ALTER TABLE resourcemanagers_dispatcher_dispatches + -- Used to cancel the job, since it must be the original user that cancels it. + DROP COLUMN impersonated_user; diff --git a/master/static/migrations/20220628070126_add-original-users-to-dispatches.tx.up.sql b/master/static/migrations/20220628070126_add-original-users-to-dispatches.tx.up.sql new file mode 100644 index 000000000000..1efd05567e9d --- /dev/null +++ b/master/static/migrations/20220628070126_add-original-users-to-dispatches.tx.up.sql @@ -0,0 +1,3 @@ +ALTER TABLE resourcemanagers_dispatcher_dispatches + -- Used to cancel the job, since it must be the original user that cancels it. + ADD COLUMN impersonated_user text NOT NULL; diff --git a/master/static/srv/dispatcher-wrapper.sh b/master/static/srv/dispatcher-wrapper.sh new file mode 100644 index 000000000000..864203a3fb1b --- /dev/null +++ b/master/static/srv/dispatcher-wrapper.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash +# Usage: +# dispather-entrypoint.sh: {realEntryPointArgs}... +# +# This is the wrapper script added around the intended determined +# entrypoint script to provide dispatcher-specific initialization +# for singularity. In particular, it processes the /determined_local_fs volume +# and clones it under /determined_local_fs/procs/# for this particular process ($SLURM_PROCID). +# It then adds softlinks for each subdirectory to re-direct it +# (via $LOCALTMP/determined/xx) -> /determined_local_fs/procs/#/run/determined/xx +# +# The links from /run/determined are provided by the DAI master dispatcher RM +# via softlinks in the archives provided to the dispatcher and expanded in-place, +# so this script only needs to handle the cloning of the file system per process +# and setting up links from $LOCALTMP/determined/xx to the local copy of +# in the /determined_local_fs/procs/# tree. +# +# This is additionally a place for all common behavior specific to SLURM/Singularity +# which addresses: +# - DET_SLOT_IDS inheritied from SLURM-provided CUDA_VISIBLE_DEVICES +# - DET_UNIQUE_PORT_OFFSET inherited from SLURM-provided least(CUDA_VISIBLE_DEVICES) + +# Fail on unexpected non-zero exit statuses. +set -e + +# Controls debug logging for this method +DEBUG=0 + + +# When the task container is invoked via SLURM, we have +# to set the slot IDs from the Slurm-provided variable. +if [ "$DET_RESOURCES_TYPE" == "slurm-job" ]; then + # One case for each device.Type in the Determined master source supported by slurm. + case $DET_SLOT_TYPE in + "cuda") + export DET_SLOT_IDS="[${CUDA_VISIBLE_DEVICES}]" + export DET_UNIQUE_PORT_OFFSET=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f1) + export DET_UNIQUE_PORT_OFFSET=${DET_UNIQUE_PORT_OFFSET:=0} + + if [ ! -z "$CUDA_VISIBLE_DEVICES" ]; then + # Test if "nvidia-smi" exists in the PATH before trying to invoking it. + if type nvidia-smi > /dev/null 2>&1 ; then + # For Nvidia GPUS, the slot IDs are the device index. Replace the + # newline characters with commas and enclose in square brackets. + # But only include GPUS that are in the CUDA_VISIBLE_DEVICES=0,1,... + VISIBLE_SLOTS="$(nvidia-smi --query-gpu=index --format=csv,noheader | sed -z 's/\n/,/g;s/,$/\n/')" + for device in ${CUDA_VISIBLE_DEVICES//,/ } ; do + if [[ ! "$VISIBLE_SLOTS" == *"$device"* ]]; then + echo "WARNING: nvidia-smi reports visible CUDA devices as ${VISIBLE_SLOTS} but does not contain ${device}. May be unable to perform CUDA operations." 1>&2 + fi + done + + else + echo "WARNING: nvidia-smi not found. May be unable to perform CUDA operations." 1>&2 + fi + else + # If CUDA_VISIBLE_DEVICES is not set, then we default DET_SLOT_IDS the same that a + # Determined agents deployment would, which should indicate to Determined to just use the + # CPU. + export DET_SLOT_IDS="[0]" + fi + ;; + + "cpu") + # For CPU only training, the "slot" we get is just the CPU, but it needs to be set. + export DET_SLOT_IDS="[0]" + export DET_UNIQUE_PORT_OFFSET=0 + ;; + + *) + echo "ERROR: unsupported slot type: ${DET_SLOT_TYPE}" + exit 1 + ;; + esac +fi + + +# Debug log method +# Args: {Level} {Message}... +log() { + if [ $DEBUG == 1 ]; then + echo -e "$*" >&2 + fi +} + +# Container-local directory to host determined directory +# With --writable-tmpfs option / is writable by the user +# and private to the container instance. +LOCALTMP=/ +# Source volume of all archives to be cloned +ROOT="/determined_local_fs" +# Base of the per-proc copy of tree +PROCDIR_ROOT="$ROOT/procs" +# Private copy of $ROOT for this $SLURM_PROCID +PROCDIR="$PROCDIR_ROOT/$SLURM_PROCID" + +# Create clone of any directories under /dispatcher for this process and setup links +if [ -d $ROOT/run ] ; then + mkdir -p $PROCDIR + for dir in $ROOT/*; do + if [[ -d $dir && $dir != $PROCDIR_ROOT ]] ; then + log "INFO: Clone $dir -> $PROCDIR" + cp -p -R $dir $PROCDIR >&2 + fi + done + + if [ -d $LOCALTMP/determined ]; then + log "ERROR: Container-private directory $LOCALTMP/determined already exists.\n$(ls -ld $LOCALTMP/determined)\nSingularity 3.7 or greater is required." + log "INFO: ls -ld $LOCALTMP $(ls -ld $LOCALTMP)" + fi + + # Container-local directory for links to container-specific /run/determined content + log "INFO: Creating $LOCALTMP/determined" + mkdir -m 0700 -p $LOCALTMP/determined >&2 + for dir in $ROOT/run/determined/*; do + dirname=${dir##*/} + log "INFO: ln -sfnT $PROCDIR/run/determined/${dirname} $LOCALTMP/determined/${dirname}" + if [ ! -w $PROCDIR/run/determined ]; then + log "ERROR: User$(id) does not have write access to $PROCDIR/run/determined/${dirname}. You may have may not have properly configured your determined agent user/group." + fi + if [ ! -w $LOCALTMP/determined ]; then + log "ERROR: User $(id) does not have write access to $LOCALTMP/determined/${dirname}. You may have may not have properly configured your determined agent user/group." + fi + ln -sfnT $PROCDIR/run/determined/${dirname} $LOCALTMP/determined/${dirname} >&2 + done +fi + +# Localize /tmp as a private folder in the container, if requested. +if [ "$DET_CONTAINER_LOCAL_TMP" == "1" ]; then + # Create a per-container tmp + mkdir -p $PROCDIR/tmp + # Replace /tmp with a link to our private + rm -rf /tmp + ln -fs $PROCDIR/tmp / + log "DEBUG: Replaced tmp $(ls -l /tmp)" +fi + + +log "INFO: Resetting workdir to $DET_WORKDIR" +cd $DET_WORKDIR + +log "INFO: executing $*" >&2 +exec $* diff --git a/tools/devcluster-casablanca-login.yaml b/tools/devcluster-casablanca-login.yaml new file mode 100644 index 000000000000..e1da10c3bd4a --- /dev/null +++ b/tools/devcluster-casablanca-login.yaml @@ -0,0 +1,81 @@ +# This startup input will cause the harness to rebuild on startup. +startup_input: "p" + +commands: + p: make -C harness build # rebuild Python + w: make -C webui build # rebuild Webui + c: make -C docs build # rebuild doCs + +# Three stages: db, master, and agent. +stages: + - db: + port: 5431 + db_name: determined + password: postgres + container_name: determined_db + image_name: "postgres:10.14" + + # data_dir is where the persistent files will be saved to. If this key + # is not present, the database will not persist at all. + data_dir: ~/.postgres + + - master: + pre: + - sh: make -C proto build + - sh: make -C master build + - sh: make -C tools prep-root + - sh: mkdir -p /tmp/determined-cp + post: + - logcheck: + regex: accepting incoming connections on port + cmdline: + - master/build/determined-master + - --config-file + - :config + + # config_file is just a master.yaml + config_file: + port: 8081 + db: + host: localhost + port: 5431 + password: postgres + user: postgres + name: determined + checkpoint_storage: + type: shared_fs + host_path: /lus/scratch/foundation_engineering/determined-cp + log: + level: debug + resource_manager: + master_host: casablanca-login.us.cray.com + master_port: 8082 + security: + tls: + skip_verify: true + host: casablanca-login.us.cray.com + port: 8043 + protocol: https + type: slurm + # Type of slot to be allocated by the SLURM scheduler. + # Default is gpu-based allocation (cuda/rocm), but this requires SLURM to be + # configure with SelectType=select/cons_tres. For systems without this + # Specify cpu, and add contraints to ensure select node have GPUs if desired. + slot_type: cuda + # File containing the authorization token for communication with the launcher -- if blank then none. + # This would typically be a full path where the determined master is running generated by + # the `dev-keytool token` command. + # If using devcluster relative to the directory from which it was invoked (typically determined-ee). + auth_file: + # Specify per-partition overrides for submitted tasks. + # partition_overrides: + # defq: + # rendezvous_network_interface: eth0 + # task_container_defaults: + # dtrain_network_interface: ib0 + # force_pull_image: true + + # This is important: we have to use the symbolic links in the + # tools/build directory to run properly. + root: tools/build + diff --git a/tools/devcluster-casablanca.yaml b/tools/devcluster-casablanca.yaml new file mode 100644 index 000000000000..ea416d4087c0 --- /dev/null +++ b/tools/devcluster-casablanca.yaml @@ -0,0 +1,80 @@ +# This startup input will cause the harness to rebuild on startup. +startup_input: "p" + +commands: + p: make -C harness build # rebuild Python + w: make -C webui build # rebuild Webui + c: make -C docs build # rebuild doCs + +# Three stages: db, master, and agent. +stages: + - db: + port: 5431 + db_name: determined + password: postgres + container_name: determined_db + image_name: "postgres:10.14" + + # data_dir is where the persistent files will be saved to. If this key + # is not present, the database will not persist at all. + data_dir: ~/.postgres + + - master: + pre: + - sh: make -C proto build + - sh: make -C master build + - sh: make -C tools prep-root + - sh: mkdir -p /tmp/determined-cp + post: + - logcheck: + regex: accepting incoming connections on port + cmdline: + - master/build/determined-master + - --config-file + - :config + + # config_file is just a master.yaml + config_file: + port: 8081 + db: + host: localhost + port: 5431 + password: postgres + user: postgres + name: determined + checkpoint_storage: + type: shared_fs + host_path: /lus/scratch/foundation_engineering/determined-cp + log: + level: debug + resource_manager: + master_host: casablanca + master_port: 8082 + host: casablanca.us.cray.com + port: 8181 + protocol: http + type: slurm + # File containing the authorization token for communication with the launcher -- if blank then none. + # This would typically be a full path where the determined master is running generated by + # the `dev-keytool token` command. + # If using devcluster relative to the directory from which it was invoked (typically determined-ee). + auth_file: + # When slurm is configured with SelectType=select/cons_tres, setting tres_supported: true + # allows us to use it is schedule GPUs more easily. For systems without this plugin, set + # this to false and add contraints to ensure select node have GPUs if desired. + tres_supported: true + # Specify per-partition overrides for submitted tasks. + # partition_overrides: + # defq: + # rendezvous_network_interface: eth0 + # # Slot type for jobs submitted to the partition. Inferred from the capabilities of + # # the partition by default. + # slot_type: [cuda,cpu,rocm] + # task_container_defaults: + # dtrain_network_interface: ib0 + # force_pull_image: true + + # This is important: we have to use the symbolic links in the + # tools/build directory to run properly. + root: tools/build + diff --git a/tools/devcluster-horizon.yaml b/tools/devcluster-horizon.yaml new file mode 100644 index 000000000000..cf7ef67b33e6 --- /dev/null +++ b/tools/devcluster-horizon.yaml @@ -0,0 +1,68 @@ +# This startup input will cause the harness to rebuild on startup. +startup_input: "p" + +commands: + p: make -C harness build # rebuild Python + w: make -C webui build # rebuild Webui + c: make -C docs build # rebuild doCs + +# Three stages: db, master, and agent. +stages: + - db: + port: 5431 + db_name: determined + password: postgres + container_name: determined_db + image_name: "postgres:10.14" + + # data_dir is where the persistent files will be saved to. If this key + # is not present, the database will not persist at all. + data_dir: ~/.postgres + + - master: + pre: + - sh: make -C proto build + - sh: make -C master build + - sh: make -C tools prep-root + - sh: mkdir -p /tmp/determined-cp + post: + - logcheck: + regex: accepting incoming connections on port + cmdline: + - master/build/determined-master + - --config-file + - :config + + # config_file is just a master.yaml + config_file: + port: 8081 + db: + host: localhost + port: 5431 + password: postgres + user: postgres + name: determined + checkpoint_storage: + type: shared_fs + host_path: /lus/scratch/foundation_engineering/determined-cp + log: + level: debug + enable_cors: true + resource_manager: + master_host: horizon + master_port: 8082 + host: horizon.us.cray.com + port: 8181 + protocol: http + type: slurm + # File containing the authorization token for communication with the launcher -- if blank then none. + # This would typically be a full path where the determined master is running generated by + # the `dev-keytool token` command. + # If using devcluster relative to the directory from which it was invoked (typically determined-ee). + auth_file: + tres_supported: false + + # This is important: we have to use the symbolic links in the + # tools/build directory to run properly. + root: tools/build + diff --git a/tools/devcluster-shuco.yaml b/tools/devcluster-shuco.yaml new file mode 100644 index 000000000000..cbdfa336e07f --- /dev/null +++ b/tools/devcluster-shuco.yaml @@ -0,0 +1,78 @@ +# This startup input will cause the harness to rebuild on startup. +startup_input: "p" + +commands: + p: make -C harness build # rebuild Python + w: make -C webui build # rebuild Webui + c: make -C docs build # rebuild doCs + +# Three stages: db, master, and agent. +stages: + - db: + port: 5431 + db_name: determined + password: postgres + container_name: determined_db + image_name: "postgres:10.14" + + # data_dir is where the persistent files will be saved to. If this key + # is not present, the database will not persist at all. + data_dir: ~/.postgres + + - master: + pre: + - sh: make -C proto build + - sh: make -C master build + - sh: make -C tools prep-root + - sh: mkdir -p /tmp/determined-cp + post: + - logcheck: + regex: accepting incoming connections on port + cmdline: + - master/build/determined-master + - --config-file + - :config + + # config_file is just a master.yaml + config_file: + port: 8081 + db: + host: localhost + port: 5431 + password: postgres + user: postgres + name: determined + checkpoint_storage: + type: shared_fs + host_path: /home/launcher/determined-cp + log: + level: debug + resource_manager: + master_host: shuco + master_port: 8082 + host: shuco.us.cray.com + port: 8181 + protocol: http + type: slurm + # Type of slot to be allocated by the SLURM scheduler. + # Default is gpu-based allocation (cuda/rocm), but this requires SLURM to be + # configure with SelectType=select/cons_tres. For systems without this + # Specify cpu, and add contraints to ensure select node have GPUs if desired. + slot_type: cpu + # File containing the authorization token for communication with the launcher -- if blank then none. + # This would typically be a full path where the determined master is running generated by + # the `dev-keytool token` command. + # If using devcluster relative to the directory from which it was invoked (typically determined-ee). + auth_file: + # Specify per-partition overrides for submitted tasks. + # partition_overrides: + # defq: + # rendezvous_network_interface: eth0 + # task_container_defaults: + # dtrain_network_interface: ib0 + # force_pull_image: true + + # This is important: we have to use the symbolic links in the + # tools/build directory to run properly. + root: tools/build + diff --git a/tools/devcluster-slurm.yaml b/tools/devcluster-slurm.yaml new file mode 100644 index 000000000000..047280f49c3f --- /dev/null +++ b/tools/devcluster-slurm.yaml @@ -0,0 +1,88 @@ +# This is a generic devcluster config with variables for the target +# Slurm test systems. It is intended to be invoked via the +# tools/slurmcluster.sh script, which is customized to support +# per-user tunnel configuration, and per-system settings. + +# This startup input will cause the harness to rebuild on startup. +startup_input: "p" + +commands: + p: make -C harness build # rebuild Python + w: make -C webui build # rebuild Webui + c: make -C docs build # rebuild doCs + +# Three stages: db, master, and agent. +stages: + - db: + port: 5431 + db_name: determined + password: postgres + container_name: determined_db + image_name: "postgres:10.14" + + # data_dir is where the persistent files will be saved to. If this key + # is not present, the database will not persist at all. + data_dir: ~/.postgres + + - master: + pre: + - sh: make -C proto build + - sh: make -C master build + - sh: make -C tools prep-root + - sh: mkdir -p /tmp/determined-cp + post: + - logcheck: + regex: accepting incoming connections on port + cmdline: + - master/build/determined-master + - --config-file + - :config + + # config_file is just a master.yaml + config_file: + port: 8081 + db: + host: localhost + port: 5431 + password: postgres + user: postgres + name: determined + checkpoint_storage: + type: shared_fs + host_path: $OPT_CHECKPOINTPATH + log: + level: $OPT_DEBUGLEVEL + resource_manager: + master_host: $OPT_MASTERHOST + master_port: $OPT_MASTERPORT + host: $OPT_LAUNCHERHOST + port: $OPT_LAUNCHERPORT + protocol: $OPT_LAUNCHERPROTOCOL + security: + tls: + skip_verify: true + type: slurm + # File containing the authorization token for communication with the launcher -- if blank then none. + # This would typically be a full path where the determined master is running generated by + # the `dev-keytool token` command. + # If using devcluster relative to the directory from which it was invoked (typically determined-ee). + auth_file: + # When slurm is configured with SelectType=select/cons_tres, setting tres_supported: true + # allows us to use it is schedule GPUs more easily. For systems without this plugin, set + # this to false and add contraints to ensure select node have GPUs if desired. + tres_supported: $OPT_TRESSUPPORTED + rendezvous_network_interface: $OPT_RENDEVOUSIFACE + # Specify per-partition overrides for submitted tasks. + # partition_overrides: + # defq: + # rendezvous_network_interface: eth0 + # # Slot type for jobs submitted to the partition. Inferred from the capabilities of + # # the partition by default. + # slot_type: [cuda,cpu,rocm] + # task_container_defaults: + # dtrain_network_interface: ib0 + # force_pull_image: true + + # This is important: we have to use the symbolic links in the + # tools/build directory to run properly. + root: tools/build diff --git a/tools/slurmcluster.sh b/tools/slurmcluster.sh new file mode 100755 index 000000000000..e535502864df --- /dev/null +++ b/tools/slurmcluster.sh @@ -0,0 +1,185 @@ +#!/bin/bash +# +# This dev script is a wrapper on the devcluster tool, and provides +# per-user and per cluster configuration of the devcluster-slurm.yaml +# file to enable it to be used for our various clusters. It dynamically +# fills in the variables within devcluster-slurm.yaml such that the original +# source need not be modified. By default it also starts/stops SSH +# tunnels inbound to launcher, and outbound to the desktop master. +# +# Pre-requisites: +# 1) Configure your USERPORT_${USER} port below using your login name on +# the desktop that you are using. +# 2) Unless you specify both -n -x, you must have password-less ssh configured to the +# target cluster, to enable the ssh connection without prompts. +# +# ssh-copy-id {cluster} +# +INTUNNEL=1 +TUNNEL=1 +if [[ $1 == '-n' ]]; then + INTUNNEL= + shift +fi +if [[ $1 == '-x' ]]; then + TUNNEL= + shift +fi +if [[ $1 == '-t' ]]; then + TRACE=1 + shift +fi + +if [[ $1 == '-h' || $1 == '--help' || -z $1 ]] ; then + echo "Usage: $0 [-h] [-n] [-x] [-t] {cluster}" + echo " -h This help message. Options are order sensitive." + echo " -n Disable start of the inbound tunnel (when using Cisco AnyConnect)." + echo " -x Disable start of personal tunnel back to master (if you have done so manually)." + echo " -t Force debug level to trace regardless of cluster configuration value." + echo + echo "Documentation:" + head -n 17 $0 + exit 1 +fi + +CLUSTER=$1 + +function lookup() { + echo "${!1}" +} + +# Setup the reverse tunnel back to the master running locally +function mktunnel() { + MASTER_HOST=$1 + MASTER_PORT=$2 + SSH_HOST=$3 + ssh -NR ${MASTER_HOST}:${MASTER_PORT}:localhost:8081 ${SSH_HOST} +} + +# Setup the inbound tunnel to enable access to the launcher +function mkintunnel() { + MASTER_HOST=$1 + MASTER_PORT=$2 + SSH_HOST=$3 + ssh -NL ${MASTER_PORT}:${MASTER_HOST}:${MASTER_PORT} ${SSH_HOST} +} + +# Update your username/port pair +USERPORT_stokc=8084 +USERPORT_rcorujo=8085 +USERPORT_phillipgaisford=8086 +USERPORT_pankaj=8087 +USERPORT_alyssa=8088 +USERPORT_charles=8089 +USERPORT_jerryharrow=8090 +USERPORT_cam=8091 +USERPORT_cobble=8092 + +USERPORT=$(lookup "USERPORT_$USER") +if [ -z $USERPORT ]; then + echo "$0: User $USER does not have a configured port, update the script." + exit 1 +fi + +if [ $CLUSTER == "casablanca-login" ]; then + CLUSTER=casablanca_login +elif [ $CLUSTER != "casablanca" -a $CLUSTER != "horizon" -a $CLUSTER != "shuco" ]; then + echo "$0: Cluster name $CLUSTER does not have a configuration. Specify one of: casablanca, casablanca-login, horizon, shuco" + exit 1 +fi + +# Configuration for casablanca +OPT_name_casablanca=casablanca.us.cray.com +OPT_LAUNCHERHOST_casablanca=localhost +OPT_LAUNCHERPORT_casablanca=8181 +OPT_LAUNCHERPROTOCOL_casablanca=http +OPT_CHECKPOINTPATH_casablanca=/lus/scratch/foundation_engineering/determined-cp +OPT_DEBUGLEVEL_casablanca=debug +OPT_MASTERHOST_casablanca=casablanca +OPT_MASTERPORT_casablanca=$USERPORT +OPT_TRESSUPPORTED_casablanca=true +OPT_PROTOCOL_casablanca=http + +# Configuration for horizon +OPT_name_horizon=horizon.us.cray.com +OPT_LAUNCHERHOST_horizon=localhost +OPT_LAUNCHERPORT_horizon=8181 +OPT_LAUNCHERPROTOCOL_horizon=http +OPT_CHECKPOINTPATH_horizon=/lus/scratch/foundation_engineering/determined-cp +OPT_DEBUGLEVEL_horizon=debug +OPT_MASTERHOST_horizon=horizon +OPT_MASTERPORT_horizon=$USERPORT +OPT_TRESSUPPORTED_horizon=false +OPT_PROTOCOL_horizon=http + +# Configuration for casablanca-login (uses suffix casablanca_login) +OPT_name_casablanca_login=casablanca-login.us.cray.com +OPT_LAUNCHERHOST_casablanca_login=localhost +OPT_LAUNCHERPORT_casablanca_login=8443 +OPT_LAUNCHERPROTOCOL_casablanca_login=https +OPT_CHECKPOINTPATH_casablanca_login=/lus/scratch/foundation_engineering/determined-cp +OPT_DEBUGLEVEL_casablanca_login=debug +OPT_MASTERHOST_casablanca_login=casablanca-login +OPT_MASTERPORT_casablanca_login=$USERPORT +OPT_TRESSUPPORTED_casablanca_login=true + +# Configuration for shuco +OPT_name_shuco=shuco.us.cray.com +OPT_LAUNCHERHOST_shuco=localhost +OPT_LAUNCHERPORT_shuco=8181 +OPT_LAUNCHERPROTOCOL_shuco=http +OPT_CHECKPOINTPATH_shuco=/home/launcher/determined-cp +OPT_DEBUGLEVEL_shuco=debug +OPT_MASTERHOST_shuco=admin.head.cm.us.cray.com +OPT_MASTERPORT_shuco=$USERPORT +OPT_TRESSUPPORTED_shuco=false +OPT_PROTOCOL_shuco=http +OPT_RENDEVOUSIFACE_shuco=bond0 + +export OPT_LAUNCHERHOST=$(lookup "OPT_LAUNCHERHOST_$CLUSTER") +export OPT_LAUNCHERPORT=$(lookup "OPT_LAUNCHERPORT_$CLUSTER") +export OPT_LAUNCHERPROTOCOL=$(lookup "OPT_LAUNCHERPROTOCOL_$CLUSTER") +export OPT_CHECKPOINTPATH=$(lookup "OPT_CHECKPOINTPATH_$CLUSTER") +export OPT_DEBUGLEVEL=$(lookup "OPT_DEBUGLEVEL_$CLUSTER") +export OPT_MASTERHOST=$(lookup "OPT_MASTERHOST_$CLUSTER") +export OPT_MASTERPORT=$(lookup "OPT_MASTERPORT_$CLUSTER") +export OPT_TRESSUPPORTED=$(lookup "OPT_TRESSUPPORTED_$CLUSTER") +export OPT_RENDEVOUSIFACE=$(lookup "OPT_RENDEVOUSIFACE_$CLUSTER") + + +SLURMCLUSTER=$(lookup "OPT_name_$CLUSTER") +if [[ -z $SLURMCLUSTER ]]; then + echo "$0: Cluster name $CLUSTER does not have a configuration. Specify one of: $(set -o posix; set | grep OPT_name | cut -f 2 -d =)." + exit 1 +fi + +if [[ -z $INTUNNEL ]]; then + OPT_LAUNCHERHOST=$SLURMCLUSTER +fi + +if [[ -n $TRACE ]]; then + export OPT_DEBUGLEVEL=trace +fi + + +echo +echo "Configuration Used:" +printenv |grep OPT_ +echo + +# Terminate our tunnels on exit +trap "kill 0" EXIT +if [[ -n $INTUNNEL ]]; then + mkintunnel $OPT_MASTERHOST $OPT_LAUNCHERPORT $SLURMCLUSTER & +fi +if [[ -n $TUNNEL ]]; then + mktunnel $OPT_MASTERHOST $OPT_MASTERPORT $SLURMCLUSTER & +fi + + +# Although devcluster supports variables, numeric values fail to load, so +# Manually apply those into a temp file. +TEMPYAML=/tmp/devcluster-$CLUSTER.yaml +rm -f $TEMPYAML +envsubst < tools/devcluster-slurm.yaml > $TEMPYAML +devcluster -c $TEMPYAML --oneshot