diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9cd4fd846..287f2caef 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -114,7 +114,7 @@ build_server: # cd cmd/dotmesh-server; # docker build -t dotmesh-server . || (sleep 30; docker build -t dotmesh-server .) # cd ../.. -# ./smoke.sh $(pwd)/binaries/Linux/dm dotmesh-server +# ./scripts/smoke.sh $(pwd)/binaries/Linux/dm dotmesh-server # docker system prune -fa # " @@ -127,14 +127,8 @@ macos_docker_stable: - macos - docker-stable script: - - cd cmd/dotmesh-server - # Possible to build just the "prod" image because binaries got passed - # as artifact in cmd/dotmesh-server/target - # TODO: pull the built image from $CI_DOCKER_REGISTRY, rather than - # rebuilding it here. - - docker build -t dotmesh-server . || (sleep 30; docker build -t dotmesh-server .) - - cd ../.. - - ./smoke.sh $(pwd)/binaries/Darwin/dm dotmesh-server + - docker pull $CI_DOCKER_REGISTRY/dotmesh-server:$CI_DOCKER_TAG + - ./scripts/smoke.sh $(pwd)/binaries/Darwin/dm dotmesh-server - docker version - which docker - docker container prune -f @@ -148,14 +142,23 @@ macos_docker_edge: - macos - docker-edge script: - - cd cmd/dotmesh-server - # Possible to build just the "prod" image because binaries got passed - # as artifact in cmd/dotmesh-server/target - # TODO: pull the built image from $CI_DOCKER_REGISTRY, rather than - # rebuilding it here. - - docker build -t dotmesh-server . || (sleep 30; docker build -t dotmesh-server .) - - cd ../.. - - ./smoke.sh $(pwd)/binaries/Darwin/dm dotmesh-server + - docker pull $CI_DOCKER_REGISTRY/dotmesh-server:$CI_DOCKER_TAG + - ./scripts/smoke.sh $(pwd)/binaries/Darwin/dm dotmesh-server + - docker version + - which docker + - docker container prune -f + +linux_smoke: + stage: test + dependencies: + - build_server + - build_client_linux + tags: + - ubuntu + - fast + script: + - docker pull $CI_DOCKER_REGISTRY/dotmesh-server:$CI_DOCKER_TAG + - ./scripts/smoke.sh $(pwd)/binaries/Linux/dm dotmesh-server - docker version - which docker - docker container prune -f diff --git a/cmd/dotmesh-server/pkg/main/docker.go b/cmd/dotmesh-server/pkg/main/docker.go index 9cfeb6274..e36bc4b09 100644 --- a/cmd/dotmesh-server/pkg/main/docker.go +++ b/cmd/dotmesh-server/pkg/main/docker.go @@ -129,7 +129,7 @@ func newContainerMountSymlink(name VolumeName, filesystemId string, subvolume st // Do we need to create the subvolume directory? if _, err := os.Stat(result); err != nil { if os.IsNotExist(err) { - if err := os.MkdirAll(result, 0755); err != nil { + if err := os.MkdirAll(result, 0777); err != nil { log.Printf("[newContainerMountSymlink] error creating subdot %s: %+v", result, err) return "", err } diff --git a/cmd/dotmesh-server/pkg/main/statemachines.go b/cmd/dotmesh-server/pkg/main/statemachines.go index 1decbd8e0..d0a53fa9f 100644 --- a/cmd/dotmesh-server/pkg/main/statemachines.go +++ b/cmd/dotmesh-server/pkg/main/statemachines.go @@ -689,9 +689,7 @@ func activeState(f *fsMachine) stateFn { // refuse to move if we have any containers running containers, err := f.containersRunning() if err != nil { - log.Printf( - "Can't move filesystem while we can't list whether containers are using it", - ) + log.Printf("Can't move filesystem while we can't list whether containers are using it") f.innerResponses <- &Event{ Name: "error-listing-containers-during-move", Args: &EventArgs{"err": err}, @@ -849,7 +847,7 @@ func activeState(f *fsMachine) stateFn { // fail if any containers running containers, err := f.containersRunning() if err != nil { - log.Printf("Can't unmount filesystem while containers are using it") + log.Printf("Can't unmount filesystem when we are unable to list containers using it") f.innerResponses <- &Event{ Name: "error-listing-containers-during-unmount", Args: &EventArgs{"err": err}, @@ -857,6 +855,7 @@ func activeState(f *fsMachine) stateFn { return backoffState } if len(containers) > 0 { + log.Printf("Can't unmount filesystem while containers are using it") f.innerResponses <- &Event{ Name: "cannot-unmount-while-running-containers", Args: &EventArgs{"containers": containers}, diff --git a/cmd/dotmesh-server/pkg/operator/main.go b/cmd/dotmesh-server/pkg/operator/main.go index e77566458..aaa58139a 100644 --- a/cmd/dotmesh-server/pkg/operator/main.go +++ b/cmd/dotmesh-server/pkg/operator/main.go @@ -472,6 +472,17 @@ func (c *dotmeshController) process() error { // At this point, we believe this is a valid running Dotmesh pod. // That node has a dotmesh, so isn't undotted. + + // IDEA: We could try and health-check the pod if we find its IP + // and send it a Dotmesh RPC call, but we need to be careful NOT + // to consider pods still in the throes of startup broken and + // mark them for death. Perhaps we need to compare their age + // against a timeout value, and allow health-check failures for + // pods younger than a certain age. But how to set that age? On + // a busy cluster with a flakey Internet connection, could image + // fetching take an age? Perhaps we only eliminate "Running" + // pods that don't respond to a health-check over a certain age? + // Where do we draw the line? glog.V(2).Infof("Observing pod %s running %s on %s (status: %s)", podName, image, boundNode, dotmesh.Status.Phase) delete(undottedNodes, boundNode) } diff --git a/kubernetes/rebuild.sh b/kubernetes/rebuild.sh index 1ae3efed6..7031f75e5 100755 --- a/kubernetes/rebuild.sh +++ b/kubernetes/rebuild.sh @@ -17,9 +17,7 @@ fi sed "s/DOCKER_TAG/$CI_DOCKER_TAG/" < dotmesh.yaml > $OUT/dotmesh-k8s-1.7.yaml sed "s_rbac.authorization.k8s.io/v1beta1_rbac.authorization.k8s.io/v1_"< $OUT/dotmesh-k8s-1.7.yaml > $OUT/dotmesh-k8s-1.8.yaml -cp $OUT/dotmesh-k8s-1.8.yaml $OUT/dotmesh-k8s-1.8.gke.yaml sed "s_ClusterIP_LoadBalancer_" < $OUT/dotmesh-k8s-1.8.yaml > $OUT/dotmesh-k8s-1.8.aks.yaml -cp $OUT/dotmesh-k8s-1.7.yaml $OUT/dotmesh-k8s-1.7.gke.yaml cp configmap.yaml $OUT/configmap.yaml sed "s_/usr/libexec/kubernetes/kubelet-plugins/volume/exec_/home/kubernetes/flexvolume_" < configmap.yaml > $OUT/configmap.gke.yaml diff --git a/scripts/smoke.sh b/scripts/smoke.sh new file mode 100755 index 000000000..4a04484d1 --- /dev/null +++ b/scripts/smoke.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash +set -xe + +# Smoke test to see whether basics still work on e.g. macOS; also tests the + +DM="$1" +VOL="volume_`date +%s`" +IMAGE="${CI_DOCKER_REGISTRY:-`hostname`.local:80/dotmesh}/"$2":${CI_DOCKER_TAG:-latest}" + +# We use a bespoke config path to isolate us from other runs (although +# we do hog the node's docker state, so it's far from perfect) + +CONFIG=/tmp/smoke_test_$$.dmconfig +trap 'rm "$CONFIG" || true' EXIT + +sudo "$DM" -c "$CONFIG" cluster reset || (sleep 30; sudo "$DM" cluster reset) || true + +echo "### Installing image ${IMAGE}" + +"$DM" -c "$CONFIG" cluster init --offline --image "$IMAGE" + +echo "### Testing docker run..." + +docker run --rm -i --name smoke -v "$VOL:/foo" --volume-driver dm ubuntu touch /foo/X + +echo "### Testing list..." + +OUT=`"$DM" -c "$CONFIG" list` + +if [[ $OUT == *"$VOL"* ]]; then + echo "String '$VOL' found, yay!" +else + echo "String '$VOL' not found, boo :(" + exit 1 +fi + +echo "### Testing commit..." + +"$DM" -c "$CONFIG" switch "$VOL" +"$DM" -c "$CONFIG" commit -m 'Test commit' + +OUT=`"$DM" -c "$CONFIG" log` + +if [[ $OUT == *"Test commit"* ]]; then + echo "Commit found, yay!" +else + echo "Commit not found, boo :(" + exit 1 +fi + +if [ x$SMOKE_TEST_REMOTE != x ] +then + echo "### Testing push to remote..." + REMOTE="smoke_test_`date +%s`" + + (set +x; echo "$SMOKE_TEST_APIKEY"; set -x) | "$DM" -c "$CONFIG" remote add "$REMOTE" "$SMOKE_TEST_REMOTE" + + "$DM" -c "$CONFIG" push "$REMOTE" "$VOL" + + "$DM" -c "$CONFIG" remote switch "$REMOTE" + OUT=`"$DM" -c "$CONFIG" list` + + if [[ $OUT == *"$VOL"* ]]; then + echo "String '$VOL' found on the remote, yay!" + else + echo "String '$VOL' not found on the remote, boo :(" + exit 1 + fi + + echo "### Testing delete on remote..." + + REMOTE_NAME="`echo $SMOKE_TEST_REMOTE | sed s/@.*$//`" + "$DM" -c "$CONFIG" dot delete -f "$REMOTE_NAME"/"$VOL" + + "$DM" -c "$CONFIG" remote switch local + "$DM" -c "$CONFIG" remote rm "$REMOTE" +fi + +exit 0 diff --git a/smoke.sh b/smoke.sh deleted file mode 100755 index 8899bc618..000000000 --- a/smoke.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash -set -xe - -# Smoke test to see whether basics still work on e.g. macOS - -DM=$1 -VOL="volume_`date +%s`" - -sudo $DM cluster reset || (sleep 30; sudo $DM cluster reset) || true - -$DM cluster init --offline --image dotmesh-server - -docker run --rm -i --name smoke -v $VOL:/foo --volume-driver dm ubuntu touch /foo/X -OUT=`$DM list` - -if [[ $OUT == *"$VOL"* ]]; then - echo "String '$VOL' found, yay!" - exit 0 -else - echo "String '$VOL' not found, boo :(" - exit 1 -fi diff --git a/tests/acceptance_test.go b/tests/acceptance_test.go index dd3510b47..0a350fa92 100644 --- a/tests/acceptance_test.go +++ b/tests/acceptance_test.go @@ -952,9 +952,9 @@ func TestTwoNodesSameCluster(t *testing.T) { citools.RunOnNode(t, node1, citools.DockerRun(fsname)+" sh -c 'echo WORLD > /foo/HELLO'") citools.RunOnNode(t, node1, "dm switch "+fsname) citools.RunOnNode(t, node1, "dm commit -m 'First commit'") - ensureCurrentDotIsFullyReplicated(t, node1) - citools.RunOnNode(t, node1, "dm dot show") - citools.RunOnNode(t, node2, "dm dot show") + ensureDotIsFullyReplicated(t, node1, fsname) + ensureDotIsFullyReplicated(t, node2, fsname) + time.Sleep(3 * time.Second) fsId := strings.TrimSpace(citools.OutputFromRunOnNode(t, node1, "dm dot show -H | grep masterBranchId | cut -f 2")) @@ -2249,13 +2249,14 @@ spec: citools.DumpTiming() } -func ensureCurrentDotIsFullyReplicated(t *testing.T, node string) { +func ensureDotIsFullyReplicated(t *testing.T, node string, fsname string) { for try := 1; try <= 5; try++ { - fmt.Printf("Dotmesh containers running on %s: ", node) - st := citools.OutputFromRunOnNode(t, node, "dm dot show | grep missing || true") - if st == "" { + st := citools.OutputFromRunOnNode(t, node, fmt.Sprintf("dm dot show %s", fsname)) + if !strings.Contains(st, "missing") { + fmt.Print("Replicated") return } else { + fmt.Print("Failed to replicate, sleeping and retrying") time.Sleep(1 * time.Second) } }