Skip to content

Commit

Permalink
Merge branch 'master' into 391-dm-run
Browse files Browse the repository at this point in the history
* master:
  NFC: More logging
  dotscience#3 make subdot roots writeable by all, for containers which run as non-root
  FIX: Missed space :-( Testing stuff in CI is tedious.
  FIX: Missed the `-c` option to the `dm dot delete...`
  FIX: Typo...
  #17: Pull the right image, use a dedicated config, and test `dm dot delete` on the remote
  NFC: Test adding sleep to ensure replication.
  #17: Avoid echoing the API key, and run the smoke tests on Linux (it's easier for me to debug them there)
  #17: Made the smoke test push to a remote cluster (if credentials are passed into SMOKE_TEST_REMOTE and SMOKE_TEST_APIKEY).
  NFC: Fix logging on error messages
  #352: Attempt to reduce flakiness by checking replication status on both nodes in a cluster
  NFC: Comments concerning pod health checking
  NFC: Re-enable flaky test for debugging
  #344: We no longer need the GKE yamls (that's handled in the ConfigMap), and they're not referenced from the docs any more.
  NFC: fix typo sneaked into yaml
  NFC: Comment out test until we can work out how to fix it
  • Loading branch information
binocarlos committed May 9, 2018
2 parents 43d80f9 + 277d71f commit 1059f0c
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 53 deletions.
37 changes: 20 additions & 17 deletions .gitlab-ci.yml
Expand Up @@ -114,7 +114,7 @@ build_server:
# cd cmd/dotmesh-server;
# docker build -t dotmesh-server . || (sleep 30; docker build -t dotmesh-server .)
# cd ../..
# ./smoke.sh $(pwd)/binaries/Linux/dm dotmesh-server
# ./scripts/smoke.sh $(pwd)/binaries/Linux/dm dotmesh-server
# docker system prune -fa
# "

Expand All @@ -127,14 +127,8 @@ macos_docker_stable:
- macos
- docker-stable
script:
- cd cmd/dotmesh-server
# Possible to build just the "prod" image because binaries got passed
# as artifact in cmd/dotmesh-server/target
# TODO: pull the built image from $CI_DOCKER_REGISTRY, rather than
# rebuilding it here.
- docker build -t dotmesh-server . || (sleep 30; docker build -t dotmesh-server .)
- cd ../..
- ./smoke.sh $(pwd)/binaries/Darwin/dm dotmesh-server
- docker pull $CI_DOCKER_REGISTRY/dotmesh-server:$CI_DOCKER_TAG
- ./scripts/smoke.sh $(pwd)/binaries/Darwin/dm dotmesh-server
- docker version
- which docker
- docker container prune -f
Expand All @@ -148,14 +142,23 @@ macos_docker_edge:
- macos
- docker-edge
script:
- cd cmd/dotmesh-server
# Possible to build just the "prod" image because binaries got passed
# as artifact in cmd/dotmesh-server/target
# TODO: pull the built image from $CI_DOCKER_REGISTRY, rather than
# rebuilding it here.
- docker build -t dotmesh-server . || (sleep 30; docker build -t dotmesh-server .)
- cd ../..
- ./smoke.sh $(pwd)/binaries/Darwin/dm dotmesh-server
- docker pull $CI_DOCKER_REGISTRY/dotmesh-server:$CI_DOCKER_TAG
- ./scripts/smoke.sh $(pwd)/binaries/Darwin/dm dotmesh-server
- docker version
- which docker
- docker container prune -f

linux_smoke:
stage: test
dependencies:
- build_server
- build_client_linux
tags:
- ubuntu
- fast
script:
- docker pull $CI_DOCKER_REGISTRY/dotmesh-server:$CI_DOCKER_TAG
- ./scripts/smoke.sh $(pwd)/binaries/Linux/dm dotmesh-server
- docker version
- which docker
- docker container prune -f
Expand Down
2 changes: 1 addition & 1 deletion cmd/dotmesh-server/pkg/main/docker.go
Expand Up @@ -129,7 +129,7 @@ func newContainerMountSymlink(name VolumeName, filesystemId string, subvolume st
// Do we need to create the subvolume directory?
if _, err := os.Stat(result); err != nil {
if os.IsNotExist(err) {
if err := os.MkdirAll(result, 0755); err != nil {
if err := os.MkdirAll(result, 0777); err != nil {
log.Printf("[newContainerMountSymlink] error creating subdot %s: %+v", result, err)
return "", err
}
Expand Down
7 changes: 3 additions & 4 deletions cmd/dotmesh-server/pkg/main/statemachines.go
Expand Up @@ -689,9 +689,7 @@ func activeState(f *fsMachine) stateFn {
// refuse to move if we have any containers running
containers, err := f.containersRunning()
if err != nil {
log.Printf(
"Can't move filesystem while we can't list whether containers are using it",
)
log.Printf("Can't move filesystem while we can't list whether containers are using it")
f.innerResponses <- &Event{
Name: "error-listing-containers-during-move",
Args: &EventArgs{"err": err},
Expand Down Expand Up @@ -849,14 +847,15 @@ func activeState(f *fsMachine) stateFn {
// fail if any containers running
containers, err := f.containersRunning()
if err != nil {
log.Printf("Can't unmount filesystem while containers are using it")
log.Printf("Can't unmount filesystem when we are unable to list containers using it")
f.innerResponses <- &Event{
Name: "error-listing-containers-during-unmount",
Args: &EventArgs{"err": err},
}
return backoffState
}
if len(containers) > 0 {
log.Printf("Can't unmount filesystem while containers are using it")
f.innerResponses <- &Event{
Name: "cannot-unmount-while-running-containers",
Args: &EventArgs{"containers": containers},
Expand Down
11 changes: 11 additions & 0 deletions cmd/dotmesh-server/pkg/operator/main.go
Expand Up @@ -472,6 +472,17 @@ func (c *dotmeshController) process() error {

// At this point, we believe this is a valid running Dotmesh pod.
// That node has a dotmesh, so isn't undotted.

// IDEA: We could try and health-check the pod if we find its IP
// and send it a Dotmesh RPC call, but we need to be careful NOT
// to consider pods still in the throes of startup broken and
// mark them for death. Perhaps we need to compare their age
// against a timeout value, and allow health-check failures for
// pods younger than a certain age. But how to set that age? On
// a busy cluster with a flakey Internet connection, could image
// fetching take an age? Perhaps we only eliminate "Running"
// pods that don't respond to a health-check over a certain age?
// Where do we draw the line?
glog.V(2).Infof("Observing pod %s running %s on %s (status: %s)", podName, image, boundNode, dotmesh.Status.Phase)
delete(undottedNodes, boundNode)
}
Expand Down
2 changes: 0 additions & 2 deletions kubernetes/rebuild.sh
Expand Up @@ -17,9 +17,7 @@ fi

sed "s/DOCKER_TAG/$CI_DOCKER_TAG/" < dotmesh.yaml > $OUT/dotmesh-k8s-1.7.yaml
sed "s_rbac.authorization.k8s.io/v1beta1_rbac.authorization.k8s.io/v1_"< $OUT/dotmesh-k8s-1.7.yaml > $OUT/dotmesh-k8s-1.8.yaml
cp $OUT/dotmesh-k8s-1.8.yaml $OUT/dotmesh-k8s-1.8.gke.yaml
sed "s_ClusterIP_LoadBalancer_" < $OUT/dotmesh-k8s-1.8.yaml > $OUT/dotmesh-k8s-1.8.aks.yaml
cp $OUT/dotmesh-k8s-1.7.yaml $OUT/dotmesh-k8s-1.7.gke.yaml

cp configmap.yaml $OUT/configmap.yaml
sed "s_/usr/libexec/kubernetes/kubelet-plugins/volume/exec_/home/kubernetes/flexvolume_" < configmap.yaml > $OUT/configmap.gke.yaml
Expand Down
79 changes: 79 additions & 0 deletions scripts/smoke.sh
@@ -0,0 +1,79 @@
#!/usr/bin/env bash
set -xe

# Smoke test to see whether basics still work on e.g. macOS; also tests the

DM="$1"
VOL="volume_`date +%s`"
IMAGE="${CI_DOCKER_REGISTRY:-`hostname`.local:80/dotmesh}/"$2":${CI_DOCKER_TAG:-latest}"

# We use a bespoke config path to isolate us from other runs (although
# we do hog the node's docker state, so it's far from perfect)

CONFIG=/tmp/smoke_test_$$.dmconfig
trap 'rm "$CONFIG" || true' EXIT

sudo "$DM" -c "$CONFIG" cluster reset || (sleep 30; sudo "$DM" cluster reset) || true

echo "### Installing image ${IMAGE}"

"$DM" -c "$CONFIG" cluster init --offline --image "$IMAGE"

echo "### Testing docker run..."

docker run --rm -i --name smoke -v "$VOL:/foo" --volume-driver dm ubuntu touch /foo/X

echo "### Testing list..."

OUT=`"$DM" -c "$CONFIG" list`

if [[ $OUT == *"$VOL"* ]]; then
echo "String '$VOL' found, yay!"
else
echo "String '$VOL' not found, boo :("
exit 1
fi

echo "### Testing commit..."

"$DM" -c "$CONFIG" switch "$VOL"
"$DM" -c "$CONFIG" commit -m 'Test commit'

OUT=`"$DM" -c "$CONFIG" log`

if [[ $OUT == *"Test commit"* ]]; then
echo "Commit found, yay!"
else
echo "Commit not found, boo :("
exit 1
fi

if [ x$SMOKE_TEST_REMOTE != x ]
then
echo "### Testing push to remote..."
REMOTE="smoke_test_`date +%s`"

(set +x; echo "$SMOKE_TEST_APIKEY"; set -x) | "$DM" -c "$CONFIG" remote add "$REMOTE" "$SMOKE_TEST_REMOTE"

"$DM" -c "$CONFIG" push "$REMOTE" "$VOL"

"$DM" -c "$CONFIG" remote switch "$REMOTE"
OUT=`"$DM" -c "$CONFIG" list`

if [[ $OUT == *"$VOL"* ]]; then
echo "String '$VOL' found on the remote, yay!"
else
echo "String '$VOL' not found on the remote, boo :("
exit 1
fi

echo "### Testing delete on remote..."

REMOTE_NAME="`echo $SMOKE_TEST_REMOTE | sed s/@.*$//`"
"$DM" -c "$CONFIG" dot delete -f "$REMOTE_NAME"/"$VOL"

"$DM" -c "$CONFIG" remote switch local
"$DM" -c "$CONFIG" remote rm "$REMOTE"
fi

exit 0
22 changes: 0 additions & 22 deletions smoke.sh

This file was deleted.

15 changes: 8 additions & 7 deletions tests/acceptance_test.go
Expand Up @@ -952,9 +952,9 @@ func TestTwoNodesSameCluster(t *testing.T) {
citools.RunOnNode(t, node1, citools.DockerRun(fsname)+" sh -c 'echo WORLD > /foo/HELLO'")
citools.RunOnNode(t, node1, "dm switch "+fsname)
citools.RunOnNode(t, node1, "dm commit -m 'First commit'")
ensureCurrentDotIsFullyReplicated(t, node1)
citools.RunOnNode(t, node1, "dm dot show")
citools.RunOnNode(t, node2, "dm dot show")
ensureDotIsFullyReplicated(t, node1, fsname)
ensureDotIsFullyReplicated(t, node2, fsname)
time.Sleep(3 * time.Second)

fsId := strings.TrimSpace(citools.OutputFromRunOnNode(t, node1, "dm dot show -H | grep masterBranchId | cut -f 2"))

Expand Down Expand Up @@ -2249,13 +2249,14 @@ spec:
citools.DumpTiming()
}

func ensureCurrentDotIsFullyReplicated(t *testing.T, node string) {
func ensureDotIsFullyReplicated(t *testing.T, node string, fsname string) {
for try := 1; try <= 5; try++ {
fmt.Printf("Dotmesh containers running on %s: ", node)
st := citools.OutputFromRunOnNode(t, node, "dm dot show | grep missing || true")
if st == "" {
st := citools.OutputFromRunOnNode(t, node, fmt.Sprintf("dm dot show %s", fsname))
if !strings.Contains(st, "missing") {
fmt.Print("Replicated")
return
} else {
fmt.Print("Failed to replicate, sleeping and retrying")
time.Sleep(1 * time.Second)
}
}
Expand Down

0 comments on commit 1059f0c

Please sign in to comment.