From f507159c6dbb22f3a0726f2dd20cb8a6c692388b Mon Sep 17 00:00:00 2001 From: nouseforaname Date: Fri, 2 Dec 2022 15:32:51 +0100 Subject: [PATCH] scsi_volume_id_device_path_resolver flaky on ubuntu jammy We've gotten reports of intermittent agent_timed outs for deployments on vsphere. These we're caused by the agent failing to setup the ephemeral disk attached to the VM because the resolver code never entered the if statement: `if path.Base(rootDevicePath) == "sda"` Investigation on the VM showed that: `ls /sys/bus/scsi/devices/32:0:0:0/block` => `sdb` `ls /sys/bus/scsi/devices/32:0:1:0/block` => `sda` We relied on the root disk to always be exposed at `/dev/sda`. Which has worked for ages (the code essentially didn't change since 2014) but actually was always problematic since it assumes that naming of `/dev/sd*` is deterministic. Which according to https://www.suse.com/support/kb/doc/?id=00001844 was the case until kernel version 5.3 the kernel commit can be found here: https://github.com/torvalds/linux/commit/f049cf1a7b6737c75884247c3f6383ef104d255a The reliance seems unnecessary though, since both disks (os-root from the stemcell and the ephemeral disk) are associated with the same scsi host. fixes: ``` [linuxPlatform] 2022/12/02 13:56:04 INFO - Setting up raw ephemeral disks [File System] 2022/12/02 13:56:04 DEBUG - Glob '/sys/bus/scsi/devices/*:0:0:0/block/*' [linuxPlatform] 2022/12/02 13:56:04 DEBUG - Error getting ephermeral disk path Zero length hostID [linuxPlatform] 2022/12/02 13:56:04 INFO - Setting up ephemeral disk... [File System] 2022/12/02 13:56:04 DEBUG - Glob '/var/vcap/data/*' [File System] 2022/12/02 13:56:04 DEBUG - Making dir /var/vcap/data with perm 0750 [main] 2022/12/02 13:56:04 ERROR - App setup Running bootstrap: Setting up ephemeral disk: No ephemeral disk found, cannot use root partition as ephemeral disk [main] 2022/12/02 13:56:04 ERROR - Agent exited with error: Running bootstrap: Setting up ephemeral disk: No ephemeral disk found, cannot use root partition as ephemeral disk [arping] 2022/12/02 13:56:04 DEBUG - Broadcasting MAC addresses [File System] 2022/12/02 13:56:04 DEBUG - Checking if file exists /sys/class/net/eth0 [main] 2022/12/02 13:56:04 DEBUG - Starting agent [File System] 2022/12/02 13:56:04 DEBUG - Reading file /var/vcap/bosh/agent.json [File System] 2022/12/02 13:56:04 DEBUG - Read content ``` --- .../scsi_volume_id_device_path_resolver.go | 2 +- ...csi_volume_id_device_path_resolver_test.go | 32 +++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/infrastructure/devicepathresolver/scsi_volume_id_device_path_resolver.go b/infrastructure/devicepathresolver/scsi_volume_id_device_path_resolver.go index f04640e21..8a4ad1a26 100644 --- a/infrastructure/devicepathresolver/scsi_volume_id_device_path_resolver.go +++ b/infrastructure/devicepathresolver/scsi_volume_id_device_path_resolver.go @@ -40,7 +40,7 @@ func (devicePathResolver SCSIVolumeIDDevicePathResolver) GetRealDevicePath(diskS volumeID := diskSettings.VolumeID for _, rootDevicePath := range devicePaths { - if path.Base(rootDevicePath) == "sda" { + if strings.HasPrefix(path.Base(rootDevicePath), "sd") { rootDevicePathSplits := strings.Split(rootDevicePath, "/") if len(rootDevicePathSplits) > 5 { scsiPath := rootDevicePathSplits[5] diff --git a/infrastructure/devicepathresolver/scsi_volume_id_device_path_resolver_test.go b/infrastructure/devicepathresolver/scsi_volume_id_device_path_resolver_test.go index 348476488..13304df3a 100644 --- a/infrastructure/devicepathresolver/scsi_volume_id_device_path_resolver_test.go +++ b/infrastructure/devicepathresolver/scsi_volume_id_device_path_resolver_test.go @@ -57,6 +57,38 @@ var _ = Describe("SCSIVolumeIDDevicePathResolver", func() { Expect(devicePath).To(Equal("/dev/sdf")) }) + Context("when root disk is not on /dev/sda", func() { + BeforeEach(func() { + fs.SetGlob("/sys/bus/scsi/devices/*:0:0:0/block/*", []string{ + "/sys/bus/scsi/devices/0:0:0:0/block/sr0", + "/sys/bus/scsi/devices/fake-host-id:0:0:0/block/sdb", + }) + + fs.SetGlob("/sys/bus/scsi/devices/fake-host-id:0:1:0/block/*", []string{ + "/sys/bus/scsi/devices/fake-host-id:0:0:0/block/sda", + }) + fs.SetGlob("/sys/bus/scsi/devices/fake-host-id:0:0:0/block/*", []string{ + "/sys/bus/scsi/devices/fake-host-id:0:0:0/block/sdb", + }) + diskSettings = boshsettings.DiskSettings{ + VolumeID: "1", + } + + }) + It("reliably detects the scsi_host", func() { + devicePath, _, err := resolver.GetRealDevicePath(diskSettings) + Expect(err).NotTo(HaveOccurred()) + Expect(devicePath).To(Equal("/dev/sda")) + + diskSettingsSystem := boshsettings.DiskSettings{ + VolumeID: "0", + } + devicePath, _, err = resolver.GetRealDevicePath(diskSettingsSystem) + Expect(err).NotTo(HaveOccurred()) + Expect(devicePath).To(Equal("/dev/sdb")) + }) + }) + Context("when device does not immediately appear", func() { It("retries detection of device", func() { fs.SetGlob("/sys/bus/scsi/devices/fake-host-id:0:fake-disk-id:0/block/*",