Skip to content

Commit

Permalink
yield cpu to other procs while flashing
Browse files Browse the repository at this point in the history
Summary:
I think we have been observing occasional watchdog resets while flashing
bletchley OpenBMC devices, which effectively bricks the device.

flashy was both wiping and flashing MTD devices with a single system call.
I suspect the underlying device driver / file system code does not include a
call to yield the CPU and therefore will keep the CPU in kernel mode for as
long as it takes to erase/write the flash, giving no opportunity for other
timeshared processes to run on the BMC.  If this occurred, systemd would
be unable to tickle the watchdog timer.  Indeed while flashing I have
observed it taking a very long time (~1m) to log into the BMC over SSH.

Give other processes the chance to run by splitting both operations up into
smaller units of work (i.e. many more system calls needed).

Test Plan:
```
0 ~/local/openbmc/tools/flashy $ ./build.sh && ./build_dev.sh && go test ./...
ok      github.com/facebook/openbmc/tools/flashy        (cached)
ok      github.com/facebook/openbmc/tools/flashy/checks_and_remediations/bletchley      (cached)
ok      github.com/facebook/openbmc/tools/flashy/checks_and_remediations/common (cached)
ok      github.com/facebook/openbmc/tools/flashy/checks_and_remediations/galaxy100      (cached)
ok      github.com/facebook/openbmc/tools/flashy/checks_and_remediations/wedge100       (cached)
ok      github.com/facebook/openbmc/tools/flashy/checks_and_remediations/yamp   (cached)
?       github.com/facebook/openbmc/tools/flashy/flash_procedure        [no test files]
ok      github.com/facebook/openbmc/tools/flashy/install        (cached)
ok      github.com/facebook/openbmc/tools/flashy/lib/fileutils  (cached)
ok      github.com/facebook/openbmc/tools/flashy/lib/flash      (cached)
ok      github.com/facebook/openbmc/tools/flashy/lib/flash/flashcp      (cached)
ok      github.com/facebook/openbmc/tools/flashy/lib/flash/flashutils   (cached)
ok      github.com/facebook/openbmc/tools/flashy/lib/flash/flashutils/devices   (cached)
?       github.com/facebook/openbmc/tools/flashy/lib/logger     [no test files]
ok      github.com/facebook/openbmc/tools/flashy/lib/step       (cached)
ok      github.com/facebook/openbmc/tools/flashy/lib/utils      (cached)
ok      github.com/facebook/openbmc/tools/flashy/lib/validate   (cached)
ok      github.com/facebook/openbmc/tools/flashy/lib/validate/image     (cached)
ok      github.com/facebook/openbmc/tools/flashy/lib/validate/partition (cached)
?       github.com/facebook/openbmc/tools/flashy/tests  [no test files]
?       github.com/facebook/openbmc/tools/flashy/utilities      [no test files]
0 ~/local/openbmc/tools/flashy $ echo $?
0
```

Reviewed By: williamspatrick

Differential Revision: D45150174

fbshipit-source-id: 535b630225c964bbf16ed230610eeddba51e0c67
  • Loading branch information
doranand authored and facebook-github-bot committed Apr 20, 2023
1 parent c765dce commit 03759b7
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 29 deletions.
66 changes: 43 additions & 23 deletions tools/flashy/lib/flash/flashcp/flashcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -273,22 +273,26 @@ var eraseFlashDevice = func(
return errors.Errorf("Failed to get erase length: %v", err)
}
// length if erasesize is 0 (won't over/under-flow here due to m.erasesize > 0)
imageErasesizeLength := uint32((imageAndEraseSize-1)/m.erasesize) * m.erasesize
eraseLength := imageErasesizeLength - eraseStart
eraseEnd := uint32((imageAndEraseSize-1)/m.erasesize) * m.erasesize

// erase the flash one block at a time, like flash_eraseall does.
// this used to wipe the entire device with a single ioctl, but
// that could lock up the system in kernel mode for the duration.
log.Printf("Erasing flash device: start: %v, length: %v (end: %v)",
eraseStart, eraseLength, eraseStart+eraseLength)
e := erase_info_user{
start: eraseStart,
length: eraseLength,
}
eraseStart, eraseEnd-eraseStart, eraseEnd)
for off := uint32(eraseStart); off < eraseEnd; off += m.erasesize {
e := erase_info_user{
start: off,
length: m.erasesize,
}

err = IOCTL(deviceFile.Fd(), MEMERASE, uintptr(unsafe.Pointer(&e)))
if err != nil {
errMsg := fmt.Sprintf("Flash device '%v' erase failed: %v",
deviceFile.Name(), err)
log.Print(errMsg)
return errors.Errorf("%v", errMsg)
err = IOCTL(deviceFile.Fd(), MEMERASE, uintptr(unsafe.Pointer(&e)))
if err != nil {
errMsg := fmt.Sprintf("Flash device '%v' erase failed: %v",
deviceFile.Name(), err)
log.Print(errMsg)
return errors.Errorf("%v", errMsg)
}
}

log.Printf("Finished erasing flash device '%v'", deviceFile.Name())
Expand All @@ -302,20 +306,36 @@ var flashImage = func(
imFile imageFile,
roOffset uint32,
) error {
const chunkSize = uint32(1048576)
fileSize := uint32(len(imFile.data))

log.Printf("Flashing image '%v' on to flash device '%v'", imFile.name, deviceFile.Name())

activeImageData, err := utils.BytesSliceRange(imFile.data, roOffset, uint32(len(imFile.data)))
if err != nil {
return errors.Errorf("Unable to get image data after roOffset (%v): %v", roOffset, err)
if roOffset >= fileSize {
return errors.Errorf("roOffset (%v) >= file size (%v)", roOffset, fileSize)
}

// use Pwrite, WriteAt may call Pwrite multiple times under the hood
n, err := fileutils.Pwrite(int(deviceFile.Fd()), activeImageData, int64(roOffset))
if err != nil {
return errors.Errorf("Failed to flash image '%v' on to flash device '%v': "+
"%vB flashed: %v",
imFile.name, deviceFile.Name(), n, err,
)
// flash the device in many large chunks rather than attempting to
// do it all with a single system call. prevents us from locking
// up the system in kernel mode in the event of a kernel bug.
for off := roOffset; off < fileSize; off += chunkSize {
size := uint32(fileSize - off)
if size > chunkSize {
size = chunkSize
}
activeImageData, err := utils.BytesSliceRange(imFile.data, off, off + size)
if err != nil {
return errors.Errorf("Unable to get image data after roOffset (%v): %v", roOffset, err)
}

// use Pwrite, WriteAt may call Pwrite multiple times under the hood
n, err := fileutils.Pwrite(int(deviceFile.Fd()), activeImageData, int64(off))
if err != nil {
return errors.Errorf("Failed to flash image '%v' on to flash device '%v': "+
"%vB flashed: %v",
imFile.name, deviceFile.Name(), n, err,
)
}
}

log.Printf("Finished flashing image '%v' on to flash device '%v'",
Expand Down
17 changes: 11 additions & 6 deletions tools/flashy/lib/flash/flashcp/flashcp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,7 @@ func TestEraseFlashDevice(t *testing.T) {
roOffset: 0,
mtdErasesize: 4,
wantEraseStart: 0,
wantEraseLen: 8,
wantEraseLen: 4,
ioctlErr: nil,
want: nil,
},
Expand All @@ -484,7 +484,7 @@ func TestEraseFlashDevice(t *testing.T) {
roOffset: 0,
mtdErasesize: 4,
wantEraseStart: 0,
wantEraseLen: 8,
wantEraseLen: 4,
ioctlErr: errors.Errorf("erase failed"),
want: errors.Errorf("Flash device '/dev/mtd5' erase failed: erase failed"),
},
Expand All @@ -502,7 +502,7 @@ func TestEraseFlashDevice(t *testing.T) {
roOffset: 2,
mtdErasesize: 4,
wantEraseStart: 0,
wantEraseLen: 8,
wantEraseLen: 4,
ioctlErr: nil,
want: nil,
},
Expand All @@ -511,7 +511,7 @@ func TestEraseFlashDevice(t *testing.T) {
roOffset: 1,
mtdErasesize: 2,
wantEraseStart: 0,
wantEraseLen: 6,
wantEraseLen: 2,
ioctlErr: nil,
want: nil,
},
Expand All @@ -520,7 +520,7 @@ func TestEraseFlashDevice(t *testing.T) {
roOffset: 0,
mtdErasesize: math.MaxUint32 - 4,
wantEraseStart: 0,
wantEraseLen: 8,
wantEraseLen: math.MaxUint32 - 4,
ioctlErr: nil,
want: errors.Errorf("Failed to get erase length: Unsigned integer overflow for (6+4294967291)"),
},
Expand All @@ -535,7 +535,12 @@ func TestEraseFlashDevice(t *testing.T) {
m := mtd_info_user{
erasesize: tc.mtdErasesize,
}
again := false
IOCTL = func(fd, name, data uintptr) error {
if again {
return tc.ioctlErr
}
again = true
if fd != 42 {
t.Errorf("fd: want '42' got '%v'", fd)
}
Expand Down Expand Up @@ -602,7 +607,7 @@ func TestFlashImage(t *testing.T) {
name: "roOffset too large",
roOffset: 100,
writeErr: nil,
want: errors.Errorf("Unable to get image data after roOffset (100): Slice start (100) > end (6)"),
want: errors.Errorf("roOffset (100) >= file size (6)"),
},
}
for _, tc := range cases {
Expand Down

0 comments on commit 03759b7

Please sign in to comment.