Skip to content

Commit

Permalink
Migrate from using io.ReadSeeker to io.ReaderAt.
Browse files Browse the repository at this point in the history
This provides the following benefits:
- We can now use pkg/fd package which does not take ownership
  of the file descriptor. So it does not close the fd when garbage collected.
  This reduces scope of errors from unexpected garbage collection of io.File.
- It enforces the offset parameter in every read call.
  It does not affect the fd offset nor is it affected by it. Hence reducing
  scope of error of using stale offsets when reading.
- We do not need to serialize the usage of any global file descriptor anymore.
  So this drops the mutual exclusion req hence reducing complexity and
  congestion.

PiperOrigin-RevId: 260635174
  • Loading branch information
ayushr2 authored and gvisor-bot committed Jul 30, 2019
1 parent ddf25e3 commit 8da9f8a
Show file tree
Hide file tree
Showing 12 changed files with 55 additions and 96 deletions.
1 change: 1 addition & 0 deletions pkg/sentry/fs/ext/BUILD
Expand Up @@ -37,6 +37,7 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/binary",
"//pkg/fd",
"//pkg/sentry/context",
"//pkg/sentry/fs",
"//pkg/sentry/fs/ext/disklayout",
Expand Down
8 changes: 6 additions & 2 deletions pkg/sentry/fs/ext/block_map_file.go
Expand Up @@ -16,6 +16,7 @@ package ext

import (
"io"
"sync"

"gvisor.dev/gvisor/pkg/binary"
)
Expand All @@ -25,18 +26,21 @@ import (
type blockMapFile struct {
regFile regularFile

// mu serializes changes to fileToPhysBlks.
mu sync.RWMutex

// fileToPhysBlks maps the file block numbers to the physical block numbers.
// the physical block number for the (i)th file block is stored in the (i)th
// index. This is initialized (at max) with the first 12 entries. The rest
// have to be read in from disk when required.
// have to be read in from disk when required. Protected by mu.
fileToPhysBlks []uint32
}

// Compiles only if blockMapFile implements fileReader.
var _ fileReader = (*blockMapFile)(nil)

// Read implements fileReader.getFileReader.
func (f *blockMapFile) getFileReader(dev io.ReadSeeker, blkSize uint64, offset uint64) io.Reader {
func (f *blockMapFile) getFileReader(dev io.ReaderAt, blkSize uint64, offset uint64) io.Reader {
panic("unimplemented")
}

Expand Down
23 changes: 11 additions & 12 deletions pkg/sentry/fs/ext/ext.go
Expand Up @@ -19,9 +19,9 @@ import (
"errors"
"fmt"
"io"
"os"

"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
Expand All @@ -35,32 +35,31 @@ type filesystemType struct{}
// Compiles only if filesystemType implements vfs.FilesystemType.
var _ vfs.FilesystemType = (*filesystemType)(nil)

// getDeviceFd returns the read seeker to the underlying device.
// getDeviceFd returns an io.ReaderAt to the underlying device.
// Currently there are two ways of mounting an ext(2/3/4) fs:
// 1. Specify a mount with our internal special MountType in the OCI spec.
// 2. Expose the device to the container and mount it from application layer.
func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReadSeeker, error) {
func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReaderAt, error) {
if opts.InternalData == nil {
// User mount call.
// TODO(b/134676337): Open the device specified by `source` and return that.
panic("unimplemented")
}

// NewFilesystem call originated from within the sentry.
fd, ok := opts.InternalData.(uintptr)
devFd, ok := opts.InternalData.(int)
if !ok {
return nil, errors.New("internal data for ext fs must be a uintptr containing the file descriptor to device")
return nil, errors.New("internal data for ext fs must be an int containing the file descriptor to device")
}

// We do not close this file because that would close the underlying device
// file descriptor (which is required for reading the fs from disk).
// TODO(b/134676337): Use pkg/fd instead.
deviceFile := os.NewFile(fd, source)
if deviceFile == nil {
return nil, fmt.Errorf("ext4 device file descriptor is not valid: %d", fd)
if devFd < 0 {
return nil, fmt.Errorf("ext device file descriptor is not valid: %d", devFd)
}

return deviceFile, nil
// The fd.ReadWriter returned from fd.NewReadWriter() does not take ownership
// of the file descriptor and hence will not close it when it is garbage
// collected.
return fd.NewReadWriter(devFd), nil
}

// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
Expand Down
2 changes: 1 addition & 1 deletion pkg/sentry/fs/ext/ext_test.go
Expand Up @@ -69,7 +69,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *v

// Mount the ext4 fs and retrieve the inode structure for the file.
mockCtx := contexttest.Context(t)
fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: f.Fd()})
fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
if err != nil {
f.Close()
return nil, nil, nil, nil, err
Expand Down
25 changes: 11 additions & 14 deletions pkg/sentry/fs/ext/extent_file.go
Expand Up @@ -36,7 +36,7 @@ type extentFile struct {
var _ fileReader = (*extentFile)(nil)

// Read implements fileReader.getFileReader.
func (f *extentFile) getFileReader(dev io.ReadSeeker, blkSize uint64, offset uint64) io.Reader {
func (f *extentFile) getFileReader(dev io.ReaderAt, blkSize uint64, offset uint64) io.Reader {
return &extentReader{
dev: dev,
file: f,
Expand All @@ -47,10 +47,8 @@ func (f *extentFile) getFileReader(dev io.ReadSeeker, blkSize uint64, offset uin

// newExtentFile is the extent file constructor. It reads the entire extent
// tree into memory.
//
// Preconditions: Must hold the mutex of the filesystem containing dev.
// TODO(b/134676337): Build extent tree on demand to reduce memory usage.
func newExtentFile(dev io.ReadSeeker, blkSize uint64, regFile regularFile) (*extentFile, error) {
func newExtentFile(dev io.ReaderAt, blkSize uint64, regFile regularFile) (*extentFile, error) {
file := &extentFile{regFile: regFile}
file.regFile.impl = file
err := file.buildExtTree(dev, blkSize)
Expand All @@ -65,10 +63,8 @@ func newExtentFile(dev io.ReadSeeker, blkSize uint64, regFile regularFile) (*ext
// memory. Then it recursively builds the rest of the tree by reading it off
// disk.
//
// Preconditions:
// - Must hold the mutex of the filesystem containing dev.
// - Inode flag InExtents must be set.
func (f *extentFile) buildExtTree(dev io.ReadSeeker, blkSize uint64) error {
// Precondition: inode flag InExtents must be set.
func (f *extentFile) buildExtTree(dev io.ReaderAt, blkSize uint64) error {
rootNodeData := f.regFile.inode.diskInode.Data()

binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &f.root.Header)
Expand Down Expand Up @@ -110,9 +106,7 @@ func (f *extentFile) buildExtTree(dev io.ReadSeeker, blkSize uint64) error {
// buildExtTreeFromDisk reads the extent tree nodes from disk and recursively
// builds the tree. Performs a simple DFS. It returns the ExtentNode pointed to
// by the ExtentEntry.
//
// Preconditions: Must hold the mutex of the filesystem containing dev.
func buildExtTreeFromDisk(dev io.ReadSeeker, entry disklayout.ExtentEntry, blkSize uint64) (*disklayout.ExtentNode, error) {
func buildExtTreeFromDisk(dev io.ReaderAt, entry disklayout.ExtentEntry, blkSize uint64) (*disklayout.ExtentNode, error) {
var header disklayout.ExtentHeader
off := entry.PhysicalBlock() * blkSize
err := readFromDisk(dev, int64(off), &header)
Expand Down Expand Up @@ -155,7 +149,7 @@ func buildExtTreeFromDisk(dev io.ReadSeeker, entry disklayout.ExtentEntry, blkSi
// extentReader implements io.Reader which can traverse the extent tree and
// read file data. This is not thread safe.
type extentReader struct {
dev io.ReadSeeker
dev io.ReaderAt
file *extentFile
fileOff uint64 // Represents the current file offset being read from.
blkSize uint64
Expand Down Expand Up @@ -247,9 +241,12 @@ func (r *extentReader) readFromExtent(ex *disklayout.Extent, dst []byte) (int, e
toRead = len(dst)
}

n, err := readFull(r.dev, int64(readStart), dst[:toRead])
n, _ := r.dev.ReadAt(dst[:toRead], int64(readStart))
r.fileOff += uint64(n)
return n, err
if n < toRead {
return n, syserror.EIO
}
return n, nil
}

// fileBlock returns the file block number we are currently reading.
Expand Down
2 changes: 1 addition & 1 deletion pkg/sentry/fs/ext/extent_test.go
Expand Up @@ -201,7 +201,7 @@ func TestBuildExtentTree(t *testing.T) {
// extentTreeSetUp writes the passed extent tree to a mock disk as an extent
// tree. It also constucts a mock extent file with the same tree built in it.
// It also writes random data file data and returns it.
func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (io.ReadSeeker, *extentFile, []byte) {
func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (io.ReaderAt, *extentFile, []byte) {
t.Helper()

mockDisk := make([]byte, mockExtentBlkSize*10)
Expand Down
26 changes: 10 additions & 16 deletions pkg/sentry/fs/ext/filesystem.go
Expand Up @@ -31,22 +31,16 @@ type filesystem struct {

vfsfs vfs.Filesystem

// mu serializes changes to the Dentry tree and the usage of the read seeker.
mu sync.Mutex
// mu serializes changes to the Dentry tree.
mu sync.RWMutex

// dev is the ReadSeeker for the underlying fs device. It is protected by mu.
//
// The ext filesystems aim to maximize locality, i.e. place all the data
// blocks of a file close together. On a spinning disk, locality reduces the
// amount of movement of the head hence speeding up IO operations. On an SSD
// there are no moving parts but locality increases the size of each transer
// request. Hence, having mutual exclusion on the read seeker while reading a
// file *should* help in achieving the intended performance gains.
//
// Note: This synchronization was not coupled with the ReadSeeker itself
// because we want to synchronize across read/seek operations for the
// performance gains mentioned above. Helps enforcing one-file-at-a-time IO.
dev io.ReadSeeker
// dev is the io.ReaderAt for the underlying fs device. It does not require
// protection because io.ReaderAt permits concurrent read calls to it. It
// translates to the pread syscall which passes on the read request directly
// to the device driver. Device drivers are intelligent in serving multiple
// concurrent read requests in the optimal order (taking locality into
// consideration).
dev io.ReaderAt

// inodeCache maps absolute inode numbers to the corresponding Inode struct.
// Inodes should be removed from this once their reference count hits 0.
Expand All @@ -69,7 +63,7 @@ var _ vfs.FilesystemImpl = (*filesystem)(nil)
// getOrCreateInode gets the inode corresponding to the inode number passed in.
// It creates a new one with the given inode number if one does not exist.
//
// Preconditions: must be holding fs.mu.
// Precondition: must be holding fs.mu.
func (fs *filesystem) getOrCreateInode(ctx context.Context, inodeNum uint32) (*inode, error) {
if in, ok := fs.inodeCache[inodeNum]; ok {
return in, nil
Expand Down
2 changes: 1 addition & 1 deletion pkg/sentry/fs/ext/inline_file.go
Expand Up @@ -28,7 +28,7 @@ type inlineFile struct {
var _ fileReader = (*inlineFile)(nil)

// getFileReader implements fileReader.getFileReader.
func (f *inlineFile) getFileReader(_ io.ReadSeeker, _ uint64, offset uint64) io.Reader {
func (f *inlineFile) getFileReader(_ io.ReaderAt, _ uint64, offset uint64) io.Reader {
diskInode := f.regFile.inode.diskInode
return &inlineReader{offset: offset, data: diskInode.Data()[:diskInode.Size()]}
}
Expand Down
6 changes: 2 additions & 4 deletions pkg/sentry/fs/ext/inode.go
Expand Up @@ -75,7 +75,7 @@ func (in *inode) tryIncRef() bool {
// decRef decrements the inode ref count and releases the inode resources if
// the ref count hits 0.
//
// Preconditions: Must have locked fs.mu.
// Precondition: Must have locked fs.mu.
func (in *inode) decRef(fs *filesystem) {
if refs := atomic.AddInt64(&in.refs, -1); refs == 0 {
delete(fs.inodeCache, in.inodeNum)
Expand All @@ -86,9 +86,7 @@ func (in *inode) decRef(fs *filesystem) {

// newInode is the inode constructor. Reads the inode off disk. Identifies
// inodes based on the absolute inode number on disk.
//
// Preconditions: Must hold the mutex of the filesystem containing dev.
func newInode(ctx context.Context, dev io.ReadSeeker, sb disklayout.SuperBlock, bgs []disklayout.BlockGroup, inodeNum uint32) (*inode, error) {
func newInode(ctx context.Context, dev io.ReaderAt, sb disklayout.SuperBlock, bgs []disklayout.BlockGroup, inodeNum uint32) (*inode, error) {
if inodeNum == 0 {
panic("inode number 0 on ext filesystems is not possible")
}
Expand Down
9 changes: 2 additions & 7 deletions pkg/sentry/fs/ext/regular_file.go
Expand Up @@ -29,10 +29,7 @@ type fileReader interface {
//
// This reader is not meant to be retained across Read operations as it needs
// to be reinitialized with the correct offset for every Read.
//
// Precondition: Must hold the mutex of the filesystem containing dev while
// using the Reader.
getFileReader(dev io.ReadSeeker, blkSize uint64, offset uint64) io.Reader
getFileReader(dev io.ReaderAt, blkSize uint64, offset uint64) io.Reader
}

// regularFile represents a regular file's inode. This too follows the
Expand All @@ -48,9 +45,7 @@ type regularFile struct {

// newRegularFile is the regularFile constructor. It figures out what kind of
// file this is and initializes the fileReader.
//
// Preconditions: Must hold the mutex of the filesystem containing dev.
func newRegularFile(dev io.ReadSeeker, blkSize uint64, inode inode) (*regularFile, error) {
func newRegularFile(dev io.ReaderAt, blkSize uint64, inode inode) (*regularFile, error) {
regFile := regularFile{
inode: inode,
}
Expand Down
4 changes: 1 addition & 3 deletions pkg/sentry/fs/ext/symlink.go
Expand Up @@ -28,9 +28,7 @@ type symlink struct {

// newSymlink is the symlink constructor. It reads out the symlink target from
// the inode (however it might have been stored).
//
// Preconditions: Must hold the mutex of the filesystem containing dev.
func newSymlink(dev io.ReadSeeker, blkSize uint64, inode inode) (*symlink, error) {
func newSymlink(dev io.ReaderAt, blkSize uint64, inode inode) (*symlink, error) {
var file *symlink
var link []byte

Expand Down
43 changes: 8 additions & 35 deletions pkg/sentry/fs/ext/utils.go
Expand Up @@ -15,55 +15,30 @@
package ext

import (
"encoding/binary"
"io"

"gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
"gvisor.dev/gvisor/pkg/syserror"
)

// readFromDisk performs a binary read from disk into the given struct from
// the absolute offset provided.
//
// All disk reads should use this helper so we avoid reading from stale
// previously used offsets. This function forces the offset parameter.
//
// Precondition: Must hold the mutex of the filesystem containing dev.
func readFromDisk(dev io.ReadSeeker, abOff int64, v interface{}) error {
if _, err := dev.Seek(abOff, io.SeekStart); err != nil {
return syserror.EIO
}

if err := binary.Read(dev, binary.LittleEndian, v); err != nil {
func readFromDisk(dev io.ReaderAt, abOff int64, v interface{}) error {
n := binary.Size(v)
buf := make([]byte, n)
if read, _ := dev.ReadAt(buf, abOff); read < int(n) {
return syserror.EIO
}

binary.Unmarshal(buf, binary.LittleEndian, v)
return nil
}

// readFull is a wrapper around io.ReadFull which enforces the absolute offset
// parameter so that we can ensure that we do not perform incorrect reads from
// stale previously used offsets.
//
// Precondition: Must hold the mutex of the filesystem containing dev.
func readFull(dev io.ReadSeeker, abOff int64, dst []byte) (int, error) {
if _, err := dev.Seek(abOff, io.SeekStart); err != nil {
return 0, syserror.EIO
}

n, err := io.ReadFull(dev, dst)
if err != nil {
err = syserror.EIO
}
return n, err
}

// readSuperBlock reads the SuperBlock from block group 0 in the underlying
// device. There are three versions of the superblock. This function identifies
// and returns the correct version.
//
// Precondition: Must hold the mutex of the filesystem containing dev.
func readSuperBlock(dev io.ReadSeeker) (disklayout.SuperBlock, error) {
func readSuperBlock(dev io.ReaderAt) (disklayout.SuperBlock, error) {
var sb disklayout.SuperBlock = &disklayout.SuperBlockOld{}
if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil {
return nil, err
Expand Down Expand Up @@ -98,9 +73,7 @@ func blockGroupsCount(sb disklayout.SuperBlock) uint64 {

// readBlockGroups reads the block group descriptor table from block group 0 in
// the underlying device.
//
// Precondition: Must hold the mutex of the filesystem containing dev.
func readBlockGroups(dev io.ReadSeeker, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) {
func readBlockGroups(dev io.ReaderAt, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) {
bgCount := blockGroupsCount(sb)
bgdSize := uint64(sb.BgDescSize())
is64Bit := sb.IncompatibleFeatures().Is64Bit
Expand Down

0 comments on commit 8da9f8a

Please sign in to comment.