diff --git a/pkg/sentry/fsimpl/erofs/BUILD b/pkg/sentry/fsimpl/erofs/BUILD index b452af8c4b..4eff45de67 100644 --- a/pkg/sentry/fsimpl/erofs/BUILD +++ b/pkg/sentry/fsimpl/erofs/BUILD @@ -18,6 +18,18 @@ go_template_instance( }, ) +go_template_instance( + name = "dentry_list", + out = "dentry_list.go", + package = "erofs", + prefix = "dentry", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*dentry", + "Linker": "*dentry", + }, +) + go_template_instance( name = "dentry_refs", out = "dentry_refs.go", @@ -43,6 +55,7 @@ go_template_instance( go_library( name = "erofs", srcs = [ + "dentry_list.go", "dentry_refs.go", "directory.go", "erofs.go", diff --git a/pkg/sentry/fsimpl/erofs/directory.go b/pkg/sentry/fsimpl/erofs/directory.go index 13203375a6..4ee3b032bc 100644 --- a/pkg/sentry/fsimpl/erofs/directory.go +++ b/pkg/sentry/fsimpl/erofs/directory.go @@ -86,19 +86,23 @@ func (i *inode) lookup(name string) (uint64, error) { return dirents[idx].Ino, nil } +// lookup returns a ref'd dentry for the child with the given name. func (d *dentry) lookup(ctx context.Context, name string) (*dentry, error) { // Fast path, dentry already exists. d.dirMu.RLock() child, ok := d.childMap[name] - d.dirMu.RUnlock() if ok { + child.IncRef() + d.dirMu.RUnlock() return child, nil } + d.dirMu.RUnlock() // Slow path, create a new dentry. d.dirMu.Lock() defer d.dirMu.Unlock() if child, ok := d.childMap[name]; ok { + child.IncRef() return child, nil } @@ -117,7 +121,18 @@ func (d *dentry) lookup(ctx context.Context, name string) (*dentry, error) { } child.parent.Store(d) child.name = name + + // newDentry returned refs=1, that's the parent's ref via childMap. d.childMap[name] = child + + // Return another ref to caller. + child.IncRef() + + // Parent is no longer a leaf, remove from cache. We use uncacheSelf() + // instead of checkCaching() because we hold d.dirMu and checkCaching() + // tries to acquire d.dirMu.RLock(). + d.uncacheSelf() + return child, nil } diff --git a/pkg/sentry/fsimpl/erofs/erofs.go b/pkg/sentry/fsimpl/erofs/erofs.go index 7e1bff57b3..0d50a17cdd 100644 --- a/pkg/sentry/fsimpl/erofs/erofs.go +++ b/pkg/sentry/fsimpl/erofs/erofs.go @@ -47,6 +47,8 @@ const ( // +stateify savable type FilesystemType struct{} +const defaultMaxCachedDentries = 1000 + // filesystem implements vfs.FilesystemImpl. // // +stateify savable @@ -76,6 +78,13 @@ type filesystem struct { // ancestryMu is required by genericfstree. ancestryMu sync.RWMutex `state:"nosave"` + + cacheMu sync.Mutex `state:"nosave"` + // +checklocks:cacheMu + cachedDentries dentryList + // +checklocks:cacheMu + cachedDentriesLen uint64 + maxCachedDentries uint64 } // InternalFilesystemOptions may be passed as @@ -128,11 +137,12 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } fs := &filesystem{ - mopts: opts.Data, - iopts: iopts, - image: image, - devMinor: devMinor, - mf: imageMemmapFile{image: image}, + mopts: opts.Data, + iopts: iopts, + image: image, + devMinor: devMinor, + mf: imageMemmapFile{image: image}, + maxCachedDentries: defaultMaxCachedDentries, } fs.vfsfs.Init(vfsObj, &fstype, fs) cu.Add(func() { fs.vfsfs.DecRef(ctx) }) @@ -359,8 +369,7 @@ func (i *inode) fileType() uint16 { // dentry implements vfs.DentryImpl. // -// The filesystem is read-only and currently we never drop the cached dentries -// until the filesystem is unmounted. The reference model works like this: +// The reference model works like this: // // - The initial reference count of each dentry is one, which is the reference // held by the parent (so when the reference count is one, it also means that @@ -372,8 +381,9 @@ func (i *inode) fileType() uint16 { // - The reference count of root dentry is two. One reference is returned to // the caller of `GetFilesystem()`, and the other is held by `fs`. // -// TODO: This can lead to unbounded memory growth in sentry due to the ever-growing -// dentry tree. We should have a dentry LRU cache, similar to what fsimpl/gofer does. +// - Dentries with refs==1 and no children are placed on an LRU cache list. +// When the cache is full, the least recently used dentry is evicted by +// removing it from its parent's childMap and DecRef'ing it. // // +stateify savable type dentry struct { @@ -400,6 +410,12 @@ type dentry struct { // dentry represents a directory. // +checklocks:dirMu childMap map[string]*dentry + + // cached indicates whether this dentry is on the fs.cachedDentries LRU list. + // +checklocks:inode.fs.cacheMu + cached bool + + dentryEntry } // The caller is expected to handle dentry insertion into dentry tree. @@ -418,15 +434,140 @@ func (fs *filesystem) newDentry(nid uint64) (*dentry, error) { // DecRef implements vfs.DentryImpl.DecRef. func (d *dentry) DecRef(ctx context.Context) { + destroyed := false d.dentryRefs.DecRef(func() { - d.dirMu.Lock() - for _, c := range d.childMap { - c.DecRef(ctx) - } - d.childMap = nil - d.dirMu.Unlock() - d.inode.DecRef(ctx) + destroyed = true + d.destroy(ctx) }) + if !destroyed { + d.checkCaching(ctx) + } +} + +// checkCaching evaluates whether d should be on the LRU cache list. +// A dentry is cache-eligible when refs == 1 (only the parent's reference via +// childMap) and it has no children. Called after every DecRef that does not +// destroy d. +// +// Lock ordering: dirMu before cacheMu. +func (d *dentry) checkCaching(ctx context.Context) { + fs := d.inode.fs + + // Hold dirMu.RLock across cacheMu so that hasChildren and refs are + // read atomically. Without this, a concurrent lookup could add a child + // between reading hasChildren and locking cacheMu, causing a dentry + // with children to incorrectly enter the cache. + d.dirMu.RLock() + fs.cacheMu.Lock() + + hasChildren := len(d.childMap) > 0 + refs := d.ReadRefs() + if refs <= 0 { + fs.cacheMu.Unlock() + d.dirMu.RUnlock() + return + } + + // Not eligible for caching — remove from cache if it was there. + if refs > 1 || hasChildren { + if d.cached { + fs.cachedDentries.Remove(d) + fs.cachedDentriesLen-- + d.cached = false + } + fs.cacheMu.Unlock() + d.dirMu.RUnlock() + return + } + + // Already cached — move to front (most recently used). + if d.cached { + fs.cachedDentries.Remove(d) + fs.cachedDentries.PushFront(d) + fs.cacheMu.Unlock() + d.dirMu.RUnlock() + return + } + + // Add to cache. + fs.cachedDentries.PushFront(d) + fs.cachedDentriesLen++ + d.cached = true + shouldEvict := fs.cachedDentriesLen > fs.maxCachedDentries + fs.cacheMu.Unlock() + d.dirMu.RUnlock() + + if shouldEvict { + fs.evictLRUDentry(ctx) + } +} + +// uncacheSelf removes d from the LRU cache list without severing d from the +// dentry tree (d stays in its parent's childMap). Used when d is no longer +// cache-eligible because it just gained a child. +// +// Precondition: the caller holds d.dirMu. +func (d *dentry) uncacheSelf() { + fs := d.inode.fs + fs.cacheMu.Lock() + if d.cached { + fs.cachedDentries.Remove(d) + fs.cachedDentriesLen-- + d.cached = false + } + fs.cacheMu.Unlock() +} + +// evictLRUDentry removes the least recently used dentry from the cache and +// severs it from the dentry tree by deleting it from its parent's childMap and +// dropping the parent's reference. +func (fs *filesystem) evictLRUDentry(ctx context.Context) { + // Pop the least recently used dentry from the cache. + fs.cacheMu.Lock() + victim := fs.cachedDentries.Back() + if victim == nil { + fs.cacheMu.Unlock() + return + } + fs.cachedDentries.Remove(victim) + fs.cachedDentriesLen-- + victim.cached = false // +checklocksforce: fs == victim.inode.fs. + fs.cacheMu.Unlock() + + parent := victim.parent.Load() + if parent == nil { + return + } + + // Revalidate under both locks (lock ordering: dirMu before cacheMu). + // Between removing victim from the cache and acquiring these locks, a + // concurrent lookup may have taken and released a ref on victim, causing + // checkCaching to re-add it to the LRU. A concurrent lookup may also + // still be holding an extra ref. In either case, abort the eviction. + parent.dirMu.Lock() + fs.cacheMu.Lock() + if victim.cached || victim.ReadRefs() > 1 { + fs.cacheMu.Unlock() + parent.dirMu.Unlock() + return + } + fs.cacheMu.Unlock() + + // Sever victim from the dentry tree and destroy it. + delete(parent.childMap, victim.name) + parent.dirMu.Unlock() + victim.DecRef(ctx) + parent.checkCaching(ctx) +} + +func (d *dentry) destroy(ctx context.Context) { + d.dirMu.Lock() + for _, child := range d.childMap { + child.DecRef(ctx) + } + d.childMap = nil + d.dirMu.Unlock() + d.inode.DecRef(ctx) } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. diff --git a/pkg/sentry/fsimpl/erofs/filesystem.go b/pkg/sentry/fsimpl/erofs/filesystem.go index 14b5ac5bf5..0d537ff288 100644 --- a/pkg/sentry/fsimpl/erofs/filesystem.go +++ b/pkg/sentry/fsimpl/erofs/filesystem.go @@ -26,6 +26,7 @@ import ( ) // step resolves rp.Component() to an existing file, starting from the given directory. +// Returns a ref'd dentry. // // step is loosely analogous to fs/namei.c:walk_component(). // @@ -41,6 +42,7 @@ func step(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, bool, name := rp.Component() if name == "." { rp.Advance() + d.IncRef() return d, false, nil } if name == ".." { @@ -49,12 +51,14 @@ func step(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, bool, return nil, false, err } else if isRoot || parent == nil { rp.Advance() + d.IncRef() return d, false, nil } if err := rp.CheckMount(ctx, &parent.vfsd); err != nil { return nil, false, err } rp.Advance() + parent.IncRef() return parent, false, nil } if len(name) > erofs.MaxNameLen { @@ -65,14 +69,20 @@ func step(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, bool, return nil, false, err } if err := rp.CheckMount(ctx, &child.vfsd); err != nil { + child.DecRef(ctx) return nil, false, err } if child.inode.IsSymlink() && rp.ShouldFollowSymlink() { target, err := child.inode.Readlink() if err != nil { + child.DecRef(ctx) return nil, false, err } + child.DecRef(ctx) followedSymlink, err := rp.HandleSymlink(target) + if followedSymlink && err == nil { + d.IncRef() + } return d, followedSymlink, err } rp.Advance() @@ -80,40 +90,48 @@ func step(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, bool, } // walkParentDir resolves all but the last path component of rp to an existing -// directory, starting from the gvien directory. It does not check that the +// directory, starting from the given directory. It does not check that the // returned directory is searchable by the provider of rp. // +// The caller is responsible for DecRef'ing the returned dentry. +// // walkParentDir is loosely analogous to Linux's fs/namei.c:path_parentat(). // // Preconditions: // - !rp.Done(). func walkParentDir(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { + d.IncRef() for !rp.Final() { next, _, err := step(ctx, rp, d) + d.DecRef(ctx) if err != nil { return nil, err } d = next } if !d.inode.IsDir() { + d.DecRef(ctx) return nil, linuxerr.ENOTDIR } return d, nil } -// resolve resolves rp to an existing file. +// resolve resolves rp to an existing file; returns a ref'd dentry. // // resolve is loosely analogous to Linux's fs/namei.c:path_lookupat(). func resolve(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) { d := rp.Start().Impl().(*dentry) + d.IncRef() for !rp.Done() { next, _, err := step(ctx, rp, d) + d.DecRef(ctx) if err != nil { return nil, err } d = next } if rp.MustBeDir() && !d.inode.IsDir() { + d.DecRef(ctx) return nil, linuxerr.ENOTDIR } return d, nil @@ -132,6 +150,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if err != nil { return err } + defer parentDir.DecRef(ctx) // Order of checks is important. First check if parent directory can be // executed, then check for existence, and lastly check if mount is writable. if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { @@ -144,7 +163,9 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if len(name) > erofs.MaxNameLen { return linuxerr.ENAMETOOLONG } - if _, err := parentDir.lookup(ctx, name); err == nil { + child, err := parentDir.lookup(ctx, name) + if err == nil { + child.DecRef(ctx) return linuxerr.EEXIST } else if !linuxerr.Equals(linuxerr.ENOENT, err) { return err @@ -166,6 +187,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds if err != nil { return err } + defer d.DecRef(ctx) if ats.MayWrite() { return linuxerr.EROFS } @@ -180,13 +202,14 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op } if opts.CheckSearchable { if !d.inode.IsDir() { + d.DecRef(ctx) return nil, linuxerr.ENOTDIR } if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + d.DecRef(ctx) return nil, err } } - d.IncRef() return &d.vfsd, nil } @@ -196,7 +219,6 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa if err != nil { return nil, err } - dir.IncRef() return &dir.vfsd, nil } @@ -226,7 +248,9 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err != nil { return nil, err } - return d.open(ctx, rp, &opts) + fd, err := d.open(ctx, rp, &opts) + d.DecRef(ctx) + return fd, err } mustCreate := opts.Flags&linux.O_EXCL != 0 @@ -239,8 +263,12 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if mustCreate { return nil, linuxerr.EEXIST } - return start.open(ctx, rp, &opts) + start.IncRef() + fd, err := start.open(ctx, rp, &opts) + start.DecRef(ctx) + return fd, err } + afterTrailingSymlink: parentDir, err := walkParentDir(ctx, rp, start) if err != nil { @@ -248,14 +276,19 @@ afterTrailingSymlink: } // Check for search permission in the parent directory. if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + parentDir.DecRef(ctx) return nil, err } // Reject attempts to open directories with O_CREAT. if rp.MustBeDir() { + parentDir.DecRef(ctx) return nil, linuxerr.EISDIR } child, followedSymlink, err := step(ctx, rp, parentDir) if followedSymlink { + // step returned child == parentDir with an extra IncRef. + child.DecRef(ctx) + parentDir.DecRef(ctx) if mustCreate { // EEXIST must be returned if an existing symlink is opened with O_EXCL. return nil, linuxerr.EEXIST @@ -268,6 +301,7 @@ afterTrailingSymlink: start = parentDir goto afterTrailingSymlink } + parentDir.DecRef(ctx) if linuxerr.Equals(linuxerr.ENOENT, err) { return nil, linuxerr.EROFS } @@ -275,12 +309,16 @@ afterTrailingSymlink: return nil, err } if mustCreate { + child.DecRef(ctx) return nil, linuxerr.EEXIST } if rp.MustBeDir() && !child.inode.IsDir() { + child.DecRef(ctx) return nil, linuxerr.ENOTDIR } - return child.open(ctx, rp, &opts) + fd, err := child.open(ctx, rp, &opts) + child.DecRef(ctx) + return fd, err } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. @@ -289,6 +327,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st if err != nil { return "", err } + defer d.DecRef(ctx) return d.inode.Readlink() } @@ -299,6 +338,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if err != nil { return err } + defer newParentDir.DecRef(ctx) newName := rp.Component() if len(newName) > erofs.MaxNameLen { return linuxerr.ENAMETOOLONG @@ -323,6 +363,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error if err != nil { return err } + defer parentDir.DecRef(ctx) if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return err } @@ -338,9 +379,11 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { - if _, err := resolve(ctx, rp); err != nil { + d, err := resolve(ctx, rp) + if err != nil { return err } + d.DecRef(ctx) return linuxerr.EROFS } @@ -350,6 +393,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err != nil { return linux.Statx{}, err } + defer d.DecRef(ctx) var stat linux.Statx d.inode.statTo(&stat) return stat, nil @@ -357,9 +401,11 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // StatFSAt implements vfs.FilesystemImpl.StatFSAt. func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { - if _, err := resolve(ctx, rp); err != nil { + d, err := resolve(ctx, rp) + if err != nil { return linux.Statfs{}, err } + d.DecRef(ctx) return fs.statFS(), nil } @@ -374,6 +420,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if err != nil { return err } + defer parentDir.DecRef(ctx) if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return err } @@ -390,6 +437,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath if err != nil { return nil, err } + defer d.DecRef(ctx) if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } @@ -398,33 +446,41 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { - if _, err := resolve(ctx, rp); err != nil { + d, err := resolve(ctx, rp) + if err != nil { return nil, err } + d.DecRef(ctx) return nil, linuxerr.ENOTSUP } // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { - if _, err := resolve(ctx, rp); err != nil { + d, err := resolve(ctx, rp) + if err != nil { return "", err } + d.DecRef(ctx) return "", linuxerr.ENOTSUP } // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { - if _, err := resolve(ctx, rp); err != nil { + d, err := resolve(ctx, rp) + if err != nil { return err } + d.DecRef(ctx) return linuxerr.EROFS } // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { - if _, err := resolve(ctx, rp); err != nil { + d, err := resolve(ctx, rp) + if err != nil { return err } + d.DecRef(ctx) return linuxerr.EROFS }