Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cron method to gc LFS MetaObjects #22385

Merged
merged 9 commits into from
Jan 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
22 changes: 22 additions & 0 deletions custom/conf/app.example.ini
Original file line number Diff line number Diff line change
Expand Up @@ -2213,6 +2213,28 @@ ROUTER = console
;SCHEDULE = @every 168h
;OLDER_THAN = 8760h

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Garbage collect LFS pointers in repositories
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;[cron.gc_lfs]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;ENABLED = false
;; Garbage collect LFS pointers in repositories (default false)
;RUN_AT_START = false
;; Interval as a duration between each gc run (default every 24h)
;SCHEDULE = @every 24h
;; Only attempt to garbage collect LFSMetaObjects older than this (default 7 days)
;OLDER_THAN = 168h
;; Only attempt to garbage collect LFSMetaObjects that have not been attempted to be garbage collected for this long (default 3 days)
;LAST_UPDATED_MORE_THAN_AGO = 72h
; Minimum number of stale LFSMetaObjects to check per repo. Set to `0` to always check all.
;NUMBER_TO_CHECK_PER_REPO = 100
;Check at least this proportion of LFSMetaObjects per repo. (This may cause all stale LFSMetaObjects to be checked.)
;PROPORTION_TO_CHECK_PER_REPO = 0.6


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Git Operation timeout in seconds
Expand Down
10 changes: 10 additions & 0 deletions docs/content/doc/advanced/config-cheat-sheet.en-us.md
Original file line number Diff line number Diff line change
Expand Up @@ -1039,6 +1039,16 @@ Default templates for project boards:
- `SCHEDULE`: **@every 168h**: Cron syntax to set how often to check.
- `OLDER_THAN`: **@every 8760h**: any system notice older than this expression will be deleted from database.

#### Cron - Garbage collect LFS pointers in repositories ('cron.gc_lfs')

- `ENABLED`: **false**: Enable service.
- `RUN_AT_START`: **false**: Run tasks at start up time (if ENABLED).
- `SCHEDULE`: **@every 24h**: Cron syntax to set how often to check.
- `OLDER_THAN`: **168h**: Only attempt to garbage collect LFSMetaObjects older than this (default 7 days)
- `LAST_UPDATED_MORE_THAN_AGO`: **72h**: Only attempt to garbage collect LFSMetaObjects that have not been attempted to be garbage collected for this long (default 3 days)
- `NUMBER_TO_CHECK_PER_REPO`: **100**: Minimum number of stale LFSMetaObjects to check per repo. Set to `0` to always check all.
- `PROPORTION_TO_CHECK_PER_REPO`: **0.6**: Check at least this proportion of LFSMetaObjects per repo. (This may cause all stale LFSMetaObjects to be checked.)

## Git (`git`)

- `PATH`: **""**: The path of Git executable. If empty, Gitea searches through the PATH environment.
Expand Down
69 changes: 66 additions & 3 deletions models/git/lfs.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ type LFSMetaObject struct {
RepositoryID int64 `xorm:"UNIQUE(s) INDEX NOT NULL"`
Existing bool `xorm:"-"`
CreatedUnix timeutil.TimeStamp `xorm:"created"`
UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
}

func init() {
Expand Down Expand Up @@ -334,8 +335,45 @@ func GetRepoLFSSize(ctx context.Context, repoID int64) (int64, error) {
return lfsSize, nil
}

// IterateRepositoryIDsWithLFSMetaObjects iterates across the repositories that have LFSMetaObjects
func IterateRepositoryIDsWithLFSMetaObjects(ctx context.Context, f func(ctx context.Context, repoID, count int64) error) error {
batchSize := setting.Database.IterateBufferSize
sess := db.GetEngine(ctx)
id := int64(0)
type RepositoryCount struct {
RepositoryID int64
Count int64
}
for {
counts := make([]*RepositoryCount, 0, batchSize)
sess.Select("repository_id, COUNT(id) AS count").
Table("lfs_meta_object").
Where("repository_id > ?", id).
GroupBy("repository_id").
OrderBy("repository_id ASC")

if err := sess.Limit(batchSize, 0).Find(&counts); err != nil {
return err
}
if len(counts) == 0 {
return nil
}

for _, count := range counts {
if err := f(ctx, count.RepositoryID, count.Count); err != nil {
return err
}
}
id = counts[len(counts)-1].RepositoryID
}
}

// IterateLFSMetaObjectsForRepoOptions provides options for IterateLFSMetaObjectsForRepo
type IterateLFSMetaObjectsForRepoOptions struct {
OlderThan time.Time
OlderThan time.Time
UpdatedLessRecentlyThan time.Time
OrderByUpdated bool
LoopFunctionAlwaysUpdates bool
}

// IterateLFSMetaObjectsForRepo provides a iterator for LFSMetaObjects per Repo
Expand All @@ -348,28 +386,53 @@ func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(cont
LFSMetaObject
}

id := int64(0)

for {
beans := make([]*CountLFSMetaObject, 0, batchSize)
// SELECT `lfs_meta_object`.*, COUNT(`l1`.id) as `count` FROM lfs_meta_object INNER JOIN lfs_meta_object AS l1 ON l1.oid = lfs_meta_object.oid WHERE lfs_meta_object.repository_id = ? GROUP BY lfs_meta_object.id
sess := engine.Select("`lfs_meta_object`.*, COUNT(`l1`.oid) AS `count`").
Join("INNER", "`lfs_meta_object` AS l1", "`lfs_meta_object`.oid = `l1`.oid").
Where("`lfs_meta_object`.repository_id = ?", repoID)
if !opts.OlderThan.IsZero() {
sess.And("`lfs_meta_object`.created_unix < ?", opts.OlderThan)
}
if !opts.UpdatedLessRecentlyThan.IsZero() {
sess.And("`lfs_meta_object`.updated_unix < ?", opts.UpdatedLessRecentlyThan)
}
sess.GroupBy("`lfs_meta_object`.id")
if opts.OrderByUpdated {
sess.OrderBy("`lfs_meta_object`.updated_unix ASC")
} else {
sess.And("`lfs_meta_object`.id > ?", id)
sess.OrderBy("`lfs_meta_object`.id ASC")
}
if err := sess.Limit(batchSize, start).Find(&beans); err != nil {
return err
}
if len(beans) == 0 {
return nil
}
start += len(beans)
if !opts.LoopFunctionAlwaysUpdates {
start += len(beans)
}

for _, bean := range beans {
if err := f(ctx, &bean.LFSMetaObject, bean.Count); err != nil {
return err
}
}
id = beans[len(beans)-1].ID
}
}

// MarkLFSMetaObject updates the updated time for the provided LFSMetaObject
func MarkLFSMetaObject(ctx context.Context, id int64) error {
obj := &LFSMetaObject{
UpdatedUnix: timeutil.TimeStampNow(),
}
count, err := db.GetEngine(ctx).ID(id).Update(obj)
if count != 1 {
log.Error("Unexpectedly updated %d LFSMetaObjects with ID: %d", count, id)
}
return err
}
5 changes: 5 additions & 0 deletions models/migrations/migrations.go
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,9 @@ var migrations = []Migration{
NewMigration("Update counts of all open milestones", v1_18.UpdateOpenMilestoneCounts),
// v230 -> v231
NewMigration("Add ConfidentialClient column (default true) to OAuth2Application table", v1_18.AddConfidentialClientColumnToOAuth2ApplicationTable),

// Gitea 1.18.0 ends at v231

// v231 -> v232
NewMigration("Add index for hook_task", v1_19.AddIndexForHookTask),
// v232 -> v233
Expand All @@ -446,6 +449,8 @@ var migrations = []Migration{
NewMigration("Create secrets table", v1_19.CreateSecretsTable),
// v237 -> v238
NewMigration("Drop ForeignReference table", v1_19.DropForeignReferenceTable),
// v238 -> v239
NewMigration("Add updated unix to LFSMetaObject", v1_19.AddUpdatedUnixToLFSMetaObject),
}

// GetCurrentDBVersion returns the current db version
Expand Down
27 changes: 27 additions & 0 deletions models/migrations/v1_19/v238.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT

package v1_19 //nolint

import (
"code.gitea.io/gitea/modules/timeutil"

"xorm.io/xorm"
)

// AddUpdatedUnixToLFSMetaObject adds an updated column to the LFSMetaObject to allow for garbage collection
func AddUpdatedUnixToLFSMetaObject(x *xorm.Engine) error {
// Drop the table introduced in `v211`, it's considered badly designed and doesn't look like to be used.
// See: https://github.com/go-gitea/gitea/issues/21086#issuecomment-1318217453
// LFSMetaObject stores metadata for LFS tracked files.
type LFSMetaObject struct {
ID int64 `xorm:"pk autoincr"`
Oid string `json:"oid" xorm:"UNIQUE(s) INDEX NOT NULL"`
Size int64 `json:"size" xorm:"NOT NULL"`
RepositoryID int64 `xorm:"UNIQUE(s) INDEX NOT NULL"`
CreatedUnix timeutil.TimeStamp `xorm:"created"`
UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
}

return x.Sync(new(LFSMetaObject))
lunny marked this conversation as resolved.
Show resolved Hide resolved
}
16 changes: 15 additions & 1 deletion modules/doctor/lfs.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package doctor
import (
"context"
"fmt"
"time"

"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
Expand All @@ -29,7 +30,20 @@ func garbageCollectLFSCheck(ctx context.Context, logger log.Logger, autofix bool
return fmt.Errorf("LFS support is disabled")
}

if err := repository.GarbageCollectLFSMetaObjects(ctx, logger, autofix); err != nil {
if err := repository.GarbageCollectLFSMetaObjects(ctx, repository.GarbageCollectLFSMetaObjectsOptions{
Logger: logger,
AutoFix: autofix,
// Only attempt to garbage collect lfs meta objects older than a week as the order of git lfs upload
// and git object upload is not necessarily guaranteed. It's possible to imagine a situation whereby
// an LFS object is uploaded but the git branch is not uploaded immediately, or there are some rapid
// changes in new branches that might lead to lfs objects becoming temporarily unassociated with git
// objects.
//
// It is likely that a week is potentially excessive but it should definitely be enough that any
// unassociated LFS object is genuinely unassociated.
OlderThan: time.Now().Add(-24 * time.Hour * 7),
// We don't set the UpdatedLessRecentlyThan because we want to do a full GC
}); err != nil {
return err
}

Expand Down
1 change: 1 addition & 0 deletions options/locale/locale_en-US.ini
Original file line number Diff line number Diff line change
Expand Up @@ -2554,6 +2554,7 @@ dashboard.delete_old_actions = Delete all old actions from database
dashboard.delete_old_actions.started = Delete all old actions from database started.
dashboard.update_checker = Update checker
dashboard.delete_old_system_notices = Delete all old system notices from database
dashboard.gc_lfs = Garbage collect LFS meta objects

users.user_manage_panel = User Account Management
users.new_account = Create User Account
Expand Down
43 changes: 43 additions & 0 deletions services/cron/tasks_extended.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,48 @@ func registerDeleteOldSystemNotices() {
})
}

func registerGCLFS() {
if !setting.LFS.StartServer {
return
}
type GCLFSConfig struct {
OlderThanConfig
LastUpdatedMoreThanAgo time.Duration
NumberToCheckPerRepo int64
ProportionToCheckPerRepo float64
}

RegisterTaskFatal("gc_lfs", &GCLFSConfig{
OlderThanConfig: OlderThanConfig{
BaseConfig: BaseConfig{
Enabled: false,
RunAtStart: false,
Schedule: "@every 24h",
},
// Only attempt to garbage collect lfs meta objects older than a week as the order of git lfs upload
// and git object upload is not necessarily guaranteed. It's possible to imagine a situation whereby
// an LFS object is uploaded but the git branch is not uploaded immediately, or there are some rapid
// changes in new branches that might lead to lfs objects becoming temporarily unassociated with git
// objects.
//
// It is likely that a week is potentially excessive but it should definitely be enough that any
// unassociated LFS object is genuinely unassociated.
OlderThan: 24 * time.Hour * 7,
},
// Only GC things that haven't been looked at in the past 3 days
LastUpdatedMoreThanAgo: 24 * time.Hour * 3,
NumberToCheckPerRepo: 100,
ProportionToCheckPerRepo: 0.6,
}, func(ctx context.Context, _ *user_model.User, config Config) error {
gcLFSConfig := config.(*GCLFSConfig)
return repo_service.GarbageCollectLFSMetaObjects(ctx, repo_service.GarbageCollectLFSMetaObjectsOptions{
AutoFix: true,
OlderThan: time.Now().Add(-gcLFSConfig.OlderThan),
UpdatedLessRecentlyThan: time.Now().Add(-gcLFSConfig.LastUpdatedMoreThanAgo),
})
})
}

func initExtendedTasks() {
registerDeleteInactiveUsers()
registerDeleteRepositoryArchives()
Expand All @@ -188,4 +230,5 @@ func initExtendedTasks() {
registerDeleteOldActions()
registerUpdateGiteaChecker()
registerDeleteOldSystemNotices()
registerGCLFS()
}