From cd6e1afb51b140938a887fc211d6edaca3e3d270 Mon Sep 17 00:00:00 2001 From: Erik Darling <2136037+erikdarlingdata@users.noreply.github.com> Date: Tue, 10 Mar 2026 23:52:45 -0400 Subject: [PATCH] Upgrade DuckDB to 1.5.0 + automatic parquet compaction + monthly retention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DuckDB 1.5.0: non-blocking checkpointing, free block reuse, 17% throughput improvement. Storage format v67→v68 upgrades transparently on first open. ArchiveService: compact per-cycle parquet files into monthly files (YYYYMM_tablename.parquet) after each archive cycle. Strips dead query_plan_text column from query_store_stats during compaction. Uses in-memory DuckDB connection — no contention with collectors. Reduces steady-state archive from thousands of files to ~75 (25 tables × 3 months). RetentionService: switch from 90-day file deletion to 3-month monthly file deletion. Recognizes all naming formats (YYYYMM_, YYYYMMDD_, YYYY-MM_). Benchmarked: v_wait_stats query dropped from 1.7s to 0.03s after compaction. Lite refresh stable at 1.3-1.5s (down from 6-13s pre-optimization). Co-Authored-By: Claude Opus 4.6 --- Lite/PerformanceMonitorLite.csproj | 4 +- Lite/Services/ArchiveService.cs | 225 +++++++++++++++++++ Lite/Services/CollectionBackgroundService.cs | 2 +- Lite/Services/RetentionService.cs | 49 ++-- 4 files changed, 259 insertions(+), 21 deletions(-) diff --git a/Lite/PerformanceMonitorLite.csproj b/Lite/PerformanceMonitorLite.csproj index a40ad9d1..d0be7ba2 100644 --- a/Lite/PerformanceMonitorLite.csproj +++ b/Lite/PerformanceMonitorLite.csproj @@ -36,8 +36,8 @@ - - + + diff --git a/Lite/Services/ArchiveService.cs b/Lite/Services/ArchiveService.cs index c80ba201..cbc199f9 100644 --- a/Lite/Services/ArchiveService.cs +++ b/Lite/Services/ArchiveService.cs @@ -7,7 +7,11 @@ */ using System; +using System.Collections.Generic; +using System.Globalization; using System.IO; +using System.Linq; +using System.Text.RegularExpressions; using System.Threading; using System.Threading.Tasks; using DuckDB.NET.Data; @@ -134,6 +138,9 @@ Archive views use glob (*_table.parquet) to pick up all files. */ } } + /* Compact per-cycle files into monthly parquet before refreshing views */ + CompactParquetFiles(); + /* Refresh archive views outside write lock — view creation is fast and safe */ await _duckDb.CreateArchiveViewsAsync(); } @@ -161,6 +168,218 @@ private static async Task ExportToParquet(DuckDBConnection connection, string ta await cmd.ExecuteNonQueryAsync(); } + /* Columns to exclude during compaction — dead weight from legacy archives */ + private static readonly Dictionary CompactionExcludeColumns = new() + { + ["query_store_stats"] = ["query_plan_text"] + }; + + /// + /// Compacts all per-cycle parquet files into monthly files (YYYYMM_tablename.parquet). + /// This keeps the archive directory small (~75 files for 3 months of 25 tables) + /// and dramatically improves DuckDB read_parquet glob performance. + /// + private void CompactParquetFiles() + { + if (!Directory.Exists(_archivePath)) + { + return; + } + + var allFiles = Directory.GetFiles(_archivePath, "*.parquet") + .Select(f => Path.GetFileName(f)) + .ToList(); + + /* Group files by (month, table). Recognized formats: + - YYYYMMDD_HHMM_tablename.parquet (per-cycle) + - YYYYMMDD_tablename.parquet (consolidated daily) + - YYYY-MM_tablename.parquet (legacy monthly) + - all_tablename.parquet (manual consolidation) + - YYYYMM_tablename.parquet (monthly — our target format) */ + var groups = new Dictionary<(string Month, string Table), List>(); + + foreach (var file in allFiles) + { + var name = Path.GetFileNameWithoutExtension(file); + + string? month = null; + string? table = null; + + /* YYYYMMDD_HHMM_tablename */ + var m = Regex.Match(name, @"^(\d{8})_\d{4}_(.+)$"); + if (m.Success) + { + month = m.Groups[1].Value[..6]; /* YYYYMM */ + table = m.Groups[2].Value; + } + + /* YYYYMMDD_tablename (no HHMM) */ + if (month == null) + { + m = Regex.Match(name, @"^(\d{8})_([a-z].+)$"); + if (m.Success) + { + month = m.Groups[1].Value[..6]; + table = m.Groups[2].Value; + } + } + + /* YYYY-MM_tablename (legacy monthly) */ + if (month == null) + { + m = Regex.Match(name, @"^(\d{4})-(\d{2})_(.+)$"); + if (m.Success) + { + month = m.Groups[1].Value + m.Groups[2].Value; + table = m.Groups[3].Value; + } + } + + /* all_tablename (manual consolidation from earlier) */ + if (month == null) + { + m = Regex.Match(name, @"^all_(.+)$"); + if (m.Success) + { + /* Put in the earliest month we can find, or current month */ + month = "orphan"; + table = m.Groups[1].Value; + } + } + + /* YYYYMM_tablename (already monthly — our target format) */ + if (month == null) + { + m = Regex.Match(name, @"^(\d{6})_(.+)$"); + if (m.Success) + { + month = m.Groups[1].Value; + table = m.Groups[2].Value; + } + } + + if (month != null && table != null) + { + var key = (month, table); + if (!groups.ContainsKey(key)) + { + groups[key] = []; + } + groups[key].Add(file); + } + else + { + _logger?.LogWarning("Unrecognized parquet file format: {File}", file); + } + } + + /* Compact each group that has more than one file (or any non-monthly files) */ + using var con = new DuckDBConnection("DataSource=:memory:"); + con.Open(); + + var totalMerged = 0; + var totalRemoved = 0; + + foreach (var ((month, table), files) in groups) + { + /* If there's exactly one file and it's already in monthly format, skip */ + if (files.Count == 1) + { + var name = Path.GetFileNameWithoutExtension(files[0]); + if (Regex.IsMatch(name, @"^\d{6}_")) + { + continue; + } + } + + /* Resolve month for orphan files — use current month */ + var targetMonth = month == "orphan" + ? DateTime.UtcNow.ToString("yyyyMM") + : month; + + var targetFile = $"{targetMonth}_{table}.parquet"; + var targetPath = Path.Combine(_archivePath, targetFile).Replace("\\", "/"); + var tempPath = targetPath + ".tmp"; + + try + { + var sourcePaths = files + .Select(f => Path.Combine(_archivePath, f).Replace("\\", "/")) + .ToList(); + var pathList = string.Join(", ", sourcePaths.Select(p => $"'{p}'")); + + /* Build SELECT with column exclusions for specific tables. + Only exclude columns that actually exist in the source files + (they may have been stripped in a previous compaction). */ + var selectClause = "*"; + if (CompactionExcludeColumns.TryGetValue(table, out var excludeCols)) + { + using var schemaCmd = con.CreateCommand(); + schemaCmd.CommandText = $"SELECT column_name FROM (DESCRIBE SELECT * FROM read_parquet([{pathList}], union_by_name=true))"; + using var reader = schemaCmd.ExecuteReader(); + var existingCols = new HashSet(StringComparer.OrdinalIgnoreCase); + while (reader.Read()) existingCols.Add(reader.GetString(0)); + + var colsToExclude = excludeCols.Where(c => existingCols.Contains(c)).ToArray(); + if (colsToExclude.Length > 0) + { + selectClause = $"* EXCLUDE ({string.Join(", ", colsToExclude)})"; + } + } + + using var cmd = con.CreateCommand(); + cmd.CommandText = $"COPY (SELECT {selectClause} FROM read_parquet([{pathList}], union_by_name=true)) " + + $"TO '{tempPath}' (FORMAT PARQUET, COMPRESSION ZSTD, ROW_GROUP_SIZE 122880)"; + cmd.ExecuteNonQuery(); + + /* Remove originals */ + var removed = 0; + foreach (var f in files) + { + var fullPath = Path.Combine(_archivePath, f); + try + { + File.Delete(fullPath); + removed++; + } + catch (IOException ex) + { + _logger?.LogWarning("Could not delete {File} during compaction: {Message}", f, ex.Message); + } + } + + /* Rename temp to final */ + if (File.Exists(targetPath)) + { + File.Delete(targetPath); + } + File.Move(tempPath, targetPath); + + totalMerged++; + totalRemoved += removed; + + _logger?.LogDebug("Compacted {Count} files into {Target}", files.Count, targetFile); + } + catch (Exception ex) + { + _logger?.LogError(ex, "Failed to compact {Month}/{Table} ({Count} files)", month, table, files.Count); + + /* Clean up temp file on failure */ + if (File.Exists(tempPath)) + { + try { File.Delete(tempPath); } catch { /* best effort */ } + } + } + } + + if (totalMerged > 0) + { + var remaining = Directory.GetFiles(_archivePath, "*.parquet").Length; + _logger?.LogInformation("Parquet compaction complete: merged {Groups} groups, removed {Removed} files, {Remaining} files remaining", + totalMerged, totalRemoved, remaining); + } + } + /// /// Archives ALL data from every table to parquet, then deletes and reinitializes the database. /// Called when the database exceeds the size threshold. Data remains queryable through archive views. @@ -214,6 +433,12 @@ Archive views use glob (*_table.parquet) to pick up all files. */ } } + /* Compact per-cycle files into monthly parquet files before reset. + This runs outside the write lock using an in-memory DuckDB connection + and only touches filesystem files — no contention with collectors. */ + _logger?.LogInformation("Compacting parquet files into monthly archives"); + CompactParquetFiles(); + /* Nuke and reinitialize outside the using-connection scope so all handles are closed */ _logger?.LogInformation("Deleting and reinitializing database"); await _duckDb.ResetDatabaseAsync(); diff --git a/Lite/Services/CollectionBackgroundService.cs b/Lite/Services/CollectionBackgroundService.cs index 4641dd61..6db67e31 100644 --- a/Lite/Services/CollectionBackgroundService.cs +++ b/Lite/Services/CollectionBackgroundService.cs @@ -173,7 +173,7 @@ private void RunRetentionIfDue() try { - _retentionService.CleanupOldArchives(retentionDays: 90); + _retentionService.CleanupOldArchives(retentionMonths: 3); _lastRetentionTime = DateTime.UtcNow; } catch (Exception ex) diff --git a/Lite/Services/RetentionService.cs b/Lite/Services/RetentionService.cs index 743c8346..a6447027 100644 --- a/Lite/Services/RetentionService.cs +++ b/Lite/Services/RetentionService.cs @@ -29,54 +29,67 @@ public RetentionService(string archivePath, ILogger? logger = /// /// Deletes Parquet files older than the specified retention period. - /// Supports two naming formats: + /// Supports naming formats: + /// - Monthly compacted: "202602_wait_stats.parquet" (yyyyMM prefix) /// - Timestamped: "20260221_1328_wait_stats.parquet" (yyyyMMdd prefix) + /// - Consolidated daily: "20260221_wait_stats.parquet" (yyyyMMdd prefix) /// - Legacy monthly: "2026-02_wait_stats.parquet" (yyyy-MM prefix) /// - public void CleanupOldArchives(int retentionDays = 90) + public void CleanupOldArchives(int retentionMonths = 3) { if (!Directory.Exists(_archivePath)) { return; } - var cutoffDate = DateTime.UtcNow.AddDays(-retentionDays); + var cutoffDate = DateTime.UtcNow.AddMonths(-retentionMonths); foreach (var file in Directory.GetFiles(_archivePath, "*.parquet")) { try { var fileName = Path.GetFileNameWithoutExtension(file); + DateTime? fileDate = null; - /* Try timestamped format first: "20260221_1328_wait_stats" -> "20260221" */ - if (fileName.Length >= 8 && + /* Monthly compacted format: "202602_wait_stats" -> "202602" */ + if (fileName.Length >= 6 && + DateTime.TryParseExact( + fileName[..6], + "yyyyMM", + CultureInfo.InvariantCulture, + DateTimeStyles.None, + out var monthDate) && + fileName.Length > 6 && fileName[6] == '_') + { + fileDate = monthDate; + } + /* Timestamped or daily format: "20260221..." -> "20260221" */ + else if (fileName.Length >= 8 && DateTime.TryParseExact( fileName[..8], "yyyyMMdd", CultureInfo.InvariantCulture, DateTimeStyles.None, - out var fileDate)) + out var dayDate)) { - if (fileDate < cutoffDate) - { - File.Delete(file); - _logger?.LogInformation("Deleted expired archive: {File}", file); - } + fileDate = dayDate; } - /* Fall back to legacy monthly format: "2026-02_wait_stats" -> "2026-02" */ + /* Legacy monthly format: "2026-02_wait_stats" -> "2026-02" */ else if (fileName.Length >= 7 && DateTime.TryParseExact( fileName[..7], "yyyy-MM", CultureInfo.InvariantCulture, DateTimeStyles.None, - out var fileMonth)) + out var legacyMonth)) + { + fileDate = legacyMonth; + } + + if (fileDate.HasValue && fileDate.Value < cutoffDate) { - if (fileMonth < cutoffDate) - { - File.Delete(file); - _logger?.LogInformation("Deleted expired archive: {File}", file); - } + File.Delete(file); + _logger?.LogInformation("Deleted expired archive: {File}", file); } } catch (Exception ex)