Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Lite/PerformanceMonitorLite.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@

<ItemGroup>
<!-- DuckDB for local data storage -->
<PackageReference Include="DuckDB.NET.Data" Version="1.4.4" />
<PackageReference Include="DuckDB.NET.Bindings.Full" Version="1.4.4" />
<PackageReference Include="DuckDB.NET.Data" Version="1.5.0" />
<PackageReference Include="DuckDB.NET.Bindings.Full" Version="1.5.0" />

<!-- SQL Server connectivity -->
<PackageReference Include="Microsoft.Data.SqlClient" Version="6.1.4" />
Expand Down
225 changes: 225 additions & 0 deletions Lite/Services/ArchiveService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
*/

using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using DuckDB.NET.Data;
Expand Down Expand Up @@ -134,6 +138,9 @@
}
}

/* Compact per-cycle files into monthly parquet before refreshing views */
CompactParquetFiles();

/* Refresh archive views outside write lock — view creation is fast and safe */
await _duckDb.CreateArchiveViewsAsync();
}
Expand Down Expand Up @@ -161,6 +168,218 @@
await cmd.ExecuteNonQueryAsync();
}

/* Columns to exclude during compaction — dead weight from legacy archives */
private static readonly Dictionary<string, string[]> CompactionExcludeColumns = new()
{
["query_store_stats"] = ["query_plan_text"]
};

/// <summary>
/// Compacts all per-cycle parquet files into monthly files (YYYYMM_tablename.parquet).
/// This keeps the archive directory small (~75 files for 3 months of 25 tables)
/// and dramatically improves DuckDB read_parquet glob performance.
/// </summary>
private void CompactParquetFiles()
{
if (!Directory.Exists(_archivePath))
{
return;
}

var allFiles = Directory.GetFiles(_archivePath, "*.parquet")
.Select(f => Path.GetFileName(f))
.ToList();

/* Group files by (month, table). Recognized formats:
- YYYYMMDD_HHMM_tablename.parquet (per-cycle)
- YYYYMMDD_tablename.parquet (consolidated daily)
- YYYY-MM_tablename.parquet (legacy monthly)
- all_tablename.parquet (manual consolidation)
- YYYYMM_tablename.parquet (monthly — our target format) */
var groups = new Dictionary<(string Month, string Table), List<string>>();

foreach (var file in allFiles)
{
var name = Path.GetFileNameWithoutExtension(file);

string? month = null;
string? table = null;

/* YYYYMMDD_HHMM_tablename */
var m = Regex.Match(name, @"^(\d{8})_\d{4}_(.+)$");
if (m.Success)
{
month = m.Groups[1].Value[..6]; /* YYYYMM */
table = m.Groups[2].Value;
}

/* YYYYMMDD_tablename (no HHMM) */
if (month == null)
{
m = Regex.Match(name, @"^(\d{8})_([a-z].+)$");
if (m.Success)
{
month = m.Groups[1].Value[..6];
table = m.Groups[2].Value;
}
}

/* YYYY-MM_tablename (legacy monthly) */
if (month == null)
{
m = Regex.Match(name, @"^(\d{4})-(\d{2})_(.+)$");
if (m.Success)
{
month = m.Groups[1].Value + m.Groups[2].Value;
table = m.Groups[3].Value;
}
}

/* all_tablename (manual consolidation from earlier) */
if (month == null)
{
m = Regex.Match(name, @"^all_(.+)$");
if (m.Success)
{
/* Put in the earliest month we can find, or current month */
month = "orphan";
table = m.Groups[1].Value;
}
}

/* YYYYMM_tablename (already monthly — our target format) */
if (month == null)
{
m = Regex.Match(name, @"^(\d{6})_(.+)$");
if (m.Success)
{
month = m.Groups[1].Value;
table = m.Groups[2].Value;
}
}

if (month != null && table != null)
{
var key = (month, table);
if (!groups.ContainsKey(key))

Check warning on line 264 in Lite/Services/ArchiveService.cs

View workflow job for this annotation

GitHub Actions / build

Prefer a 'TryGetValue' call over a Dictionary indexer access guarded by a 'ContainsKey' check to avoid double lookup (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1854)
{
groups[key] = [];
}
groups[key].Add(file);
}
else
{
_logger?.LogWarning("Unrecognized parquet file format: {File}", file);
}
}

/* Compact each group that has more than one file (or any non-monthly files) */
using var con = new DuckDBConnection("DataSource=:memory:");
con.Open();

var totalMerged = 0;
var totalRemoved = 0;

foreach (var ((month, table), files) in groups)
{
/* If there's exactly one file and it's already in monthly format, skip */
if (files.Count == 1)
{
var name = Path.GetFileNameWithoutExtension(files[0]);
if (Regex.IsMatch(name, @"^\d{6}_"))
{
continue;
}
}

/* Resolve month for orphan files — use current month */
var targetMonth = month == "orphan"
? DateTime.UtcNow.ToString("yyyyMM")
: month;

var targetFile = $"{targetMonth}_{table}.parquet";
var targetPath = Path.Combine(_archivePath, targetFile).Replace("\\", "/");
var tempPath = targetPath + ".tmp";

try
{
var sourcePaths = files
.Select(f => Path.Combine(_archivePath, f).Replace("\\", "/"))
.ToList();
var pathList = string.Join(", ", sourcePaths.Select(p => $"'{p}'"));

/* Build SELECT with column exclusions for specific tables.
Only exclude columns that actually exist in the source files
(they may have been stripped in a previous compaction). */
var selectClause = "*";
if (CompactionExcludeColumns.TryGetValue(table, out var excludeCols))
{
using var schemaCmd = con.CreateCommand();
schemaCmd.CommandText = $"SELECT column_name FROM (DESCRIBE SELECT * FROM read_parquet([{pathList}], union_by_name=true))";
using var reader = schemaCmd.ExecuteReader();
var existingCols = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
while (reader.Read()) existingCols.Add(reader.GetString(0));

var colsToExclude = excludeCols.Where(c => existingCols.Contains(c)).ToArray();
if (colsToExclude.Length > 0)
{
selectClause = $"* EXCLUDE ({string.Join(", ", colsToExclude)})";
}
}

using var cmd = con.CreateCommand();
cmd.CommandText = $"COPY (SELECT {selectClause} FROM read_parquet([{pathList}], union_by_name=true)) " +
$"TO '{tempPath}' (FORMAT PARQUET, COMPRESSION ZSTD, ROW_GROUP_SIZE 122880)";
cmd.ExecuteNonQuery();

/* Remove originals */
var removed = 0;
foreach (var f in files)
{
var fullPath = Path.Combine(_archivePath, f);
try
{
File.Delete(fullPath);
removed++;
}
catch (IOException ex)
{
_logger?.LogWarning("Could not delete {File} during compaction: {Message}", f, ex.Message);
}
}

/* Rename temp to final */
if (File.Exists(targetPath))
{
File.Delete(targetPath);
}
File.Move(tempPath, targetPath);

totalMerged++;
totalRemoved += removed;

_logger?.LogDebug("Compacted {Count} files into {Target}", files.Count, targetFile);
}
catch (Exception ex)
{
_logger?.LogError(ex, "Failed to compact {Month}/{Table} ({Count} files)", month, table, files.Count);

/* Clean up temp file on failure */
if (File.Exists(tempPath))
{
try { File.Delete(tempPath); } catch { /* best effort */ }
}
}
}

if (totalMerged > 0)
{
var remaining = Directory.GetFiles(_archivePath, "*.parquet").Length;
_logger?.LogInformation("Parquet compaction complete: merged {Groups} groups, removed {Removed} files, {Remaining} files remaining",
totalMerged, totalRemoved, remaining);
}
}

/// <summary>
/// Archives ALL data from every table to parquet, then deletes and reinitializes the database.
/// Called when the database exceeds the size threshold. Data remains queryable through archive views.
Expand Down Expand Up @@ -214,6 +433,12 @@
}
}

/* Compact per-cycle files into monthly parquet files before reset.
This runs outside the write lock using an in-memory DuckDB connection
and only touches filesystem files — no contention with collectors. */
_logger?.LogInformation("Compacting parquet files into monthly archives");
CompactParquetFiles();

/* Nuke and reinitialize outside the using-connection scope so all handles are closed */
_logger?.LogInformation("Deleting and reinitializing database");
await _duckDb.ResetDatabaseAsync();
Expand Down
2 changes: 1 addition & 1 deletion Lite/Services/CollectionBackgroundService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ private void RunRetentionIfDue()

try
{
_retentionService.CleanupOldArchives(retentionDays: 90);
_retentionService.CleanupOldArchives(retentionMonths: 3);
_lastRetentionTime = DateTime.UtcNow;
}
catch (Exception ex)
Expand Down
49 changes: 31 additions & 18 deletions Lite/Services/RetentionService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,54 +29,67 @@ public RetentionService(string archivePath, ILogger<RetentionService>? logger =

/// <summary>
/// Deletes Parquet files older than the specified retention period.
/// Supports two naming formats:
/// Supports naming formats:
/// - Monthly compacted: "202602_wait_stats.parquet" (yyyyMM prefix)
/// - Timestamped: "20260221_1328_wait_stats.parquet" (yyyyMMdd prefix)
/// - Consolidated daily: "20260221_wait_stats.parquet" (yyyyMMdd prefix)
/// - Legacy monthly: "2026-02_wait_stats.parquet" (yyyy-MM prefix)
/// </summary>
public void CleanupOldArchives(int retentionDays = 90)
public void CleanupOldArchives(int retentionMonths = 3)
{
if (!Directory.Exists(_archivePath))
{
return;
}

var cutoffDate = DateTime.UtcNow.AddDays(-retentionDays);
var cutoffDate = DateTime.UtcNow.AddMonths(-retentionMonths);

foreach (var file in Directory.GetFiles(_archivePath, "*.parquet"))
{
try
{
var fileName = Path.GetFileNameWithoutExtension(file);
DateTime? fileDate = null;

/* Try timestamped format first: "20260221_1328_wait_stats" -> "20260221" */
if (fileName.Length >= 8 &&
/* Monthly compacted format: "202602_wait_stats" -> "202602" */
if (fileName.Length >= 6 &&
DateTime.TryParseExact(
fileName[..6],
"yyyyMM",
CultureInfo.InvariantCulture,
DateTimeStyles.None,
out var monthDate) &&
fileName.Length > 6 && fileName[6] == '_')
{
fileDate = monthDate;
}
/* Timestamped or daily format: "20260221..." -> "20260221" */
else if (fileName.Length >= 8 &&
DateTime.TryParseExact(
fileName[..8],
"yyyyMMdd",
CultureInfo.InvariantCulture,
DateTimeStyles.None,
out var fileDate))
out var dayDate))
{
if (fileDate < cutoffDate)
{
File.Delete(file);
_logger?.LogInformation("Deleted expired archive: {File}", file);
}
fileDate = dayDate;
}
/* Fall back to legacy monthly format: "2026-02_wait_stats" -> "2026-02" */
/* Legacy monthly format: "2026-02_wait_stats" -> "2026-02" */
else if (fileName.Length >= 7 &&
DateTime.TryParseExact(
fileName[..7],
"yyyy-MM",
CultureInfo.InvariantCulture,
DateTimeStyles.None,
out var fileMonth))
out var legacyMonth))
{
fileDate = legacyMonth;
}

if (fileDate.HasValue && fileDate.Value < cutoffDate)
{
if (fileMonth < cutoffDate)
{
File.Delete(file);
_logger?.LogInformation("Deleted expired archive: {File}", file);
}
File.Delete(file);
_logger?.LogInformation("Deleted expired archive: {File}", file);
}
}
catch (Exception ex)
Expand Down
Loading