diff --git a/Directory.Packages.props b/Directory.Packages.props
index 566dca579..6d43beffa 100644
--- a/Directory.Packages.props
+++ b/Directory.Packages.props
@@ -23,6 +23,7 @@
+
diff --git a/src/api/Elastic.Documentation.Api.Core/TelemetryConstants.cs b/src/api/Elastic.Documentation.Api.Core/TelemetryConstants.cs
index 6a8d2683c..e6d8d4bd8 100644
--- a/src/api/Elastic.Documentation.Api.Core/TelemetryConstants.cs
+++ b/src/api/Elastic.Documentation.Api.Core/TelemetryConstants.cs
@@ -31,4 +31,10 @@ public static class TelemetryConstants
/// Used to trace frontend telemetry proxying.
///
public const string OtlpProxySourceName = "Elastic.Documentation.Api.OtlpProxy";
+
+ ///
+ /// ActivitySource name for distributed cache operations.
+ /// Used to trace cache hits, misses, and performance.
+ ///
+ public const string CacheSourceName = "Elastic.Documentation.Api.Cache";
}
diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Aws/IParameterProvider.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Aws/IParameterProvider.cs
index c35dac454..b911b856d 100644
--- a/src/api/Elastic.Documentation.Api.Infrastructure/Aws/IParameterProvider.cs
+++ b/src/api/Elastic.Documentation.Api.Infrastructure/Aws/IParameterProvider.cs
@@ -4,6 +4,10 @@
namespace Elastic.Documentation.Api.Infrastructure.Aws;
+///
+/// Abstraction for retrieving configuration parameters.
+/// Infrastructure concern: Used by other Infrastructure adapters to get configuration.
+///
public interface IParameterProvider
{
Task GetParam(string name, bool withDecryption = true, Cancel ctx = default);
diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Caching/CacheKey.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Caching/CacheKey.cs
new file mode 100644
index 000000000..67e565463
--- /dev/null
+++ b/src/api/Elastic.Documentation.Api.Infrastructure/Caching/CacheKey.cs
@@ -0,0 +1,45 @@
+// Licensed to Elasticsearch B.V under one or more agreements.
+// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
+// See the LICENSE file in the project root for more information
+
+using System.Security.Cryptography;
+using System.Text;
+
+namespace Elastic.Documentation.Api.Infrastructure.Caching;
+
+///
+/// Represents a cache key with automatic hashing of sensitive identifiers.
+/// Prevents exposing sensitive data in cache keys (CodeQL security requirement).
+///
+public sealed class CacheKey
+{
+ ///
+ /// Gets the hashed key string for use in cache operations.
+ ///
+ public string Value { get; }
+
+ private CacheKey(string category, string identifier)
+ {
+ // Hash the identifier to prevent exposing sensitive data (CodeQL security requirement)
+ var bytes = Encoding.UTF8.GetBytes(identifier);
+ var hash = SHA256.HashData(bytes);
+ var hashBase64 = Convert.ToBase64String(hash);
+ // Use base64url encoding for cache key (URL-safe)
+ var hashBase64Url = hashBase64.Replace('+', '-').Replace('/', '_').TrimEnd('=');
+ Value = $"{category}:{hashBase64Url}";
+ }
+
+ ///
+ /// Creates a cache key from a category and identifier.
+ /// The identifier is automatically hashed to prevent exposing sensitive data.
+ ///
+ /// Cache category (e.g., "idtoken", "search")
+ /// Identifier that may contain sensitive data (will be hashed)
+ /// A CacheKey instance with the hashed key
+ public static CacheKey Create(string category, string identifier) => new(category, identifier);
+
+ ///
+ /// Implicit conversion to string for convenience.
+ ///
+ public static implicit operator string(CacheKey key) => key.Value;
+}
diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Caching/DynamoDbDistributedCache.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Caching/DynamoDbDistributedCache.cs
new file mode 100644
index 000000000..3b6f70791
--- /dev/null
+++ b/src/api/Elastic.Documentation.Api.Infrastructure/Caching/DynamoDbDistributedCache.cs
@@ -0,0 +1,153 @@
+// Licensed to Elasticsearch B.V under one or more agreements.
+// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
+// See the LICENSE file in the project root for more information
+
+using System.Diagnostics;
+using System.Globalization;
+using Amazon.DynamoDBv2;
+using Amazon.DynamoDBv2.Model;
+using Elastic.Documentation.Api.Core;
+using Microsoft.Extensions.Logging;
+
+namespace Elastic.Documentation.Api.Infrastructure.Caching;
+
+///
+/// DynamoDB implementation of for Lambda environments.
+/// Provides distributed caching across all Lambda containers using DynamoDB as backing store.
+/// Clean Code: Constructor injection (Dependency Inversion), small focused methods.
+///
+public sealed class DynamoDbDistributedCache(IAmazonDynamoDB dynamoDb, string tableName, ILogger logger) : IDistributedCache
+{
+ private static readonly ActivitySource ActivitySource = new(TelemetryConstants.CacheSourceName);
+
+ private readonly IAmazonDynamoDB _dynamoDb = dynamoDb;
+ private readonly string _tableName = tableName;
+ private readonly ILogger _logger = logger;
+
+ // DynamoDB attribute names
+ private const string AttributeCacheKey = "CacheKey";
+ private const string AttributeValue = "Value";
+ private const string AttributeTtl = "TTL";
+
+ public async Task GetAsync(CacheKey key, Cancel ct = default)
+ {
+ var hashedKey = key.Value;
+ using var activity = ActivitySource.StartActivity("get cache", ActivityKind.Client);
+ _ = (activity?.SetTag("cache.key", hashedKey));
+ _ = (activity?.SetTag("cache.table", _tableName));
+ _ = (activity?.SetTag("cache.backend", "dynamodb"));
+
+ try
+ {
+ var response = await _dynamoDb.GetItemAsync(new GetItemRequest
+ {
+ TableName = _tableName,
+ Key = new Dictionary
+ {
+ [AttributeCacheKey] = new AttributeValue { S = hashedKey }
+ }
+ }, ct);
+
+ if (!response.IsItemSet)
+ {
+ _ = (activity?.SetTag("cache.hit", false));
+ _logger.LogDebug("Cache miss for key: {CacheKey}", hashedKey);
+ return null;
+ }
+
+ // DynamoDB TTL handles expiration automatically
+ // Items may still be returned briefly after expiration until DynamoDB deletes them
+ var value = response.Item.TryGetValue(AttributeValue, out var valueAttr)
+ ? valueAttr.S
+ : null;
+
+ _ = (activity?.SetTag("cache.hit", value != null));
+ if (value != null)
+ {
+ _logger.LogDebug("Cache hit for key: {CacheKey}", hashedKey);
+ }
+
+ return value;
+ }
+ catch (ResourceNotFoundException ex)
+ {
+ // Table doesn't exist - return null gracefully
+ // Infrastructure should create table, but don't fail hard in dev
+ _ = (activity?.SetTag("cache.error", "table_not_found"));
+ _logger.LogWarning(ex, "DynamoDB table {TableName} not found. Cache operations will fail gracefully.", _tableName);
+ return null;
+ }
+ catch (ProvisionedThroughputExceededException ex)
+ {
+ _ = (activity?.SetTag("cache.error", "provisioned_throughput_exceeded"));
+ _logger.LogWarning(ex, "Provisioned throughput exceeded for DynamoDB cache table {TableName}.", _tableName);
+ return null;
+ }
+ catch (InternalServerErrorException ex)
+ {
+ _ = (activity?.SetTag("cache.error", "internal_server_error"));
+ _logger.LogError(ex, "Internal server error retrieving cache key {CacheKey} from DynamoDB", hashedKey);
+ return null;
+ }
+ catch (Exception ex) when (ex is not OperationCanceledException and not TaskCanceledException)
+ {
+ _ = (activity?.SetStatus(ActivityStatusCode.Error, ex.Message));
+ _logger.LogError(ex, "Error retrieving cache key {CacheKey} from DynamoDB", hashedKey);
+ return null; // Fail gracefully
+ }
+ // Allow cancellation exceptions to propagate to respect request lifetimes
+ }
+
+ public async Task SetAsync(CacheKey key, string value, TimeSpan ttl, Cancel ct = default)
+ {
+ var hashedKey = key.Value;
+ using var activity = ActivitySource.StartActivity("set cache", ActivityKind.Client);
+ _ = (activity?.SetTag("cache.key", hashedKey));
+ _ = (activity?.SetTag("cache.table", _tableName));
+ _ = (activity?.SetTag("cache.backend", "dynamodb"));
+ _ = (activity?.SetTag("cache.ttl", ttl.TotalSeconds));
+
+ try
+ {
+ var expiresAt = DateTimeOffset.UtcNow.Add(ttl);
+ var ttlTimestamp = expiresAt.ToUnixTimeSeconds();
+
+ _ = await _dynamoDb.PutItemAsync(new PutItemRequest
+ {
+ TableName = _tableName,
+ Item = new Dictionary
+ {
+ [AttributeCacheKey] = new AttributeValue { S = hashedKey },
+ [AttributeValue] = new AttributeValue { S = value },
+ [AttributeTtl] = new AttributeValue { N = ttlTimestamp.ToString(CultureInfo.InvariantCulture) }
+ }
+ }, ct);
+
+ _logger.LogDebug("Cache set for key: {CacheKey}, TTL: {TTL}s", hashedKey, ttl.TotalSeconds);
+ }
+ catch (ResourceNotFoundException ex)
+ {
+ // Table doesn't exist - fail silently in dev, log in production
+ // Infrastructure should create table before deployment
+ _ = (activity?.SetTag("cache.error", "table_not_found"));
+ _logger.LogWarning(ex, "DynamoDB table {TableName} not found. Unable to cache key {CacheKey}.", _tableName, hashedKey);
+ }
+ catch (ProvisionedThroughputExceededException ex)
+ {
+ _ = (activity?.SetTag("cache.error", "provisioned_throughput_exceeded"));
+ _logger.LogWarning(ex, "Provisioned throughput exceeded for DynamoDB cache table {TableName}. Unable to cache key {CacheKey}.", _tableName, hashedKey);
+ }
+ catch (InternalServerErrorException ex)
+ {
+ _ = (activity?.SetTag("cache.error", "internal_server_error"));
+ _logger.LogError(ex, "Internal server error setting cache key {CacheKey} in DynamoDB", hashedKey);
+ }
+ catch (Exception ex) when (ex is not OperationCanceledException and not TaskCanceledException)
+ {
+ // Allow cancellation exceptions to propagate to respect request lifetimes
+ _ = activity?.SetStatus(ActivityStatusCode.Error, ex.Message);
+ _logger.LogError(ex, "Error setting cache key {CacheKey} in DynamoDB", hashedKey);
+ // Fail gracefully - don't throw
+ }
+ }
+}
diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Caching/IDistributedCache.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Caching/IDistributedCache.cs
new file mode 100644
index 000000000..de875cb36
--- /dev/null
+++ b/src/api/Elastic.Documentation.Api.Infrastructure/Caching/IDistributedCache.cs
@@ -0,0 +1,38 @@
+// Licensed to Elasticsearch B.V under one or more agreements.
+// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
+// See the LICENSE file in the project root for more information
+
+namespace Elastic.Documentation.Api.Infrastructure.Caching;
+
+///
+/// Abstraction for distributed caching across Lambda invocations.
+/// Infrastructure concern: Used by other Infrastructure adapters for caching.
+///
+///
+///
+/// Cache keys should be created using to automatically hash
+/// sensitive identifiers and prevent exposing sensitive data in cache keys (CodeQL security requirement).
+///
+///
+/// Key format: {category}:{hashed-identifier} (e.g., "idtoken:{hash}" where hash is SHA256 of the identifier)
+///
+///
+public interface IDistributedCache
+{
+ ///
+ /// Retrieves a cached value by key.
+ ///
+ /// Cache key created using (format: {category}:{hashed-identifier})
+ /// Cancellation token
+ /// Cached value as string, or null if not found or expired
+ Task GetAsync(CacheKey key, Cancel ct = default);
+
+ ///
+ /// Stores a value in the cache with a time-to-live.
+ ///
+ /// Cache key created using (format: {category}:{hashed-identifier})
+ /// Value to cache (typically JSON-serialized data)
+ /// Time-to-live duration
+ /// Cancellation token
+ Task SetAsync(CacheKey key, string value, TimeSpan ttl, Cancel ct = default);
+}
diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Caching/InMemoryDistributedCache.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Caching/InMemoryDistributedCache.cs
new file mode 100644
index 000000000..9733ff574
--- /dev/null
+++ b/src/api/Elastic.Documentation.Api.Infrastructure/Caching/InMemoryDistributedCache.cs
@@ -0,0 +1,57 @@
+// Licensed to Elasticsearch B.V under one or more agreements.
+// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
+// See the LICENSE file in the project root for more information
+
+using System.Collections.Concurrent;
+
+namespace Elastic.Documentation.Api.Infrastructure.Caching;
+
+///
+/// In-memory implementation of for local development.
+/// Uses ConcurrentDictionary for thread-safe storage with TTL-based expiration.
+/// Clean Code: Sealed class (not meant for inheritance), single responsibility.
+///
+public sealed class InMemoryDistributedCache : IDistributedCache
+{
+ private readonly ConcurrentDictionary _cache = new();
+
+ ///
+ /// Immutable cache entry with value and expiration timestamp.
+ /// Clean Code: Record type ensures immutability.
+ ///
+ private sealed record CacheEntry(string Value, DateTimeOffset ExpiresAt);
+
+ public Task GetAsync(CacheKey key, Cancel ct = default)
+ {
+ var hashedKey = key.Value;
+ if (_cache.TryGetValue(hashedKey, out var entry))
+ {
+ if (IsExpired(entry))
+ {
+ // Remove expired entry
+ _ = _cache.TryRemove(hashedKey, out _);
+ return Task.FromResult(null);
+ }
+
+ return Task.FromResult(entry.Value);
+ }
+
+ return Task.FromResult(null);
+ }
+
+ public Task SetAsync(CacheKey key, string value, TimeSpan ttl, Cancel ct = default)
+ {
+ var hashedKey = key.Value;
+ var expiresAt = DateTimeOffset.UtcNow.Add(ttl);
+ var entry = new CacheEntry(value, expiresAt);
+ _ = _cache.AddOrUpdate(hashedKey, entry, (_, _) => entry);
+ return Task.CompletedTask;
+ }
+
+ ///
+ /// Checks if a cache entry has expired.
+ /// Clean Code: Single-purpose helper method with intention-revealing name.
+ ///
+ private static bool IsExpired(CacheEntry entry) =>
+ entry.ExpiresAt <= DateTimeOffset.UtcNow;
+}
diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Caching/MultiLayerCache.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Caching/MultiLayerCache.cs
new file mode 100644
index 000000000..273dd1edb
--- /dev/null
+++ b/src/api/Elastic.Documentation.Api.Infrastructure/Caching/MultiLayerCache.cs
@@ -0,0 +1,131 @@
+// Licensed to Elasticsearch B.V under one or more agreements.
+// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
+// See the LICENSE file in the project root for more information
+
+using System.Collections.Concurrent;
+using System.Diagnostics;
+using Elastic.Documentation.Api.Core;
+using Microsoft.Extensions.Logging;
+
+namespace Elastic.Documentation.Api.Infrastructure.Caching;
+
+///
+/// Multi-layer cache decorator implementing L1 (in-memory) + L2 (distributed) caching strategy.
+/// Optimizes Lambda warm-start performance while maintaining cross-container cache sharing.
+/// Design Pattern: Decorator - Adds L1 layer to existing cache implementation.
+/// Caching Strategy: Cache-aside (read-through) and write-through patterns.
+///
+public sealed class MultiLayerCache(IDistributedCache l2Cache, ILogger logger) : IDistributedCache
+{
+ private static readonly ActivitySource ActivitySource = new(TelemetryConstants.CacheSourceName);
+
+ // L1 Cache: Static in-memory cache survives across Lambda warm starts
+ // Thread-safe for concurrent requests within same Lambda container
+ private static readonly ConcurrentDictionary L1Cache = new();
+
+ // L2 Cache: DynamoDB for cross-container sharing
+ private readonly IDistributedCache _l2Cache = l2Cache;
+ private readonly ILogger _logger = logger;
+
+ ///
+ /// Immutable L1 cache entry with value and expiration timestamp.
+ /// Clean Code: Record type ensures immutability, intention-revealing name.
+ ///
+ private sealed record L1CacheEntry(string Value, DateTimeOffset ExpiresAt);
+
+ public async Task GetAsync(CacheKey key, Cancel ct = default)
+ {
+ var hashedKey = key.Value;
+ using var activity = ActivitySource.StartActivity("get cache", ActivityKind.Internal);
+ _ = (activity?.SetTag("cache.key", hashedKey));
+ _ = (activity?.SetTag("cache.backend", "multilayer"));
+
+ // L1: Check in-memory cache first (fastest)
+ if (TryGetFromL1(hashedKey, out var value))
+ {
+ _ = (activity?.SetTag("cache.l1.hit", true));
+ _ = (activity?.SetTag("cache.l2.hit", false));
+ _logger.LogDebug("L1 cache hit for key: {CacheKey}", hashedKey);
+ return value;
+ }
+
+ _ = (activity?.SetTag("cache.l1.hit", false));
+
+ // L2: Check distributed cache (DynamoDB)
+ var l2Value = await _l2Cache.GetAsync(key, ct);
+
+ if (l2Value != null)
+ {
+ _ = (activity?.SetTag("cache.l2.hit", true));
+ _logger.LogDebug("L2 cache hit for key: {CacheKey}, populating L1", hashedKey);
+
+ // Populate L1 cache for future requests in this container
+ // Use a reasonable TTL for L1 (1 hour) to match ID token lifetime
+ PopulateL1(hashedKey, l2Value, TimeSpan.FromHours(1));
+ }
+ else
+ {
+ _ = (activity?.SetTag("cache.l2.hit", false));
+ _logger.LogDebug("Cache miss (L1 and L2) for key: {CacheKey}", hashedKey);
+ }
+
+ return l2Value;
+ }
+
+ public async Task SetAsync(CacheKey key, string value, TimeSpan ttl, Cancel ct = default)
+ {
+ var hashedKey = key.Value;
+ using var activity = ActivitySource.StartActivity("set cache", ActivityKind.Internal);
+ _ = (activity?.SetTag("cache.key", hashedKey));
+ _ = (activity?.SetTag("cache.backend", "multilayer"));
+ _ = (activity?.SetTag("cache.ttl", ttl.TotalSeconds));
+
+ // Write-through: Update both L1 and L2
+ PopulateL1(hashedKey, value, ttl);
+ _logger.LogDebug("Writing to L1 and L2 cache for key: {CacheKey}, TTL: {TTL}s", hashedKey, ttl.TotalSeconds);
+
+ await _l2Cache.SetAsync(key, value, ttl, ct);
+ }
+
+ ///
+ /// Attempts to retrieve value from L1 cache.
+ /// Clean Code: Single Responsibility - only handles L1 retrieval logic.
+ ///
+ private static bool TryGetFromL1(string key, out string? value)
+ {
+ if (L1Cache.TryGetValue(key, out var entry))
+ {
+ if (IsExpired(entry))
+ {
+ // Remove expired entry from L1
+ _ = L1Cache.TryRemove(key, out _);
+ value = null;
+ return false;
+ }
+
+ value = entry.Value;
+ return true;
+ }
+
+ value = null;
+ return false;
+ }
+
+ ///
+ /// Populates L1 cache with value and expiration.
+ /// Clean Code: Single Responsibility - only handles L1 population logic.
+ ///
+ private static void PopulateL1(string key, string value, TimeSpan ttl)
+ {
+ var expiresAt = DateTimeOffset.UtcNow.Add(ttl);
+ var entry = new L1CacheEntry(value, expiresAt);
+ _ = L1Cache.AddOrUpdate(key, entry, (_, _) => entry);
+ }
+
+ ///
+ /// Checks if L1 cache entry has expired.
+ /// Clean Code: Single-purpose helper with intention-revealing name.
+ ///
+ private static bool IsExpired(L1CacheEntry entry) =>
+ entry.ExpiresAt <= DateTimeOffset.UtcNow;
+}
diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Elastic.Documentation.Api.Infrastructure.csproj b/src/api/Elastic.Documentation.Api.Infrastructure/Elastic.Documentation.Api.Infrastructure.csproj
index 7724ea935..58e3c4880 100644
--- a/src/api/Elastic.Documentation.Api.Infrastructure/Elastic.Documentation.Api.Infrastructure.csproj
+++ b/src/api/Elastic.Documentation.Api.Infrastructure/Elastic.Documentation.Api.Infrastructure.csproj
@@ -17,6 +17,7 @@
+
diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Gcp/GcpIdTokenProvider.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Gcp/GcpIdTokenProvider.cs
index e6df30897..bb731d8f8 100644
--- a/src/api/Elastic.Documentation.Api.Infrastructure/Gcp/GcpIdTokenProvider.cs
+++ b/src/api/Elastic.Documentation.Api.Infrastructure/Gcp/GcpIdTokenProvider.cs
@@ -2,29 +2,36 @@
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
// See the LICENSE file in the project root for more information
-using System.Collections.Concurrent;
using System.Security.Cryptography;
using System.Text;
using System.Text.Json;
using System.Text.Json.Serialization;
+using Elastic.Documentation.Api.Infrastructure.Caching;
namespace Elastic.Documentation.Api.Infrastructure.Gcp;
// This is a custom implementation to create an ID token for GCP.
// Because Google.Api.Auth.OAuth2 is not compatible with AOT
-public class GcpIdTokenProvider(HttpClient httpClient) : IGcpIdTokenProvider
+// Clean Architecture: Depends on IDistributedCache abstraction from Infrastructure layer
+public class GcpIdTokenProvider(IHttpClientFactory httpClientFactory, IDistributedCache cache) : IGcpIdTokenProvider
{
- // Cache tokens by target audience to avoid regenerating them on every request
- private static readonly ConcurrentDictionary TokenCache = new();
-
- private sealed record CachedToken(string Token, DateTimeOffset ExpiresAt);
+ private readonly IHttpClientFactory _httpClientFactory = httpClientFactory;
+ private readonly IDistributedCache _cache = cache;
public async Task GenerateIdTokenAsync(string serviceAccount, string targetAudience, Cancel cancellationToken = default)
{
- // Check if we have a valid cached token
- if (TokenCache.TryGetValue(targetAudience, out var cachedToken) &&
- cachedToken.ExpiresAt > DateTimeOffset.UtcNow.AddMinutes(1)) // Refresh 1 minute before expiry
- return cachedToken.Token;
+ // Check distributed cache first (works across all Lambda containers)
+ // CacheKey automatically hashes the identifier to prevent exposing sensitive data
+ // DynamoDB ExpiresAt attribute handles expiration checking
+ var cacheKey = CacheKey.Create("idtoken", targetAudience);
+ var cachedToken = await _cache.GetAsync(cacheKey, cancellationToken);
+
+ if (cachedToken != null)
+ {
+ // Cache implementation (DynamoDbDistributedCache) already checked ExpiresAt
+ // If we get here, the token is still valid
+ return cachedToken;
+ }
// Read and parse service account key file using System.Text.Json source generation (AOT compatible)
var serviceAccountJson = JsonSerializer.Deserialize(serviceAccount, GcpJsonContext.Default.ServiceAccountKey);
@@ -72,10 +79,12 @@ public async Task GenerateIdTokenAsync(string serviceAccount, string tar
// Exchange JWT for ID token
var idToken = await ExchangeJwtForIdToken(jwt, targetAudience, cancellationToken);
- var expiresAt = expirationTime.Subtract(TimeSpan.FromMinutes(1));
- _ = TokenCache.AddOrUpdate(targetAudience,
- new CachedToken(idToken, expiresAt),
- (_, _) => new CachedToken(idToken, expiresAt));
+ // Cache the token in distributed cache (shared across all Lambda containers)
+ // Use 15-minute buffer for maximum safety against clock skew and edge cases
+ // DynamoDB will use ExpiresAt attribute for expiration checking and TTL for cleanup
+ var cacheExpiry = expirationTime.Subtract(TimeSpan.FromMinutes(15));
+ var cacheTtl = cacheExpiry - now;
+ await _cache.SetAsync(cacheKey, idToken, cacheTtl, cancellationToken);
return idToken;
}
@@ -89,6 +98,7 @@ private async Task ExchangeJwtForIdToken(string jwt, string targetAudien
new KeyValuePair("target_audience", targetAudience)
]);
+ var httpClient = _httpClientFactory.CreateClient();
var response = await httpClient.PostAsync("https://oauth2.googleapis.com/token", requestContent, cancellationToken);
_ = response.EnsureSuccessStatusCode();
@@ -107,6 +117,7 @@ private static string Base64UrlEncode(byte[] input)
// Convert base64 to base64url encoding
return base64.Replace('+', '-').Replace('/', '_').TrimEnd('=');
}
+
}
internal readonly record struct ServiceAccountKey(
diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/OpenTelemetry/OpenTelemetryExtensions.cs b/src/api/Elastic.Documentation.Api.Infrastructure/OpenTelemetry/OpenTelemetryExtensions.cs
index d6572a751..26af4ee00 100644
--- a/src/api/Elastic.Documentation.Api.Infrastructure/OpenTelemetry/OpenTelemetryExtensions.cs
+++ b/src/api/Elastic.Documentation.Api.Infrastructure/OpenTelemetry/OpenTelemetryExtensions.cs
@@ -36,6 +36,7 @@ public static TracerProviderBuilder AddDocsApiTracing(this TracerProviderBuilder
.AddSource(TelemetryConstants.AskAiSourceName)
.AddSource(TelemetryConstants.StreamTransformerSourceName)
.AddSource(TelemetryConstants.OtlpProxySourceName)
+ .AddSource(TelemetryConstants.CacheSourceName)
.AddAspNetCoreInstrumentation(aspNetCoreOptions =>
{
// Don't trace root API endpoint (health check)
diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/ServicesExtension.cs b/src/api/Elastic.Documentation.Api.Infrastructure/ServicesExtension.cs
index dea5311e0..88ebaba3d 100644
--- a/src/api/Elastic.Documentation.Api.Infrastructure/ServicesExtension.cs
+++ b/src/api/Elastic.Documentation.Api.Infrastructure/ServicesExtension.cs
@@ -3,6 +3,7 @@
// See the LICENSE file in the project root for more information
using System.ComponentModel.DataAnnotations;
+using Amazon.DynamoDBv2;
using Elastic.Documentation.Api.Core;
using Elastic.Documentation.Api.Core.AskAi;
using Elastic.Documentation.Api.Core.Search;
@@ -11,6 +12,7 @@
using Elastic.Documentation.Api.Infrastructure.Adapters.Search;
using Elastic.Documentation.Api.Infrastructure.Adapters.Telemetry;
using Elastic.Documentation.Api.Infrastructure.Aws;
+using Elastic.Documentation.Api.Infrastructure.Caching;
using Elastic.Documentation.Api.Infrastructure.Gcp;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
@@ -71,6 +73,7 @@ private static void AddElasticDocsApiUsecases(this IServiceCollection services,
});
// Register AppEnvironment as a singleton for dependency injection
_ = services.AddSingleton(new AppEnvironment { Current = appEnv });
+ AddDistributedCache(services, appEnv);
AddParameterProvider(services, appEnv);
AddAskAiUsecase(services, appEnv);
AddSearchUsecase(services, appEnv);
@@ -121,6 +124,58 @@ private static void AddParameterProvider(IServiceCollection services, AppEnv app
}
}
+ private static void AddDistributedCache(IServiceCollection services, AppEnv appEnv)
+ {
+ var logger = GetLogger(services);
+
+ switch (appEnv)
+ {
+ case AppEnv.Dev:
+ {
+ logger?.LogInformation("Configuring InMemoryDistributedCache for environment {AppEnvironment}", appEnv);
+ _ = services.AddSingleton();
+ logger?.LogInformation("InMemoryDistributedCache registered for local development");
+ break;
+ }
+ case AppEnv.Prod:
+ case AppEnv.Staging:
+ case AppEnv.Edge:
+ {
+ logger?.LogInformation("Configuring DynamoDB distributed cache for environment {AppEnvironment}", appEnv);
+ try
+ {
+ // Register AWS DynamoDB client
+ _ = services.AddSingleton();
+ logger?.LogInformation("AmazonDynamoDB client registered");
+
+ // Register multi-layer cache (L1: in-memory + L2: DynamoDB)
+ _ = services.AddSingleton(sp =>
+ {
+ var dynamoDb = sp.GetRequiredService();
+ var tableName = $"docs-api-cache-{appEnv.ToStringFast()}";
+ var dynamoLogger = sp.GetRequiredService>();
+ var multiLogger = sp.GetRequiredService>();
+
+ var dynamoCache = new DynamoDbDistributedCache(dynamoDb, tableName, dynamoLogger);
+ var multiLayerCache = new MultiLayerCache(dynamoCache, multiLogger);
+ logger?.LogInformation("Multi-layer cache registered with DynamoDB table: {TableName}", tableName);
+ return multiLayerCache;
+ });
+ }
+ catch (Exception ex)
+ {
+ logger?.LogError(ex, "Failed to configure distributed cache for environment {AppEnvironment}", appEnv);
+ throw;
+ }
+ break;
+ }
+ default:
+ {
+ throw new ArgumentOutOfRangeException(nameof(appEnv), appEnv, "Unsupported environment for distributed cache.");
+ }
+ }
+ }
+
private static void AddAskAiUsecase(IServiceCollection services, AppEnv appEnv)
{
var logger = GetLogger(services);
@@ -128,8 +183,10 @@ private static void AddAskAiUsecase(IServiceCollection services, AppEnv appEnv)
try
{
+ // Register GcpIdTokenProvider with distributed cache dependency
+ // Clean: Let DI handle everything automatically
_ = services.AddSingleton();
- logger?.LogInformation("GcpIdTokenProvider registered successfully");
+ logger?.LogInformation("GcpIdTokenProvider registered with distributed cache support");
_ = services.AddScoped();
logger?.LogInformation("LlmGatewayOptions registered successfully");
diff --git a/tests/Elastic.Documentation.Api.Infrastructure.Tests/Caching/DistributedCacheTests.cs b/tests/Elastic.Documentation.Api.Infrastructure.Tests/Caching/DistributedCacheTests.cs
new file mode 100644
index 000000000..a980e1b63
--- /dev/null
+++ b/tests/Elastic.Documentation.Api.Infrastructure.Tests/Caching/DistributedCacheTests.cs
@@ -0,0 +1,323 @@
+// Licensed to Elasticsearch B.V under one or more agreements.
+// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
+// See the LICENSE file in the project root for more information
+
+using System.Globalization;
+using System.Text.Json;
+using Amazon.DynamoDBv2;
+using Amazon.DynamoDBv2.Model;
+using Elastic.Documentation.Api.Infrastructure.Caching;
+using Elastic.Documentation.Api.Infrastructure.Gcp;
+using FakeItEasy;
+using FluentAssertions;
+using Microsoft.Extensions.Logging.Abstractions;
+
+namespace Elastic.Documentation.Api.Infrastructure.Tests.Caching;
+
+public class InMemoryDistributedCacheTests
+{
+ private readonly InMemoryDistributedCache _cache = new();
+
+ [Fact]
+ public async Task GetAsyncWhenKeyDoesNotExistReturnsNull()
+ {
+ // Arrange
+ var key = CacheKey.Create("test", "nonexistent-key");
+
+ // Act
+ var result = await _cache.GetAsync(key, TestContext.Current.CancellationToken);
+
+ // Assert
+ result.Should().BeNull();
+ }
+
+ [Fact]
+ public async Task SetAndGetWhenKeyIsSetReturnsValue()
+ {
+ // Arrange
+ var key = CacheKey.Create("test", "test-key");
+ const string value = "test-value";
+
+ // Act
+ await _cache.SetAsync(key, value, TimeSpan.FromMinutes(1), TestContext.Current.CancellationToken);
+ var result = await _cache.GetAsync(key, TestContext.Current.CancellationToken);
+
+ // Assert
+ result.Should().Be(value);
+ }
+
+ [Fact]
+ public async Task GetAsyncWhenEntryExpiredReturnsNull()
+ {
+ // Arrange
+ var key = CacheKey.Create("test", "expiring-key");
+
+ // Act
+ await _cache.SetAsync(key, "value", TimeSpan.FromMilliseconds(10), TestContext.Current.CancellationToken);
+ await Task.Delay(50, TestContext.Current.CancellationToken);
+ var result = await _cache.GetAsync(key, TestContext.Current.CancellationToken);
+
+ // Assert
+ result.Should().BeNull("expired entries should be removed");
+ }
+
+ [Fact]
+ public async Task SetAsyncOverwritesExistingValue()
+ {
+ // Arrange
+ var key = CacheKey.Create("test", "key");
+
+ // Act
+ await _cache.SetAsync(key, "first", TimeSpan.FromMinutes(1), TestContext.Current.CancellationToken);
+ await _cache.SetAsync(key, "second", TimeSpan.FromMinutes(1), TestContext.Current.CancellationToken);
+ var result = await _cache.GetAsync(key, TestContext.Current.CancellationToken);
+
+ // Assert
+ result.Should().Be("second");
+ }
+}
+
+public class MultiLayerCacheTests
+{
+ [Fact]
+ public async Task GetAsyncWhenL1HitDoesNotCallL2Again()
+ {
+ // Arrange
+ var fakeL2 = A.Fake();
+ var cache = new MultiLayerCache(fakeL2, NullLogger.Instance);
+ var uniqueKey = CacheKey.Create("test", $"test-key-{Guid.NewGuid()}"); // Use unique key to avoid L1 cache pollution from other tests
+
+ // Pre-populate L1 by setting a value
+ await cache.SetAsync(uniqueKey, "value", TimeSpan.FromMinutes(1), TestContext.Current.CancellationToken);
+
+ // Act - Second get should hit L1
+ var result1 = await cache.GetAsync(uniqueKey, TestContext.Current.CancellationToken);
+ var result2 = await cache.GetAsync(uniqueKey, TestContext.Current.CancellationToken);
+
+ // Assert
+ result1.Should().Be("value");
+ result2.Should().Be("value");
+ // L2 should only be called once (during SetAsync), not on subsequent Gets
+ A.CallTo(() => fakeL2.GetAsync(A._, A._))
+ .MustNotHaveHappened();
+ }
+
+ [Fact]
+ public async Task GetAsyncWhenL1MissCallsL2AndPopulatesL1()
+ {
+ // Arrange
+ var fakeL2 = A.Fake();
+ var uniqueKey = CacheKey.Create("test", $"test-key-{Guid.NewGuid()}"); // Use unique key to avoid L1 cache pollution from other tests
+ A.CallTo(() => fakeL2.GetAsync(uniqueKey, A._))
+ .Returns("l2-value");
+
+ var cache = new MultiLayerCache(fakeL2, NullLogger.Instance);
+
+ // Act - First call misses L1, hits L2
+ var result1 = await cache.GetAsync(uniqueKey, TestContext.Current.CancellationToken);
+ // Second call should hit L1 (populated from previous call)
+ var result2 = await cache.GetAsync(uniqueKey, TestContext.Current.CancellationToken);
+
+ // Assert
+ result1.Should().Be("l2-value");
+ result2.Should().Be("l2-value");
+ A.CallTo(() => fakeL2.GetAsync(uniqueKey, A._))
+ .MustHaveHappenedOnceExactly();
+ }
+
+ [Fact]
+ public async Task SetAsyncWritesToBothL1AndL2()
+ {
+ // Arrange
+ var fakeL2 = A.Fake();
+ var cache = new MultiLayerCache(fakeL2, NullLogger.Instance);
+ var uniqueKey = CacheKey.Create("test", $"test-key-{Guid.NewGuid()}"); // Use unique key to avoid L1 cache pollution from other tests
+
+ // Act
+ await cache.SetAsync(uniqueKey, "value", TimeSpan.FromMinutes(1), TestContext.Current.CancellationToken);
+
+ // Get from cache (should hit L1)
+ var result = await cache.GetAsync(uniqueKey, TestContext.Current.CancellationToken);
+
+ // Assert
+ result.Should().Be("value", "L1 should have the value");
+ A.CallTo(() => fakeL2.SetAsync(uniqueKey, "value", TimeSpan.FromMinutes(1), A._))
+ .MustHaveHappenedOnceExactly();
+ }
+
+ [Fact]
+ public async Task GetAsyncWhenBothCachesMissReturnsNull()
+ {
+ // Arrange
+ var fakeL2 = A.Fake();
+ A.CallTo(() => fakeL2.GetAsync(A._, A._))
+ .Returns((string?)null);
+
+ var cache = new MultiLayerCache(fakeL2, NullLogger.Instance);
+ var key = CacheKey.Create("test", "missing-key");
+
+ // Act
+ var result = await cache.GetAsync(key, TestContext.Current.CancellationToken);
+
+ // Assert
+ result.Should().BeNull();
+ }
+}
+
+public class DynamoDbDistributedCacheTests
+{
+ [Fact]
+ public async Task GetAsyncWhenItemExistsReturnsValue()
+ {
+ // Arrange
+ var fakeDynamoDb = A.Fake();
+ var key = CacheKey.Create("test", "test-key");
+ // Note: We no longer use ExpiresAt - DynamoDB TTL handles expiration automatically
+
+ var response = new GetItemResponse
+ {
+ Item = new Dictionary
+ {
+ ["CacheKey"] = new AttributeValue { S = key.Value },
+ ["Value"] = new AttributeValue { S = "test-value" },
+ ["TTL"] = new AttributeValue { N = DateTimeOffset.UtcNow.AddMinutes(30).ToUnixTimeSeconds().ToString(CultureInfo.InvariantCulture) }
+ },
+ IsItemSet = true
+ };
+
+ A.CallTo(() => fakeDynamoDb.GetItemAsync(A._, A._))
+ .Returns(response);
+
+ var cache = new DynamoDbDistributedCache(fakeDynamoDb, "test-table", NullLogger.Instance);
+
+ // Act
+ var result = await cache.GetAsync(key, TestContext.Current.CancellationToken);
+
+ // Assert
+ result.Should().Be("test-value");
+ }
+
+ [Fact]
+ public async Task GetAsyncWhenItemDoesNotExistReturnsNull()
+ {
+ // Arrange
+ var fakeDynamoDb = A.Fake();
+ var key = CacheKey.Create("test", "missing-key");
+ var response = new GetItemResponse { IsItemSet = false };
+
+ A.CallTo(() => fakeDynamoDb.GetItemAsync(A._, A._))
+ .Returns(response);
+
+ var cache = new DynamoDbDistributedCache(fakeDynamoDb, "test-table", NullLogger.Instance);
+
+ // Act
+ var result = await cache.GetAsync(key, TestContext.Current.CancellationToken);
+
+ // Assert
+ result.Should().BeNull();
+ }
+
+ [Fact]
+ public async Task SetAsyncCallsDynamoDbPutItem()
+ {
+ // Arrange
+ var fakeDynamoDb = A.Fake();
+ var cache = new DynamoDbDistributedCache(fakeDynamoDb, "test-table", NullLogger.Instance);
+ var key = CacheKey.Create("test", "key");
+
+ // Act
+ await cache.SetAsync(key, "value", TimeSpan.FromMinutes(30), TestContext.Current.CancellationToken);
+
+ // Assert
+ A.CallTo(() => fakeDynamoDb.PutItemAsync(
+ A.That.Matches(r =>
+ r.TableName == "test-table" &&
+ r.Item["CacheKey"].S == key.Value &&
+ r.Item["Value"].S == "value"
+ ),
+ A._
+ )).MustHaveHappenedOnceExactly();
+ }
+
+ [Fact]
+ public async Task GetAsyncWhenTableNotFoundReturnsNullGracefully()
+ {
+ // Arrange
+ var fakeDynamoDb = A.Fake();
+ var key = CacheKey.Create("test", "key");
+ var exception = new ResourceNotFoundException("Table not found");
+ A.CallTo(() => fakeDynamoDb.GetItemAsync(A._, A._))
+ .Throws(exception);
+
+ var cache = new DynamoDbDistributedCache(fakeDynamoDb, "missing-table", NullLogger.Instance);
+
+ // Act
+ var result = await cache.GetAsync(key, TestContext.Current.CancellationToken);
+
+ // Assert
+ result.Should().BeNull("should handle missing table gracefully");
+ }
+}
+
+public class GcpIdTokenProviderCachingIntegrationTests
+{
+ [Fact]
+ public async Task GenerateIdTokenAsyncUsesCachedTokenWhenValid()
+ {
+ // Arrange
+ // Use HttpMessageHandler directly with method name matching to verify ExchangeJwtForIdToken was not called
+ // See: https://fakeiteasy.github.io/docs/8.1.0/Recipes/faking-http-client/
+ var fakeHandler = A.Fake();
+ using var httpClient = new HttpClient(fakeHandler);
+ var fakeHttpClientFactory = A.Fake();
+ A.CallTo(() => fakeHttpClientFactory.CreateClient(A._))
+ .Returns(httpClient);
+
+ var cache = new InMemoryDistributedCache();
+
+ var provider = new GcpIdTokenProvider(fakeHttpClientFactory, cache);
+ const string targetAudience = "https://test-audience.googleapis.com";
+
+ // Pre-populate cache with valid token (TTL of 45 minutes - matches cache expiry logic)
+ const string cachedToken = "fake-cached-token";
+ var cacheKey = CacheKey.Create("idtoken", targetAudience);
+ await cache.SetAsync(cacheKey, cachedToken, TimeSpan.FromMinutes(45), TestContext.Current.CancellationToken);
+
+ // Act
+ var result = await provider.GenerateIdTokenAsync("{}", targetAudience, TestContext.Current.CancellationToken);
+
+ // Assert
+ result.Should().Be("fake-cached-token", "should return cached token without calling Google OAuth");
+ // Verify that ExchangeJwtForIdToken was not called (PostAsync -> SendAsync)
+ // Using method name matching since SendAsync is protected
+ A.CallTo(fakeHandler)
+ .WithReturnType>()
+ .Where(call => call.Method.Name == "SendAsync")
+ .MustNotHaveHappened();
+ }
+
+ [Fact]
+ public async Task GenerateIdTokenAsyncIgnoresExpiredCachedToken()
+ {
+ // Arrange
+ var cache = new InMemoryDistributedCache();
+ const string targetAudience = "https://test.com";
+
+ // Pre-populate cache with token that has very short TTL (will expire quickly)
+ // InMemoryDistributedCache will remove it when expired
+ const string expiredToken = "expired-token";
+ var cacheKey = CacheKey.Create("idtoken", targetAudience);
+ await cache.SetAsync(cacheKey, expiredToken, TimeSpan.FromMilliseconds(10), TestContext.Current.CancellationToken);
+
+ // Wait for expiration
+ await Task.Delay(50, TestContext.Current.CancellationToken);
+
+ // Act - Try to get the expired token
+ var cachedValue = await cache.GetAsync(cacheKey, TestContext.Current.CancellationToken);
+
+ // Assert - Expired cache entry should return null
+ // GcpIdTokenProvider checks `if (cachedToken != null)` - when cache returns null,
+ // it will generate a new token, effectively ignoring the expired cached token
+ cachedValue.Should().BeNull("expired cache entries should return null, allowing provider to generate new token");
+ }
+}
diff --git a/tests/Elastic.Documentation.Api.Infrastructure.Tests/Elastic.Documentation.Api.Infrastructure.Tests.csproj b/tests/Elastic.Documentation.Api.Infrastructure.Tests/Elastic.Documentation.Api.Infrastructure.Tests.csproj
index 6513b2c7d..3e8bc29f7 100644
--- a/tests/Elastic.Documentation.Api.Infrastructure.Tests/Elastic.Documentation.Api.Infrastructure.Tests.csproj
+++ b/tests/Elastic.Documentation.Api.Infrastructure.Tests/Elastic.Documentation.Api.Infrastructure.Tests.csproj
@@ -10,6 +10,7 @@
+