Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Active Rebalancing #8877

Open
wants to merge 28 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
1aa656f
Non-reentrant timers
ReubenBond Apr 21, 2024
51fdad8
Active Rebalancing
May 19, 2024
915293c
Try to improve candidate vertex max-heap perf
ReubenBond May 20, 2024
d40485b
WIP
ReubenBond May 21, 2024
4315acf
The breaking
ReubenBond May 21, 2024
f79d6e7
Fix & clean up
ReubenBond May 21, 2024
51f7608
wip
ReubenBond May 21, 2024
e463a27
Log 1st chance exceptions in fan-out test
ReubenBond May 21, 2024
cc7bfab
Fixes
ReubenBond May 22, 2024
ba21e6b
Reschedule timer whether sending or receiving a request
ReubenBond May 22, 2024
6457d69
minor clean up
ReubenBond May 22, 2024
5c65e8e
Ignore remote vertex instance if it appears in the local set.
ReubenBond May 23, 2024
2fb6430
Improve timer scheduling slightly - consider winding back into Update…
ReubenBond May 23, 2024
36d4ec9
Remove an option, fix tests
ReubenBond May 25, 2024
f158228
Fix message sink cycles per yield
ReubenBond May 26, 2024
5600ca1
Experiment: anchoring grains with globally negative transfer score
ReubenBond May 26, 2024
22183f1
Add initial bloom filter implementation
ReubenBond May 27, 2024
eaf36ee
Add & use bloom filter to filter 'anchored' grains from Top-K
ReubenBond May 27, 2024
390f2af
Mix instead of re-hashing. Yield only every 25ms+
ReubenBond May 27, 2024
20bb8ca
Improvements
ReubenBond May 27, 2024
0bea27d
REVERT - short recovery
ReubenBond May 27, 2024
4e79452
Yield more in MessageSink
ReubenBond May 27, 2024
e438cca
WIP Aspire playground for Active Rebalancing
ReubenBond May 28, 2024
6996c2c
fixups
ReubenBond May 29, 2024
4afc6b0
Added faster variation of the bloom filter, and adjusted benchmarks
Jun 2, 2024
3f8b7e9
Introduced variable filtering options for anchored grains filtering
Jun 9, 2024
3458a73
fixed some tests
Jun 19, 2024
caf32c6
fixed all tests
Jun 19, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions Directory.Packages.props
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@
<PackageVersion Include="Azure.Messaging.EventHubs" Version="5.9.3" />
<PackageVersion Include="Azure.Storage.Blobs" Version="12.18.0" />
<PackageVersion Include="Azure.Storage.Queues" Version="12.16.0" />
<!-- Aspire -->
<PackageVersion Include="Aspire.Hosting.AppHost" Version="8.0.1" />
<PackageVersion Include="Aspire.Hosting.Orleans" Version="8.0.1" />
<PackageVersion Include="Aspire.Hosting.Redis" Version="8.0.1" />
<PackageVersion Include="Aspire.StackExchange.Redis" Version="8.0.1" />
<!-- 3rd party packages -->
<PackageVersion Include="Google.Cloud.PubSub.V1" Version="1.0.0-beta13" />
<PackageVersion Include="AWSSDK.DynamoDBv2" Version="3.7.300.6" />
Expand Down
26 changes: 26 additions & 0 deletions Orleans.sln
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,16 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Orleans.Streaming.AdoNet",
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Benchmarks.AdoNet", "test\Benchmarks.AdoNet\Benchmarks.AdoNet.csproj", "{B8F43537-2D2E-42A0-BE67-5E07E4313AEA}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "playground", "playground", "{A41DE3D1-F8AA-4234-BE6F-3C9646A1507A}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "DashboardToy", "DashboardToy", "{316CDCC7-323F-4264-9FC9-667662BB1F80}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DashboardToy.Frontend", "playground\DashboardToy\DashboardToy.Frontend\DashboardToy.Frontend.csproj", "{C4DD4F96-3EC6-47C6-97AA-9B14F0F2099B}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DashboardToy.AppHost", "playground\DashboardToy\DashboradToy.AppHost\DashboardToy.AppHost.csproj", "{84B44F1D-B7FE-40E3-82F0-730A55AC8613}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DashboardToy.Common", "playground\DashboardToy\DashboardToy.Common\DashboardToy.Common.csproj", "{10F842A4-D5F9-41A7-B328-6D5A02BBE4C9}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -593,6 +603,18 @@ Global
{B8F43537-2D2E-42A0-BE67-5E07E4313AEA}.Debug|Any CPU.Build.0 = Debug|Any CPU
{B8F43537-2D2E-42A0-BE67-5E07E4313AEA}.Release|Any CPU.ActiveCfg = Release|Any CPU
{B8F43537-2D2E-42A0-BE67-5E07E4313AEA}.Release|Any CPU.Build.0 = Release|Any CPU
{C4DD4F96-3EC6-47C6-97AA-9B14F0F2099B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{C4DD4F96-3EC6-47C6-97AA-9B14F0F2099B}.Debug|Any CPU.Build.0 = Debug|Any CPU
{C4DD4F96-3EC6-47C6-97AA-9B14F0F2099B}.Release|Any CPU.ActiveCfg = Release|Any CPU
{C4DD4F96-3EC6-47C6-97AA-9B14F0F2099B}.Release|Any CPU.Build.0 = Release|Any CPU
{84B44F1D-B7FE-40E3-82F0-730A55AC8613}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{84B44F1D-B7FE-40E3-82F0-730A55AC8613}.Debug|Any CPU.Build.0 = Debug|Any CPU
{84B44F1D-B7FE-40E3-82F0-730A55AC8613}.Release|Any CPU.ActiveCfg = Release|Any CPU
{84B44F1D-B7FE-40E3-82F0-730A55AC8613}.Release|Any CPU.Build.0 = Release|Any CPU
{10F842A4-D5F9-41A7-B328-6D5A02BBE4C9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{10F842A4-D5F9-41A7-B328-6D5A02BBE4C9}.Debug|Any CPU.Build.0 = Debug|Any CPU
{10F842A4-D5F9-41A7-B328-6D5A02BBE4C9}.Release|Any CPU.ActiveCfg = Release|Any CPU
{10F842A4-D5F9-41A7-B328-6D5A02BBE4C9}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -702,6 +724,10 @@ Global
{A073C0EE-8732-42F9-A22E-D47034E25076} = {4CD3AA9E-D937-48CA-BB6C-158E12257D23}
{2B994F33-16CF-4679-936A-5AEABC529D2C} = {EB2EDE59-5021-42EE-A97A-D59939B39C66}
{B8F43537-2D2E-42A0-BE67-5E07E4313AEA} = {2CAB7894-777C-42B1-8B1E-322868CE92C7}
{316CDCC7-323F-4264-9FC9-667662BB1F80} = {A41DE3D1-F8AA-4234-BE6F-3C9646A1507A}
{C4DD4F96-3EC6-47C6-97AA-9B14F0F2099B} = {316CDCC7-323F-4264-9FC9-667662BB1F80}
{84B44F1D-B7FE-40E3-82F0-730A55AC8613} = {316CDCC7-323F-4264-9FC9-667662BB1F80}
{10F842A4-D5F9-41A7-B328-6D5A02BBE4C9} = {316CDCC7-323F-4264-9FC9-667662BB1F80}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {7BFB3429-B5BB-4DB1-95B4-67D77A864952}
Expand Down
51 changes: 48 additions & 3 deletions distributed-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@ variables:
clusterId: '{{ "now" | date: "%s" }}'
serviceId: '{{ "now" | date: "%s" }}'
secretSource: KeyVault
framework: net7.0
framework: net8.0

jobs:
server:
source:
localFolder: Artifacts/DistributedTests/DistributedTests.Server/{{framework}}
executable: DistributedTests.Server.exe
readyStateText: Orleans Silo started.
framework: net7.0
framework: net8.0
arguments: "{{configurator}} --clusterId {{clusterId}} --serviceId {{serviceId}} --secretSource {{secretSource}} {{configuratorOptions}}"
onConfigure:
- if (job.endpoints.Count > 0) {
Expand All @@ -21,7 +21,7 @@ jobs:
localFolder: Artifacts/DistributedTests/DistributedTests.Client/{{framework}}
executable: DistributedTests.Client.exe
waitForExit: true
framework: net7.0
framework: net8.0
arguments: "{{command}} --clusterId {{clusterId}} --serviceId {{serviceId}} --secretSource {{secretSource}} {{commandOptions}}"
onConfigure:
- if (job.endpoints.Count > 0) {
Expand All @@ -46,6 +46,22 @@ scenarios:
requestsPerBlock: 500
duration: 120
commandOptions: "--numWorkers {{numWorkers}} --blocksPerWorker {{blocksPerWorker}} --requestsPerBlock {{requestsPerBlock}} --duration {{duration}}"
fanout:
server:
job: server
variables:
instances: 10
configurator: SimpleSilo
client:
job: client
variables:
command: fan-out
instances: 1
numWorkers: 1
blocksPerWorker: 0
requestsPerBlock: 50
duration: 240
commandOptions: "--numWorkers {{numWorkers}} --blocksPerWorker {{blocksPerWorker}} --requestsPerBlock {{requestsPerBlock}} --duration {{duration}}"
streaming:
server:
job: server
Expand Down Expand Up @@ -125,6 +141,35 @@ scenarios:
duration: 180
commandOptions: "--numWorkers {{numWorkers}} --blocksPerWorker {{blocksPerWorker}} --requestsPerBlock {{requestsPerBlock}} --duration {{duration}}"

counters:
- provider: Microsoft.Orleans
values:
- name: app-requests
measurement: orleans-counter/requests-per-second
description: Request rate

- name: activation-count
measurement: orleans-counter/grain-activation-count
description: Total number of grains

results:
# Microsoft.Orleans counters
- name: orleans-counter/requests-per-second
measurement: orleans-counter/requests-per-second
description: Request rate
format: "n0"
aggregate: max
reduce: max
- name: orleans-counter/requests-per-second/95
measurement: orleans-counter/requests-per-second
description: Request rate
format: "n0"
aggregate: percentile95
reduce: max
- name: activation-count
measurement: orleans-counter/grain-activation-count
description: Active grains

profiles:
local:
variables:
Expand Down
6 changes: 6 additions & 0 deletions playground/DashboardToy/DashboardToy.Common/Class1.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
namespace DashboardToy.Common;

public class Class1
{

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<OrleansBuildTimeCodeGen>true</OrleansBuildTimeCodeGen>
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="..\..\..\src\Orleans.Runtime\Orleans.Runtime.csproj" />
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<Project Sdk="Microsoft.NET.Sdk.Web">

<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<OrleansBuildTimeCodeGen>true</OrleansBuildTimeCodeGen>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Aspire.StackExchange.Redis" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\..\..\src\Orleans.Runtime\Orleans.Runtime.csproj" />
<ProjectReference Include="..\..\..\src\Redis\Orleans.Clustering.Redis\Orleans.Clustering.Redis.csproj" />
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
using System.Runtime.InteropServices;
using Orleans.Core.Internal;

namespace DashboardToy.Frontend.Data;

public class ClusterDiagnosticsService(IGrainFactory grainFactory)
{
private readonly Dictionary<SiloAddress, int> _hostKeys = [];
private readonly Dictionary<SiloAddress, HostDetails> _hostDetails = [];
private readonly Dictionary<GrainId, GrainDetails> _grainDetails = []; // Grain to host id
private readonly Dictionary<Key, ulong> _edges = [];
private readonly IManagementGrain _managementGrain = grainFactory.GetGrain<IManagementGrain>(0);
private readonly record struct GrainDetails(int GrainKey, int HostKey);
private readonly record struct HostDetails(int HostKey, int ActivationCount);
private int _version;

public async ValueTask<CallGraph> GetGrainCallFrequencies()
{
var loaderGrain = grainFactory.GetGrain<ILoaderGrain>("root");
var loaderGrainType = loaderGrain.GetGrainId().Type;
var resetCount = await loaderGrain.GetResetCount();
if (resetCount > _version)
{
_version = resetCount;
await ResetAsync();
}

_edges.Clear();
var maxEdgeValue = 0;
var maxActivationCount = 0;

var silos = (await _managementGrain.GetHosts(onlyActive: true)).Keys.Order();
foreach (var silo in silos)
{
var hostKey = GetHostVertex(silo);
var activationCount = 0;
foreach (var activation in await _managementGrain.GetDetailedGrainStatistics(hostsIds: [silo]))
{
if (activation.GrainId.Type.Equals(loaderGrainType)) continue;
if (activation.GrainId.IsSystemTarget()) continue;
var details = GetGrainVertex(activation.GrainId, hostKey);
_grainDetails[activation.GrainId] = new(details.GrainKey, hostKey);
++activationCount;
}

maxActivationCount = Math.Max(maxActivationCount, activationCount);
_hostDetails[silo] = new(hostKey, activationCount);
}

foreach (var edge in await _managementGrain.GetGrainCallFrequencies())
{
if (edge.TargetGrain.Type.Equals(loaderGrainType) || edge.SourceGrain.Type.Equals(loaderGrainType)) continue;
if (edge.TargetGrain.IsSystemTarget() || edge.SourceGrain.IsSystemTarget()) continue;
var sourceHostId = GetHostVertex(edge.SourceHost);
var targetHostId = GetHostVertex(edge.TargetHost);
var sourceVertex = GetGrainVertex(edge.SourceGrain, sourceHostId);
var targetVertex = GetGrainVertex(edge.TargetGrain, targetHostId);
maxEdgeValue = Math.Max(maxEdgeValue, (int)edge.CallCount);
UpdateEdge(new(sourceVertex.GrainKey, targetVertex.GrainKey), edge.CallCount);
}

var grainIds = new List<GraphNode>(_grainDetails.Count);
CollectionsMarshal.SetCount(grainIds, _grainDetails.Count);
foreach ((var grainId, var (grainKey, hostKey)) in _grainDetails)
{
grainIds[grainKey] = new(grainId.ToString(), grainId.Key.ToString()!, hostKey, 1.0);
}

var hostIds = new List<HostNode>(_hostKeys.Count);
CollectionsMarshal.SetCount(hostIds, _hostKeys.Count);
foreach ((var hostId, var key) in _hostKeys)
{
var details = _hostDetails[hostId];
hostIds[key] = new(hostId.ToString(), details.ActivationCount);
}

var edges = new List<GraphEdge>();

foreach (var edge in _edges)
{
edges.Add(new(edge.Key.Source, edge.Key.Target, edge.Value));
}

return new(grainIds, hostIds, edges, maxEdgeValue, maxActivationCount);
}

internal async ValueTask ResetAsync()
{
var fanoutType = grainFactory.GetGrain<IFanOutGrain>(0, "0").GetGrainId().Type;
foreach (var activation in await _managementGrain.GetDetailedGrainStatistics())
{
if (!activation.GrainId.Type.Equals(fanoutType)) continue;
await grainFactory.GetGrain<IGrainManagementExtension>(activation.GrainId).DeactivateOnIdle();
}

Reset();
}

internal void Reset()
{
_hostKeys.Clear();
_hostDetails.Clear();
_grainDetails.Clear();
_edges.Clear();
}

private GrainDetails GetGrainVertex(GrainId grainId, int hostKey)
{
ref var key = ref CollectionsMarshal.GetValueRefOrAddDefault(_grainDetails, grainId, out var exists);
if (!exists)
{
key = new (_grainDetails.Count - 1, hostKey);
}

return key;
}

private int GetHostVertex(SiloAddress silo)
{
ref var key = ref CollectionsMarshal.GetValueRefOrAddDefault(_hostKeys, silo, out var exists);
if (!exists)
{
key = _hostKeys.Count - 1;
}

return key;
}

private void UpdateEdge(Key key, ulong increment)
{
ref var count = ref CollectionsMarshal.GetValueRefOrAddDefault(_edges, key, out var exists);
count += increment;
}
}

public record class CallGraph(List<GraphNode> GrainIds, List<HostNode> HostIds, List<GraphEdge> Edges, int MaxEdgeValue, int MaxActivationCount);

public record struct HostNode(string Name, int ActivationCount);
public record struct GraphNode(string Name, string Key, int Host, double Weight);
public record struct Key(int Source, int Target);
public record struct GraphEdge(int Source, int Target, double Weight);