Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Activation Rebalancing #9140

Open
wants to merge 56 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
6a56ef2
Add in-process test cluster to simplify testing of services which are…
ReubenBond Sep 25, 2024
45b526a
working version
Aug 16, 2024
591bb36
wip
Aug 31, 2024
9eaaa77
refactor
Aug 31, 2024
a40e2ec
wip
Sep 1, 2024
ccaaf40
/
Sep 1, 2024
b350743
/
Sep 1, 2024
39c6e60
/
Sep 1, 2024
dc45412
made grain Immovable, and guarding against non-zero activation key
Sep 1, 2024
898327d
tests & added SiloRebalancingStatistics
Sep 1, 2024
0cedbac
increase the cycle number in test to see effect better
Sep 2, 2024
dbafeef
/
Sep 2, 2024
ceec30f
added more to stats
Sep 2, 2024
1d9d1b7
allowing silo weight to go all the way to 0 + tests
Sep 3, 2024
bdb7e21
added failed session backoff provider
Sep 3, 2024
48f4ba6
switched to using the existing IBackoffProvider and FixedBackoff and …
Sep 3, 2024
021c1cf
refactoring into RebalancingTestBase
Sep 3, 2024
2ae3f75
added dynamic rebalancing test
Sep 4, 2024
ec91211
switched to LoggerMessage templates for logging
Sep 4, 2024
8cb4efb
Added migratability checker
Sep 4, 2024
bccd685
try the best to fullfill MigrateRandomActivations up to the provided …
Sep 4, 2024
b12db9c
Monitor system, and auto-migration in case of shutdowns (graceful once)
Sep 5, 2024
90dcfa6
added status listeners
Sep 7, 2024
2591f6e
removed old testing apps
Sep 7, 2024
a79c5eb
refactoring towards rebalancer report
Sep 7, 2024
5dbb302
more tests and fixed xml docs
Sep 7, 2024
9c89591
report listening & tests for controlling the rebalancer
Sep 7, 2024
e509c15
renamed test
Sep 10, 2024
b492172
added toy project for showing rebalancing in playground
Sep 11, 2024
f3e1b7c
adjusted code after merging main, introduced GetLocalDetailedGrainSta…
Sep 11, 2024
8a386de
xml docs
Sep 11, 2024
a2d37af
simplified GetDetailedGrainStatistics
Sep 11, 2024
c8a9767
reverted back
Sep 12, 2024
1a873c1
added line charts to show rebalancing over time in toy proejct + incr…
Sep 14, 2024
412fb86
/
Sep 14, 2024
20fe731
added ClusterImbalance to RebalancingReport to be used to determine i…
Sep 15, 2024
72054c3
tweaked options slighly + toy project
Sep 15, 2024
84a7f42
avoiding some allocations + fixing tests
Sep 15, 2024
eb5f87d
added support for scaled allowed entropy deviation to conform the fac…
Sep 15, 2024
1921e66
xml docs
Sep 15, 2024
e75fde9
added ActivationMigrationCountLimit
Sep 15, 2024
af0d723
xml docs
Sep 17, 2024
18bd7a3
addressed some of the PR comments
Sep 20, 2024
0d44de7
PR feedback
ReubenBond Sep 21, 2024
ffef570
PR feedback 2
ReubenBond Sep 21, 2024
e7499a4
Use cancellation tokens with timers
ReubenBond Sep 21, 2024
3ad801a
PR feedback
ReubenBond Sep 21, 2024
ae62957
Use relative time
ReubenBond Sep 21, 2024
9cac2ea
Small fixes
ReubenBond Sep 21, 2024
3d649ed
Changes
ReubenBond Sep 21, 2024
85b0cce
wording
ReubenBond Sep 21, 2024
c96f09f
Implement `ISiloStatisticsChangeListener` safely in grain
ReubenBond Sep 23, 2024
a907ae1
WIP: add & use in-process test cluster
ReubenBond Sep 24, 2024
86ce2bc
addressed remaning comments
Sep 25, 2024
70bee4d
xml doc
Sep 25, 2024
e46bbb6
missed setting default value of ScaledEntropyDeviationActivationThres…
Sep 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions Orleans.sln
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,12 @@ Project("{6EC3EE1D-3C4E-46DD-8F32-0CC8E7565705}") = "Orleans.Serialization.FShar
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Orleans.Serialization.MessagePack", "src\Orleans.Serialization.MessagePack\Orleans.Serialization.MessagePack.csproj", "{F50F81B6-E9B5-4143-B66B-A1AD913F6E9C}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ActivationRebalancing", "ActivationRebalancing", "{B0DC8B8D-29CD-4CA3-A874-471F75595829}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ActivationRebalancing.Cluster", "playground\ActivationRebalancing\ActivationRebalancing.Cluster\ActivationRebalancing.Cluster.csproj", "{2D109E60-E9BF-4F57-BBCD-DF5FA7768B00}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ActivationRebalancing.Frontend", "playground\ActivationRebalancing\ActivationRebalancing.Frontend\ActivationRebalancing.Frontend.csproj", "{DFAF9FFC-EBD9-45F0-A121-010D29A296C1}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -632,6 +638,14 @@ Global
{F50F81B6-E9B5-4143-B66B-A1AD913F6E9C}.Debug|Any CPU.Build.0 = Debug|Any CPU
{F50F81B6-E9B5-4143-B66B-A1AD913F6E9C}.Release|Any CPU.ActiveCfg = Release|Any CPU
{F50F81B6-E9B5-4143-B66B-A1AD913F6E9C}.Release|Any CPU.Build.0 = Release|Any CPU
{2D109E60-E9BF-4F57-BBCD-DF5FA7768B00}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{2D109E60-E9BF-4F57-BBCD-DF5FA7768B00}.Debug|Any CPU.Build.0 = Debug|Any CPU
{2D109E60-E9BF-4F57-BBCD-DF5FA7768B00}.Release|Any CPU.ActiveCfg = Release|Any CPU
{2D109E60-E9BF-4F57-BBCD-DF5FA7768B00}.Release|Any CPU.Build.0 = Release|Any CPU
{DFAF9FFC-EBD9-45F0-A121-010D29A296C1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{DFAF9FFC-EBD9-45F0-A121-010D29A296C1}.Debug|Any CPU.Build.0 = Debug|Any CPU
{DFAF9FFC-EBD9-45F0-A121-010D29A296C1}.Release|Any CPU.ActiveCfg = Release|Any CPU
{DFAF9FFC-EBD9-45F0-A121-010D29A296C1}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -748,6 +762,9 @@ Global
{84B44F1D-B7FE-40E3-82F0-730A55AC8613} = {316CDCC7-323F-4264-9FC9-667662BB1F80}
{B2D53D3C-E44A-4C9B-AAEE-28FB8C1BDF62} = {A6573187-FD0D-4DF7-91D1-03E07E470C0A}
{F50F81B6-E9B5-4143-B66B-A1AD913F6E9C} = {4CD3AA9E-D937-48CA-BB6C-158E12257D23}
{B0DC8B8D-29CD-4CA3-A874-471F75595829} = {A41DE3D1-F8AA-4234-BE6F-3C9646A1507A}
{2D109E60-E9BF-4F57-BBCD-DF5FA7768B00} = {B0DC8B8D-29CD-4CA3-A874-471F75595829}
{DFAF9FFC-EBD9-45F0-A121-010D29A296C1} = {B0DC8B8D-29CD-4CA3-A874-471F75595829}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {7BFB3429-B5BB-4DB1-95B4-67D77A864952}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<OrleansBuildTimeCodeGen>true</OrleansBuildTimeCodeGen>
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="..\..\..\src\Orleans.Server\Orleans.Server.csproj" />
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
using System.Diagnostics;
using System.Net;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Orleans.Configuration;
using Orleans.Runtime.Placement;

#nullable enable

// Ledjon: The silos will run in the same process so they will have the same memory usage.
// I previously had 4 console apps to run the example, but didn't want to add so many proj into the solution.
// I am sure with something like Aspire that would be easier, but for now I'll leave them like this.
// You (the reader) feel free to run this in different processes for a more realistic example.

var host0 = await StartSiloHost(0);
var host1 = await StartSiloHost(1);
var host2 = await StartSiloHost(2);
var host3 = await StartSiloHost(3);
IHost? host5 = null;

Console.WriteLine("All silos have started.");

var grainFactory = host0.Services.GetRequiredService<IGrainFactory>();
var mgmtGrain = grainFactory.GetGrain<IManagementGrain>(0);

var silos = await mgmtGrain.GetHosts(onlyActive: true);
Debug.Assert(silos.Count == 4);
var addresses = silos.Select(x => x.Key).ToArray();

var tasks = new List<Task>();
RequestContext.Set(IPlacementDirector.PlacementHintKey, addresses[0]);
for (var i = 0; i < 300; i++)
{
tasks.Add(grainFactory.GetGrain<IRebalancingTestGrain>(Guid.NewGuid()).Ping());
}

RequestContext.Set(IPlacementDirector.PlacementHintKey, addresses[1]);
for (var i = 0; i < 30; i++)
{
tasks.Add(grainFactory.GetGrain<IRebalancingTestGrain>(Guid.NewGuid()).Ping());
}

RequestContext.Set(IPlacementDirector.PlacementHintKey, addresses[2]);
for (var i = 0; i < 410; i++)
{
tasks.Add(grainFactory.GetGrain<IRebalancingTestGrain>(Guid.NewGuid()).Ping());
}

RequestContext.Set(IPlacementDirector.PlacementHintKey, addresses[3]);
for (var i = 0; i < 120; i++)
{
tasks.Add(grainFactory.GetGrain<IRebalancingTestGrain>(Guid.NewGuid()).Ping());
}

var sessionCount = 0;
while (true)
{
if (sessionCount == 25)
{
RequestContext.Set(IPlacementDirector.PlacementHintKey, addresses[0]);
for (var i = 0; i < 50; i++)
{
tasks.Add(grainFactory.GetGrain<IRebalancingTestGrain>(Guid.NewGuid()).Ping());
}

RequestContext.Set(IPlacementDirector.PlacementHintKey, addresses[1]);
for (var i = 0; i < 50; i++)
{
tasks.Add(grainFactory.GetGrain<IRebalancingTestGrain>(Guid.NewGuid()).Ping());
}
}

if (sessionCount == 35)
{
RequestContext.Set(IPlacementDirector.PlacementHintKey, addresses[1]);
for (var i = 0; i < 50; i++)
{
tasks.Add(grainFactory.GetGrain<IRebalancingTestGrain>(Guid.NewGuid()).Ping());
}

RequestContext.Set(IPlacementDirector.PlacementHintKey, addresses[2]);
for (var i = 0; i < 50; i++)
{
tasks.Add(grainFactory.GetGrain<IRebalancingTestGrain>(Guid.NewGuid()).Ping());
}
}

if (sessionCount == 40)
{
host5 = await StartSiloHost(4);
}

if (sessionCount == 45)
{
RequestContext.Set(IPlacementDirector.PlacementHintKey, addresses[2]);
for (var i = 0; i < 50; i++)
{
tasks.Add(grainFactory.GetGrain<IRebalancingTestGrain>(Guid.NewGuid()).Ping());
}

RequestContext.Set(IPlacementDirector.PlacementHintKey, addresses[3]);
for (var i = 0; i < 50; i++)
{
tasks.Add(grainFactory.GetGrain<IRebalancingTestGrain>(Guid.NewGuid()).Ping());
}
}

await Task.Delay(5000); // session duration
sessionCount++;

if (sessionCount > 55)
{
break;
}
}

Console.WriteLine("Simulation has finished. Press Enter to terminate...");
Console.ReadLine();

await host0.StopAsync();
await host1.StopAsync();
await host2.StopAsync();
await host3.StopAsync();

if (host5 != null)
{
await host5.StopAsync();
}

static async Task<IHost> StartSiloHost(int num)
{
#pragma warning disable ORLEANSEXP002
var host = Host.CreateDefaultBuilder()
.ConfigureLogging(builder => builder
.AddFilter("", LogLevel.Error)
.AddFilter("Orleans.Runtime.Placement.Rebalancing", LogLevel.Trace)
.AddConsole())
.UseOrleans(builder => builder
.Configure<ActivationRebalancerOptions>(o =>
{
o.RebalancerDueTime = TimeSpan.FromSeconds(5);
o.SessionCyclePeriod = TimeSpan.FromSeconds(5);
// uncomment these below, if you want higher migration rate
//o.CycleNumberWeight = 1;
//o.SiloNumberWeight = 0;
})
.UseLocalhostClustering(
siloPort: EndpointOptions.DEFAULT_SILO_PORT + num,
gatewayPort: EndpointOptions.DEFAULT_GATEWAY_PORT + num,
primarySiloEndpoint: new IPEndPoint(IPAddress.Loopback, EndpointOptions.DEFAULT_SILO_PORT))
.AddActivationRebalancer())
.Build();
#pragma warning restore ORLEANSEXP002

await host.StartAsync();
Console.WriteLine($"Silo{num} started.");

return host;
}

public interface IRebalancingTestGrain : IGrainWithGuidKey
{
Task Ping();
}

public class RebalancingTestGrain : Grain, IRebalancingTestGrain
{
public Task Ping() => Task.CompletedTask;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<Project Sdk="Microsoft.NET.Sdk.Web">

<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="..\..\..\src\Orleans.Server\Orleans.Server.csproj" />
<ProjectReference Include="..\ActivationRebalancing.Cluster\ActivationRebalancing.Cluster.csproj" />
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
using Microsoft.AspNetCore.Mvc;
using Orleans.Runtime;
using Orleans;

namespace ActivationRebalancing.Frontend.Controllers;

[ApiController]
[Route("api/[controller]")]
public class StatsController(IClusterClient clusterClient) : ControllerBase
{
[HttpGet("silos")]
public async Task<IActionResult> GetStats()
{
var grainStats = await clusterClient
.GetGrain<IManagementGrain>(0)
.GetDetailedGrainStatistics();

var siloData = grainStats.GroupBy(stat => stat.SiloAddress)
.Select(g => new SiloData(g.Key.ToString(), g.Count()))
.ToList();

if (siloData.Count == 4)
{
siloData = [.. siloData, new SiloData("x", 0)];
}

if (siloData.Count > 5)
{
throw new NotSupportedException("The frontend cant support more than 6 silos");
}

return Ok(siloData);
}
}

public record SiloData(string Host, int Activations);
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
using Orleans.Hosting;

var builder = WebApplication.CreateBuilder(args);

builder.UseOrleansClient(clientBuilder => clientBuilder.UseLocalhostClustering());
builder.Services.AddControllers();

var app = builder.Build();

var options = new DefaultFilesOptions();
options.DefaultFileNames.Clear();
options.DefaultFileNames.Add("index.html");

app.UseDefaultFiles(options);
app.UseStaticFiles();
app.MapControllers();
app.Run();
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"profiles": {
"http": {
"commandName": "Project",
"dotnetRunMessages": true,
"launchBrowser": true,
"launchUrl": "index.html",
"applicationUrl": "http://localhost:5000",
"environmentVariables": {
"ASPNETCORE_ENVIRONMENT": "Development"
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"Logging": {
"LogLevel": {
"Default": "Information",
"Microsoft.AspNetCore": "Warning"
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"Logging": {
"LogLevel": {
"Default": "Information",
"Microsoft.AspNetCore": "Warning"
}
},
"AllowedHosts": "*"
}
Loading
Loading