Skip to content

Commit

Permalink
Partition SuperPMI replay task (#66065)
Browse files Browse the repository at this point in the history
* Partition SuperPMI replay task

Split per-platform/architecture work into multiple partitions to increase pipeline
parallelism and reduce overall job time. The partitions are sets of
different JitStressRegs options.

We could create a partition for each JitStressRegs option, but the concern
is that there is potentially a lot of overhead downloading the large set
of MCH files, and we might want to share that overhead between work partitions.

* Add comment
  • Loading branch information
BruceForstall committed Mar 2, 2022
1 parent d21a17f commit 83f204e
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 15 deletions.
29 changes: 20 additions & 9 deletions src/coreclr/scripts/superpmi-replay.proj
Original file line number Diff line number Diff line change
Expand Up @@ -52,23 +52,34 @@
</ItemGroup>

<ItemGroup Condition="'$(Architecture)' == 'x64'">
<SPMI_Partition Include="win-x64" Platform="windows" Architecture="x64" />
<SPMI_Partition Include="win-arm64" Platform="windows" Architecture="arm64" />
<SPMI_Partition Include="unix-x64" Platform="Linux" Architecture="x64" />
<SPMI_Partition Include="linux-arm64" Platform="Linux" Architecture="arm64" />
<SPMI_Partition Include="osx-arm64" Platform="OSX" Architecture="arm64" />
<!-- Use 2 partitions for each run on an x64 machine -->
<SPMI_Partition Include="win-x64-1" Platform="windows" Architecture="x64" Partition="1" PartitionCount="2"/>
<SPMI_Partition Include="win-x64-2" Platform="windows" Architecture="x64" Partition="2" PartitionCount="2"/>
<SPMI_Partition Include="win-arm64-1" Platform="windows" Architecture="arm64" Partition="1" PartitionCount="2"/>
<SPMI_Partition Include="win-arm64-2" Platform="windows" Architecture="arm64" Partition="2" PartitionCount="2"/>
<SPMI_Partition Include="unix-x64-1" Platform="Linux" Architecture="x64" Partition="1" PartitionCount="2"/>
<SPMI_Partition Include="unix-x64-2" Platform="Linux" Architecture="x64" Partition="2" PartitionCount="2"/>
<SPMI_Partition Include="linux-arm64-1" Platform="Linux" Architecture="arm64" Partition="1" PartitionCount="2"/>
<SPMI_Partition Include="linux-arm64-2" Platform="Linux" Architecture="arm64" Partition="2" PartitionCount="2"/>
<SPMI_Partition Include="osx-arm64-1" Platform="OSX" Architecture="arm64" Partition="1" PartitionCount="2"/>
<SPMI_Partition Include="osx-arm64-2" Platform="OSX" Architecture="arm64" Partition="2" PartitionCount="2"/>
</ItemGroup>

<ItemGroup Condition="'$(Architecture)' == 'x86'">
<SPMI_Partition Include="win-x86" Platform="windows" Architecture="x86" />
<SPMI_Partition Include="unix-arm" Platform="Linux" Architecture="arm" />
<!-- The x86 machine replays are slower than x64, so use 3 partitions for each run on x86 -->
<SPMI_Partition Include="win-x86-1" Platform="windows" Architecture="x86" Partition="1" PartitionCount="3"/>
<SPMI_Partition Include="win-x86-2" Platform="windows" Architecture="x86" Partition="2" PartitionCount="3"/>
<SPMI_Partition Include="win-x86-3" Platform="windows" Architecture="x86" Partition="3" PartitionCount="3"/>
<SPMI_Partition Include="unix-arm-1" Platform="Linux" Architecture="arm" Partition="1" PartitionCount="3"/>
<SPMI_Partition Include="unix-arm-2" Platform="Linux" Architecture="arm" Partition="2" PartitionCount="3"/>
<SPMI_Partition Include="unix-arm-3" Platform="Linux" Architecture="arm" Partition="3" PartitionCount="3"/>
</ItemGroup>

<ItemGroup>
<HelixWorkItem Include="@(SPMI_Partition)">
<Command>$(WorkItemCommand) -arch %(HelixWorkItem.Architecture) -platform %(HelixWorkItem.Platform) -log_directory $(SuperpmiLogsLocation)</Command>
<Command>$(WorkItemCommand) -arch %(HelixWorkItem.Architecture) -platform %(HelixWorkItem.Platform) -partition %(HelixWorkItem.Partition) -partition_count %(HelixWorkItem.PartitionCount) -log_directory $(SuperpmiLogsLocation)</Command>
<Timeout>$(WorkItemTimeout)</Timeout>
<DownloadFilesFromResults>superpmi_%(HelixWorkItem.Platform)_%(HelixWorkItem.Architecture).log</DownloadFilesFromResults>
<DownloadFilesFromResults>superpmi_%(HelixWorkItem.Platform)_%(HelixWorkItem.Architecture)_%(HelixWorkItem.Partition).log</DownloadFilesFromResults>
</HelixWorkItem>
</ItemGroup>
</Project>
67 changes: 61 additions & 6 deletions src/coreclr/scripts/superpmi_replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@
parser.add_argument("-platform", help="OS platform")
parser.add_argument("-jit_directory", help="path to the directory containing clrjit binaries")
parser.add_argument("-log_directory", help="path to the directory containing superpmi log files")
parser.add_argument("-partition", help="Partition number specifying which set of flags to use: between 1 and the `-partition_count` value")
parser.add_argument("-partition_count", help="Count of the total number of partitions we are using: should be <= 9 (number of jit_flags_all elements)")

jit_flags = [
jit_flags_all = [
"JitStressRegs=0",
# JitStressRegs=1 disabled due to https://github.com/dotnet/runtime/issues/65332
# "JitStressRegs=1",
Expand All @@ -38,6 +40,20 @@
"JitStressRegs=0x1000",
]

def split(a, n):
""" Splits array `a` in `n` partitions.
Slightly modified from https://stackoverflow.com/a/2135920.
Args:
args (ArgParse): args parsed by arg parser
Returns:
args (CoreclrArguments)
"""
k, m = divmod(len(a), n)
return [a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)]


def setup_args(args):
""" Setup the args for SuperPMI to use.
Expand Down Expand Up @@ -69,9 +85,39 @@ def setup_args(args):

coreclr_args.verify(args,
"log_directory",
lambda log_directory: True,
lambda log_directory: os.path.isdir(log_directory),
"log_directory doesn't exist")

coreclr_args.verify(args,
"partition",
lambda partition: True,
"Unable to set partition")

coreclr_args.verify(args,
"partition_count",
lambda partition: True,
"Unable to set partition_count")

try:
coreclr_args.partition = int(coreclr_args.partition)
except ValueError as e:
print("Illegal `-partition` value: " + str(coreclr_args.partition))
sys.exit(1)

try:
coreclr_args.partition_count = int(coreclr_args.partition_count)
except ValueError as e:
print("Illegal `-partition_count` value: " + str(coreclr_args.partition_count))
sys.exit(1)

if coreclr_args.partition_count <= 0:
print("Illegal `-partition_count` value: " + str(coreclr_args.partition_count))
sys.exit(1)

if coreclr_args.partition < 1 or coreclr_args.partition > coreclr_args.partition_count:
print("Illegal `-partition` value: " + str(coreclr_args.partition))
sys.exit(1)

return coreclr_args


Expand All @@ -81,7 +127,6 @@ def main(main_args):
Args:
main_args ([type]): Arguments to the script
"""

python_path = sys.executable
cwd = os.path.dirname(os.path.realpath(__file__))
coreclr_args = setup_args(main_args)
Expand All @@ -94,9 +139,19 @@ def main(main_args):
os_name = "universal" if arch_name.startswith("arm") else os_name
jit_path = os.path.join(coreclr_args.jit_directory, 'clrjit_{}_{}_{}.dll'.format(os_name, arch_name, host_arch_name))

jit_flags_partitioned = split(jit_flags_all, coreclr_args.partition_count)
jit_flags = jit_flags_partitioned[coreclr_args.partition - 1] # partition number is 1-based

print("Running superpmi.py download")
run_command([python_path, os.path.join(cwd, "superpmi.py"), "download", "--no_progress", "-target_os", platform_name,
"-target_arch", arch_name, "-core_root", cwd, "-spmi_location", spmi_location], _exit_on_fail=True)
run_command([python_path,
os.path.join(cwd, "superpmi.py"),
"download",
"--no_progress",
"-target_os", platform_name,
"-target_arch", arch_name,
"-core_root", cwd,
"-spmi_location", spmi_location,
"-log_level", "debug"], _exit_on_fail=True)

failed_runs = []
for jit_flag in jit_flags:
Expand All @@ -122,7 +177,7 @@ def main(main_args):
failed_runs.append("Failure in {}".format(log_file))

# Consolidate all superpmi_*.logs in superpmi_platform_architecture.log
final_log_name = os.path.join(log_directory, "superpmi_{}_{}.log".format(platform_name, arch_name))
final_log_name = os.path.join(log_directory, "superpmi_{}_{}_{}.log".format(platform_name, arch_name, coreclr_args.partition))
print("Consolidating final {}".format(final_log_name))
with open(final_log_name, "a") as final_superpmi_log:
for superpmi_log in os.listdir(log_directory):
Expand Down

0 comments on commit 83f204e

Please sign in to comment.