Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ jobs:
if: github.actor != 'dependabot[bot]'
env:
GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
GITHUB_BRANCH: ${{ github.head_ref || github.ref_name }}
steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v4
Expand Down
4 changes: 2 additions & 2 deletions src/kernelbot/api/api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,9 +213,9 @@ async def to_submit_info(

allowed_modes = [
SubmissionMode.TEST,
SubmissionMode.BENCHMARK,
SubmissionMode.PRIVATE,
SubmissionMode.PROFILE,
SubmissionMode.LEADERBOARD,
SubmissionMode.PUBLIC,
]
if submission_mode_enum not in allowed_modes:
raise HTTPException(
Expand Down
16 changes: 8 additions & 8 deletions src/kernelbot/cogs/leaderboard_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int):
for run in sub_data["runs"]:
if (
not run["secret"]
and run["mode"] == SubmissionMode.LEADERBOARD.value
and run["mode"] == SubmissionMode.PUBLIC.value
and run["passed"]
):
result_lines.append(generate_run_verdict(self.bot.backend, run, sub_data))
Expand Down Expand Up @@ -134,7 +134,7 @@ async def submit(
reporter = MultiProgressReporterDiscord(interaction)
sub_id, results = await self.bot.backend.submit_full(req, mode, reporter)

if mode == SubmissionMode.LEADERBOARD:
if mode == SubmissionMode.PUBLIC:
await self.post_submit_hook(interaction, sub_id)
return sub_id

Expand All @@ -157,23 +157,23 @@ async def submit_test(
interaction, leaderboard_name, script, mode=SubmissionMode.TEST, gpu=gpu
)

@app_commands.command(name="benchmark", description="Start a benchmarking run")
@app_commands.command(name="private", description="Start a private benchmarking run")
@app_commands.describe(
leaderboard_name="Name of the competition / kernel to optimize",
script="The Python / CUDA script file to run",
gpu="Select GPU. Leave empty for interactive or automatic selection.",
)
@app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete)
@with_error_handling
async def submit_bench(
async def submit_private(
self,
interaction: discord.Interaction,
script: discord.Attachment,
leaderboard_name: Optional[str],
gpu: Optional[str],
):
return await self.submit(
interaction, leaderboard_name, script, mode=SubmissionMode.BENCHMARK, gpu=gpu
interaction, leaderboard_name, script, mode=SubmissionMode.PRIVATE, gpu=gpu
)

@app_commands.command(name="profile", description="Start a profiling run")
Expand All @@ -196,7 +196,7 @@ async def submit_profile(
)

@app_commands.command(
name="ranked", description="Start a ranked run for an official leaderboard submission"
name="public", description="Start a public run for an official leaderboard submission"
)
@app_commands.describe(
leaderboard_name="Name of the competition / kernel to optimize",
Expand All @@ -205,15 +205,15 @@ async def submit_profile(
)
@app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete)
@with_error_handling
async def submit_ranked(
async def submit_public(
self,
interaction: discord.Interaction,
script: discord.Attachment,
leaderboard_name: Optional[str] = None,
gpu: Optional[str] = None,
):
return await self.submit(
interaction, leaderboard_name, script, mode=SubmissionMode.LEADERBOARD, gpu=gpu
interaction, leaderboard_name, script, mode=SubmissionMode.PUBLIC, gpu=gpu
)


Expand Down
8 changes: 4 additions & 4 deletions src/kernelbot/cogs/verify_run_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,8 @@ async def verify_modal_run(
@app_commands.choices(
mode=[
Choice(name=SubmissionMode.TEST.name, value=SubmissionMode.TEST.value),
Choice(name=SubmissionMode.BENCHMARK.name, value=SubmissionMode.BENCHMARK.value),
Choice(name=SubmissionMode.LEADERBOARD.name, value=SubmissionMode.LEADERBOARD.value),
Choice(name=SubmissionMode.PRIVATE.name, value=SubmissionMode.PRIVATE.value),
Choice(name=SubmissionMode.PUBLIC.name, value=SubmissionMode.PUBLIC.value),
Choice(name="All", value="all"),
]
)
Expand All @@ -194,9 +194,9 @@ async def verify_task(

modes = []
if mode is None:
modes = [SubmissionMode.LEADERBOARD]
modes = [SubmissionMode.PUBLIC]
elif mode.value == "all":
modes = [SubmissionMode.TEST, SubmissionMode.BENCHMARK, SubmissionMode.LEADERBOARD]
modes = [SubmissionMode.TEST, SubmissionMode.PRIVATE, SubmissionMode.PUBLIC]
else:
modes = [SubmissionMode(mode.value)]

Expand Down
22 changes: 12 additions & 10 deletions src/libkernelbot/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ async def submit_full(
for gpu in selected_gpus
]

if mode == SubmissionMode.LEADERBOARD:
if mode == SubmissionMode.PUBLIC:
tasks += [
self.submit_leaderboard(
sub_id,
Expand All @@ -95,7 +95,7 @@ async def submit_full(
gpu,
reporter.add_run(f"{gpu.name} on {gpu.runner} (secret)"),
req.task,
SubmissionMode.PRIVATE,
SubmissionMode.SECRET,
req.secret_seed,
)
for gpu in selected_gpus
Expand Down Expand Up @@ -142,12 +142,14 @@ async def submit_leaderboard( # noqa: C901

if result.success:
score = None
# Check for the mode's result key (public or secret)
mode_key = mode.value
if (
"leaderboard" in result.runs
and result.runs["leaderboard"].run.success
and result.runs["leaderboard"].run.passed
mode_key in result.runs
and result.runs[mode_key].run.success
and result.runs[mode_key].run.passed
Comment on lines +146 to +150
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This logic uses mode_key = mode.value and then computes a score whenever mode_key exists in result.runs, which means PRIVATE submissions will also receive a non-null score. Because create_submission_run later writes that score with secret=False, PRIVATE (non-ranked) runs will end up contributing to the public leaderboard, contradicting the new semantics where only PUBLIC (and internal SECRET) runs should affect ranking. To align behavior with the new mode definitions, restrict score calculation (and setting a non-None score) to ranked modes only (e.g., PUBLIC/SECRET), and keep PRIVATE runs’ score as None so they are excluded by the DB queries that filter on score IS NOT NULL and NOT secret.

Copilot uses AI. Check for mistakes.
):
score = compute_score(result, task, submission_id)
score = compute_score(result, task, submission_id, mode_key)

# verifyruns uses a fake submission id of -1
if submission_id != -1:
Expand All @@ -159,8 +161,8 @@ async def submit_leaderboard( # noqa: C901
end=value.end,
mode=key,
runner=gpu_type.name,
score=None if key != "leaderboard" else score,
secret=mode == SubmissionMode.PRIVATE,
score=None if key != mode_key else score,
secret=mode == SubmissionMode.SECRET,
compilation=value.compilation,
result=value.run,
system=result.system,
Expand Down Expand Up @@ -207,7 +209,7 @@ async def handle_submission(
await reporter.update_title(reporter.title + " ✅ success")

short_report = make_short_report(
result.runs, full=mode in [SubmissionMode.PRIVATE, SubmissionMode.LEADERBOARD]
result.runs, full=mode in [SubmissionMode.PUBLIC, SubmissionMode.SECRET]
)

stream_msg = (
Expand All @@ -222,7 +224,7 @@ async def handle_submission(
)

await reporter.push(short_report)
if mode != SubmissionMode.PRIVATE:
if mode != SubmissionMode.SECRET:
try:
# does the last message of the short report start with ✅ or ❌?
verdict = short_report[-1][0]
Expand Down
19 changes: 10 additions & 9 deletions src/libkernelbot/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,21 +82,22 @@ class SubmissionMode(Enum):
"""
Different types of submission that can be made:
Test: Run tests and give detailed results about passed/failed tests. These have short timeouts.
Benchmark: Run larger benchmarks. Each benchmark is tested once, and then run multiple times.
Private: Run benchmarks privately. Each benchmark is tested once, and then run multiple times.
Returns detailed timing results but doesn't affect leaderboard ranking.
Profile: Gather profiling information. One selected benchmark is run under the profiler. No
testing is performed in this mode (sometimes, you need to profile deliberately broken code)
Leaderboard: Official submission to the leaderboard. This first runs public tests, then a
repeated invocation of a single benchmark. Feedback for the secret benchmark is only very
limited (no stdout/stderr).
Private: Special run that does test followed by leaderboard (on a secret seed), but gives only
very limited feedback.
Public: Official submission to the leaderboard. This first runs public tests, then a
repeated invocation of a single benchmark. If all tests pass, the submission is evaluated
and ranked on the public leaderboard.
Secret: Internal mode for running the full evaluation flow with a secret seed. This is used
for secret validation runs that accompany public submissions.
"""

TEST = "test"
BENCHMARK = "benchmark"
PROFILE = "profile"
LEADERBOARD = "leaderboard"
PRIVATE = "private"
PROFILE = "profile"
PUBLIC = "public"
SECRET = "secret"


class Language(Enum):
Expand Down
4 changes: 2 additions & 2 deletions src/libkernelbot/launchers/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ def get_timeout(config: dict) -> int:
mode = config.get("mode")
sec_map = {
SubmissionMode.TEST.value: config.get("test_timeout"),
SubmissionMode.BENCHMARK.value: config.get("benchmark_timeout"),
SubmissionMode.LEADERBOARD.value: config.get("ranked_timeout"),
SubmissionMode.PRIVATE.value: config.get("benchmark_timeout"),
SubmissionMode.PUBLIC.value: config.get("ranked_timeout"),
}
seconds = sec_map.get(mode) or DEFAULT_GITHUB_TIMEOUT_MINUTES * 60
return math.ceil(seconds / 60)
Expand Down
28 changes: 16 additions & 12 deletions src/libkernelbot/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,8 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n
elif full:
result.append("❌ Tests missing")

if "benchmark" in runs:
bench_run = runs["benchmark"].run
if "private" in runs:
bench_run = runs["private"].run
if not bench_run.success:
result.append("❌ Running benchmarks failed" + _short_fail_reason(bench_run))
return result
Expand All @@ -202,16 +202,18 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n
else:
result.append("✅ Profiling successful")

if "leaderboard" in runs:
lb_run = runs["leaderboard"].run
# Check for public or secret run results
ranked_key = "public" if "public" in runs else ("secret" if "secret" in runs else None)
if ranked_key:
lb_run = runs[ranked_key].run
if not lb_run.success:
result.append("❌ Running leaderboard failed" + _short_fail_reason(lb_run))
result.append("❌ Running ranked submission failed" + _short_fail_reason(lb_run))
elif not lb_run.passed:
result.append("❌ Leaderboard run failed")
result.append("❌ Ranked submission failed")
else:
result.append("✅ Leaderboard run successful")
result.append("✅ Ranked submission successful")
elif full:
result.append("❌ Leaderboard missing")
result.append("❌ Ranked submission missing")
return result


Expand Down Expand Up @@ -339,8 +341,8 @@ def generate_report(result: FullResult, extra_text: str = "") -> RunResultReport
num_tests = int(test_run.result.get("test-count", 0))
report.add_log(f"✅ Passed {num_tests}/{num_tests} tests", make_test_log(test_run))

if "benchmark" in runs:
bench_run = runs["benchmark"]
if "private" in runs:
bench_run = runs["private"]
if _handle_crash_report(report, bench_run):
return report

Expand Down Expand Up @@ -378,8 +380,10 @@ def generate_report(result: FullResult, extra_text: str = "") -> RunResultReport
base64.b64decode(prof_run.profile.trace),
)

if "leaderboard" in runs:
bench_run = runs["leaderboard"]
# Check for public or secret run results
ranked_key = "public" if "public" in runs else ("secret" if "secret" in runs else None)
if ranked_key:
bench_run = runs[ranked_key]
if _handle_crash_report(report, bench_run):
return report

Expand Down
16 changes: 8 additions & 8 deletions src/libkernelbot/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,8 +556,8 @@ def run_single_evaluation(
if mode == "test":
timeout = test_timeout
cases.write(tests)
elif mode in ["benchmark", "profile", "leaderboard"]:
timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout
elif mode in ["private", "profile", "public", "secret"]:
timeout = ranked_timeout if mode in ["public", "secret"] else benchmark_timeout
if ranking_by == "last":
cases.write(benchmarks.splitlines(keepends=True)[-1])
else:
Expand Down Expand Up @@ -801,22 +801,22 @@ def run_evaluation(
common_args["benchmarks"] = benchmark
results[f"{mode}.{i}"] = call(mode=mode, **common_args)

elif mode in ["test", "benchmark"]:
elif mode in ["test", "private"]:
results[mode] = call(mode=mode, **common_args)
elif mode in ["private", "leaderboard"]:
elif mode in ["public", "secret"]:
# first, run the tests
results["test"] = call(mode="test", **common_args)

if not results["test"].run or not results["test"].run.passed:
return results

results["benchmark"] = call(mode="benchmark", **common_args)
results["private"] = call(mode="private", **common_args)

if not results["benchmark"].run or not results["benchmark"].run.passed:
if not results["private"].run or not results["private"].run.passed:
return results

# if they pass, run the leaderboard validation
results["leaderboard"] = call(mode="leaderboard", **common_args)
# if they pass, run the public/secret validation
results[mode] = call(mode=mode, **common_args)
else:
raise AssertionError("Invalid mode")

Expand Down
8 changes: 4 additions & 4 deletions src/libkernelbot/submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,8 @@ def _get_popcorn_directives(submission: str) -> dict: # noqa: C901
return popcorn_info


def compute_score(result: FullResult, task: LeaderboardTask, submission_id: int) -> float:
num_benchmarks = int(result.runs["leaderboard"].run.result["benchmark-count"])
def compute_score(result: FullResult, task: LeaderboardTask, submission_id: int, mode_key: str = "public") -> float:
num_benchmarks = int(result.runs[mode_key].run.result["benchmark-count"])
if task.ranking_by == RankCriterion.LAST:
if num_benchmarks != 1:
logger.error(
Expand All @@ -182,11 +182,11 @@ def compute_score(result: FullResult, task: LeaderboardTask, submission_id: int)
raise KernelBotError(
f"Expected submission to have exactly one benchmark, got {num_benchmarks}."
)
score = float(result.runs["leaderboard"].run.result["benchmark.0.mean"]) / 1e9
score = float(result.runs[mode_key].run.result["benchmark.0.mean"]) / 1e9
else:
scores = []
for i in range(num_benchmarks):
scores.append(float(result.runs["leaderboard"].run.result[f"benchmark.{i}.mean"]) / 1e9)
scores.append(float(result.runs[mode_key].run.result[f"benchmark.{i}.mean"]) / 1e9)
if task.ranking_by == RankCriterion.MEAN:
score = sum(scores) / len(scores)
elif task.ranking_by == RankCriterion.GEOM:
Expand Down
Loading
Loading