From da7a1de3705790e49ea2f68bf8cc7046ccb69c22 Mon Sep 17 00:00:00 2001 From: cmyui Date: Sat, 7 Feb 2026 12:57:00 -0500 Subject: [PATCH] Add benchmark suite and request timing instrumentation Benchmark suite (make bench) measures operation latencies across all major subsystems: slab allocator, database, string utilities, JSON builder, and HTTP request parsing. Uses ARM64 Generic Timer for nanosecond-precision measurements. Request timing logs wall-clock time for every HTTP request handled by slowapi_handle, printed to UART on completion. Co-Authored-By: Claude Opus 4.6 --- Makefile | 32 ++- src/bench/bench_db.s | 187 +++++++++++++++ src/bench/bench_harness.s | 467 ++++++++++++++++++++++++++++++++++++++ src/bench/bench_json.s | 139 ++++++++++++ src/bench/bench_main.s | 27 +++ src/bench/bench_request.s | 72 ++++++ src/bench/bench_slab.s | 117 ++++++++++ src/bench/bench_string.s | 109 +++++++++ src/drivers/timer.s | 18 ++ src/slowapi/slowapi.s | 75 ++++++ 10 files changed, 1241 insertions(+), 2 deletions(-) create mode 100644 src/bench/bench_db.s create mode 100644 src/bench/bench_harness.s create mode 100644 src/bench/bench_json.s create mode 100644 src/bench/bench_main.s create mode 100644 src/bench/bench_request.s create mode 100644 src/bench/bench_slab.s create mode 100644 src/bench/bench_string.s create mode 100644 src/drivers/timer.s diff --git a/Makefile b/Makefile index 842f1be..019e4a8 100644 --- a/Makefile +++ b/Makefile @@ -7,10 +7,12 @@ OBJCOPY = aarch64-none-elf-objcopy KERNEL_ELF = kernel.elf KERNEL_BIN = kernel.bin TEST_ELF = kernel_test.elf +BENCH_ELF = kernel_bench.elf # Common source files (shared between server and tests) COMMON_SRCS = src/drivers/uart.s \ src/drivers/virtio.s \ + src/drivers/timer.s \ src/net/ethernet.s \ src/net/arp.s \ src/net/ipv4.s \ @@ -61,11 +63,24 @@ TEST_SRCS = src/test/test_main.s \ src/app.s TEST_OBJS = $(TEST_SRCS:.s=.o) +# Benchmark sources +BENCH_SRCS = src/bench/bench_main.s \ + src/bench/bench_harness.s \ + src/bench/bench_slab.s \ + src/bench/bench_db.s \ + src/bench/bench_string.s \ + src/bench/bench_json.s \ + src/bench/bench_request.s \ + $(COMMON_SRCS) \ + $(MEM_DB_SRCS) \ + $(SLOWAPI_SRCS) +BENCH_OBJS = $(BENCH_SRCS:.s=.o) + # Flags ASFLAGS = -g LDFLAGS = -T linker.ld -nostdlib -.PHONY: all clean run test +.PHONY: all clean run test bench all: $(KERNEL_ELF) @@ -75,6 +90,9 @@ $(KERNEL_ELF): $(SERVER_OBJS) linker.ld $(TEST_ELF): $(TEST_OBJS) linker.ld $(LD) $(LDFLAGS) -o $@ $(TEST_OBJS) +$(BENCH_ELF): $(BENCH_OBJS) linker.ld + $(LD) $(LDFLAGS) -o $@ $(BENCH_OBJS) + $(KERNEL_BIN): $(KERNEL_ELF) $(OBJCOPY) -O binary $< $@ @@ -82,7 +100,7 @@ $(KERNEL_BIN): $(KERNEL_ELF) $(AS) $(ASFLAGS) -o $@ $< clean: - rm -f $(SERVER_OBJS) $(TEST_OBJS) $(KERNEL_ELF) $(KERNEL_BIN) $(TEST_ELF) + rm -f $(SERVER_OBJS) $(TEST_OBJS) $(BENCH_OBJS) $(KERNEL_ELF) $(KERNEL_BIN) $(TEST_ELF) $(BENCH_ELF) run: $(KERNEL_ELF) qemu-system-aarch64 \ @@ -103,3 +121,13 @@ test: $(TEST_ELF) -kernel $(TEST_ELF) \ -device virtio-net-device,netdev=net0 \ -netdev user,id=net0 + +bench: $(BENCH_ELF) + qemu-system-aarch64 \ + -machine virt \ + -cpu cortex-a72 \ + -nographic \ + -global virtio-mmio.force-legacy=true \ + -kernel $(BENCH_ELF) \ + -device virtio-net-device,netdev=net0 \ + -netdev user,id=net0 diff --git a/src/bench/bench_db.s b/src/bench/bench_db.s new file mode 100644 index 0000000..f743859 --- /dev/null +++ b/src/bench/bench_db.s @@ -0,0 +1,187 @@ +// Database Benchmarks + +.section .text +.global run_db_benchmarks + +run_db_benchmarks: + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x0, =section_name + bl bench_section + + ldr x0, =ctx_db_create + bl bench_run + ldr x0, =ctx_db_get + bl bench_run + ldr x0, =ctx_db_delete + bl bench_run + ldr x0, =ctx_db_list + bl bench_run + + ldp x29, x30, [sp], #16 + ret + +//============================================================================= +// Benchmark functions +//============================================================================= + +// Create + delete a record (self-contained per iteration) +bench_fn_db_create: + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x0, =test_record + mov x1, #8 + bl db_create + // Delete to keep db clean for next iteration + bl db_delete + + ldp x29, x30, [sp], #16 + ret + +// Get a record by ID (record created in setup) +bench_fn_db_get: + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x0, =db_saved_id + ldr w0, [x0] + bl db_get + + ldp x29, x30, [sp], #16 + ret + +// Create + delete (measures delete cost, self-contained) +bench_fn_db_delete: + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x0, =test_record + mov x1, #8 + bl db_create + bl db_delete + + ldp x29, x30, [sp], #16 + ret + +// Count records +bench_fn_db_list: + stp x29, x30, [sp, #-16]! + mov x29, sp + + bl db_count + + ldp x29, x30, [sp], #16 + ret + +// Setup: reinit mem + db +bench_db_setup_reinit: + stp x29, x30, [sp, #-16]! + mov x29, sp + + bl mem_init + bl db_init + + ldp x29, x30, [sp], #16 + ret + +// Setup: reinit + create one record (for get benchmark) +bench_db_setup_create: + stp x29, x30, [sp, #-16]! + mov x29, sp + + bl mem_init + bl db_init + ldr x0, =test_record + mov x1, #8 + bl db_create + ldr x1, =db_saved_id + str w0, [x1] + + ldp x29, x30, [sp], #16 + ret + +// Setup: reinit + populate 10 records (for list benchmark) +bench_db_setup_populate: + stp x29, x30, [sp, #-16]! + mov x29, sp + stp x19, xzr, [sp, #-16]! + + bl mem_init + bl db_init + + mov w19, #10 +.populate_loop: + cbz w19, .populate_done + ldr x0, =test_record + mov x1, #8 + bl db_create + sub w19, w19, #1 + b .populate_loop +.populate_done: + + ldp x19, xzr, [sp], #16 + ldp x29, x30, [sp], #16 + ret + +//============================================================================= +// Benchmark contexts +//============================================================================= + +.section .data +.balign 8 +ctx_db_create: + .quad name_db_create + .quad bench_fn_db_create + .quad bench_db_setup_reinit + .quad 0 + .word 500 + .skip 28 + +.balign 8 +ctx_db_get: + .quad name_db_get + .quad bench_fn_db_get + .quad bench_db_setup_create + .quad 0 + .word 1000 + .skip 28 + +.balign 8 +ctx_db_delete: + .quad name_db_delete + .quad bench_fn_db_delete + .quad bench_db_setup_reinit + .quad 0 + .word 500 + .skip 28 + +.balign 8 +ctx_db_list: + .quad name_db_list + .quad bench_fn_db_list + .quad bench_db_setup_populate + .quad 0 + .word 100 + .skip 28 + +.section .rodata +section_name: + .asciz "database" +name_db_create: + .asciz "db_create" +name_db_get: + .asciz "db_get" +name_db_delete: + .asciz "db_delete" +name_db_list: + .asciz "db_list" + +test_record: + .ascii "testdata" + +.section .bss +.balign 4 +db_saved_id: + .skip 4 diff --git a/src/bench/bench_harness.s b/src/bench/bench_harness.s new file mode 100644 index 0000000..fb8faa1 --- /dev/null +++ b/src/bench/bench_harness.s @@ -0,0 +1,467 @@ +// Benchmark Harness +// Run N iterations, track min/avg/max ticks, print results + +.section .text +.global bench_start +.global bench_end +.global bench_section +.global bench_run + +// Benchmark context layout (all 8-byte fields must be 8-byte aligned) +.equ BENCH_NAME, 0 // 8 bytes: pointer to name string +.equ BENCH_FUNC, 8 // 8 bytes: pointer to benchmark function +.equ BENCH_SETUP, 16 // 8 bytes: setup function (0 = none) +.equ BENCH_TEARDOWN, 24 // 8 bytes: teardown function (0 = none) +.equ BENCH_ITERS, 32 // 4 bytes: iteration count + // 4 bytes: padding +.equ BENCH_MIN, 40 // 8 bytes: min ticks +.equ BENCH_MAX, 48 // 8 bytes: max ticks +.equ BENCH_TOTAL, 56 // 8 bytes: total ticks +.equ BENCH_CTX_SIZE, 64 + +// bench_start: Print header, cache timer frequency +bench_start: + stp x29, x30, [sp, #-16]! + mov x29, sp + + // Cache timer frequency + bl timer_freq + ldr x1, =cached_freq + str x0, [x1] + + // Reset bench count + ldr x0, =bench_count + str wzr, [x0] + + // Print header + ldr x0, =msg_bench_header + bl uart_puts + + ldp x29, x30, [sp], #16 + ret + +// bench_end: Print summary +bench_end: + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x0, =msg_bench_summary + bl uart_puts + + // Print count + ldr x0, =msg_bench_run + bl uart_puts + ldr x0, =bench_count + ldr w0, [x0] + bl bench_print_decimal + bl uart_newline + + // Print frequency + ldr x0, =msg_timer_freq + bl uart_puts + ldr x0, =cached_freq + ldr x0, [x0] + // Print 64-bit frequency as decimal + bl bench_print_decimal_64 + ldr x0, =msg_hz + bl uart_puts + bl uart_newline + + ldp x29, x30, [sp], #16 + ret + +// bench_section: Print section header +// Input: x0 = section name string +bench_section: + stp x29, x30, [sp, #-16]! + mov x29, sp + stp x19, xzr, [sp, #-16]! + + mov x19, x0 + + ldr x0, =msg_section_start + bl uart_puts + mov x0, x19 + bl uart_puts + ldr x0, =msg_section_end + bl uart_puts + + ldp x19, xzr, [sp], #16 + ldp x29, x30, [sp], #16 + ret + +// bench_run: Run a benchmark +// Input: x0 = pointer to benchmark context +bench_run: + stp x29, x30, [sp, #-16]! + mov x29, sp + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + + mov x19, x0 // benchmark context + + // Initialize min/max/total + mov x0, #-1 // 0xFFFFFFFFFFFFFFFF + str x0, [x19, #BENCH_MIN] + str xzr, [x19, #BENCH_MAX] + str xzr, [x19, #BENCH_TOTAL] + + // Increment bench count + ldr x0, =bench_count + ldr w1, [x0] + add w1, w1, #1 + str w1, [x0] + + // Call setup once if present + ldr x0, [x19, #BENCH_SETUP] + cbz x0, .bench_loop_init + blr x0 + +.bench_loop_init: + ldr w20, [x19, #BENCH_ITERS] // iteration count + mov w21, #0 // current iteration + +.bench_loop: + cmp w21, w20 + b.ge .bench_loop_done + + // Read start time + bl timer_read + mov x22, x0 + + // Call benchmark function + ldr x0, [x19, #BENCH_FUNC] + blr x0 + + // Read end time + bl timer_read + sub x23, x0, x22 // elapsed = end - start + + // Update min + ldr x0, [x19, #BENCH_MIN] + cmp x23, x0 + b.hs .skip_min + str x23, [x19, #BENCH_MIN] +.skip_min: + + // Update max + ldr x0, [x19, #BENCH_MAX] + cmp x23, x0 + b.ls .skip_max + str x23, [x19, #BENCH_MAX] +.skip_max: + + // Update total + ldr x0, [x19, #BENCH_TOTAL] + add x0, x0, x23 + str x0, [x19, #BENCH_TOTAL] + + add w21, w21, #1 + b .bench_loop + +.bench_loop_done: + // Call teardown if present + ldr x0, [x19, #BENCH_TEARDOWN] + cbz x0, .bench_print + blr x0 + +.bench_print: + // Print: name | N iters | min: X (Y us) | avg: X (Y us) | max: X (Y us) + + // Print name (left-padded to 20 chars) + ldr x0, [x19, #BENCH_NAME] + bl bench_print_padded_name + + // Print " | " + ldr x0, =msg_sep + bl uart_puts + + // Print iteration count (right-aligned to 4 digits) + ldr w0, [x19, #BENCH_ITERS] + bl bench_print_rjust4 + ldr x0, =msg_iters + bl uart_puts + + // Print min + ldr x0, =msg_min + bl uart_puts + ldr x0, [x19, #BENCH_MIN] + bl bench_print_decimal_64 + ldr x0, =msg_paren_open + bl uart_puts + ldr x0, [x19, #BENCH_MIN] + bl bench_print_ns + ldr x0, =msg_us_close + bl uart_puts + + // Print avg + ldr x0, =msg_avg + bl uart_puts + ldr x0, [x19, #BENCH_TOTAL] + ldr w1, [x19, #BENCH_ITERS] + udiv x0, x0, x1 + mov x24, x0 // save avg for us conversion + bl bench_print_decimal_64 + ldr x0, =msg_paren_open + bl uart_puts + mov x0, x24 + bl bench_print_ns + ldr x0, =msg_us_close + bl uart_puts + + // Print max + ldr x0, =msg_max + bl uart_puts + ldr x0, [x19, #BENCH_MAX] + bl bench_print_decimal_64 + ldr x0, =msg_paren_open + bl uart_puts + ldr x0, [x19, #BENCH_MAX] + bl bench_print_ns + ldr x0, =msg_us_end + bl uart_puts + + bl uart_newline + + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ldp x29, x30, [sp], #16 + ret + +// bench_print_ns: Print ticks converted to nanoseconds +// Input: x0 = ticks +bench_print_ns: + stp x29, x30, [sp, #-16]! + mov x29, sp + + // ns = ticks * 1000000000 / freq + // 1000000000 = 0x3B9ACA00 + mov x1, #0xCA00 + movk x1, #0x3B9A, lsl #16 + mul x0, x0, x1 + ldr x1, =cached_freq + ldr x1, [x1] + udiv x0, x0, x1 + bl bench_print_decimal_64 + + ldp x29, x30, [sp], #16 + ret + +// bench_print_padded_name: Print name left-padded to 20 chars +// Input: x0 = name string +bench_print_padded_name: + stp x29, x30, [sp, #-16]! + mov x29, sp + stp x19, x20, [sp, #-16]! + + mov x19, x0 + // Get length + bl strlen_simple + mov x20, x0 // length + + // Print name + mov x0, x19 + bl uart_puts + + // Pad with spaces + mov x1, #20 +.pad_loop: + cmp x20, x1 + b.ge .pad_done + mov w0, #' ' + stp x1, x20, [sp, #-16]! + bl uart_putc + ldp x1, x20, [sp], #16 + add x20, x20, #1 + b .pad_loop +.pad_done: + ldp x19, x20, [sp], #16 + ldp x29, x30, [sp], #16 + ret + +// bench_print_rjust4: Print number right-justified in 4 chars +// Input: w0 = number +bench_print_rjust4: + stp x29, x30, [sp, #-16]! + mov x29, sp + stp x19, x20, [sp, #-16]! + + mov w19, w0 + + // Count digits + mov w20, #0 // digit count + mov w1, w19 + cbz w1, .rj_one_digit +.rj_count: + cbz w1, .rj_pad + mov w2, #10 + udiv w1, w1, w2 + add w20, w20, #1 + b .rj_count + +.rj_one_digit: + mov w20, #1 + +.rj_pad: + // Print leading spaces + mov w1, #4 + sub w1, w1, w20 +.rj_space: + cmp w1, #0 + b.le .rj_print + mov w0, #' ' + stp x1, xzr, [sp, #-16]! + bl uart_putc + ldp x1, xzr, [sp], #16 + sub w1, w1, #1 + b .rj_space + +.rj_print: + mov w0, w19 + bl bench_print_decimal + + ldp x19, x20, [sp], #16 + ldp x29, x30, [sp], #16 + ret + +// bench_print_decimal: Print 32-bit decimal number +// Input: w0 = number +bench_print_decimal: + stp x29, x30, [sp, #-16]! + mov x29, sp + stp x19, x20, [sp, #-16]! + + mov w19, w0 + mov x20, sp // save sp before any stack manipulation + + cbz w19, .bpd_zero + + // Build digits in reverse on stack + sub sp, sp, #16 + mov x1, sp + + mov w2, #10 +.bpd_loop: + cbz w19, .bpd_print + udiv w3, w19, w2 + msub w4, w3, w2, w19 + add w4, w4, #'0' + strb w4, [x1], #1 + mov w19, w3 + b .bpd_loop + +.bpd_print: + mov x3, sp +.bpd_print_loop: + cmp x1, x3 + b.le .bpd_done + sub x1, x1, #1 + ldrb w0, [x1] + stp x1, x3, [sp, #-16]! + bl uart_putc + ldp x1, x3, [sp], #16 + b .bpd_print_loop + +.bpd_zero: + mov w0, #'0' + bl uart_putc + b .bpd_done + +.bpd_done: + mov sp, x20 + ldp x19, x20, [sp], #16 + ldp x29, x30, [sp], #16 + ret + +// bench_print_decimal_64: Print 64-bit decimal number +// Input: x0 = number +bench_print_decimal_64: + stp x29, x30, [sp, #-16]! + mov x29, sp + stp x19, x20, [sp, #-16]! + + mov x19, x0 + mov x20, sp // save sp before any stack manipulation + + cbz x19, .bpd64_zero + + sub sp, sp, #32 + mov x1, sp + + mov x2, #10 +.bpd64_loop: + cbz x19, .bpd64_print + udiv x3, x19, x2 + msub x4, x3, x2, x19 + add w4, w4, #'0' + strb w4, [x1], #1 + mov x19, x3 + b .bpd64_loop + +.bpd64_print: + mov x3, sp +.bpd64_print_loop: + cmp x1, x3 + b.le .bpd64_done + sub x1, x1, #1 + ldrb w0, [x1] + stp x1, x3, [sp, #-16]! + bl uart_putc + ldp x1, x3, [sp], #16 + b .bpd64_print_loop + +.bpd64_zero: + mov w0, #'0' + bl uart_putc + b .bpd64_done + +.bpd64_done: + mov sp, x20 + ldp x19, x20, [sp], #16 + ldp x29, x30, [sp], #16 + ret + +//============================================================================= +// Data +//============================================================================= + +.section .rodata +msg_bench_header: + .asciz "\n========== RUNNING BENCHMARKS ==========\n" +msg_bench_summary: + .asciz "\n========== BENCHMARK SUMMARY ==========\n" +msg_bench_run: + .asciz "Benchmarks run: " +msg_timer_freq: + .asciz "Timer frequency: " +msg_hz: + .asciz " Hz" +msg_section_start: + .asciz "\n--- " +msg_section_end: + .asciz " ---\n" +msg_sep: + .asciz " | " +msg_iters: + .asciz " iters | " +msg_min: + .asciz "min: " +msg_avg: + .asciz "avg: " +msg_max: + .asciz "max: " +msg_paren_open: + .asciz " (" +msg_us_close: + .asciz " ns) | " +msg_us_end: + .asciz " ns)" + +.section .bss +.balign 8 +cached_freq: + .skip 8 +bench_count: + .skip 4 diff --git a/src/bench/bench_json.s b/src/bench/bench_json.s new file mode 100644 index 0000000..ea2935f --- /dev/null +++ b/src/bench/bench_json.s @@ -0,0 +1,139 @@ +// JSON Builder Benchmarks + +.section .text +.global run_json_benchmarks + +.include "src/slowapi/macros.s" + +.equ JSON_CTX_SIZE, 16 + +run_json_benchmarks: + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x0, =section_name + bl bench_section + + ldr x0, =ctx_json_init + bl bench_run + ldr x0, =ctx_json_build_object + bl bench_run + + ldp x29, x30, [sp], #16 + ret + +//============================================================================= +// Benchmark functions +//============================================================================= + +// Initialize JSON context (stack-allocated buffer) +bench_fn_json_init: + stp x29, x30, [sp, #-16]! + mov x29, sp + + // Allocate JSON context + buffer on stack + sub sp, sp, #272 // 16 (ctx) + 256 (buffer) + mov x0, sp // context + add x1, sp, #JSON_CTX_SIZE // buffer + mov x2, #256 + bl json_init + + add sp, sp, #272 + ldp x29, x30, [sp], #16 + ret + +// Build {"id":42,"name":"test"} end-to-end +bench_fn_json_build_object: + stp x29, x30, [sp, #-16]! + mov x29, sp + stp x19, xzr, [sp, #-16]! + + // Allocate JSON context + buffer on stack + sub sp, sp, #272 + mov x19, sp // save context pointer + + // Init + mov x0, x19 + add x1, x19, #JSON_CTX_SIZE + mov x2, #256 + bl json_init + + // Start object + mov x0, x19 + bl json_start_obj + + // Add "id": 42 + mov x0, x19 + ldr x1, =key_id + mov x2, #2 + bl json_add_key + + mov x0, x19 + mov w1, #42 + bl json_add_int + + // Comma + mov x0, x19 + bl json_comma + + // Add "name": "test" + mov x0, x19 + ldr x1, =key_name + mov x2, #4 + bl json_add_key + + mov x0, x19 + ldr x1, =val_test + mov x2, #4 + bl json_add_string + + // End object + mov x0, x19 + bl json_end_obj + + // Finish + mov x0, x19 + bl json_finish + + add sp, sp, #272 + ldp x19, xzr, [sp], #16 + ldp x29, x30, [sp], #16 + ret + +//============================================================================= +// Benchmark contexts +//============================================================================= + +.section .data +.balign 8 +ctx_json_init: + .quad name_json_init + .quad bench_fn_json_init + .quad 0 + .quad 0 + .word 1000 + .skip 28 + +.balign 8 +ctx_json_build_object: + .quad name_json_build_object + .quad bench_fn_json_build_object + .quad 0 + .quad 0 + .word 500 + .skip 28 + +.section .rodata +section_name: + .asciz "json builder" +name_json_init: + .asciz "json_init" +name_json_build_object: + .asciz "json_build_object" + +key_id: + .asciz "id" +key_name: + .asciz "name" +val_test: + .asciz "test" diff --git a/src/bench/bench_main.s b/src/bench/bench_main.s new file mode 100644 index 0000000..37def05 --- /dev/null +++ b/src/bench/bench_main.s @@ -0,0 +1,27 @@ +// Benchmark Runner Entry Point + +.section .text.boot +.global _start + +_start: + // Set up stack pointer + ldr x0, =_stack_top + mov sp, x0 + + // Initialize subsystems + bl uart_init + bl mem_init + bl db_init + + // Run benchmarks + bl bench_start + bl run_slab_benchmarks + bl run_db_benchmarks + bl run_string_benchmarks + bl run_json_benchmarks + bl run_request_benchmarks + bl bench_end + +halt: + wfe + b halt diff --git a/src/bench/bench_request.s b/src/bench/bench_request.s new file mode 100644 index 0000000..5147ab5 --- /dev/null +++ b/src/bench/bench_request.s @@ -0,0 +1,72 @@ +// HTTP Request Parsing Benchmarks + +.section .text +.global run_request_benchmarks + +.include "src/slowapi/macros.s" + +run_request_benchmarks: + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x0, =section_name + bl bench_section + + ldr x0, =ctx_parse_request + bl bench_run + + ldp x29, x30, [sp], #16 + ret + +//============================================================================= +// Benchmark functions +//============================================================================= + +// Parse a complete HTTP request +bench_fn_parse_request: + stp x29, x30, [sp, #-16]! + mov x29, sp + + // Allocate request context on stack + sub sp, sp, #REQ_SIZE + + ldr x0, =http_request + ldr x1, =http_request_len + ldr w1, [x1] + mov x2, sp // request context + bl slowapi_parse_request + + add sp, sp, #REQ_SIZE + ldp x29, x30, [sp], #16 + ret + +//============================================================================= +// Benchmark contexts +//============================================================================= + +.section .data +.balign 8 +ctx_parse_request: + .quad name_parse_request + .quad bench_fn_parse_request + .quad 0 + .quad 0 + .word 500 + .skip 28 + +.section .rodata +section_name: + .asciz "request parsing" +name_parse_request: + .asciz "parse_request" + +http_request: + .ascii "GET /api/hotels?city=tokyo HTTP/1.1\r\n" + .ascii "Host: localhost\r\n" + .ascii "Accept: application/json\r\n" + .ascii "\r\n" +http_request_end: + +.balign 4 +http_request_len: + .word http_request_end - http_request diff --git a/src/bench/bench_slab.s b/src/bench/bench_slab.s new file mode 100644 index 0000000..9a783eb --- /dev/null +++ b/src/bench/bench_slab.s @@ -0,0 +1,117 @@ +// Slab Allocator Benchmarks + +.section .text +.global run_slab_benchmarks + +run_slab_benchmarks: + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x0, =section_name + bl bench_section + + ldr x0, =ctx_mem_alloc + bl bench_run + ldr x0, =ctx_mem_free + bl bench_run + ldr x0, =ctx_alloc_free_cycle + bl bench_run + + ldp x29, x30, [sp], #16 + ret + +//============================================================================= +// Benchmark functions +//============================================================================= + +// Allocate 64 bytes (free immediately to avoid exhausting blocks) +bench_fn_mem_alloc: + stp x29, x30, [sp, #-16]! + mov x29, sp + stp x19, xzr, [sp, #-16]! + + mov x0, #64 + bl mem_alloc + mov x19, x0 + + // Free to keep pool available for next iteration + mov x0, x19 + bl mem_free + + ldp x19, xzr, [sp], #16 + ldp x29, x30, [sp], #16 + ret + +// Free a block (alloc first, then free) +bench_fn_mem_free: + stp x29, x30, [sp, #-16]! + mov x29, sp + + mov x0, #64 + bl mem_alloc + bl mem_free + + ldp x29, x30, [sp], #16 + ret + +// Allocate + free together +bench_fn_alloc_free_cycle: + stp x29, x30, [sp, #-16]! + mov x29, sp + + mov x0, #64 + bl mem_alloc + bl mem_free + + ldp x29, x30, [sp], #16 + ret + +// Setup: reinit allocator +bench_setup_reinit: + stp x29, x30, [sp, #-16]! + mov x29, sp + bl mem_init + ldp x29, x30, [sp], #16 + ret + +//============================================================================= +// Benchmark contexts +//============================================================================= + +.section .data +.balign 8 +ctx_mem_alloc: + .quad name_mem_alloc // BENCH_NAME + .quad bench_fn_mem_alloc // BENCH_FUNC + .quad bench_setup_reinit // BENCH_SETUP + .quad 0 // BENCH_TEARDOWN + .word 1000 // BENCH_ITERS + .skip 28 // min/max/total (runtime) + +.balign 8 +ctx_mem_free: + .quad name_mem_free + .quad bench_fn_mem_free + .quad bench_setup_reinit + .quad 0 + .word 1000 + .skip 28 + +.balign 8 +ctx_alloc_free_cycle: + .quad name_alloc_free_cycle + .quad bench_fn_alloc_free_cycle + .quad bench_setup_reinit + .quad 0 + .word 1000 + .skip 28 + +.section .rodata +section_name: + .asciz "slab allocator" +name_mem_alloc: + .asciz "mem_alloc" +name_mem_free: + .asciz "mem_free" +name_alloc_free_cycle: + .asciz "alloc_free_cycle" diff --git a/src/bench/bench_string.s b/src/bench/bench_string.s new file mode 100644 index 0000000..047d45b --- /dev/null +++ b/src/bench/bench_string.s @@ -0,0 +1,109 @@ +// String Utility Benchmarks + +.section .text +.global run_string_benchmarks + +run_string_benchmarks: + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x0, =section_name + bl bench_section + + ldr x0, =ctx_parse_int + bl bench_run + ldr x0, =ctx_strlen + bl bench_run + ldr x0, =ctx_find_char + bl bench_run + + ldp x29, x30, [sp], #16 + ret + +//============================================================================= +// Benchmark functions +//============================================================================= + +// Parse "12345" +bench_fn_parse_int: + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x0, =str_number + mov w1, #5 + bl parse_int + + ldp x29, x30, [sp], #16 + ret + +// strlen on 50-char string +bench_fn_strlen: + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x0, =str_50chars + bl strlen_simple + + ldp x29, x30, [sp], #16 + ret + +// find_char in 50-char string (char near end) +bench_fn_find_char: + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x0, =str_50chars + mov w1, #50 + mov w2, #'Z' + bl find_char + + ldp x29, x30, [sp], #16 + ret + +//============================================================================= +// Benchmark contexts +//============================================================================= + +.section .data +.balign 8 +ctx_parse_int: + .quad name_parse_int + .quad bench_fn_parse_int + .quad 0 // no setup + .quad 0 // no teardown + .word 1000 + .skip 28 + +.balign 8 +ctx_strlen: + .quad name_strlen + .quad bench_fn_strlen + .quad 0 + .quad 0 + .word 1000 + .skip 28 + +.balign 8 +ctx_find_char: + .quad name_find_char + .quad bench_fn_find_char + .quad 0 + .quad 0 + .word 1000 + .skip 28 + +.section .rodata +section_name: + .asciz "string utilities" +name_parse_int: + .asciz "parse_int" +name_strlen: + .asciz "strlen_simple" +name_find_char: + .asciz "find_char" + +str_number: + .asciz "12345" + +str_50chars: + .asciz "ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxZ" diff --git a/src/drivers/timer.s b/src/drivers/timer.s new file mode 100644 index 0000000..85f24f1 --- /dev/null +++ b/src/drivers/timer.s @@ -0,0 +1,18 @@ +// ARM64 Generic Timer Access +// Reads cycle counter and frequency from system registers + +.section .text +.global timer_read +.global timer_freq + +// timer_read: Read the physical counter +// Output: x0 = current counter value +timer_read: + mrs x0, cntpct_el0 + ret + +// timer_freq: Read the counter frequency +// Output: x0 = frequency in Hz +timer_freq: + mrs x0, cntfrq_el0 + ret diff --git a/src/slowapi/slowapi.s b/src/slowapi/slowapi.s index 6b9eb14..e8c5d89 100644 --- a/src/slowapi/slowapi.s +++ b/src/slowapi/slowapi.s @@ -49,10 +49,15 @@ slowapi_handle: stp x29, x30, [sp, #-16]! mov x29, sp stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! mov x19, x0 // raw data mov x20, x1 // length + // Record start time + bl timer_read + mov x21, x0 + .if DEBUG ldr x0, =msg_request bl uart_puts @@ -108,6 +113,26 @@ slowapi_handle: bl resp_error .handle_done: + // Record end time and print elapsed + bl timer_read + sub x22, x0, x21 // elapsed ticks + + ldr x0, =msg_timing + bl uart_puts + + // Convert ticks to nanoseconds: ns = ticks * 1000000000 / freq + bl timer_freq + mov x1, x0 // freq + mov x0, #0xCA00 + movk x0, #0x3B9A, lsl #16 + mul x0, x22, x0 + udiv x0, x0, x1 + bl slowapi_print_decimal_64 + + ldr x0, =msg_ns + bl uart_puts + + ldp x21, x22, [sp], #16 ldp x19, x20, [sp], #16 ldp x29, x30, [sp], #16 ret @@ -192,6 +217,52 @@ http_check_complete: mov w0, #0 ret +// slowapi_print_decimal_64: Print 64-bit number as decimal +// Input: x0 = number +slowapi_print_decimal_64: + stp x29, x30, [sp, #-16]! + mov x29, sp + stp x19, x20, [sp, #-16]! + + mov x19, x0 + mov x20, sp + + cbz x19, .spd_zero + + sub sp, sp, #32 + mov x1, sp + mov x2, #10 +.spd_loop: + cbz x19, .spd_print + udiv x3, x19, x2 + msub x4, x3, x2, x19 + add w4, w4, #'0' + strb w4, [x1], #1 + mov x19, x3 + b .spd_loop + +.spd_print: + mov x3, sp +.spd_print_loop: + cmp x1, x3 + b.le .spd_done + sub x1, x1, #1 + ldrb w0, [x1] + stp x1, x3, [sp, #-16]! + bl uart_putc + ldp x1, x3, [sp], #16 + b .spd_print_loop + +.spd_zero: + mov w0, #'0' + bl uart_putc + +.spd_done: + mov sp, x20 + ldp x19, x20, [sp], #16 + ldp x29, x30, [sp], #16 + ret + //============================================================================= // Data //============================================================================= @@ -209,6 +280,10 @@ msg_path: .endif .section .rodata +msg_timing: + .asciz "[SlowAPI] request handled in " +msg_ns: + .asciz " ns\n" msg_parse_fail: .asciz "[SlowAPI] parse failed\n"