# Bootloader ARM64

2025-06-15

340-byte bootloader:

In [110]:
%%writefile bootloader.s
// Compact AArch64 Forth-inspired Bootloader
// Commands: 0x01 (store next byte), 0x00 (execute buffer)
// QEMU virt machine (PL011 UART)

.global _start
.section .text

.set UART_BASE,    0x09000000
.set UART_FR,      0x09000018
.set UART_FR_RXFE, (1 << 4)
.set EXEC_BUFFER,  0x40001000

_start:
    movz x0, #0x4100, lsl #16    // Stack at 0x41000000
    mov sp, x0
    movz x19, #0x0900, lsl #16   // UART base
    movz x20, #0x4000, lsl #16   // Buffer base
    add x20, x20, #0x1000        // x20 = EXEC_BUFFER
    mov x21, #0                  // Buffer offset
    mov x22, #0                  // First connection flag

main_loop:
    add x2, x19, #0x18           // UART_FR address
    ldr w3, [x2]
    tst w3, #UART_FR_RXFE
    b.ne main_loop               // Wait for data

    cmp x22, #0                  // First connection?
    b.ne skip_ready
    bl send_ready
    mov x22, #1

skip_ready:
    ldr w0, [x19]                // Read byte
    and w0, w0, #0xFF
    
    cmp w0, #0x00
    b.eq execute
    cmp w0, #0x01
    b.eq store_byte
    b main_loop                  // Ignore other bytes

execute:
    bl send_exec
    blr x20                      // Call stored code
    bl send_done
    mov x21, #0                  // Reset buffer
    b main_loop

store_byte:
    add x2, x19, #0x18
wait_byte:
    ldr w3, [x2]
    tst w3, #UART_FR_RXFE
    b.ne wait_byte
    
    ldr w0, [x19]
    and w0, w0, #0xFF
    strb w0, [x20, x21]          // Store byte
    add x21, x21, #1
    
    mov w0, #'*'                 // Send confirmation
    str w0, [x19]
    b main_loop

send_ready:
    stp x29, x30, [sp, #-16]!
    mov w0, #'R'; str w0, [x19]
    mov w0, #'e'; str w0, [x19]
    mov w0, #'a'; str w0, [x19]
    mov w0, #'d'; str w0, [x19]
    mov w0, #'y'; str w0, [x19]
    mov w0, #'!'; str w0, [x19]
    mov w0, #10;  str w0, [x19]  // Newline
    ldp x29, x30, [sp], #16
    ret

send_exec:
    stp x29, x30, [sp, #-16]!
    mov w0, #'E'; str w0, [x19]
    mov w0, #'x'; str w0, [x19]
    mov w0, #'e'; str w0, [x19]
    mov w0, #'c'; str w0, [x19]
    mov w0, #'!'; str w0, [x19]
    mov w0, #10;  str w0, [x19]
    ldp x29, x30, [sp], #16
    ret

send_done:
    stp x29, x30, [sp, #-16]!
    mov w0, #'D'; str w0, [x19]
    mov w0, #'o'; str w0, [x19]
    mov w0, #'n'; str w0, [x19]
    mov w0, #'e'; str w0, [x19]
    mov w0, #'!'; str w0, [x19]
    mov w0, #10;  str w0, [x19]
    ldp x29, x30, [sp], #16
    ret

Overwriting bootloader.s


In [111]:
%%bash
as -o bootloader.o bootloader.s
ld -Ttext=0x40000000 -o bootloader.elf bootloader.o
objcopy -O binary bootloader.elf bootloader.img
wc -c bootloader.img

340 bootloader.img


84-byte bootloader:

In [102]:
%%writefile bootloader.s
// Ultra-compact AArch64 bootloader: 0x01=store, 0x00=exec
.global _start
_start:
    movz x19,#0x900,lsl#16       // UART base
    movz x20,#0x4000,lsl#16      
    add x20,x20,#0x1000          // Buffer at 0x40001000
    mov x21,#0                   // Offset
L:  add x2,x19,#24              // UART status
    ldr w0,[x2]
    tbnz w0,#4,L                 // Wait for RX data
    ldr w0,[x19]                 // Read byte
    cbz w0,X                     // 0x00 = execute
    cmp w0,#1
    b.ne L                       // Ignore if not 0x01
S:  add x2,x19,#24              // Store mode
    ldr w3,[x2]
    tbnz w3,#4,S                 // Wait for next byte
    ldr w0,[x19]
    strb w0,[x20,x21]            // Store byte
    add x21,x21,#1
    b L
X:  blr x20                     // Execute buffer
    mov x21,#0                   // Reset
    b L

Overwriting bootloader.s


In [103]:
%%bash
as -o bootloader.o bootloader.s
ld -Ttext=0x40000000 -o bootloader.elf bootloader.o
objcopy -O binary bootloader.elf bootloader.img
wc -c bootloader.img

84 bootloader.img


## The bootloader

72-byte bootloader:

In [113]:
%%writefile bootloader.s
// Minimal AArch64 bootloader 
// 0x01=compile, 0x00=execute (15 instructions, 72 bytes)
.global _start
_start:
    movz x19,#0x900,lsl#16       // UART base
    movz x20,#0x4000,lsl#16
    orr x20,x20,#0x1000          // EXEC buffer
    mov x21,x20                  // Code pointer
L:  ldr w0,[x19,#24]             // Check UART status
    tbnz w0,#4,L                 // Wait for RX data
    ldr w0,[x19]                 // Read byte
    cbz w0,X                     // 0=execute  
    cmp w0,#1
    b.ne L                       // Skip if not 1
C:  ldr w0,[x19,#24]             // Wait for compile byte (restored)
    tbnz w0,#4,C                 // Keep waiting
    ldr w0,[x19]                 // Get compile byte
    strb w0,[x21],#1             // Store & increment
    b L
X:  blr x20                      // Execute
    mov x21,x20                  // Reset pointer
    b L

Overwriting bootloader.s


In [114]:
%%bash
as -o bootloader.o bootloader.s
ld -Ttext=0x40000000 -o bootloader.elf bootloader.o
objcopy -O binary bootloader.elf bootloader.img
wc -c bootloader.img

72 bootloader.img


Running on AMD64 host machine with qemu installed:

    qemu-system-aarch64 \
    -machine virt \
    -cpu cortex-a53 \
    -nographic \
    -kernel bootloader.img \
    -serial tcp::44444,server,nowait \
    -monitor none

Example:

    printf '\x01\x00\x01\x09\x01\x80\x01\x52\x01\x60\x01\x02\x01\x00\x01\xB9\x01\xA0\x01\x0C\x01\x80\x01\x52\x01\x60\x01\x02\x01\x00\x01\xB9\x01\x80\x01\x0D\x01\x80\x01\x52\x01\x60\x01\x02\x01\x00\x01\xB9\x01\x80\x01\x0D\x01\x80\x01\x52\x01\x60\x01\x02\x01\x00\x01\xB9\x01\xE0\x01\x0D\x01\x80\x01\x52\x01\x60\x01\x02\x01\x00\x01\xB9\x01\x80\x01\x05\x01\x80\x01\x52\x01\x60\x01\x02\x01\x00\x01\xB9\x01\x00\x01\x04\x01\x80\x01\x52\x01\x60\x01\x02\x01\x00\x01\xB9\x01\xE0\x01\x0E\x01\x80\x01\x52\x01\x60\x01\x02\x01\x00\x01\xB9\x01\xE0\x01\x0D\x01\x80\x01\x52\x01\x60\x01\x02\x01\x00\x01\xB9\x01\x40\x01\x0E\x01\x80\x01\x52\x01\x60\x01\x02\x01\x00\x01\xB9\x01\x80\x01\x0D\x01\x80\x01\x52\x01\x60\x01\x02\x01\x00\x01\xB9\x01\x80\x01\x0C\x01\x80\x01\x52\x01\x60\x01\x02\x01\x00\x01\xB9\x01\x20\x01\x04\x01\x80\x01\x52\x01\x60\x01\x02\x01\x00\x01\xB9\x01\xA0\x01\x01\x01\x80\x01\x52\x01\x60\x01\x02\x01\x00\x01\xB9\x01\x40\x01\x01\x01\x80\x01\x52\x01\x60\x01\x02\x01\x00\x01\xB9\x01\xC0\x01\x03\x01\x5F\x01\xD6\x00' | nc localhost 44444

In [106]:
! size bootloader.elf

   text	   data	    bss	    dec	    hex	filename
     72	      0	      0	     72	     48	bootloader.elf


In [107]:
! wc -c bootloader.img

72 bootloader.img


In [108]:
! hexdump -C bootloader.img

00000000  13 20 a1 d2 14 00 a8 d2  94 02 74 b2 f5 03 14 aa  |. ........t.....|
00000010  60 1a 40 b9 e0 ff 27 37  60 02 40 b9 00 01 00 34  |`.@...'7`.@....4|
00000020  1f 04 00 71 61 ff ff 54  60 1a 40 b9 e0 ff 27 37  |...qa..T`.@...'7|
00000030  60 02 40 b9 a0 16 00 38  f6 ff ff 17 80 02 3f d6  |`.@....8......?.|
00000040  f5 03 14 aa f3 ff ff 17                           |........|
00000048


In [109]:
! objdump -D -b binary -m aarch64 bootloader.img


bootloader.img:     file format binary


Disassembly of section .data:

0000000000000000 <.data>:
   0:	d2a12013 	mov	x19, #0x9000000             	// #150994944
   4:	d2a80014 	mov	x20, #0x40000000            	// #1073741824
   8:	b2740294 	orr	x20, x20, #0x1000
   c:	aa1403f5 	mov	x21, x20
  10:	b9401a60 	ldr	w0, [x19, #24]
  14:	3727ffe0 	tbnz	w0, #4, 0x10
  18:	b9400260 	ldr	w0, [x19]
  1c:	34000100 	cbz	w0, 0x3c
  20:	7100041f 	cmp	w0, #0x1
  24:	54ffff61 	b.ne	0x10  // b.any
  28:	b9401a60 	ldr	w0, [x19, #24]
  2c:	3727ffe0 	tbnz	w0, #4, 0x28
  30:	b9400260 	ldr	w0, [x19]
  34:	380016a0 	strb	w0, [x21], #1
  38:	17fffff6 	b	0x10
  3c:	d63f0280 	blr	x20
  40:	aa1403f5 	mov	x21, x20
  44:	17fffff3 	b	0x10


## Examples

Example running in bootloader:

In [92]:
%%writefile a.s
.text                   // Declare the .text section, which contains executable code.
.global _start          // Declare the _start label as a global symbol, making it the entry point for the program.

_start:
    // Move the ASCII value for 'A' (0x41) into the 32-bit wide register w0.
    // w0 is commonly used for function arguments or return values.
    mov w0, #'A'            // Move immediate: w0 = ASCII value of 'A' (0x41).

    // Store the content of w0 (the character 'A') into the UART Data Register (UART_DR).
    // x19 is assumed to hold the base address of the UART (0x09000000, as set by the bootloader).
    // The UART_DR is typically located at offset 0x00 from the UART_BASE.
    str w0, [x19]           // Store Word: Store the 32-bit value in w0 at the address in x19 (UART_DR).

    // Return from the current function (this program) to the caller.
    // In this context, it returns control to the bootloader's main loop.
    ret                     // Return: Return from the current subroutine.

Overwriting a.s


In [93]:
%%bash
as -o a.o a.s
objcopy -O binary -j .text a.o a.bin

**Explanation and Purpose of this Code:**

This small AArch64 assembly program was created specifically for **diagnostic and verification purposes**.

**Why it was made:**

During the debugging process, there was an initial issue where a more complex "Hello, world!" program was not printing, even though a user-provided, manually crafted `printf` command *did* print (albeit using a non-standard UART register offset).

This code was designed to:

1.  **Isolate the core printing functionality:** By reducing the task to printing a single, fixed character ('A'), we could eliminate variables like string addressing (`adr`), loops, and null terminators that were present in the code.
2.  **Verify the UART write mechanism:** It uses the standard and correct method for writing to the PL011 UART Data Register (`str w0, [x19]`, where `x19` points to `UART_BASE + 0x00`). This was crucial to confirm that `x19` was indeed the correct base address and that writes to offset `0x00` were functional, contradicting the behavior of the initial and "working" `printf` command.
3.  **Test the byte transmission pipeline:** When combined with the Python script that generated the `printf` command, this program helped confirm that the entire process – from generating the correct `\x01` prefixed payload in Jupyter, copying it, and sending it via `nc` to the QEMU bootloader – was working as intended.

**What it proved:**

The successful execution and printing of 'A' confirmed that the fundamental UART write operation (`str w0, [x19]`) was correct, and that the method for injecting assembly code into the bootloader was reliable. This allowed us to definitively pinpoint the previous issues as related to the payload generation (`hexdump` output `\x00` instead of `\x01` prefixes) rather than the basic assembly logic itself.

The Python program generates the command that we must use in the second terminal:

In [94]:
import sys

def generate_full_printf_command(binary_file_path, nc_host="localhost", nc_port=44444):
    """
    Reads a binary file and generates the full 'printf' command string,
    including the pipe to 'nc'.
    """
    payload_parts = []
    try:
        with open(binary_file_path, 'rb') as f:
            binary_content = f.read()

        for byte_val in binary_content:
            # Append '\x01' followed by the byte value as '\xXX'
            payload_parts.append(f'\\x01\\x{byte_val:02x}')

        # Append the final '\x00' (execute command)
        payload_parts.append('\\x00')

        # Join all payload parts into a single string
        escaped_payload_string = "".join(payload_parts)
        
        # Construct the full printf command
        full_command = f"printf '{escaped_payload_string}' | nc {nc_host} {nc_port}"
        
        # Print the full command to standard output
        print(full_command, end='') # Use end='' to prevent an extra newline

    except FileNotFoundError:
        print(f"Error: File not found at {binary_file_path}", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"An error occurred: {e}", file=sys.stderr)
        sys.exit(1)


generate_full_printf_command("a.bin")

printf '\x01\x20\x01\x08\x01\x80\x01\x52\x01\x60\x01\x02\x01\x00\x01\xb9\x01\xc0\x01\x03\x01\x5f\x01\xd6\x00' | nc localhost 44444

qemu runs on an AMD64 host machine:

    $ qemu-system-aarch64 -machine virt -cpu cortex-a53 -nographic -kernel bootloader.img -serial tcp::44444,server,nowait,nodelay -monitor none

In another terminal we run:

    $ printf '\x01\x20\x01\x08\x01\x80\x01\x52\x01\x60\x01\x02\x01\x00\x01\xb9\x01\xc0\x01\x03\x01\x5f\x01\xd6\x00' | nc localhost 44444

and the output should be:

    A

Now let's test a more complete example that prints "Hello, world!":

In [115]:
%%writefile a.s
.text                   // Declare the .text section, which contains executable code.
.global _start          // Declare the _start label as a global symbol, making it the entry point for the program.

_start:
    // Initialize x0 with the address of the 'msg' string.
    // 'adr' (Address Register) is a PC-relative instruction, suitable for position-independent code.
    // It calculates the address of 'msg' relative to the current instruction's address.
    adr x0, msg              // Load the address of the 'msg' string into register x0.

loop:
    // Load a byte from the address pointed to by x0 into w1.
    // The '#1' after '[x0]' indicates post-indexed addressing:
    // After loading, x0 is incremented by 1, effectively moving to the next byte in the string.
    ldrb w1, [x0], #1        // Load Byte: Load a byte from [x0] into w1, then increment x0 by 1.

    // Check if the loaded byte is zero (null terminator).
    // 'cbz' (Compare and Branch on Zero) branches if the value in w1 is zero.
    cbz w1, done             // Compare Byte on Zero: If w1 is 0, branch to 'done'.

    // Load the UART Flag Register (UART_FR) status into w2.
    // x19 is assumed to hold the UART_BASE address (0x09000000).
    // The UART_FR is typically at an offset of 0x18 from UART_BASE.
    ldr w2, [x19, #24]       // Load Word: Load the value from address [x19 + 24] (UART_FR) into w2.

    // Wait until the UART Transmit FIFO is not full (TX ready).
    // 'tbnz' (Test Bit and Branch if Not Zero) checks bit 5 of w2.
    // UART_FR_TXFF (Transmit FIFO Full) is typically bit 5.
    // If bit 5 is set (non-zero), the FIFO is full, so we loop and wait.
    tbnz w2, #5, loop        // Test Bit and Branch if Not Zero: If bit 5 of w2 is 1, branch back to 'loop'.

    // Write the character from w1 to the UART Data Register (UART_DR).
    // UART_DR is typically at an offset of 0x00 from UART_BASE.
    str w1, [x19]            // Store Word: Store the character from w1 into address [x19] (UART_DR).

    // Branch unconditionally back to the beginning of the 'loop' to process the next character.
    b loop                   // Branch: Unconditionally branch to 'loop'.

done:
    // Return from the current function.
    // In this bare-metal context, 'ret' will typically return to the bootloader's execution loop.
    ret                      // Return: Return from the current subroutine.

msg:
    // Define the string "Hello, world!\n" followed by a null terminator.
    // '.ascii' defines a string, and '\n' is the newline character.
    // '\0' is the null terminator, which 'cbz' uses to detect the end of the string.
    .ascii "Hello, world!\n\0" // Define an ASCII string literal.

Overwriting a.s


In [96]:
%%bash
as -o a.o a.s
objcopy -O binary -j .text a.o a.bin

In [97]:
generate_full_printf_command("a.bin")

printf '\x01\x00\x01\x01\x01\x00\x01\x10\x01\x01\x01\x14\x01\x40\x01\x38\x01\xa1\x01\x00\x01\x00\x01\x34\x01\x62\x01\x1a\x01\x40\x01\xb9\x01\xa2\x01\xff\x01\x2f\x01\x37\x01\x61\x01\x02\x01\x00\x01\xb9\x01\xfb\x01\xff\x01\xff\x01\x17\x01\xc0\x01\x03\x01\x5f\x01\xd6\x01\x48\x01\x65\x01\x6c\x01\x6c\x01\x6f\x01\x2c\x01\x20\x01\x77\x01\x6f\x01\x72\x01\x6c\x01\x64\x01\x21\x01\x0a\x01\x00\x00' | nc localhost 44444

Same listing as above, just without comments:

In [116]:
%%writefile hw.s
.text
.global _start
_start:
    adr x0, msg
loop:
    ldrb w1, [x0], #1
    cbz w1, done
    ldr w2, [x19, #24]
    tbnz w2, #5, loop
    str w1, [x19]
    b loop
done:
    ret
msg:
    .ascii "Hello, world!\n\0"

Writing hw.s
