# RISC-V Processor FPGA Verification
## 32-bit Signed Integer Sorting — PYNQ Hardware-Mapped Edition

**Hardware topology:**
```
  PS (ARM / Jupyter)                 PL (FPGA fabric)
  ─────────────────    AXI-Lite      ────────────────────────────────
  MMIO(iram) ──────────────────────► axi_bram_ctrl_1  → Instruction RAM
  MMIO(dram) ──────────────────────► axi_bram_ctrl_0  → Data RAM
  MMIO(gpio) ──────────────────────► axi_gpio_0       → reset_n pin
                                                           │
                                                      RISC-V Core
```
The program is a **fully-unrolled bubble sort** generated as raw 32-bit machine words
directly in Python — no assembler toolchain required.  
A sentinel value `0xDEADBEEF` written to `dram[0x100]` signals completion.


## 1. Imports & Test Parameters

In [14]:
import struct
import time
import random
import numpy as np
from typing import List, Tuple

# ─── Test parameters ─────────────────────────────────────────────────────────
NUM_ELEMENTS_TO_SORT = 32      # fixed: the sort network is hard-wired for 32 elements
PRNG_SEED_VALUE = 42
MAX_EXECUTION_TIMEOUT_SECONDS = 10.0    # seconds to wait for 0xDEADBEEF sentinel
COMPLETION_CHECK_INTERVAL_SECONDS = 0.001   # 1 ms polling interval

# ─── Memory-map constants (RISC-V core's address space) ──────────────────────
# The core boots at PC=0 and uses x0 (hardwired zero) as its base-address
# register, so the data array lives at dram offset 0x000.
DRAM_SORT_ARRAY_BASE_ADDR = 0x000   # byte offset in dram where the 32-element array starts
DRAM_COMPLETION_FLAG_ADDR  = 0x100   # byte offset where the done sentinel is written
COMPLETION_MAGIC_VALUE     = 0xDEADBEAF

SIGNED_INT32_MINIMUM_VALUE = -100
SIGNED_INT32_MAXIMUM_VALUE =  100

print('Imports OK.')
print(f'  Array size : {NUM_ELEMENTS_TO_SORT} x int32  ({NUM_ELEMENTS_TO_SORT*4} bytes)')
print(f'  Sentinel   : 0x{COMPLETION_MAGIC_VALUE:08X} @ dram+0x{DRAM_COMPLETION_FLAG_ADDR:03X}')

Imports OK.
  Array size : 32 x int32  (128 bytes)
  Sentinel   : 0xDEADBEAF @ dram+0x100


## 2. Load Overlay & Map Hardware

In [15]:
from pynq import Overlay, MMIO

BITSTREAM_FILENAME = 'design_1_wrapper.bit'

print('Loading overlay...')
fpga_overlay = Overlay(BITSTREAM_FILENAME)
print('Overlay loaded.')
print('  IPs  :', list(fpga_overlay.ip_dict.keys()))
print('  Mems :', list(fpga_overlay.mem_dict.keys()))

# ─── BRAM controllers ────────────────────────────────────────────────────────
iram_interface_info = fpga_overlay.mem_dict['axi_bram_ctrl_1']   # Instruction RAM
dram_interface_info = fpga_overlay.mem_dict['axi_bram_ctrl_0']   # Data RAM

instruction_memory = MMIO(iram_interface_info['phys_addr'], iram_interface_info['addr_range'])
data_memory = MMIO(dram_interface_info['phys_addr'], dram_interface_info['addr_range'])

# ─── GPIO — reset control ─────────────────────────────────────────────────────
# GPIO DATA register offset 0x0:
#   write 0 → assert reset   (core frozen at PC=0)
#   write 1 → de-assert reset (core runs freely)
gpio_interface_info = fpga_overlay.ip_dict['axi_gpio_0']
reset_controller = MMIO(gpio_interface_info['phys_addr'], gpio_interface_info['addr_range'])

def freeze_processor():
    reset_controller.write(0x0, 0x0)   # assert reset

def release_processor():
    reset_controller.write(0x0, 0x1)   # de-assert reset → execution begins at PC=0

# Start with the core held in reset
freeze_processor()
print('Hardware mapped. Core is HALTED.')
print(f'  iram @ 0x{iram_interface_info["phys_addr"]:08X}  ({iram_interface_info["addr_range"]//1024} KB)')
print(f'  dram @ 0x{dram_interface_info["phys_addr"]:08X}  ({dram_interface_info["addr_range"]//1024} KB)')
print(f'  gpio @ 0x{gpio_interface_info["phys_addr"]:08X}')

Loading overlay...
Overlay loaded.
  IPs  : ['axi_gpio_0', 'processing_system7_0']
  Mems : ['axi_bram_ctrl_0', 'axi_bram_ctrl_1', 'PSDDR']
Hardware mapped. Core is HALTED.
  iram @ 0x42000000  (16 KB)
  dram @ 0x40000000  (16 KB)
  gpio @ 0x41200000


## 3. Machine-Code Program Generator

Builds a **fully-unrolled bubble sort** as raw 32-bit RISC-V instruction words.  
No assembler needed — each `enc_*` function manually packs instruction fields
according to the RV32I spec.

In [16]:
def _extract_bits(x, bits): return x & ((1 << bits) - 1)

# ── RV32I instruction encoders ────────────────────────────────────────────────
def encode_beq(source1_reg, source2_reg, branch_offset_signed_13bit):      # B-format  (imm bits shuffled per RV32I spec)
    assert branch_offset_signed_13bit % 2 == 0, 'branch offset must be even'
    imm = _extract_bits(branch_offset_signed_13bit, 13)
    return (((imm>>12)&1)<<31) | (((imm>>5)&0x3F)<<25) | (_extract_bits(source2_reg,5)<<20) | \
           (_extract_bits(source1_reg,5)<<15) | (0b000<<12) | (((imm>>1)&0xF)<<8) | (((imm>>11)&1)<<7) | 0x63

def encode_slt(destination_reg, source1_reg, source2_reg):       # R-format  signed less-than
    return (0b0000000<<25) | (_extract_bits(source2_reg,5)<<20) | (_extract_bits(source1_reg,5)<<15) | (0b010<<12) | (_extract_bits(destination_reg,5)<<7) | 0x33


def encode_addi(destination_reg, source_reg, signed_immediate_12bit):      # I-format
    return (_extract_bits(signed_immediate_12bit,12)<<20) | (_extract_bits(source_reg,5)<<15) | (0b000<<12) | (_extract_bits(destination_reg,5)<<7) | 0x13

def encode_lw(destination_reg, base_address_reg, byte_offset_12bit):        # I-format  (funct3=010)
    return (_extract_bits(byte_offset_12bit,12)<<20) | (_extract_bits(base_address_reg,5)<<15) | (0b010<<12) | (_extract_bits(destination_reg,5)<<7) | 0x03

def encode_lui(destination_reg, upper_immediate_20bit):          # U-format
    return (_extract_bits(upper_immediate_20bit,20)<<12) | (_extract_bits(destination_reg,5)<<7) | 0x37

def encode_sw(source_data_reg, base_address_reg, byte_offset_12bit):       # S-format  (imm split: bits 11:5 and 4:0)
    imm = _extract_bits(byte_offset_12bit, 12)
    return (((imm>>5)&0x7F)<<25) | (_extract_bits(source_data_reg,5)<<20) | (_extract_bits(base_address_reg,5)<<15) | (0b010<<12) | ((imm&0x1F)<<7) | 0x23


# ── Sort-network generator ────────────────────────────────────────────────────
def generate_unrolled_bubble_sort_program() -> Tuple[List[int], List[str]]:
    """
    Emit a fully-unrolled bubble sort for 32 int32 elements.

    Register map:
      x0  = 0  (hardwired zero, also used as base-address pointer -> array at addr 0)
      x7  = arr[j]       (element a)
      x8  = arr[j+1]     (element b)
      x9  = slt result   (1 = out-of-order, 0 = sorted)
      x12 = scratch for done-sentinel write

    Each compare-swap block (6 instructions, 24 bytes):
      lw  x7, off(x0)        # load a
      lw  x8, off+4(x0)      # load b
      slt x9, x8, x7         # x9 = (b < a) ? 1 : 0
      beq x9, x0, +12        # skip 2 stores if already sorted
      sw  x8, off(x0)        # arr[j]   = b  (swap)
      sw  x7, off+4(x0)      # arr[j+1] = a

    Epilogue writes 0xDEADBEEF to dram[0x100] then spins forever.
    """
    reg_zero, reg_element_a, reg_element_b, reg_comparison_result, reg_sentinel_builder = 0, 7, 8, 9, 12
    instruction_list: List[int] = []
    assembly_listing:  List[str] = []

    for pass_number in range(31):              # 31 passes
        for element_index in range(31 - pass_number):      # shrinking inner window
            byte_offset = 4 * element_index
            instruction_list.append(encode_lw (reg_element_a, reg_zero, byte_offset    ));  assembly_listing.append(f'lw   x7, {byte_offset}(x0)')
            instruction_list.append(encode_lw (reg_element_b, reg_zero, byte_offset + 4));  assembly_listing.append(f'lw   x8, {byte_offset+4}(x0)')
            instruction_list.append(encode_slt(reg_comparison_result, reg_element_b, reg_element_a     ));  assembly_listing.append(f'slt  x9, x8, x7')
            instruction_list.append(encode_beq(reg_comparison_result, reg_zero, 12     ));  assembly_listing.append(f'beq  x9, x0, +12')
            instruction_list.append(encode_sw (reg_element_b, reg_zero, byte_offset    ));  assembly_listing.append(f'sw   x8, {byte_offset}(x0)')
            instruction_list.append(encode_sw (reg_element_a, reg_zero, byte_offset + 4));  assembly_listing.append(f'sw   x7, {byte_offset+4}(x0)')

    # Epilogue: write 0xDEADBEEF to dram[0x100], then infinite loop (halt)
    # 0xDEADBEEF = 0xDEADC000 + (-337)  because addi sign-extends 12-bit imm
    instruction_list.append(encode_lui (reg_sentinel_builder,       0xDEADC));  assembly_listing.append('lui  x12, 0xDEADC')
    instruction_list.append(encode_addi(reg_sentinel_builder, reg_sentinel_builder,  -337   ));  assembly_listing.append('addi x12, x12, -337')
    instruction_list.append(encode_sw  (reg_sentinel_builder, reg_zero,   0x100  ));  assembly_listing.append('sw   x12, 256(x0)')
    instruction_list.append(encode_beq (reg_zero,  reg_zero,   0      ));  assembly_listing.append('beq  x0, x0, 0   # halt')

    return instruction_list, assembly_listing

GENERATED_MACHINE_CODE, ASSEMBLY_CODE_LISTING = generate_unrolled_bubble_sort_program()
print(f'Program generated: {len(GENERATED_MACHINE_CODE)} instructions  ({len(GENERATED_MACHINE_CODE)*4} bytes)')
print(f'  Compare-swap blocks : {len(GENERATED_MACHINE_CODE) - 4}  (496 blocks x 6 instr)')
print(f'  Epilogue            : 4 instructions')

Program generated: 2980 instructions  (11920 bytes)
  Compare-swap blocks : 2976  (496 blocks x 6 instr)
  Epilogue            : 4 instructions


## 4. Load Program into Instruction RAM

In [17]:
def upload_program_to_instruction_memory(instruction_words: List[int]):
    """
    Write each 32-bit instruction word into iram at consecutive 4-byte offsets.
    The core resets to PC=0, so instruction 0 maps to iram byte offset 0.
    The core is kept in reset (halted) during the write.
    """
    freeze_processor()
    for instruction_index, instruction_word in enumerate(instruction_words):
        instruction_memory.write(instruction_index * 4, instruction_word & 0xFFFF_FFFF)
    print(f'Loaded {len(instruction_words)} words into iram'
          f'  (offsets 0x0000 – 0x{(len(instruction_words)-1)*4:04X})')

upload_program_to_instruction_memory(GENERATED_MACHINE_CODE)

# ── Readback sanity check (first 4 instructions) ─────────────────────────────
print('\nReadback check (first 4 instructions):')
verification_passed = True
for instruction_index in range(4):
    expected_value = GENERATED_MACHINE_CODE[instruction_index] & 0xFFFF_FFFF
    readback_value = instruction_memory.read(instruction_index * 4)
    values_match = readback_value == expected_value
    if not values_match:
        verification_passed = False
    status_marker = 'OK' if values_match else 'MISMATCH'
    print(f'  [{instruction_index:4d}] wrote=0x{expected_value:08X}  read=0x{readback_value:08X}  {status_marker}')
print('Readback:', 'PASS' if verification_passed else 'FAIL — check AXI-BRAM wiring')

Loaded 2980 words into iram  (offsets 0x0000 – 0x2E8C)

Readback check (first 4 instructions):
  [   0] wrote=0x00002383  read=0x00002383  OK
  [   1] wrote=0x00402403  read=0x00402403  OK
  [   2] wrote=0x007424B3  read=0x007424B3  OK
  [   3] wrote=0x00048663  read=0x00048663  OK
Readback: PASS


## 5. Helper Functions — dram I/O & Reference Model

In [18]:
# ── Signed <-> unsigned conversion ───────────────────────────────────────────
def convert_signed_to_unsigned_32bit(signed_value: int) -> int:
    return struct.unpack('<I', struct.pack('<i', signed_value))[0]

def convert_unsigned_to_signed_32bit(unsigned_value: int) -> int:
    return struct.unpack('<i', struct.pack('<I', unsigned_value & 0xFFFF_FFFF))[0]

# ── dram helpers ──────────────────────────────────────────────────────────────
def upload_array_to_data_memory(integer_array: List[int], base_address: int = DRAM_SORT_ARRAY_BASE_ADDR):
    for array_index, signed_value in enumerate(integer_array):
        data_memory.write(base_address + array_index * 4, convert_signed_to_unsigned_32bit(signed_value))

def read_array_from_data_memory(num_elements: int, base_address: int = DRAM_SORT_ARRAY_BASE_ADDR) -> List[int]:
    return [convert_unsigned_to_signed_32bit(data_memory.read(base_address + element_index * 4)) for element_index in range(num_elements)]

def clear_completion_sentinel():
    data_memory.write(DRAM_COMPLETION_FLAG_ADDR, 0x00000000)

def read_completion_sentinel() -> int:
    return data_memory.read(DRAM_COMPLETION_FLAG_ADDR) & 0xFFFF_FFFF

# ── Test-vector generator ─────────────────────────────────────────────────────
def generate_random_test_array(array_size: int = NUM_ELEMENTS_TO_SORT, random_seed: int = PRNG_SEED_VALUE) -> List[int]:
    prng = random.Random(random_seed)
    test_array = [prng.randint(SIGNED_INT32_MINIMUM_VALUE, SIGNED_INT32_MAXIMUM_VALUE) for _ in range(array_size)]
    # Force boundary values into the first slots so they're always exercised
    test_array[0] = SIGNED_INT32_MINIMUM_VALUE
    test_array[1] = SIGNED_INT32_MAXIMUM_VALUE
    test_array[2] = 0
    test_array[3] = -1
    return test_array

print('Helpers defined.')

# Preview the main test vector
primary_test_input = generate_random_test_array()
golden_reference_output     = sorted(primary_test_input)
print('Test input :', primary_test_input)
print('Golden     :', golden_reference_output)

Helpers defined.
Test input : [-100, 100, 0, -1, -30, -38, -43, -65, 88, -74, 73, 89, 39, -78, 51, 8, -92, -93, -77, -45, -41, 29, 54, -94, 43, -50, 83, 66, 79, 39, 7, -44]
Golden     : [-100, -94, -93, -92, -78, -77, -74, -65, -50, -45, -44, -43, -41, -38, -30, -1, 0, 7, 8, 29, 39, 39, 43, 51, 54, 66, 73, 79, 83, 88, 89, 100]


## 6. Run on FPGA

In [19]:
def execute_sort_on_fpga_hardware(unsorted_array: List[int],
                     max_timeout_seconds: float = MAX_EXECUTION_TIMEOUT_SECONDS) -> Tuple[List[int], float]:
    """
    Full run sequence:
      1. freeze_processor()              — freeze core while setting up
      2. clear_completion_sentinel()        — zero out sentinel so we don't read stale data
      3. upload_array_to_data_memory()    — load input into data RAM
      4. release_processor()               — release reset; core fetches from iram PC=0
      5. poll dram[0x100]         — wait for 0xDEADBEEF (done sentinel)
      6. freeze_processor()              — re-assert reset
      7. read_array_from_data_memory()   — collect sorted result

    Returns (sorted_array, elapsed_seconds).
    Raises TimeoutError if sentinel never appears within `max_timeout_seconds` seconds.
    """
    freeze_processor()
    clear_completion_sentinel()
    upload_array_to_data_memory(unsorted_array)

    start_time = time.perf_counter()
    release_processor()                        # <-- core starts executing here

    timeout_deadline = start_time + max_timeout_seconds
    while time.perf_counter() < timeout_deadline:
        if read_completion_sentinel() == COMPLETION_MAGIC_VALUE:
            break
        time.sleep(COMPLETION_CHECK_INTERVAL_SECONDS)
    else:
        freeze_processor()
        raise TimeoutError(
            f'Sentinel 0x{COMPLETION_MAGIC_VALUE:08X} not seen after {max_timeout_seconds}s. '
            f'dram[0x100] = 0x{read_completion_sentinel():08X}'
        )

    elapsed_time_seconds = time.perf_counter() - start_time
    freeze_processor()
    sorted_result = read_array_from_data_memory(len(unsorted_array))
    return sorted_result, elapsed_time_seconds


print('Running main sort test on FPGA...')
hardware_sorted_output, execution_time_seconds = execute_sort_on_fpga_hardware(primary_test_input)
print(f'Done in {execution_time_seconds*1000:.3f} ms')
print('DUT output :', hardware_sorted_output)

Running main sort test on FPGA...
Done in 1.798 ms
DUT output : [-100, -94, -93, -92, -78, -77, -74, -65, -50, -45, -44, -43, -41, -38, -30, -1, 0, 7, 8, 29, 39, 39, 43, 51, 54, 66, 73, 79, 83, 88, 89, 100]


## 7. Verification & Reporting

In [20]:
def verify_sort_correctness(hardware_output: List[int], expected_output: List[int], original_input: List[int],
           test_label: str = 'Main test') -> bool:
    """
    Four-check verification suite:
      1. Length preserved
      2. Element-wise equality vs golden reference (Python sorted)
      3. Multiset preservation — no elements lost or duplicated
      4. Monotonicity — output is non-decreasing
    """
    test_passed = True
    error_messages, element_mismatches = [], []

    if len(hardware_output) != len(expected_output):
        error_messages.append(f'Length: DUT={len(hardware_output)}, expected={len(expected_output)}')
        test_passed = False

    for array_index, (hw_value, expected_value) in enumerate(zip(hardware_output, expected_output)):
        if hw_value != expected_value:
            element_mismatches.append((array_index, hw_value, expected_value))
            test_passed = False

    if sorted(hardware_output) != sorted(original_input):
        error_messages.append('Multiset check FAILED — elements lost or corrupted.')
        test_passed = False

    for array_index in range(len(hardware_output) - 1):
        if hardware_output[array_index] > hardware_output[array_index + 1]:
            error_messages.append(f'Not monotone at [{array_index}]: {hardware_output[array_index]} > {hardware_output[array_index+1]}')
            test_passed = False
            break

    separator_line = '=' * 58
    print(separator_line)
    print(f'  {test_label}:  {"PASSED" if test_passed else "FAILED"}')
    print(separator_line)
    if element_mismatches:
        print(f'  Element mismatches ({len(element_mismatches)}):')
        for idx, hw_val, expected_val in element_mismatches[:8]:
            print(f'    [{idx:3d}]  DUT={hw_val:12d}  expected={expected_val:12d}')
        if len(element_mismatches) > 8:
            print(f'    ... and {len(element_mismatches)-8} more')
    for error_msg in error_messages:
        print(f'  ERROR: {error_msg}')
    print(separator_line)
    return test_passed

primary_test_passed = verify_sort_correctness(hardware_sorted_output, golden_reference_output, primary_test_input)

  Main test:  PASSED


## 8. Extended / Corner-Case Tests

All test cases are exactly 32 elements to match the fixed sort network.

In [21]:
ARRAY_LENGTH = NUM_ELEMENTS_TO_SORT   # 32

CORNER_CASE_TEST_VECTORS = {
    'already_sorted'  : list(range(ARRAY_LENGTH)),
    'reverse_sorted'  : list(range(ARRAY_LENGTH - 1, -1, -1)),
    'all_zeros'       : [0] * ARRAY_LENGTH,
    'all_same'        : [42] * ARRAY_LENGTH,
    'all_negative'    : [-(i + 1) for i in range(ARRAY_LENGTH)],
    'boundary_alt'    : ([SIGNED_INT32_MINIMUM_VALUE, SIGNED_INT32_MAXIMUM_VALUE] * (ARRAY_LENGTH // 2))[:ARRAY_LENGTH],
    'random_seed_99'  : generate_random_test_array(ARRAY_LENGTH, random_seed=99),
    'random_seed_1337': generate_random_test_array(ARRAY_LENGTH, random_seed=1337),
}

extended_test_results = {}
print(f'  {"Test name":<25}  {"Time (ms)":>10}  Result')
print('  ' + '-' * 50)

for test_name, test_array in CORNER_CASE_TEST_VECTORS.items():
    try:
        hardware_output, execution_time = execute_sort_on_fpga_hardware(test_array)
        correctness_check = (hardware_output == sorted(test_array))
    except TimeoutError as timeout_error:
        print(f'  {test_name:<25}  {"TIMEOUT":>10}  FAIL  ({timeout_error})')
        extended_test_results[test_name] = False
        continue

    extended_test_results[test_name] = correctness_check
    status_tag = 'PASS' if correctness_check else 'FAIL'
    print(f'  {test_name:<25}  {execution_time*1000:>10.3f}  {status_tag}')

print()
print('Extended tests:', 'ALL PASSED' if all(extended_test_results.values()) else 'SOME FAILED')

  Test name                   Time (ms)  Result
  --------------------------------------------------
  already_sorted                  1.910  PASS
  reverse_sorted                  1.233  PASS
  all_zeros                       1.221  PASS
  all_same                        1.275  PASS
  all_negative                    1.232  PASS
  boundary_alt                    1.223  PASS
  random_seed_99                  1.231  PASS
  random_seed_1337                1.217  PASS

Extended tests: ALL PASSED


## 9. Performance Sweep

In [22]:
# Measure wall-clock time across multiple independent runs with different inputs.
# (The hardware always sorts exactly 32 elements — only the data varies.)

NUMBER_OF_TIMING_RUNS = 10
execution_time_samples  = []

print(f'Running {NUMBER_OF_TIMING_RUNS} timed trials (32 int32 each)...')
for run_number in range(NUMBER_OF_TIMING_RUNS):
    test_array = generate_random_test_array(NUM_ELEMENTS_TO_SORT, random_seed=run_number * 7 + 3)
    _, elapsed_time = execute_sort_on_fpga_hardware(test_array)
    execution_time_samples.append(elapsed_time * 1000)
    print(f'  Run {run_number+1:2d}: {elapsed_time*1000:.3f} ms')

print()
print(f'  Min  : {min(execution_time_samples):.3f} ms')
print(f'  Max  : {max(execution_time_samples):.3f} ms')
print(f'  Mean : {np.mean(execution_time_samples):.3f} ms')
print(f'  Std  : {np.std(execution_time_samples):.4f} ms')

Running 10 timed trials (32 int32 each)...
  Run  1: 1.873 ms
  Run  2: 4.865 ms
  Run  3: 1.247 ms
  Run  4: 1.280 ms
  Run  5: 1.253 ms
  Run  6: 1.299 ms
  Run  7: 1.260 ms
  Run  8: 1.230 ms
  Run  9: 1.221 ms
  Run 10: 1.218 ms

  Min  : 1.218 ms
  Max  : 4.865 ms
  Mean : 1.675 ms
  Std  : 1.0799 ms


## 10. Hex Dump (for `$readmemh` in Verilog testbenches)

In [24]:
HEX_OUTPUT_FILENAME = 'test_sort.hex'
with open(HEX_OUTPUT_FILENAME, 'w') as hex_file:
    for instruction_word in GENERATED_MACHINE_CODE:
        hex_file.write(f'{instruction_word & 0xFFFFFFFF:08x}\n')

print(f'Hex file written: {HEX_OUTPUT_FILENAME}  ({len(GENERATED_MACHINE_CODE)} lines)')
print()
print('First 6 instructions (one compare-swap block):')
for instruction_word, assembly_code in zip(GENERATED_MACHINE_CODE[:6], ASSEMBLY_CODE_LISTING[:6]):
    print(f'  {instruction_word&0xFFFFFFFF:08x}   # {assembly_code}')
print('  ...')
print('Epilogue (last 4):')
for instruction_word, assembly_code in zip(GENERATED_MACHINE_CODE[-4:], ASSEMBLY_CODE_LISTING[-4:]):
    print(f'  {instruction_word&0xFFFFFFFF:08x}   # {assembly_code}')

Hex file written: test_sort.hex  (2980 lines)

First 6 instructions (one compare-swap block):
  00002383   # lw   x7, 0(x0)
  00402403   # lw   x8, 4(x0)
  007424b3   # slt  x9, x8, x7
  00048663   # beq  x9, x0, +12
  00802023   # sw   x8, 0(x0)
  00702223   # sw   x7, 4(x0)
  ...
Epilogue (last 4):
  deadc637   # lui  x12, 0xDEADC
  eaf60613   # addi x12, x12, -337
  10c02023   # sw   x12, 256(x0)
  00000063   # beq  x0, x0, 0   # halt


## 11. Final Summary

In [25]:
overall_pass_status = primary_test_passed and all(extended_test_results.values())

print('=' * 50)
print(f'  FINAL RESULT : {"PASS" if overall_pass_status else "FAIL"}')
print('=' * 50)
print(f'  Main sort test      : {"PASS" if primary_test_passed else "FAIL"}')
for test_name, test_passed in extended_test_results.items():
    print(f'  {test_name:<25} : {"PASS" if test_passed else "FAIL"}')
print('=' * 50)
print(f'  Mean exec time      : {np.mean(execution_time_samples):.3f} ms')
print('=' * 50)

freeze_processor()   # always leave the core in reset
print('Core halted. Done.')

  FINAL RESULT : PASS
  Main sort test      : PASS
  already_sorted            : PASS
  reverse_sorted            : PASS
  all_zeros                 : PASS
  all_same                  : PASS
  all_negative              : PASS
  boundary_alt              : PASS
  random_seed_99            : PASS
  random_seed_1337          : PASS
  Mean exec time      : 1.675 ms
Core halted. Done.
