# **GEMMM Acceleraor using HLS**

## **Step 01 - Import libraries**

In [1]:
import time
import numpy as np
from pynq import Overlay, allocate

## **Step 02 - Class to compute GEMM on FPGA and Naive Matrix Multiplication function**

In [2]:
class GEMM_DRIVER:
    def __init__(self, path, debug=False):
        self.debug = debug
        if self.debug:
            print("Initializing GEMM driver...")
        
        # Create the overlay and initialize the IP
        self.overlay = Overlay(path)
        
        # Initialize the GEMM kernel IP
        self.gemm = self.overlay.gemm_kernel_0
        if self.debug:
            print("Overlay loaded successfully")
        
        # Define matrix dimensions
        self.M = 128  # Adjust these based on your HLS parameters
        self.K = 128
        self.N = 128
        if self.debug:
            print(f"Matrix dimensions set to: {self.M}x{self.K} * {self.K}x{self.N}")

    def write_matrix(self, a_buffer, b_buffer, c_buffer):
        if self.debug:
            print("\nWriting matrix addresses to kernel...")
        
        # Get physical addresses
        a_phys_addr = a_buffer.physical_address
        b_phys_addr = b_buffer.physical_address
        c_phys_addr = c_buffer.physical_address

        # Write the physical addresses to the kernel
        self.gemm.write(0x10, a_phys_addr)  # Set address for matrix A
        self.gemm.write(0x1c, b_phys_addr)  # Set address for matrix B
        self.gemm.write(0x28, c_phys_addr)  # Set address for matrix C

        if self.debug:
            print(f"Matrix A address: 0x{a_phys_addr:x}")
            print(f"Matrix B address: 0x{b_phys_addr:x}")
            print(f"Matrix C address: 0x{c_phys_addr:x}")

    def gemm_compute(self, A, B):
        if self.debug:
            print("\nStarting GEMM computation...")
            print("Allocating memory buffers...")
        
        # Allocate physically contiguous memory for matrices
        a_buffer = allocate(shape=(self.M, self.K), dtype=np.float32)
        b_buffer = allocate(shape=(self.K, self.N), dtype=np.float32)
        c_buffer = allocate(shape=(self.M, self.N), dtype=np.float32)
        
        if self.debug:
            print("Copying input matrices to buffers...")
        # Initialize input matrices
        a_buffer[:] = A
        b_buffer[:] = B
        c_buffer[:] = np.zeros((self.M, self.N), dtype=np.float32)
        
        if self.debug:
            print("Flushing data cache...")
        # Flush the data cache
        a_buffer.flush()
        b_buffer.flush()
        c_buffer.flush()
        
        # Write matrices to the kernel
        self.write_matrix(a_buffer, b_buffer, c_buffer)
        
        if self.debug:
            print("Starting hardware computation...")
        start_time = time.time()
        # Start the computation
        self.gemm.write(0x00, 1)  # Set AP_START
        
        # Wait for completion
        while (self.gemm.read(0x00) & 0x2) == 0:  # Check AP_DONE
            pass
            
        end_time = time.time()
        if self.debug:
            print(f"Hardware computation completed in {(end_time-start_time)*1000:.2f} ms")
        
        if self.debug:
            print("Invalidating output buffer...")
        # Invalidate the output buffer
        c_buffer.invalidate()
        
        return c_buffer

    def verify_result(self, hw_result, sw_result):
        if self.debug:
            print("\nVerifying results...")
        # Verify results match within tolerance
        rtol = 1e-5
        atol = 1e-5
        match = np.allclose(hw_result, sw_result, rtol=rtol, atol=atol)
        
        if match:
            if self.debug:
                print("✓ Results match within tolerance!")
                # Print some sample comparisons
                print("\nSample comparisons (first 3 elements):")
                print("HW:", hw_result.flatten()[:3])
                print("SW:", sw_result.flatten()[:3])
        else:
            print("✗ Results don't match!")
            diff = np.abs(hw_result - sw_result)
            max_diff = np.max(diff)
            print(f"Maximum difference: {max_diff}")
            print("\nFirst differing elements:")
            print("HW:", hw_result.flatten()[:5])
            print("SW:", sw_result.flatten()[:5])
            
        if self.debug:
            # Print performance metrics
            print("\nPerformance Analysis:")
            total_ops = self.M * self.N * self.K * 2  # multiply-add operations
            print(f"Total Operations: {total_ops:,}")


In [3]:
# Function for naive matrix multiplication
def matrix_multiply(A, B):
    rows_A, cols_A = A.shape
    rows_B, cols_B = B.shape
    assert cols_A == rows_B, "Matrix dimensions do not match for multiplication"
    result = np.zeros((rows_A, cols_B), dtype=np.float32)
    for i in range(rows_A):
        for j in range(cols_B):
            for k in range(cols_A):
                result[i, j] += A[i, k] * B[k, j]
    return result

## **Step 03 - Compare speed-up**

In [4]:
# Initialize the driver
print("Creating GEMM driver instance...")
gemm_driver = GEMM_DRIVER("gemm.bit", debug = True)

# Generate test matrices
print("\nGenerating test matrices...")
A = np.random.rand(128, 128).astype(np.float32)
B = np.random.rand(128, 128).astype(np.float32)

# Timing the hardware GEMM computation
print("\nRunning hardware GEMM...")
start_time_hw = time.time()
hw_result = gemm_driver.gemm_compute(A, B)
end_time_hw = time.time()

# Timing the software reference computation
print("\nComputing software reference...")
start_time_sw = time.time()
sw_result = matrix_multiply(A, B)
end_time_sw = time.time()

# Compute time taken
hw_time = end_time_hw - start_time_hw
sw_time = end_time_sw - start_time_sw

# Verify results
gemm_driver.verify_result(hw_result, sw_result)

# Print results
print(f"\nTime taken by hardware GEMM: {hw_time:.6f} seconds")
print(f"Time taken by software reference: {sw_time:.6f} seconds")

# Calculate and print speedup
speedup = sw_time / hw_time
print(f"\nSpeedup of hardware over software: {speedup:.2f}x")

Creating GEMM driver instance...
Initializing GEMM driver...


Overlay loaded successfully
Matrix dimensions set to: 128x128 * 128x128

Generating test matrices...

Running hardware GEMM...

Starting GEMM computation...
Allocating memory buffers...
Copying input matrices to buffers...
Flushing data cache...

Writing matrix addresses to kernel...
Matrix A address: 0x16850000
Matrix B address: 0x16860000
Matrix C address: 0x16870000
Starting hardware computation...
Hardware computation completed in 3.43 ms
Invalidating output buffer...

Computing software reference...

Verifying results...
✓ Results match within tolerance!

Sample comparisons (first 3 elements):
HW: [35.34154  29.643805 33.810066]
SW: [35.34154  29.643805 33.810066]

Performance Analysis:
Total Operations: 4,194,304

Time taken by hardware GEMM: 0.014266 seconds
Time taken by software reference: 20.150390 seconds

Speedup of hardware over software: 1412.50x
