# GPU code profiling

## Case 1 - normal matrix multiplication with user defined block size

In [None]:
import subprocess
import time
matrix_width = 1024
block_size = 32 # Try also with block_size > 32
print(f"Matrix Width: {matrix_width}, Block Size: {block_size}")
start = time.perf_counter()
result = subprocess.run(["./matrixMul.exe", str(matrix_width), str(block_size)], capture_output=True, text=True)
end = time.perf_counter()
elapsed_time_ms = (end - start) * 1000
print(f"{elapsed_time_ms:.2f} ms")
print("Program Output:")
print(result.stdout.strip())
print("="*50)

In [None]:
import subprocess
import time
for matrix_width in [128, 256, 512, 1024]:
    for block_size in [8, 16, 32]:
        print(f"Matrix Width: {matrix_width}, Block Size: {block_size}")
        for run in range(1, 4):  # Run 3 times
            start = time.perf_counter()
            result = subprocess.run(["./matrixMul.exe", str(matrix_width), str(block_size)], capture_output=True, text=True)
            end = time.perf_counter()
            elapsed_time_ms = (end - start) * 1000
            print(f"  Run {run}: {elapsed_time_ms:.2f} ms")
            #print("Program Output:")
            #print(result.stdout.strip())
        print("="*50)

In [None]:
import subprocess
import time
import matplotlib.pyplot as plt

matrix_width = 1024
block_sizes = [8, 16, 32]
runs_per_block = {}

# Run profiling and collect timings
for block_size in block_sizes:
    print(f"Matrix Width: {matrix_width}, Block Size: {block_size}")
    runtimes = []
    for run in range(1, 4):
        start = time.perf_counter()
        result = subprocess.run(
            ["./matrixMul.exe", str(matrix_width), str(block_size)],
            capture_output=True,
            text=True
        )
        end = time.perf_counter()
        elapsed_time_ms = (end - start) * 1000
        runtimes.append(elapsed_time_ms)
        print(f"  Run {run}: {elapsed_time_ms:.2f} ms")
    runs_per_block[block_size] = runtimes
    print("=" * 50)

# Plotting
x_vals = []
y_vals = []

for block_size, times in runs_per_block.items():
    for t in times:
        x_vals.append(block_size)
        y_vals.append(t)

plt.figure(figsize=(8, 6))
plt.scatter(x_vals, y_vals, color='blue')
plt.title("Execution Time vs Block Size (Matrix Width = 1024)")
plt.xlabel("Block Size")
plt.ylabel("Execution Time (ms)")
plt.grid(True)
plt.tight_layout()
plt.show()


## Case 2.1 - normal matrix multiplication with Device Properties

In [36]:
import subprocess
import time
matrix_width =4096
block_size = 32 # should fail due to resource constraint
print(f"Matrix Width: {matrix_width}, Block Size: {block_size}")
for run in range(1, 4):  # Run 3 times
    start = time.perf_counter()
    result = subprocess.run(["./matrixMulDevProp.exe", str(matrix_width), str(block_size)], capture_output=True, text=True)
    end = time.perf_counter()
    elapsed_time_ms = (end - start) * 1000
    print(f"Run {run}: {elapsed_time_ms:.2f} ms")
    print("Program Output:")
    print(result.stdout.strip())
    print("^"*50)

Matrix Width: 4096, Block Size: 32
Run 1: 814.69 ms
Program Output:
==== CUDA Device Properties ====
Device Name: NVIDIA GeForce RTX 4050 Laptop GPU
Compute Capability: 8.9
Clock Rate: 2.37 GHz
Total Global Memory: 6.00 GB
Shared Memory per Block: 48.00 KB
Registers per Block: 65536
Warp Size: 32
Max Threads per Block: 1024
Max Threads Dim: (1024, 1024, 64)
Max Grid Size: (2147483647, 65535, 65535)
Multiprocessor Count: 20
Memory Bus Width: 96 bits
Memory Clock Rate: 8.00 GHz
L2 Cache Size: 24576 KB
Sample output:
4096.0 4096.0 4096.0 4096.0 
4096.0 4096.0 4096.0 4096.0 
4096.0 4096.0 4096.0 4096.0 
4096.0 4096.0 4096.0 4096.0
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Run 2: 333.57 ms
Program Output:
==== CUDA Device Properties ====
Device Name: NVIDIA GeForce RTX 4050 Laptop GPU
Compute Capability: 8.9
Clock Rate: 2.37 GHz
Total Global Memory: 6.00 GB
Shared Memory per Block: 48.00 KB
Registers per Block: 65536
Warp Size: 32
Max Threads per Block: 1024
Max Threads Dim: (1024,

## Case 2.2 - Automatically choose Block Size using Device Properties

In [None]:
import subprocess
import time
matrix_width = 1024
print(f"Matrix Width: {matrix_width}, Block Size: {block_size}")
for run in range(1, 5):  # Run multiple times
    start = time.perf_counter()
    result = subprocess.run(["./matrixMulDevPropAuto.exe", str(matrix_width)], capture_output=True, text=True)
    end = time.perf_counter()
    elapsed_time_ms = (end - start) * 1000
    print(f"{elapsed_time_ms:.2f} ms")
    print("Program Output:")
    print(result.stdout.strip())
    print("="*50)

## Case 3 - Tiled matrix multiplication

In [35]:
import subprocess
import time
matrix_width = 4096
tile_size = 16
print(f"Matrix Width: {matrix_width}, Tile Size: {tile_size}")
for run in range(1, 5):  # Run multiple times
    start = time.perf_counter()
    result = subprocess.run(["./matrixMulTiled.exe", str(matrix_width), str(tile_size)], capture_output=True, text=True)
    end = time.perf_counter()
    elapsed_time_ms = (end - start) * 1000
    print(f"{elapsed_time_ms:.2f} ms")
    print("Program Output:")
    print(result.stdout.strip())
    print("="*50)

Matrix Width: 4096, Tile Size: 16
1839.96 ms
Program Output:
Sample output:
4096.0 4096.0 4096.0 4096.0 
4096.0 4096.0 4096.0 4096.0 
4096.0 4096.0 4096.0 4096.0 
4096.0 4096.0 4096.0 4096.0
340.01 ms
Program Output:
Sample output:
4096.0 4096.0 4096.0 4096.0 
4096.0 4096.0 4096.0 4096.0 
4096.0 4096.0 4096.0 4096.0 
4096.0 4096.0 4096.0 4096.0
341.81 ms
Program Output:
Sample output:
4096.0 4096.0 4096.0 4096.0 
4096.0 4096.0 4096.0 4096.0 
4096.0 4096.0 4096.0 4096.0 
4096.0 4096.0 4096.0 4096.0
342.19 ms
Program Output:
Sample output:
4096.0 4096.0 4096.0 4096.0 
4096.0 4096.0 4096.0 4096.0 
4096.0 4096.0 4096.0 4096.0 
4096.0 4096.0 4096.0 4096.0
