### Exercise 1: Cython Implementation of K-Means

Implement a fast K-means clustering algorithm in Cython:

In [1]:
import time
import os
import sys

# Check if Cython is available
try:
    import Cython
    print(f"Cython version: {Cython.__version__}")
    cython_available = True
except ImportError:
    print("Cython not available. Install with: pip install cython")
    cython_available = False

if cython_available:
    # Load Cython magic for Jupyter
    %load_ext Cython

Cython version: 3.1.3


In [2]:
%%cython
# Exercise 2: K-means clustering in Cython
import numpy as np
cimport numpy as cnp
from libc.math cimport sqrt
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
def cython_kmeans_step(cnp.ndarray[double, ndim=2] points,
                       cnp.ndarray[double, ndim=2] centroids):
    """Single step of K-means clustering"""
    # TODO: Implement K-means step
    # 1. Assign each point to nearest centroid
    # 2. Update centroids as mean of assigned points
    # 3. Return new centroids and assignments
    
    cdef int n_points = points.shape[0]
    cdef int n_dims = points.shape[1]
    cdef int k = centroids.shape[0]
    
    cdef cnp.ndarray[int, ndim=1] assignments = np.zeros(n_points, dtype=np.int32)
    cdef cnp.ndarray[double, ndim=2] new_centroids = np.zeros((k, n_dims), dtype=np.float64)
    cdef cnp.ndarray[int, ndim=1] counts = np.zeros(k, dtype=np.int32)
    
    cdef double dist, min_dist, diff
    cdef int i, j, d, best_cluster
    
    # Step 1: Assign points to nearest centroids
    for i in range(n_points):
        min_dist = 1e20
        best_cluster = 0
        
        for j in range(k):
            dist = 0.0
            for d in range(n_dims):
                diff = points[i, d] - centroids[j, d]
                dist += diff * diff
            dist = sqrt(dist)
            
            if dist < min_dist:
                min_dist = dist
                best_cluster = j
        
        assignments[i] = best_cluster
    
    # Step 2: Update centroids
    for i in range(n_points):
        j = assignments[i]
        counts[j] += 1
        for d in range(n_dims):
            new_centroids[j, d] += points[i, d]
    
    # Average the sums
    for j in range(k):
        if counts[j] > 0:
            for d in range(n_dims):
                new_centroids[j, d] /= counts[j]
    
    return new_centroids, assignments

In [3]:
# Test Cython K-means
if cython_available:
    # Generate test data with known clusters
    np.random.seed(42)
    n_points, n_dims, k = 10000, 2, 3
    
    # Create 3 clusters
    cluster_centers = np.array([[2, 2], [-2, 2], [0, -3]], dtype=np.float64)
    points = []
    
    for center in cluster_centers:
        cluster_points = center + 0.8 * np.random.randn(n_points//k, 2)
        points.append(cluster_points)
    
    points = np.vstack(points).astype(np.float64)
    print(f"Created {len(points)} points in {k} clusters")
    
    # Initialize centroids randomly
    centroids = np.random.randn(k, n_dims).astype(np.float64)
    
    # Run Cython K-means
    print("\nRunning Cython K-means:")
    for iteration in range(20):
        old_centroids = centroids.copy()
        centroids, assignments = cython_kmeans_step(points, centroids)
        
        # Check convergence
        shift = np.max(np.linalg.norm(centroids - old_centroids, axis=1))
        
        if iteration % 5 == 0 or shift < 1e-4:
            print(f"Iteration {iteration:2d}: max centroid shift = {shift:.6f}")
        
        if shift < 1e-4:
            print(f"Converged after {iteration} iterations")
            break
    
    print(f"\nFinal centroids:")
    for i, centroid in enumerate(centroids):
        print(f"Cluster {i}: [{centroid[0]:6.3f}, {centroid[1]:6.3f}]")
    
    # Compare with scikit-learn
    try:
        from sklearn.cluster import KMeans
        
        print("\nComparison with scikit-learn:")
        sklearn_kmeans = KMeans(n_clusters=k, random_state=42, n_init=1)
        
        # Time both implementations
        print("Cython K-means (single step):")
        %timeit cython_kmeans_step(points, centroids)
        
        print("Scikit-learn K-means (full algorithm):")
        %timeit sklearn_kmeans.fit(points)
        
        # Show scikit-learn results
        sklearn_centroids = sklearn_kmeans.fit(points).cluster_centers_
        print(f"\nScikit-learn centroids:")
        for i, centroid in enumerate(sklearn_centroids):
            print(f"Cluster {i}: [{centroid[0]:6.3f}, {centroid[1]:6.3f}]")
            
    except ImportError:
        print("\nScikit-learn not available for comparison")
else:
    print("Cython not available, skipping K-means exercise")

Created 9999 points in 3 clusters

Running Cython K-means:
Iteration  0: max centroid shift = 2.284860
Iteration  3: max centroid shift = 0.000000
Converged after 3 iterations

Final centroids:
Cluster 0: [ 0.013, -2.980]
Cluster 1: [ 1.992,  1.998]
Cluster 2: [-2.006,  2.002]

Comparison with scikit-learn:
Cython K-means (single step):
85.3 μs ± 5.23 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
Scikit-learn K-means (full algorithm):
4.76 ms ± 367 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Scikit-learn centroids:
Cluster 0: [-2.006,  2.002]
Cluster 1: [ 0.013, -2.980]
Cluster 2: [ 1.992,  1.998]


## f2py - Fortran to Python Interface

Many high-performance scientific libraries are written in Fortran. f2py makes them accessible from Python.

In [4]:
# Create a Fortran module
fortran_source = '''
! Fortran module for mathematical operations
module mathops
    implicit none
    
contains
    
    ! Compute sum of squares of array elements
    function sum_of_squares(arr, n) result(total)
        implicit none
        integer, intent(in) :: n
        real(8), intent(in) :: arr(n)
        real(8) :: total
        integer :: i
        
        total = 0.0d0
        do i = 1, n
            total = total + arr(i) * arr(i)
        end do
    end function sum_of_squares
    
    ! Matrix multiplication C = A * B
    subroutine matmul_fortran(A, B, C, m, n, p)
        implicit none
        integer, intent(in) :: m, n, p
        real(8), intent(in) :: A(m, n), B(n, p)
        real(8), intent(out) :: C(m, p)
        integer :: i, j, k
        
        do i = 1, m
            do j = 1, p
                C(i, j) = 0.0d0
                do k = 1, n
                    C(i, j) = C(i, j) + A(i, k) * B(k, j)
                end do
            end do
        end do
    end subroutine matmul_fortran
    
    ! Solve tridiagonal system Ax = b using Thomas algorithm
    subroutine solve_tridiagonal(a, b, c, d, x, n)
        implicit none
        integer, intent(in) :: n
        real(8), intent(in) :: a(n), b(n), c(n), d(n)
        real(8), intent(out) :: x(n)
        real(8) :: cp(n), dp(n)
        integer :: i
        
        ! Forward elimination
        cp(1) = c(1) / b(1)
        dp(1) = d(1) / b(1)
        
        do i = 2, n
            cp(i) = c(i) / (b(i) - a(i) * cp(i-1))
            dp(i) = (d(i) - a(i) * dp(i-1)) / (b(i) - a(i) * cp(i-1))
        end do
        
        ! Back substitution
        x(n) = dp(n)
        do i = n-1, 1, -1
            x(i) = dp(i) - cp(i) * x(i+1)
        end do
    end subroutine solve_tridiagonal
    
end module mathops
'''

# Write Fortran source to file
with open('mathops.f90', 'w') as f:
    f.write(fortran_source)

print("Created Fortran source file: mathops.f90")

Created Fortran source file: mathops.f90


In [5]:
import subprocess
import sys

# Compile Fortran module with f2py
try:
    # Check if f2py is available
    result = subprocess.run([sys.executable, '-c', 'import numpy.f2py'], 
                          capture_output=True)
    if result.returncode != 0:
        print("f2py not available")
        fortran_available = False
    else:
        # Compile the Fortran module
        f2py_cmd = [sys.executable, '-m', 'numpy.f2py', '-c', 'mathops.f90', '-m', 'mathops']
        print(f"Compiling with f2py: {' '.join(f2py_cmd)}")
        
        result = subprocess.run(f2py_cmd, capture_output=True, text=True)
        
        if result.returncode == 0:
            print("f2py compilation successful!")
            fortran_available = True
        else:
            print(f"f2py compilation failed:")
            print(f"stdout: {result.stdout}")
            print(f"stderr: {result.stderr}")
            fortran_available = False
            
except Exception as e:
    print(f"Error with f2py: {e}")
    fortran_available = False

Compiling with f2py: /home/manuel/Code/HPCP/HS25/.venv/bin/python3 -m numpy.f2py -c mathops.f90 -m mathops
f2py compilation successful!


In [6]:
# Test Fortran functions
if fortran_available:
    try:
        import mathops
        
        # Test sum of squares
        data = np.random.rand(1000000)
        
        fortran_result = mathops.mathops.sum_of_squares(data)
        numpy_result = np.sum(data**2)
        
        print(f"Fortran sum of squares: {fortran_result:.6f}")
        print(f"NumPy sum of squares:   {numpy_result:.6f}")
        print(f"Match: {np.isclose(fortran_result, numpy_result)}")
        
        # Performance comparison
        print("\nPerformance comparison - Sum of squares:")
        print("Fortran (f2py):")
        %timeit mathops.mathops.sum_of_squares(data)
        
        print("NumPy:")
        %timeit np.sum(data**2)
        
        # Test matrix multiplication
        print("\nTesting Fortran matrix multiplication:")
        m, n, p = 200, 150, 100
        A = np.random.rand(m, n)
        B = np.random.rand(n, p)
        
        C_fortran = mathops.mathops.matmul_fortran(A, B)
        C_numpy = A @ B
        
        print(f"Matrix multiplication results match: {np.allclose(C_fortran, C_numpy)}")
        print(f"Max difference: {np.max(np.abs(C_fortran - C_numpy)):.2e}")
        
        # Performance comparison for matrix multiplication
        print("\nMatrix multiplication performance:")
        print("Fortran:")
        %timeit mathops.mathops.matmul_fortran(A, B)
        
        print("NumPy:")
        %timeit A @ B
        
    except ImportError as e:
        print(f"Failed to import compiled Fortran module: {e}")
        fortran_available = False
else:
    print("Fortran/f2py not available")

Fortran sum of squares: 333590.253481
NumPy sum of squares:   333590.253481
Match: True

Performance comparison - Sum of squares:
Fortran (f2py):
454 μs ± 8.19 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
NumPy:
453 μs ± 12.8 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Testing Fortran matrix multiplication:
Matrix multiplication results match: True
Max difference: 4.97e-14

Matrix multiplication performance:
Fortran:
1.04 ms ± 53.2 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
NumPy:
92.1 μs ± 2.65 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


### Exercise 2: Solve Linear Systems with Fortran

Use the tridiagonal solver from our Fortran module:

In [7]:
# Exercise 3: Tridiagonal linear system solver
if fortran_available:
    try:
        # Create a tridiagonal system Ax = b
        # A has the form:
        # [b1  c1   0   0 ... ]
        # [a2  b2  c2   0 ... ]
        # [ 0  a3  b3  c3 ... ]
        # [ ...              ]
        
        n = 1000
        
        # Create tridiagonal matrix coefficients
        a = np.ones(n) * -1.0  # sub-diagonal
        b = np.ones(n) * 2.0   # main diagonal
        c = np.ones(n) * -1.0  # super-diagonal
        
        # Right-hand side
        d = np.ones(n)
        
        # TODO: Use Fortran solver
        x_fortran = mathops.mathops.solve_tridiagonal(a, b, c, d)
        
        # Verify with NumPy (construct full matrix)
        A_full = np.zeros((n, n))
        for i in range(n):
            A_full[i, i] = b[i]
            if i > 0:
                A_full[i, i-1] = a[i]
            if i < n-1:
                A_full[i, i+1] = c[i]
        
        x_numpy = np.linalg.solve(A_full, d)
        
        print(f"Tridiagonal system solution (n={n}):")
        print(f"Solutions match: {np.allclose(x_fortran, x_numpy)}")
        print(f"Max difference: {np.max(np.abs(x_fortran - x_numpy)):.2e}")
        
        # Performance comparison
        print("\nPerformance comparison:")
        print("Fortran tridiagonal solver:")
        %timeit mathops.mathops.solve_tridiagonal(a, b, c, d)
        
        print("NumPy general solver:")
        %timeit np.linalg.solve(A_full, d)
        
        # Show solution sample
        print(f"\nSample solution values: {x_fortran[:5]}")
        
    except Exception as e:
        print(f"Error in tridiagonal solver test: {e}")
else:
    print("Fortran not available, skipping tridiagonal solver exercise")

Tridiagonal system solution (n=1000):
Solutions match: True
Max difference: 8.73e-11

Performance comparison:
Fortran tridiagonal solver:
7.03 μs ± 477 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
NumPy general solver:
14 ms ± 380 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Sample solution values: [ 500.  999. 1497. 1994. 2490.]


In [9]:
import os

# Clean up generated files
cleanup_files = ['mathops.f90', 'mathops.cpython-312-x86_64-linux-gnu.so']
for filename in cleanup_files:
    if os.path.exists(filename):
        try:
            os.remove(filename)
        except:
            pass  # Ignore cleanup errors