# Python Bindings for High-Performance Computing
## Interfacing Python with C/C++, Fortran, and Other Languages

Python's strength lies in its ability to orchestrate high-performance code written in compiled languages. This session covers:

- **ctypes** - Calling C libraries directly
- **Cython** - Writing C extensions in Python-like syntax
- **pybind11** - Modern C++ binding creation
- **f2py** - Fortran to Python interface
- **CFFI** - C Foreign Function Interface
- **SWIG** - Simplified Wrapper Interface Generator

**Goal**: Leverage existing high-performance libraries and create custom extensions when pure Python isn't fast enough.

In [1]:
import numpy as np
import ctypes
import os
import subprocess
import sys
import time
from pathlib import Path

print(f"Python version: {sys.version}")
print(f"NumPy version: {np.__version__}")
print(f"Platform: {sys.platform}")

# Check if common tools are available
tools = ['gcc', 'g++', 'gfortran', 'make']
for tool in tools:
    try:
        result = subprocess.run([tool, '--version'], capture_output=True, text=True)
        if result.returncode == 0:
            version = result.stdout.split('\n')[0]
            print(f"{tool}: {version}")
        else:
            print(f"{tool}: Not available")
    except FileNotFoundError:
        print(f"{tool}: Not found")

Python version: 3.12.1 | packaged by Anaconda, Inc. | (main, Jan 19 2024, 15:51:05) [GCC 11.2.0]
NumPy version: 2.2.6
Platform: linux
gcc: gcc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
g++: g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
gfortran: GNU Fortran (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
make: GNU Make 4.3


## 1.1 ctypes - Direct C Library Access

ctypes allows calling functions from shared libraries (.so, .dll, .dylib) directly from Python. It's the fastest way to interface with existing C libraries.

In [2]:
# First, let's create a simple C library
c_source = '''
#include <math.h>
#include <stdlib.h>

// Simple function: compute sum of squares
double sum_of_squares(double* arr, int n) {
    double sum = 0.0;
    for (int i = 0; i < n; i++) {
        sum += arr[i] * arr[i];
    }
    return sum;
}

// Vector dot product
double dot_product(double* a, double* b, int n) {
    double result = 0.0;
    for (int i = 0; i < n; i++) {
        result += a[i] * b[i];
    }
    return result;
}

// Matrix-vector multiplication: y = A * x
void matvec(double* A, double* x, double* y, int m, int n) {
    for (int i = 0; i < m; i++) {
        y[i] = 0.0;
        for (int j = 0; j < n; j++) {
            y[i] += A[i * n + j] * x[j];
        }
    }
}

// Parallel reduction (sum)
#ifdef _OPENMP
#include <omp.h>
#endif

double parallel_sum(double* arr, int n) {
    double sum = 0.0;
    #ifdef _OPENMP
    #pragma omp parallel for reduction(+:sum)
    #endif
    for (int i = 0; i < n; i++) {
        sum += arr[i];
    }
    return sum;
}
'''

# Write C source to file
with open('mathlib.c', 'w') as f:
    f.write(c_source)

print("Created C source file: mathlib.c")

Created C source file: mathlib.c


In [3]:
# Compile the C library to a shared library
compile_commands = {
    'linux': 'gcc -shared -fPIC -O3 -fopenmp mathlib.c -o mathlib.so -lm',
    'darwin': 'gcc -shared -fPIC -O3 -Xpreprocessor -fopenmp mathlib.c -o mathlib.so -lm',
    'win32': 'gcc -shared -O3 -fopenmp mathlib.c -o mathlib.dll -lm'
}

compile_cmd = compile_commands.get(sys.platform, compile_commands['linux'])
print(f"Compiling with: {compile_cmd}")

try:
    result = subprocess.run(compile_cmd.split(), capture_output=True, text=True)
    if result.returncode == 0:
        print("Compilation successful!")
        if result.stderr:
            print(f"Warnings: {result.stderr}")
    else:
        print(f"Compilation failed: {result.stderr}")
        # Try without OpenMP
        compile_cmd_simple = compile_cmd.replace('-fopenmp', '').replace('-Xpreprocessor -fopenmp', '')
        print(f"Trying without OpenMP: {compile_cmd_simple}")
        result = subprocess.run(compile_cmd_simple.split(), capture_output=True, text=True)
        if result.returncode == 0:
            print("Compilation successful (without OpenMP)!")
        else:
            print(f"Still failed: {result.stderr}")
except FileNotFoundError:
    print("GCC not found. Please install a C compiler.")

Compiling with: gcc -shared -fPIC -O3 -fopenmp mathlib.c -o mathlib.so -lm
Compilation successful!


In [4]:
# Load the compiled library with ctypes
try:
    # Determine library extension
    if sys.platform == 'win32':
        lib_name = './mathlib.dll'
    else:
        lib_name = './mathlib.so'
    
    # Load the library
    mathlib = ctypes.CDLL(lib_name)
    print(f"Loaded library: {lib_name}")
    
    # Define function signatures
    # double sum_of_squares(double* arr, int n)
    mathlib.sum_of_squares.argtypes = [ctypes.POINTER(ctypes.c_double), ctypes.c_int]
    mathlib.sum_of_squares.restype = ctypes.c_double
    
    # double dot_product(double* a, double* b, int n)
    mathlib.dot_product.argtypes = [ctypes.POINTER(ctypes.c_double), 
                                    ctypes.POINTER(ctypes.c_double), 
                                    ctypes.c_int]
    mathlib.dot_product.restype = ctypes.c_double
    
    # void matvec(double* A, double* x, double* y, int m, int n)
    mathlib.matvec.argtypes = [ctypes.POINTER(ctypes.c_double),
                               ctypes.POINTER(ctypes.c_double),
                               ctypes.POINTER(ctypes.c_double),
                               ctypes.c_int, ctypes.c_int]
    mathlib.matvec.restype = None
    
    # double parallel_sum(double* arr, int n)
    mathlib.parallel_sum.argtypes = [ctypes.POINTER(ctypes.c_double), ctypes.c_int]
    mathlib.parallel_sum.restype = ctypes.c_double
    
    print("Function signatures defined")
    
except OSError as e:
    print(f"Failed to load library: {e}")
    print("Available files:", os.listdir('.'))
    mathlib = None

Loaded library: ./mathlib.so
Function signatures defined


In [5]:
# Test the ctypes interface
if mathlib:
    # Create test data
    n = 1000000
    data = np.random.rand(n).astype(np.float64)
    
    # Convert NumPy array to ctypes pointer
    data_ptr = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
    
    # Test sum_of_squares
    c_result = mathlib.sum_of_squares(data_ptr, n)
    numpy_result = np.sum(data**2)
    
    print(f"Sum of squares comparison:")
    print(f"C result:     {c_result:.6f}")
    print(f"NumPy result: {numpy_result:.6f}")
    print(f"Match: {np.isclose(c_result, numpy_result)}")
    
    # Performance comparison
    print("\nPerformance comparison:")
    
    print("C library (ctypes):")
    %timeit mathlib.sum_of_squares(data_ptr, n)
    
    print("NumPy:")
    %timeit np.sum(data**2)
    
    print("Pure Python:")
    %timeit sum(x**2 for x in data[:10000])  # Only 10k for timing
else:
    print("Library not loaded, skipping tests")

Sum of squares comparison:
C result:     333492.589344
NumPy result: 333492.589344
Match: True

Performance comparison:
C library (ctypes):
551 μs ± 26.8 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
NumPy:
537 μs ± 17.9 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Pure Python:
1 ms ± 29.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### Exercise 1: Implement Matrix-Vector Multiplication with ctypes

Use the `matvec` function from our C library:

In [6]:
# Exercise 1: Matrix-vector multiplication with ctypes
if mathlib:
    def ctypes_matvec(A, x):
        """Matrix-vector multiplication using ctypes"""
        # TODO: Implement this function
        # Hints:
        # 1. Get matrix dimensions m, n = A.shape
        # 2. Create output vector y = np.zeros(m)
        # 3. Get ctypes pointers for A, x, y
        # 4. Call mathlib.matvec(A_ptr, x_ptr, y_ptr, m, n)
        # 5. Return y
        
        m, n = A.shape
        y = np.zeros(m, dtype=np.float64)
        
        # Ensure arrays are contiguous and correct type
        A_flat = np.ascontiguousarray(A.flatten(), dtype=np.float64)
        x = np.ascontiguousarray(x, dtype=np.float64)
        
        # Get ctypes pointers
        A_ptr = A_flat.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
        x_ptr = x.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
        y_ptr = y.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
        
        # Call C function
        mathlib.matvec(A_ptr, x_ptr, y_ptr, m, n)
        
        return y
    
    # Test matrix-vector multiplication
    m, n = 500, 400
    A = np.random.rand(m, n).astype(np.float64)
    x = np.random.rand(n).astype(np.float64)
    
    # Compare results
    numpy_result = A @ x
    ctypes_result = ctypes_matvec(A, x)
    
    print(f"Matrix-vector multiplication ({m}×{n} @ {n}):")
    print(f"Results match: {np.allclose(numpy_result, ctypes_result)}")
    print(f"Max difference: {np.max(np.abs(numpy_result - ctypes_result)):.2e}")
    
    # Performance comparison
    print("\nPerformance comparison:")
    print("C library (ctypes):")
    %timeit ctypes_matvec(A, x)
    
    print("NumPy:")
    %timeit A @ x
else:
    print("Library not loaded, skipping matrix-vector exercise")

Matrix-vector multiplication (500×400 @ 400):
Results match: True
Max difference: 2.27e-13

Performance comparison:
C library (ctypes):
187 μs ± 4.04 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
NumPy:
30.8 μs ± 848 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## 1.2 Cython - Python-like Syntax with C Performance

Cython lets you write C extensions using Python-like syntax. It's particularly powerful for numerical code.

In [7]:
# Check if Cython is available
try:
    import Cython
    print(f"Cython version: {Cython.__version__}")
    cython_available = True
except ImportError:
    print("Cython not available. Install with: pip install cython")
    cython_available = False

if cython_available:
    # Load Cython magic for Jupyter
    %load_ext Cython

Cython version: 3.1.3


In [8]:
%%cython
# Cython implementation of mathematical functions
import numpy as np
cimport numpy as cnp
from libc.math cimport sqrt, sin, cos, exp
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
def cython_sum_of_squares(cnp.ndarray[double, ndim=1] arr):
    """Cython version of sum of squares"""
    cdef int n = arr.shape[0]
    cdef double total = 0.0
    cdef int i
    
    for i in range(n):
        total += arr[i] * arr[i]
    
    return total

@cython.boundscheck(False)
@cython.wraparound(False)
def cython_euclidean_distance(cnp.ndarray[double, ndim=2] X, 
                              cnp.ndarray[double, ndim=2] Y):
    """Cython pairwise Euclidean distance matrix"""
    cdef int n = X.shape[0]
    cdef int m = Y.shape[0]
    cdef int d = X.shape[1]
    
    cdef cnp.ndarray[double, ndim=2] distances = np.zeros((n, m), dtype=np.float64)
    
    cdef double dist, diff
    cdef int i, j, k
    
    for i in range(n):
        for j in range(m):
            dist = 0.0
            for k in range(d):
                diff = X[i, k] - Y[j, k]
                dist += diff * diff
            distances[i, j] = sqrt(dist)
    
    return distances

@cython.boundscheck(False)
@cython.wraparound(False)
def cython_complex_function(cnp.ndarray[double, ndim=1] arr):
    """Complex mathematical function in Cython"""
    cdef int n = arr.shape[0]
    cdef cnp.ndarray[double, ndim=1] result = np.zeros(n, dtype=np.float64)
    cdef double x
    cdef int i
    
    for i in range(n):
        x = arr[i]
        result[i] = sin(x) * cos(x*x) + exp(-x*x)
    
    return result

In [9]:
# Test Cython functions
if cython_available:
    # Create test data
    data = np.random.rand(1000000)
    
    # Test sum of squares
    cython_result = cython_sum_of_squares(data)
    numpy_result = np.sum(data**2)
    
    print(f"Cython sum of squares: {cython_result:.6f}")
    print(f"NumPy sum of squares:  {numpy_result:.6f}")
    print(f"Match: {np.isclose(cython_result, numpy_result)}")
    
    # Performance comparison
    print("\nPerformance comparison - Sum of squares:")
    print("Cython:")
    %timeit cython_sum_of_squares(data)
    
    print("NumPy:")
    %timeit np.sum(data**2)
    
    # Test complex function
    print("\nComplex function performance:")
    test_data = np.random.rand(100000)
    
    print("Cython:")
    %timeit cython_complex_function(test_data)
    
    print("NumPy:")
    %timeit np.sin(test_data) * np.cos(test_data**2) + np.exp(-test_data**2)
    
    # Verify results match
    cython_complex = cython_complex_function(test_data[:1000])
    numpy_complex = np.sin(test_data[:1000]) * np.cos(test_data[:1000]**2) + np.exp(-test_data[:1000]**2)
    print(f"Complex function results match: {np.allclose(cython_complex, numpy_complex)}")
else:
    print("Cython not available, skipping tests")

Cython sum of squares: 333676.255266
NumPy sum of squares:  333676.255266
Match: True

Performance comparison - Sum of squares:
Cython:
557 μs ± 21.9 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
NumPy:
561 μs ± 24.2 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Complex function performance:
Cython:
1.57 ms ± 53.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
NumPy:
1.82 ms ± 75.9 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Complex function results match: True


### Exercise 2: Cython Implementation of K-Means

Implement a fast K-means clustering algorithm in Cython:

In [10]:
%%cython
# Exercise 2: K-means clustering in Cython
import numpy as np
cimport numpy as cnp
from libc.math cimport sqrt
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
def cython_kmeans_step(cnp.ndarray[double, ndim=2] points,
                       cnp.ndarray[double, ndim=2] centroids):
    """Single step of K-means clustering"""
    # TODO: Implement K-means step
    # 1. Assign each point to nearest centroid
    # 2. Update centroids as mean of assigned points
    # 3. Return new centroids and assignments
    
    cdef int n_points = points.shape[0]
    cdef int n_dims = points.shape[1]
    cdef int k = centroids.shape[0]
    
    cdef cnp.ndarray[int, ndim=1] assignments = np.zeros(n_points, dtype=np.int32)
    cdef cnp.ndarray[double, ndim=2] new_centroids = np.zeros((k, n_dims), dtype=np.float64)
    cdef cnp.ndarray[int, ndim=1] counts = np.zeros(k, dtype=np.int32)
    
    cdef double dist, min_dist, diff
    cdef int i, j, d, best_cluster
    
    # Step 1: Assign points to nearest centroids
    for i in range(n_points):
        min_dist = 1e20
        best_cluster = 0
        
        for j in range(k):
            dist = 0.0
            for d in range(n_dims):
                diff = points[i, d] - centroids[j, d]
                dist += diff * diff
            dist = sqrt(dist)
            
            if dist < min_dist:
                min_dist = dist
                best_cluster = j
        
        assignments[i] = best_cluster
    
    # Step 2: Update centroids
    for i in range(n_points):
        j = assignments[i]
        counts[j] += 1
        for d in range(n_dims):
            new_centroids[j, d] += points[i, d]
    
    # Average the sums
    for j in range(k):
        if counts[j] > 0:
            for d in range(n_dims):
                new_centroids[j, d] /= counts[j]
    
    return new_centroids, assignments

In [11]:
# Test Cython K-means
if cython_available:
    # Generate test data with known clusters
    np.random.seed(42)
    n_points, n_dims, k = 10000, 2, 3
    
    # Create 3 clusters
    cluster_centers = np.array([[2, 2], [-2, 2], [0, -3]], dtype=np.float64)
    points = []
    
    for center in cluster_centers:
        cluster_points = center + 0.8 * np.random.randn(n_points//k, 2)
        points.append(cluster_points)
    
    points = np.vstack(points).astype(np.float64)
    print(f"Created {len(points)} points in {k} clusters")
    
    # Initialize centroids randomly
    centroids = np.random.randn(k, n_dims).astype(np.float64)
    
    # Run Cython K-means
    print("\nRunning Cython K-means:")
    for iteration in range(20):
        old_centroids = centroids.copy()
        centroids, assignments = cython_kmeans_step(points, centroids)
        
        # Check convergence
        shift = np.max(np.linalg.norm(centroids - old_centroids, axis=1))
        
        if iteration % 5 == 0 or shift < 1e-4:
            print(f"Iteration {iteration:2d}: max centroid shift = {shift:.6f}")
        
        if shift < 1e-4:
            print(f"Converged after {iteration} iterations")
            break
    
    print(f"\nFinal centroids:")
    for i, centroid in enumerate(centroids):
        print(f"Cluster {i}: [{centroid[0]:6.3f}, {centroid[1]:6.3f}]")
    
    # Compare with scikit-learn
    try:
        from sklearn.cluster import KMeans
        
        print("\nComparison with scikit-learn:")
        sklearn_kmeans = KMeans(n_clusters=k, random_state=42, n_init=1)
        
        # Time both implementations
        print("Cython K-means (single step):")
        %timeit cython_kmeans_step(points, centroids)
        
        print("Scikit-learn K-means (full algorithm):")
        %timeit sklearn_kmeans.fit(points)
        
        # Show scikit-learn results
        sklearn_centroids = sklearn_kmeans.fit(points).cluster_centers_
        print(f"\nScikit-learn centroids:")
        for i, centroid in enumerate(sklearn_centroids):
            print(f"Cluster {i}: [{centroid[0]:6.3f}, {centroid[1]:6.3f}]")
            
    except ImportError:
        print("\nScikit-learn not available for comparison")
else:
    print("Cython not available, skipping K-means exercise")

Created 9999 points in 3 clusters

Running Cython K-means:
Iteration  0: max centroid shift = 2.284860
Iteration  3: max centroid shift = 0.000000
Converged after 3 iterations

Final centroids:
Cluster 0: [ 0.013, -2.980]
Cluster 1: [ 1.992,  1.998]
Cluster 2: [-2.006,  2.002]

Comparison with scikit-learn:
Cython K-means (single step):
78.8 μs ± 2.46 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
Scikit-learn K-means (full algorithm):
4.03 ms ± 547 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Scikit-learn centroids:
Cluster 0: [-2.006,  2.002]
Cluster 1: [ 0.013, -2.980]
Cluster 2: [ 1.992,  1.998]


## 2.1 f2py - Fortran to Python Interface

Many high-performance scientific libraries are written in Fortran. f2py makes them accessible from Python.

In [12]:
# Create a Fortran module
fortran_source = '''
! Fortran module for mathematical operations
module mathops
    implicit none
    
contains
    
    ! Compute sum of squares of array elements
    function sum_of_squares(arr, n) result(total)
        implicit none
        integer, intent(in) :: n
        real(8), intent(in) :: arr(n)
        real(8) :: total
        integer :: i
        
        total = 0.0d0
        do i = 1, n
            total = total + arr(i) * arr(i)
        end do
    end function sum_of_squares
    
    ! Matrix multiplication C = A * B
    subroutine matmul_fortran(A, B, C, m, n, p)
        implicit none
        integer, intent(in) :: m, n, p
        real(8), intent(in) :: A(m, n), B(n, p)
        real(8), intent(out) :: C(m, p)
        integer :: i, j, k
        
        do i = 1, m
            do j = 1, p
                C(i, j) = 0.0d0
                do k = 1, n
                    C(i, j) = C(i, j) + A(i, k) * B(k, j)
                end do
            end do
        end do
    end subroutine matmul_fortran
    
    ! Solve tridiagonal system Ax = b using Thomas algorithm
    subroutine solve_tridiagonal(a, b, c, d, x, n)
        implicit none
        integer, intent(in) :: n
        real(8), intent(in) :: a(n), b(n), c(n), d(n)
        real(8), intent(out) :: x(n)
        real(8) :: cp(n), dp(n)
        integer :: i
        
        ! Forward elimination
        cp(1) = c(1) / b(1)
        dp(1) = d(1) / b(1)
        
        do i = 2, n
            cp(i) = c(i) / (b(i) - a(i) * cp(i-1))
            dp(i) = (d(i) - a(i) * dp(i-1)) / (b(i) - a(i) * cp(i-1))
        end do
        
        ! Back substitution
        x(n) = dp(n)
        do i = n-1, 1, -1
            x(i) = dp(i) - cp(i) * x(i+1)
        end do
    end subroutine solve_tridiagonal
    
end module mathops
'''

# Write Fortran source to file
with open('mathops.f90', 'w') as f:
    f.write(fortran_source)

print("Created Fortran source file: mathops.f90")

Created Fortran source file: mathops.f90


In [13]:
# Compile Fortran module with f2py
try:
    # Check if f2py is available
    result = subprocess.run([sys.executable, '-c', 'import numpy.f2py'], 
                          capture_output=True)
    if result.returncode != 0:
        print("f2py not available")
        fortran_available = False
    else:
        # Compile the Fortran module
        f2py_cmd = [sys.executable, '-m', 'numpy.f2py', '-c', 'mathops.f90', '-m', 'mathops']
        print(f"Compiling with f2py: {' '.join(f2py_cmd)}")
        
        result = subprocess.run(f2py_cmd, capture_output=True, text=True)
        
        if result.returncode == 0:
            print("f2py compilation successful!")
            fortran_available = True
        else:
            print(f"f2py compilation failed:")
            print(f"stdout: {result.stdout}")
            print(f"stderr: {result.stderr}")
            fortran_available = False
            
except Exception as e:
    print(f"Error with f2py: {e}")
    fortran_available = False

Compiling with f2py: /home/manuel/Code/HPCP/HS25/.venv/bin/python3 -m numpy.f2py -c mathops.f90 -m mathops
f2py compilation successful!


In [14]:
# Test Fortran functions
if fortran_available:
    try:
        import mathops
        
        # Test sum of squares
        data = np.random.rand(1000000)
        
        fortran_result = mathops.mathops.sum_of_squares(data)
        numpy_result = np.sum(data**2)
        
        print(f"Fortran sum of squares: {fortran_result:.6f}")
        print(f"NumPy sum of squares:   {numpy_result:.6f}")
        print(f"Match: {np.isclose(fortran_result, numpy_result)}")
        
        # Performance comparison
        print("\nPerformance comparison - Sum of squares:")
        print("Fortran (f2py):")
        %timeit mathops.mathops.sum_of_squares(data)
        
        print("NumPy:")
        %timeit np.sum(data**2)
        
        # Test matrix multiplication
        print("\nTesting Fortran matrix multiplication:")
        m, n, p = 200, 150, 100
        A = np.random.rand(m, n)
        B = np.random.rand(n, p)
        
        C_fortran = mathops.mathops.matmul_fortran(A, B)
        C_numpy = A @ B
        
        print(f"Matrix multiplication results match: {np.allclose(C_fortran, C_numpy)}")
        print(f"Max difference: {np.max(np.abs(C_fortran - C_numpy)):.2e}")
        
        # Performance comparison for matrix multiplication
        print("\nMatrix multiplication performance:")
        print("Fortran:")
        %timeit mathops.mathops.matmul_fortran(A, B)
        
        print("NumPy:")
        %timeit A @ B
        
    except ImportError as e:
        print(f"Failed to import compiled Fortran module: {e}")
        fortran_available = False
else:
    print("Fortran/f2py not available")

Fortran sum of squares: 333590.253481
NumPy sum of squares:   333590.253481
Match: True

Performance comparison - Sum of squares:
Fortran (f2py):
558 μs ± 16.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
NumPy:
576 μs ± 32.4 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Testing Fortran matrix multiplication:
Matrix multiplication results match: True
Max difference: 4.97e-14

Matrix multiplication performance:
Fortran:
1.11 ms ± 28 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
NumPy:
94.3 μs ± 728 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


### Exercise 3: Solve Linear Systems with Fortran

Use the tridiagonal solver from our Fortran module:

In [15]:
# Exercise 3: Tridiagonal linear system solver
if fortran_available:
    try:
        # Create a tridiagonal system Ax = b
        # A has the form:
        # [b1  c1   0   0 ... ]
        # [a2  b2  c2   0 ... ]
        # [ 0  a3  b3  c3 ... ]
        # [ ...              ]
        
        n = 1000
        
        # Create tridiagonal matrix coefficients
        a = np.ones(n) * -1.0  # sub-diagonal
        b = np.ones(n) * 2.0   # main diagonal
        c = np.ones(n) * -1.0  # super-diagonal
        
        # Right-hand side
        d = np.ones(n)
        
        # TODO: Use Fortran solver
        x_fortran = mathops.mathops.solve_tridiagonal(a, b, c, d)
        
        # Verify with NumPy (construct full matrix)
        A_full = np.zeros((n, n))
        for i in range(n):
            A_full[i, i] = b[i]
            if i > 0:
                A_full[i, i-1] = a[i]
            if i < n-1:
                A_full[i, i+1] = c[i]
        
        x_numpy = np.linalg.solve(A_full, d)
        
        print(f"Tridiagonal system solution (n={n}):")
        print(f"Solutions match: {np.allclose(x_fortran, x_numpy)}")
        print(f"Max difference: {np.max(np.abs(x_fortran - x_numpy)):.2e}")
        
        # Performance comparison
        print("\nPerformance comparison:")
        print("Fortran tridiagonal solver:")
        %timeit mathops.mathops.solve_tridiagonal(a, b, c, d)
        
        print("NumPy general solver:")
        %timeit np.linalg.solve(A_full, d)
        
        # Show solution sample
        print(f"\nSample solution values: {x_fortran[:5]}")
        
    except Exception as e:
        print(f"Error in tridiagonal solver test: {e}")
else:
    print("Fortran not available, skipping tridiagonal solver exercise")

Tridiagonal system solution (n=1000):
Solutions match: True
Max difference: 8.73e-11

Performance comparison:
Fortran tridiagonal solver:
7.69 μs ± 273 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
NumPy general solver:
16.7 ms ± 961 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Sample solution values: [ 500.  999. 1497. 1994. 2490.]


## 2.2 CFFI - C Foreign Function Interface

CFFI provides a different approach to calling C libraries, often easier than ctypes for complex interfaces.

In [16]:
# Check if CFFI is available
try:
    import cffi
    print(f"CFFI version: {cffi.__version__}")
    cffi_available = True
except ImportError:
    print("CFFI not available. Install with: pip install cffi")
    cffi_available = False

if cffi_available:
    # Create CFFI interface to our C library
    ffi = cffi.FFI()
    
    # Define C function signatures
    ffi.cdef("""
        double sum_of_squares(double* arr, int n);
        double dot_product(double* a, double* b, int n);
        void matvec(double* A, double* x, double* y, int m, int n);
        double parallel_sum(double* arr, int n);
    """)
    
    # Load the compiled library
    try:
        if sys.platform == 'win32':
            lib = ffi.dlopen('./mathlib.dll')
        else:
            lib = ffi.dlopen('./mathlib.so')
        
        print("CFFI library loaded successfully")
        cffi_lib_loaded = True
        
    except Exception as e:
        print(f"Failed to load library with CFFI: {e}")
        cffi_lib_loaded = False
else:
    cffi_lib_loaded = False

CFFI version: 1.17.1
CFFI library loaded successfully


In [17]:
# Test CFFI interface
if cffi_available and cffi_lib_loaded:
    def cffi_sum_of_squares(arr):
        """CFFI wrapper for sum_of_squares"""
        # Convert NumPy array to CFFI pointer
        arr_ptr = ffi.cast("double*", arr.ctypes.data)
        return lib.sum_of_squares(arr_ptr, len(arr))
    
    def cffi_dot_product(a, b):
        """CFFI wrapper for dot product"""
        a_ptr = ffi.cast("double*", a.ctypes.data)
        b_ptr = ffi.cast("double*", b.ctypes.data)
        return lib.dot_product(a_ptr, b_ptr, len(a))
    
    # Test CFFI functions
    data = np.random.rand(1000000).astype(np.float64)
    
    # Test sum of squares
    cffi_result = cffi_sum_of_squares(data)
    numpy_result = np.sum(data**2)
    
    print(f"CFFI sum of squares: {cffi_result:.6f}")
    print(f"NumPy result:        {numpy_result:.6f}")
    print(f"Match: {np.isclose(cffi_result, numpy_result)}")
    
    # Test dot product
    a = np.random.rand(1000000).astype(np.float64)
    b = np.random.rand(1000000).astype(np.float64)
    
    cffi_dot = cffi_dot_product(a, b)
    numpy_dot = np.dot(a, b)
    
    print(f"\nCFFI dot product: {cffi_dot:.6f}")
    print(f"NumPy dot product: {numpy_dot:.6f}")
    print(f"Match: {np.isclose(cffi_dot, numpy_dot)}")
    
    # Performance comparison
    print("\nCFFI Performance:")
    print("CFFI sum of squares:")
    %timeit cffi_sum_of_squares(data)
    
    print("CFFI dot product:")
    %timeit cffi_dot_product(a, b)
    
    print("\nNumPy comparison:")
    %timeit np.sum(data**2)
    %timeit np.dot(a, b)
else:
    print("CFFI not available or library not loaded")

CFFI sum of squares: 332734.869764
NumPy result:        332734.869764
Match: True

CFFI dot product: 250111.690752
NumPy dot product: 250111.690752
Match: True

CFFI Performance:
CFFI sum of squares:
567 μs ± 31.1 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
CFFI dot product:
565 μs ± 15.4 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

NumPy comparison:
560 μs ± 46.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
The slowest run took 6.87 times longer than the fastest. This could mean that an intermediate result is being cached.
407 μs ± 260 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## 3.1 Performance Summary and Best Practices

Let's compare all the different approaches we've used:

In [18]:
# Comprehensive performance comparison
print("Comprehensive Performance Comparison")
print("=" * 50)

# Create test data
n = 1_000_000
data = np.random.rand(n).astype(np.float64)

print(f"Test: Sum of squares with {n:,} elements")
print("\nTiming Results:")

# Pure Python (small sample)
small_data = data[:10000]
python_time = %timeit -o sum(x**2 for x in small_data)
python_scaled = python_time.best * (n / 10000)
print(f"Pure Python (scaled): ~{python_scaled:.3f}s")

# NumPy
numpy_time = %timeit -o np.sum(data**2)
print(f"NumPy:                 {numpy_time.best:.6f}s")

# ctypes (if available)
if 'mathlib' in locals() and mathlib:
    data_ptr = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
    ctypes_time = %timeit -o mathlib.sum_of_squares(data_ptr, n)
    print(f"ctypes (C):            {ctypes_time.best:.6f}s")
else:
    print(f"ctypes (C):            Not available")

# Cython (if available)
if cython_available:
    cython_time = %timeit -o cython_sum_of_squares(data)
    print(f"Cython:                {cython_time.best:.6f}s")
else:
    print(f"Cython:                Not available")

# Fortran (if available)
if fortran_available:
    try:
        fortran_time = %timeit -o mathops.mathops.sum_of_squares(data)
        print(f"Fortran (f2py):        {fortran_time.best:.6f}s")
    except:
        print(f"Fortran (f2py):        Error")
else:
    print(f"Fortran (f2py):        Not available")

# CFFI (if available)
if cffi_available and cffi_lib_loaded:
    cffi_time = %timeit -o cffi_sum_of_squares(data)
    print(f"CFFI:                  {cffi_time.best:.6f}s")
else:
    print(f"CFFI:                  Not available")

print("\n" + "=" * 50)
print("Speedup factors (relative to NumPy):")
baseline = numpy_time.best

if 'mathlib' in locals() and mathlib:
    speedup = baseline / ctypes_time.best
    print(f"ctypes:   {speedup:6.2f}x {'faster' if speedup > 1 else 'slower'}")

if cython_available:
    speedup = baseline / cython_time.best
    print(f"Cython:   {speedup:6.2f}x {'faster' if speedup > 1 else 'slower'}")

if fortran_available:
    try:
        speedup = baseline / fortran_time.best
        print(f"Fortran:  {speedup:6.2f}x {'faster' if speedup > 1 else 'slower'}")
    except:
        pass

if cffi_available and cffi_lib_loaded:
    speedup = baseline / cffi_time.best
    print(f"CFFI:     {speedup:6.2f}x {'faster' if speedup > 1 else 'slower'}")

Comprehensive Performance Comparison
Test: Sum of squares with 1,000,000 elements

Timing Results:
1 ms ± 12.1 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Pure Python (scaled): ~0.099s
549 μs ± 8.71 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
NumPy:                 0.000536s
556 μs ± 14.3 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
ctypes (C):            0.000538s
527 μs ± 14.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Cython:                0.000500s
530 μs ± 12.2 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Fortran (f2py):        0.000507s
536 μs ± 11.2 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
CFFI:                  0.000520s

Speedup factors (relative to NumPy):
ctypes:     1.00x slower
Cython:     1.07x faster
Fortran:    1.06x faster
CFFI:       1.03x faster


## 3.2 Choosing the Right Tool

### Decision Matrix for Python Bindings:

| Tool | Ease of Use | Performance | Existing Code | Best For |
|------|-------------|-------------|---------------|----------|
| **ctypes** | Medium | High | C libraries | Quick integration with existing C libs |
| **Cython** | Easy | Very High | Python-like | Writing new high-performance code |
| **pybind11** | Medium | Very High | C++ libraries | Modern C++ integration |
| **f2py** | Easy | High | Fortran | Scientific/numerical Fortran code |
| **CFFI** | Medium | High | C libraries | Complex C interfaces |
| **SWIG** | Hard | High | Multiple languages | Multi-language support |

In [20]:
# Real-world example: Interfacing with a C library (libc math functions)
import ctypes
import ctypes.util

# Find and load the math library
if sys.platform == 'win32':
    # On Windows, math functions are in msvcrt
    libc = ctypes.CDLL('msvcrt')
else:
    # On Unix-like systems
    libm_path = ctypes.util.find_library('m')
    if libm_path:
        libm = ctypes.CDLL(libm_path)
    else:
        libm = ctypes.CDLL('libc.so.6')  # Fallback

try:
    if sys.platform != 'win32':
        # Define sin function signature
        libm.sin.argtypes = [ctypes.c_double]
        libm.sin.restype = ctypes.c_double
        
        # Test calling C math library directly
        x = 1.0
        c_sin = libm.sin(x)
        python_sin = np.sin(x)
        
        print(f"\nCalling C math library directly:")
        print(f"C sin({x}):      {c_sin:.10f}")
        print(f"NumPy sin({x}):  {python_sin:.10f}")
        print(f"Match: {np.isclose(c_sin, python_sin)}")
        
        # Performance comparison for vectorized operations
        test_data = np.random.rand(100000)
        
        def c_sin_vectorized(arr):
            """Vectorized C sin using list comprehension"""
            return np.array([libm.sin(float(x)) for x in arr])
        
        print("\nVectorized sin performance:")
        print("C sin (vectorized):")
        %timeit c_sin_vectorized(test_data[:1000])  # Smaller sample
        
        print("NumPy sin:")
        %timeit np.sin(test_data)
        
        print("\nNote: NumPy is usually much faster due to vectorization overhead in Python loops")
    else:
        print("Math library interface not demonstrated on Windows")
        
except Exception as e:
    print(f"Error accessing math library: {e}")


Calling C math library directly:
C sin(1.0):      0.8414709848
NumPy sin(1.0):  0.8414709848
Match: True

Vectorized sin performance:
C sin (vectorized):
288 μs ± 9.09 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
NumPy sin:
684 μs ± 35.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Note: NumPy is usually much faster due to vectorization overhead in Python loops


## 3.3 Advanced Topics and Best Practices

### Memory Management
- **ctypes**: Manual memory management, careful with pointers
- **Cython**: Automatic memory management for NumPy arrays
- **f2py**: Automatic array allocation and cleanup

### Error Handling
- Always check return codes from C functions
- Use exception handling around foreign function calls
- Validate input data before passing to compiled code

### Threading and GIL
- ctypes releases GIL automatically for C calls
- Cython: Use `with nogil:` blocks
- f2py: Automatic GIL release for Fortran calls

In [21]:
# Demonstrate GIL release with threading
import threading
import concurrent.futures

def compute_intensive_task(data, method='numpy'):
    """CPU-intensive computation"""
    if method == 'numpy':
        # This holds the GIL
        result = np.sum(data**3) + np.sum(np.sin(data))
    elif method == 'ctypes' and mathlib:
        # This releases the GIL
        data_ptr = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
        result = mathlib.sum_of_squares(data_ptr, len(data))
    else:
        result = 0.0
    
    return result

# Create test data
test_data = [np.random.rand(100000).astype(np.float64) for _ in range(4)]

print("Threading comparison (4 threads):")

# Test NumPy (holds GIL)
start_time = time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    numpy_futures = [executor.submit(compute_intensive_task, data, 'numpy') 
                     for data in test_data]
    numpy_results = [f.result() for f in numpy_futures]
numpy_threaded_time = time.time() - start_time

# Test NumPy sequentially
start_time = time.time()
numpy_sequential = [compute_intensive_task(data, 'numpy') for data in test_data]
numpy_sequential_time = time.time() - start_time

print(f"NumPy sequential:  {numpy_sequential_time:.4f}s")
print(f"NumPy threaded:    {numpy_threaded_time:.4f}s")
print(f"NumPy speedup:     {numpy_sequential_time/numpy_threaded_time:.2f}x")

# Test ctypes (releases GIL)
if mathlib:
    start_time = time.time()
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        ctypes_futures = [executor.submit(compute_intensive_task, data, 'ctypes') 
                         for data in test_data]
        ctypes_results = [f.result() for f in ctypes_futures]
    ctypes_threaded_time = time.time() - start_time
    
    start_time = time.time()
    ctypes_sequential = [compute_intensive_task(data, 'ctypes') for data in test_data]
    ctypes_sequential_time = time.time() - start_time
    
    print(f"\nctypes sequential: {ctypes_sequential_time:.4f}s")
    print(f"ctypes threaded:   {ctypes_threaded_time:.4f}s")
    print(f"ctypes speedup:    {ctypes_sequential_time/ctypes_threaded_time:.2f}x")
else:
    print("\nctypes not available for threading test")

print("\n💡 Notice: ctypes shows better threading scalability due to GIL release")

Threading comparison (4 threads):
NumPy sequential:  0.0091s
NumPy threaded:    0.0069s
NumPy speedup:     1.31x

ctypes sequential: 0.0004s
ctypes threaded:   0.0014s
ctypes speedup:    0.28x

💡 Notice: ctypes shows better threading scalability due to GIL release


## Summary: Python Bindings Best Practices

### Do:
- **Profile first** - identify actual bottlenecks before optimizing
- **Choose the right tool** - consider complexity vs. performance needs
- **Handle errors gracefully** - validate inputs and catch exceptions
- **Use appropriate data types** - match Python and C/Fortran types
- **Consider memory layout** - C-contiguous arrays for better performance
- **Document interfaces clearly** - especially for team development

### Don't:
- **Over-optimize** - NumPy is often fast enough
- **Ignore memory management** - especially with manual pointer handling
- **Mix different binding approaches** unnecessarily
- **Forget about maintainability** - complex bindings are hard to debug
- **Skip testing** - foreign function calls can be fragile

### Decision Guide:

1. **Start with NumPy/SciPy/JAX/Numba** - usually sufficient for most needs
2. **Need custom algorithms?** → **Cython** (easiest path)
3. **Existing C library?** → **ctypes** (quick integration)
4. **Existing C++ code?** → **pybind11** (modern C++)
5. **Existing Fortran code?** → **f2py** (scientific computing)
6. **Complex C interfaces?** → **CFFI** (more features than ctypes)
7. **Multiple languages?** → **SWIG** (universal but complex)

### Performance Hierarchy (typical):
1. **Optimized C/C++/Fortran** (with good compiler flags)
2. **Cython** (close to C performance)
3. **NumPy** (highly optimized for array operations)
4. **ctypes/CFFI/f2py** (function call overhead)
5. **Pure Python** (development and prototyping)

In [23]:
# Final demonstration: Building a complete pipeline
print("Complete High-Performance Pipeline Demo")
print("=" * 45)

# Simulate a real-world scientific computing pipeline
# Step 1: Data generation (NumPy)
n_samples = 50000
n_features = 100

print(f"1. Generating {n_samples}×{n_features} dataset with NumPy...")
start_time = time.time()
data = np.random.rand(n_samples, n_features).astype(np.float64)
labels = np.random.randint(0, 3, n_samples)
data_gen_time = time.time() - start_time
print(f"   ✓ Complete in {data_gen_time:.3f}s")

# Step 2: Preprocessing with compiled code
if cython_available:
    print("2. Preprocessing with Cython...")
    start_time = time.time()
    # Normalize each sample
    normalized_data = np.array([row / cython_sum_of_squares(row)**0.5 
                               for row in data[:1000]])  # Sample for demo
    preprocess_time = time.time() - start_time
    print(f"   ✓ Complete in {preprocess_time:.3f}s (1000 samples)")
else:
    print("2. Preprocessing with NumPy (Cython not available)...")
    start_time = time.time()
    normalized_data = data / np.linalg.norm(data, axis=1, keepdims=True)
    preprocess_time = time.time() - start_time
    print(f"   ✓ Complete in {preprocess_time:.3f}s")

# Step 3: Analysis with best available tool
analysis_data = data[:10000]  # Subset for analysis

if cython_available:
    print("3. Distance computation with Cython...")
    start_time = time.time()
    distances = cython_euclidean_distance(analysis_data[:500], analysis_data[:500])
    analysis_time = time.time() - start_time
    print(f"   ✓ 500×500 distance matrix in {analysis_time:.3f}s")
else:
    print("3. Distance computation with NumPy...")
    start_time = time.time()
    # Using the NumPy broadcasting trick from earlier sessions
    diff = analysis_data[:500, np.newaxis, :] - analysis_data[np.newaxis, :500, :]
    distances = np.sqrt(np.sum(diff**2, axis=2))
    analysis_time = time.time() - start_time
    print(f"   ✓ 500×500 distance matrix in {analysis_time:.3f}s")

# Step 4: Summary statistics
print("4. Computing summary statistics...")
start_time = time.time()

stats = {
    'mean_distance': np.mean(distances[np.triu_indices(len(distances), k=1)]),
    'std_distance': np.std(distances[np.triu_indices(len(distances), k=1)]),
    'min_distance': np.min(distances[distances > 0]),
    'max_distance': np.max(distances)
}

stats_time = time.time() - start_time
print(f"   ✓ Statistics computed in {stats_time:.6f}s")

# Results
total_time = data_gen_time + preprocess_time + analysis_time + stats_time

print("\n" + "=" * 45)
print("Pipeline Results:")
print(f"Data generation:  {data_gen_time:.3f}s ({100*data_gen_time/total_time:.1f}%)")
print(f"Preprocessing:    {preprocess_time:.3f}s ({100*preprocess_time/total_time:.1f}%)")
print(f"Analysis:         {analysis_time:.3f}s ({100*analysis_time/total_time:.1f}%)")
print(f"Statistics:       {stats_time:.6f}s ({100*stats_time/total_time:.1f}%)")
print(f"Total time:       {total_time:.3f}s")

print(f"\nDistance Statistics:")
for key, value in stats.items():
    print(f"{key}: {value:.4f}")

Complete High-Performance Pipeline Demo
1. Generating 50000×100 dataset with NumPy...
   ✓ Complete in 0.038s
2. Preprocessing with Cython...
   ✓ Complete in 0.001s (1000 samples)
3. Distance computation with Cython...
   ✓ 500×500 distance matrix in 0.018s
4. Computing summary statistics...
   ✓ Statistics computed in 0.003136s

Pipeline Results:
Data generation:  0.038s (62.8%)
Preprocessing:    0.001s (2.2%)
Analysis:         0.018s (29.8%)
Statistics:       0.003136s (5.1%)
Total time:       0.061s

Distance Statistics:
mean_distance: 4.0709
std_distance: 0.2414
min_distance: 3.0533
max_distance: 5.1172


In [25]:
# Clean up generated files
cleanup_files = ['mathlib.c', 'mathlib.so', 'mathlib.dll', 'mathops.f90']
for filename in cleanup_files:
    if os.path.exists(filename):
        try:
            os.remove(filename)
        except:
            pass  # Ignore cleanup errors