![Bindings Slide](img/04_Bindings/Folie1.PNG)
![Bindings Slide](img/04_Bindings/Folie2.PNG)
![Bindings Slide](img/04_Bindings/Folie3.PNG)

In [1]:
import numpy as np
import ctypes
import os
import subprocess
import sys
import time
from pathlib import Path

print(f"Python version: {sys.version}")
print(f"NumPy version: {np.__version__}")
print(f"Platform: {sys.platform}")

# Check if common tools are available
tools = ['gcc', 'g++', 'gfortran', 'make']
for tool in tools:
    try:
        result = subprocess.run([tool, '--version'], capture_output=True, text=True)
        if result.returncode == 0:
            version = result.stdout.split('\n')[0]
            print(f"{tool}: {version}")
        else:
            print(f"{tool}: Not available")
    except FileNotFoundError:
        print(f"{tool}: Not found")

Python version: 3.12.1 | packaged by Anaconda, Inc. | (main, Jan 19 2024, 15:51:05) [GCC 11.2.0]
NumPy version: 2.2.6
Platform: linux
gcc: gcc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
g++: g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
gfortran: GNU Fortran (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
make: GNU Make 4.3


## 1.1 ctypes - Direct C Library Access

![Bindings Slide](img/04_Bindings/Folie4.PNG)

[ctypes tutorial on python.org](https://docs.python.org/3/library/ctypes.html)

In [2]:
# First, let's create a simple C library
c_source = '''
#include <math.h>
#include <stdlib.h>

// Simple function: compute sum of squares
double sum_of_squares(double* arr, int n) {
    double sum = 0.0;
    for (int i = 0; i < n; i++) {
        sum += arr[i] * arr[i];
    }
    return sum;
}

// Vector dot product
double dot_product(double* a, double* b, int n) {
    double result = 0.0;
    for (int i = 0; i < n; i++) {
        result += a[i] * b[i];
    }
    return result;
}

// Matrix-vector multiplication: y = A * x
void matvec(double* A, double* x, double* y, int m, int n) {
    for (int i = 0; i < m; i++) {
        y[i] = 0.0;
        for (int j = 0; j < n; j++) {
            y[i] += A[i * n + j] * x[j];
        }
    }
}

// Parallel reduction (sum)
#ifdef _OPENMP
#include <omp.h>
#endif

double parallel_sum(double* arr, int n) {
    double sum = 0.0;
    #ifdef _OPENMP
    #pragma omp parallel for reduction(+:sum)
    #endif
    for (int i = 0; i < n; i++) {
        sum += arr[i];
    }
    return sum;
}
'''

# Write C source to file
with open('mathlib.c', 'w') as f:
    f.write(c_source)

print("Created C source file: mathlib.c")

Created C source file: mathlib.c


In [3]:
# Compile the C library to a shared library
compile_commands = {
    'linux': 'gcc -shared -fPIC -O3 -fopenmp mathlib.c -o mathlib.so -lm',
    'darwin': 'gcc -shared -fPIC -O3 -Xpreprocessor -fopenmp mathlib.c -o mathlib.so -lm',
    'win32': 'gcc -shared -O3 -fopenmp mathlib.c -o mathlib.dll -lm'
}

compile_cmd = compile_commands.get(sys.platform, compile_commands['linux'])
print(f"Compiling with: {compile_cmd}")

try:
    result = subprocess.run(compile_cmd.split(), capture_output=True, text=True)
    if result.returncode == 0:
        print("Compilation successful!")
        if result.stderr:
            print(f"Warnings: {result.stderr}")
    else:
        print(f"Compilation failed: {result.stderr}")
        # Try without OpenMP
        compile_cmd_simple = compile_cmd.replace('-fopenmp', '').replace('-Xpreprocessor -fopenmp', '')
        print(f"Trying without OpenMP: {compile_cmd_simple}")
        result = subprocess.run(compile_cmd_simple.split(), capture_output=True, text=True)
        if result.returncode == 0:
            print("Compilation successful (without OpenMP)!")
        else:
            print(f"Still failed: {result.stderr}")
except FileNotFoundError:
    print("GCC not found. Please install a C compiler.")

Compiling with: gcc -shared -fPIC -O3 -fopenmp mathlib.c -o mathlib.so -lm
Compilation successful!


In [4]:
# Load the compiled library with ctypes
try:
    # Determine library extension
    if sys.platform == 'win32':
        lib_name = './mathlib.dll'
    else:
        lib_name = './mathlib.so'
    
    # Load the library
    mathlib = ctypes.CDLL(lib_name)
    print(f"Loaded library: {lib_name}")
    
    # Define function signatures
    # double sum_of_squares(double* arr, int n)
    mathlib.sum_of_squares.argtypes = [ctypes.POINTER(ctypes.c_double), ctypes.c_int]
    mathlib.sum_of_squares.restype = ctypes.c_double
    
    # double dot_product(double* a, double* b, int n)
    mathlib.dot_product.argtypes = [ctypes.POINTER(ctypes.c_double), 
                                    ctypes.POINTER(ctypes.c_double), 
                                    ctypes.c_int]
    mathlib.dot_product.restype = ctypes.c_double
    
    # void matvec(double* A, double* x, double* y, int m, int n)
    mathlib.matvec.argtypes = [ctypes.POINTER(ctypes.c_double),
                               ctypes.POINTER(ctypes.c_double),
                               ctypes.POINTER(ctypes.c_double),
                               ctypes.c_int, ctypes.c_int]
    mathlib.matvec.restype = None
    
    # double parallel_sum(double* arr, int n)
    mathlib.parallel_sum.argtypes = [ctypes.POINTER(ctypes.c_double), ctypes.c_int]
    mathlib.parallel_sum.restype = ctypes.c_double
    
    print("Function signatures defined")
    
except OSError as e:
    print(f"Failed to load library: {e}")
    print("Available files:", os.listdir('.'))
    mathlib = None

Loaded library: ./mathlib.so
Function signatures defined


In [5]:
# Test the ctypes interface
if mathlib:
    # Create test data
    n = 1000000
    data = np.random.rand(n).astype(np.float64)
    
    # Convert NumPy array to ctypes pointer
    data_ptr = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
    
    # Test sum_of_squares
    c_result = mathlib.sum_of_squares(data_ptr, n)
    numpy_result = np.sum(data**2)
    
    print(f"Sum of squares comparison:")
    print(f"C result:     {c_result:.6f}")
    print(f"NumPy result: {numpy_result:.6f}")
    print(f"Match: {np.isclose(c_result, numpy_result)}")
    
    # Performance comparison
    print("\nPerformance comparison:")
    
    print("C library (ctypes):")
    %timeit mathlib.sum_of_squares(data_ptr, n)
    
    print("NumPy:")
    %timeit np.sum(data**2)
    
    print("Pure Python (1% of the data):")
    %timeit sum(x**2 for x in data[:10000])  # Only 10k for timing
else:
    print("Library not loaded, skipping tests")

Sum of squares comparison:
C result:     333090.155127
NumPy result: 333090.155127
Match: True

Performance comparison:
C library (ctypes):
448 μs ± 13 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
NumPy:
453 μs ± 4.58 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Pure Python (1% of the data):
841 μs ± 21.2 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [6]:
# Real-world ctypes example: Using system math library functions
import ctypes
import ctypes.util    

try:
    if sys.platform != 'win32':
        # Access system math library
        libm_path = ctypes.util.find_library('m')
        if libm_path:
            libm = ctypes.CDLL(libm_path)
            libm.sin.argtypes = [ctypes.c_double]
            libm.sin.restype = ctypes.c_double
            
            # Quick demonstration
            x = 1.0
            c_sin = libm.sin(x)
            python_sin = np.sin(x)
            print(f"C sin({x}): {c_sin:.6f}, NumPy: {python_sin:.6f}")
            print(f"Note: NumPy is usually faster for vectorized operations")
        else:
            print("Math library not found")
    else:
        print("Math library interface not demonstrated on Windows")
        
except Exception as e:
    print(f"Error accessing math library: {e}")

C sin(1.0): 0.841471, NumPy: 0.841471
Note: NumPy is usually faster for vectorized operations


## 1.2 Cython - Python-like Syntax with C Performance

![Bindings Slide](img/04_Bindings/Folie5.PNG)

[Basic Cython Tutorial](https://cython.readthedocs.io/en/latest/src/tutorial/cython_tutorial.html)



In [7]:
# Check if Cython is available
try:
    import Cython
    print(f"Cython version: {Cython.__version__}")
    cython_available = True
except ImportError:
    print("Cython not available. Install with: pip install cython")
    cython_available = False

if cython_available:
    # Load Cython magic for Jupyter
    %load_ext Cython

Cython version: 3.1.3


In [8]:
%%cython
# Cython implementation of mathematical functions
import numpy as np
cimport numpy as cnp
from libc.math cimport sqrt, sin, cos, exp
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
def cython_sum_of_squares(cnp.ndarray[double, ndim=1] arr):
    """Cython version of sum of squares"""
    cdef int n = arr.shape[0]
    cdef double total = 0.0
    cdef int i
    
    for i in range(n):
        total += arr[i] * arr[i]
    
    return total

@cython.boundscheck(False)
@cython.wraparound(False)
def cython_euclidean_distance(cnp.ndarray[double, ndim=2] X, 
                              cnp.ndarray[double, ndim=2] Y):
    """Cython pairwise Euclidean distance matrix"""
    cdef int n = X.shape[0]
    cdef int m = Y.shape[0]
    cdef int d = X.shape[1]
    
    cdef cnp.ndarray[double, ndim=2] distances = np.zeros((n, m), dtype=np.float64)
    
    cdef double dist, diff
    cdef int i, j, k
    
    for i in range(n):
        for j in range(m):
            dist = 0.0
            for k in range(d):
                diff = X[i, k] - Y[j, k]
                dist += diff * diff
            distances[i, j] = sqrt(dist)
    
    return distances

@cython.boundscheck(False)
@cython.wraparound(False)
def cython_complex_function(cnp.ndarray[double, ndim=1] arr):
    """Complex mathematical function in Cython"""
    cdef int n = arr.shape[0]
    cdef cnp.ndarray[double, ndim=1] result = np.zeros(n, dtype=np.float64)
    cdef double x
    cdef int i
    
    for i in range(n):
        x = arr[i]
        result[i] = sin(x) * cos(x*x) + exp(-x*x)
    
    return result

In [9]:
# Test Cython functions
if cython_available:
    # Create test data
    data = np.random.rand(1000000)
    
    # Test sum of squares
    cython_result = cython_sum_of_squares(data)
    numpy_result = np.sum(data**2)
    
    print(f"Cython sum of squares: {cython_result:.6f}")
    print(f"NumPy sum of squares:  {numpy_result:.6f}")
    print(f"Match: {np.isclose(cython_result, numpy_result)}")
    
    # Performance comparison
    print("\nPerformance comparison - Sum of squares:")
    print("Cython:")
    %timeit cython_sum_of_squares(data)
    
    print("NumPy:")
    %timeit np.sum(data**2)
    
    # Test complex function
    print("\nComplex function performance:")
    test_data = np.random.rand(100000)
    
    print("Cython:")
    %timeit cython_complex_function(test_data)
    
    print("NumPy:")
    %timeit np.sin(test_data) * np.cos(test_data**2) + np.exp(-test_data**2)
    
    # Verify results match
    cython_complex = cython_complex_function(test_data[:1000])
    numpy_complex = np.sin(test_data[:1000]) * np.cos(test_data[:1000]**2) + np.exp(-test_data[:1000]**2)
    print(f"Complex function results match: {np.allclose(cython_complex, numpy_complex)}")
else:
    print("Cython not available, skipping tests")

Cython sum of squares: 332822.207766
NumPy sum of squares:  332822.207766
Match: True

Performance comparison - Sum of squares:
Cython:
460 μs ± 36 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
NumPy:
519 μs ± 20.9 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Complex function performance:
Cython:
1.38 ms ± 41.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
NumPy:
1.77 ms ± 174 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Complex function results match: True


## 1.3 CFFI - C Foreign Function Interface

![Bindings Slide](img/04_Bindings/Folie6.PNG)

[CFFI Tutorial](https://cffi.readthedocs.io/en/latest/overview.html)

In [10]:
# Check if CFFI is available
try:
    import cffi
    print(f"CFFI version: {cffi.__version__}")
    cffi_available = True
except ImportError:
    print("CFFI not available. Install with: pip install cffi")
    cffi_available = False

if cffi_available:
    # Create CFFI interface to our C library
    ffi = cffi.FFI()
    
    # Define C function signatures
    ffi.cdef("""
        double sum_of_squares(double* arr, int n);
        double dot_product(double* a, double* b, int n);
        void matvec(double* A, double* x, double* y, int m, int n);
        double parallel_sum(double* arr, int n);
    """)
    
    # Load the compiled library
    try:
        if sys.platform == 'win32':
            lib = ffi.dlopen('./mathlib.dll')
        else:
            lib = ffi.dlopen('./mathlib.so')
        
        print("CFFI library loaded successfully")
        cffi_lib_loaded = True
        
    except Exception as e:
        print(f"Failed to load library with CFFI: {e}")
        cffi_lib_loaded = False
else:
    cffi_lib_loaded = False

CFFI version: 1.17.1
CFFI library loaded successfully


In [11]:
# Test CFFI interface
if cffi_available and cffi_lib_loaded:
    def cffi_sum_of_squares(arr):
        """CFFI wrapper for sum_of_squares"""
        # Convert NumPy array to CFFI pointer
        arr_ptr = ffi.cast("double*", arr.ctypes.data)
        return lib.sum_of_squares(arr_ptr, len(arr))
    
    def cffi_dot_product(a, b):
        """CFFI wrapper for dot product"""
        a_ptr = ffi.cast("double*", a.ctypes.data)
        b_ptr = ffi.cast("double*", b.ctypes.data)
        return lib.dot_product(a_ptr, b_ptr, len(a))
    
    # Test CFFI functions
    data = np.random.rand(1000000).astype(np.float64)
    
    # Test sum of squares
    cffi_result = cffi_sum_of_squares(data)
    numpy_result = np.sum(data**2)
    
    print(f"CFFI sum of squares: {cffi_result:.6f}")
    print(f"NumPy result:        {numpy_result:.6f}")
    print(f"Match: {np.isclose(cffi_result, numpy_result)}")
    
    # Test dot product
    a = np.random.rand(1000000).astype(np.float64)
    b = np.random.rand(1000000).astype(np.float64)
    
    cffi_dot = cffi_dot_product(a, b)
    numpy_dot = np.dot(a, b)
    
    print(f"\nCFFI dot product: {cffi_dot:.6f}")
    print(f"NumPy dot product: {numpy_dot:.6f}")
    print(f"Match: {np.isclose(cffi_dot, numpy_dot)}")
    
    # Performance comparison
    print("\nCFFI Performance:")
    print("CFFI sum of squares:")
    %timeit cffi_sum_of_squares(data)
    
    print("CFFI dot product:")
    %timeit cffi_dot_product(a, b)
    
    print("\nNumPy comparison:")
    %timeit np.sum(data**2)
    %timeit np.dot(a, b)
else:
    print("CFFI not available or library not loaded")

CFFI sum of squares: 333409.998236
NumPy result:        333409.998236
Match: True

CFFI dot product: 250189.045345
NumPy dot product: 250189.045345
Match: True

CFFI Performance:
CFFI sum of squares:
562 μs ± 34.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
CFFI dot product:
569 μs ± 46.8 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

NumPy comparison:
598 μs ± 50.3 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
177 μs ± 68.9 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## 2.1 pybind11 - Modern C++ Python Bindings

![Bindings Slide](img/04_Bindings/Folie7.PNG)

[PyBind11 Tutorial](https://pybind11.readthedocs.io/en/stable/basics.html)

In [12]:
# Check if pybind11 is available
try:
    import pybind11
    print(f"pybind11 version: {pybind11.__version__}")
    pybind11_available = True
except ImportError:
    print("pybind11 not available. Install with: pip install pybind11")
    pybind11_available = False

# Create a C++ source file for pybind11
cpp_source = '''
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <vector>
#include <cmath>
#include <algorithm>
#include <numeric>

// Fast sum of squares using modern C++
double fast_sum_of_squares(pybind11::array_t<double> input) {
    pybind11::buffer_info buf_info = input.request();
    double *ptr = static_cast<double *>(buf_info.ptr);
    size_t size = buf_info.size;
    
    double sum = 0.0;
    for (size_t i = 0; i < size; i++) {
        sum += ptr[i] * ptr[i];
    }
    return sum;
}

// Matrix multiplication with proper error handling
pybind11::array_t<double> matrix_multiply(
    pybind11::array_t<double> a, 
    pybind11::array_t<double> b
) {
    pybind11::buffer_info buf_a = a.request();
    pybind11::buffer_info buf_b = b.request();
    
    if (buf_a.ndim != 2 || buf_b.ndim != 2) {
        throw std::runtime_error("Input arrays must be 2-dimensional");
    }
    
    size_t rows_a = buf_a.shape[0];
    size_t cols_a = buf_a.shape[1];
    size_t rows_b = buf_b.shape[0]; 
    size_t cols_b = buf_b.shape[1];
    
    if (cols_a != rows_b) {
        throw std::runtime_error("Matrix dimensions don't match for multiplication");
    }
    
    auto result = pybind11::array_t<double>(rows_a * cols_b);
    pybind11::buffer_info buf_result = result.request();
    result.resize({rows_a, cols_b});
    buf_result = result.request();  // Update after resize
    
    double *ptr_a = static_cast<double *>(buf_a.ptr);
    double *ptr_b = static_cast<double *>(buf_b.ptr);  
    double *ptr_result = static_cast<double *>(buf_result.ptr);
    
    // Initialize result to zero
    std::fill(ptr_result, ptr_result + rows_a * cols_b, 0.0);
    
    // Perform matrix multiplication
    for (size_t i = 0; i < rows_a; i++) {
        for (size_t j = 0; j < cols_b; j++) {
            for (size_t k = 0; k < cols_a; k++) {
                ptr_result[i * cols_b + j] += 
                    ptr_a[i * cols_a + k] * ptr_b[k * cols_b + j];
            }
        }
    }
    
    return result;
}

// Class example: Simple statistics calculator
class StatisticsCalculator {
private:
    std::vector<double> data_;
    
public:
    void add_data(const std::vector<double>& new_data) {
        data_.insert(data_.end(), new_data.begin(), new_data.end());
    }
    
    void add_value(double value) {
        data_.push_back(value);
    }
    
    double mean() const {
        if (data_.empty()) return 0.0;
        return std::accumulate(data_.begin(), data_.end(), 0.0) / data_.size();
    }
    
    double std_dev() const {
        if (data_.size() < 2) return 0.0;
        double m = mean();
        double sum = 0.0;
        for (double x : data_) {
            sum += (x - m) * (x - m);
        }
        return std::sqrt(sum / (data_.size() - 1));
    }
    
    size_t size() const { return data_.size(); }
    
    std::vector<double> get_data() const { return data_; }
    
    void clear() { data_.clear(); }
};

// Module definition
PYBIND11_MODULE(pybind11_math, m) {
    m.doc() = "pybind11 mathematical operations plugin";
    
    m.def("fast_sum_of_squares", &fast_sum_of_squares, 
          "Calculate sum of squares of array elements");
    
    m.def("matrix_multiply", &matrix_multiply,
          "Multiply two matrices");
    
    pybind11::class_<StatisticsCalculator>(m, "StatisticsCalculator")
        .def(pybind11::init<>())
        .def("add_data", &StatisticsCalculator::add_data)
        .def("add_value", &StatisticsCalculator::add_value)  
        .def("mean", &StatisticsCalculator::mean)
        .def("std_dev", &StatisticsCalculator::std_dev)
        .def("size", &StatisticsCalculator::size)
        .def("get_data", &StatisticsCalculator::get_data)
        .def("clear", &StatisticsCalculator::clear);
}
'''

# Write C++ source to file
with open('pybind11_math.cpp', 'w') as f:
    f.write(cpp_source)

print("Created C++ source file: pybind11_math.cpp")

pybind11 version: 3.0.1
Created C++ source file: pybind11_math.cpp


In [13]:
# Compile the pybind11 module
if pybind11_available:
    # Create setup.py for pybind11 compilation
    setup_py_content = '''
from setuptools import setup, Extension
import pybind11

# Define the extension module
ext_modules = [
    Extension(
        'pybind11_math',
        ['pybind11_math.cpp'],
        include_dirs=[pybind11.get_cmake_dir() + '/../../../include'],
        language='c++',
        cxx_std=14,  # Use C++14 standard
    ),
]

setup(
    name='pybind11_math',
    ext_modules=ext_modules,
    zip_safe=False,
)
'''
    
    with open('setup.py', 'w') as f:
        f.write(setup_py_content)
    
    # Try to compile
    print("Compiling pybind11 module...")
    try:
        # Build in-place
        result = subprocess.run([
            sys.executable, 'setup.py', 'build_ext', '--inplace'
        ], capture_output=True, text=True)
        
        if result.returncode == 0:
            print("✓ pybind11 compilation successful!")
            pybind11_compiled = True
            
            # Try to import the compiled module
            try:
                import pybind11_math
                print("✓ pybind11 module imported successfully!")
            except ImportError as e:
                print(f"✗ Failed to import pybind11 module: {e}")
                pybind11_compiled = False
        else:
            print(f"✗ Compilation failed:")
            print("STDOUT:", result.stdout)
            print("STDERR:", result.stderr)
            pybind11_compiled = False
            
    except Exception as e:
        print(f"✗ Build process failed: {e}")
        pybind11_compiled = False
        
else:
    pybind11_compiled = False
    print("pybind11 not available, skipping compilation")

Compiling pybind11 module...
✓ pybind11 compilation successful!
✓ pybind11 module imported successfully!


In [14]:
# Test pybind11 functions
if pybind11_compiled and 'pybind11_math' in sys.modules:
    # Test basic functions
    test_data = np.random.rand(1000000).astype(np.float64)
    
    # Test sum of squares
    pybind11_result = pybind11_math.fast_sum_of_squares(test_data)
    numpy_result = np.sum(test_data**2)
    
    print("pybind11 Function Tests:")
    print(f"Sum of squares - pybind11: {pybind11_result:.6f}")
    print(f"Sum of squares - NumPy:    {numpy_result:.6f}")
    print(f"Results match: {np.isclose(pybind11_result, numpy_result)}")
    
    # Test matrix multiplication
    A = np.random.rand(200, 300).astype(np.float64)
    B = np.random.rand(300, 150).astype(np.float64)
    
    pybind11_matmul = pybind11_math.matrix_multiply(A, B)
    numpy_matmul = np.dot(A, B)
    
    print(f"\nMatrix multiplication test:")
    print(f"pybind11 result shape: {pybind11_matmul.shape}")
    print(f"NumPy result shape:    {numpy_matmul.shape}")
    print(f"Results match: {np.allclose(pybind11_matmul, numpy_matmul)}")
    
    # Test the C++ class
    print("\nTesting StatisticsCalculator class:")
    stats = pybind11_math.StatisticsCalculator()
    
    # Add some data
    test_values = [1.0, 2.0, 3.0, 4.0, 5.0]
    stats.add_data(test_values)
    stats.add_value(6.0)
    
    print(f"Data size: {stats.size()}")
    print(f"Mean: {stats.mean():.4f}")
    print(f"Std dev: {stats.std_dev():.4f}")
    
    # Compare with NumPy
    all_data = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
    print(f"NumPy mean: {np.mean(all_data):.4f}")
    print(f"NumPy std: {np.std(all_data, ddof=1):.4f}")
    
    # Performance comparison
    print("\nPerformance comparison:")
    print("pybind11 sum of squares:")
    %timeit pybind11_math.fast_sum_of_squares(test_data)
    
    print("NumPy sum of squares:")
    %timeit np.sum(test_data**2)
    
    # Matrix multiplication performance
    print("\nMatrix multiplication performance:")
    A_small = np.random.rand(500, 600).astype(np.float64)
    B_small = np.random.rand(600, 400).astype(np.float64)
    
    print("pybind11 matrix multiply:")
    %timeit pybind11_math.matrix_multiply(A_small, B_small)
    
    print("NumPy matrix multiply:")
    %timeit np.dot(A_small, B_small)
    
else:
    print("pybind11 module not available for testing")

pybind11 Function Tests:
Sum of squares - pybind11: 333179.357994
Sum of squares - NumPy:    333179.357994
Results match: True

Matrix multiplication test:
pybind11 result shape: (200, 150)
NumPy result shape:    (200, 150)
Results match: True

Testing StatisticsCalculator class:
Data size: 6
Mean: 3.5000
Std dev: 1.8708
NumPy mean: 3.5000
NumPy std: 1.8708

Performance comparison:
pybind11 sum of squares:
606 μs ± 13.2 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
NumPy sum of squares:
596 μs ± 19.4 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Matrix multiplication performance:
pybind11 matrix multiply:
103 ms ± 3.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
NumPy matrix multiply:
2.56 ms ± 431 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## 3.1 Performance Summary and Best Practices

Let's compare all the different approaches we've used:

In [15]:
# Comprehensive performance comparison
print("Comprehensive Performance Comparison")
print("=" * 50)

# Create test data
n = 1_000_000
data = np.random.rand(n).astype(np.float64)

print(f"Test: Sum of squares with {n:,} elements")
print("\nTiming Results:")

# Pure Python (small sample)
small_data = data[:10000]
python_time = %timeit -o sum(x**2 for x in small_data)
python_scaled = python_time.best * (n / 10000)
print(f"Pure Python (scaled): ~{python_scaled:.3f}s")

# NumPy
numpy_time = %timeit -o np.sum(data**2)
print(f"NumPy:                 {numpy_time.best:.6f}s")

# ctypes (if available)
if 'mathlib' in locals() and mathlib:
    data_ptr = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
    ctypes_time = %timeit -o mathlib.sum_of_squares(data_ptr, n)
    print(f"ctypes (C):            {ctypes_time.best:.6f}s")
else:
    print(f"ctypes (C):            Not available")

# Cython (if available)
if cython_available:
    cython_time = %timeit -o cython_sum_of_squares(data)
    print(f"Cython:                {cython_time.best:.6f}s")
else:
    print(f"Cython:                Not available")

# CFFI (if available)
if cffi_available and cffi_lib_loaded:
    cffi_time = %timeit -o cffi_sum_of_squares(data)
    print(f"CFFI:                  {cffi_time.best:.6f}s")
else:
    print(f"CFFI:                  Not available")

# pybind11 (if available)
if 'pybind11_math' in sys.modules:
    pybind11_time = %timeit -o pybind11_math.fast_sum_of_squares(data)
    print(f"pybind11 (C++):        {pybind11_time.best:.6f}s")
else:
    print(f"pybind11 (C++):        Not available")

print("\n" + "=" * 50)
print("Speedup factors (relative to NumPy):")
baseline = numpy_time.best

if 'mathlib' in locals() and mathlib:
    speedup = baseline / ctypes_time.best
    print(f"ctypes:   {speedup:6.2f}x {'faster' if speedup > 1 else 'slower'}")

if cython_available:
    speedup = baseline / cython_time.best
    print(f"Cython:   {speedup:6.2f}x {'faster' if speedup > 1 else 'slower'}")

if cffi_available and cffi_lib_loaded:
    speedup = baseline / cffi_time.best
    print(f"CFFI:     {speedup:6.2f}x {'faster' if speedup > 1 else 'slower'}")
    
if 'pybind11_math' in sys.modules:
    speedup = baseline / pybind11_time.best
    print(f"pybind11: {speedup:6.2f}x {'faster' if speedup > 1 else 'slower'}")

Comprehensive Performance Comparison
Test: Sum of squares with 1,000,000 elements

Timing Results:
1.09 ms ± 21.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Pure Python (scaled): ~0.106s
602 μs ± 15.2 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
NumPy:                 0.000590s
594 μs ± 14.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
ctypes (C):            0.000582s
606 μs ± 13.4 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Cython:                0.000590s
612 μs ± 31.9 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
CFFI:                  0.000580s
661 μs ± 7.27 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
pybind11 (C++):        0.000649s

Speedup factors (relative to NumPy):
ctypes:     1.01x faster
Cython:     1.00x slower
CFFI:       1.02x faster
pybind11:   0.91x slower


## 3.2 Choosing the Right Tool

### Decision Matrix for Python Bindings:

| Tool | Ease of Use | Performance | Existing Code | Best For |
|------|-------------|-------------|---------------|----------|
| **ctypes** | Medium | High | C libraries | Quick integration with existing C libs |
| **Cython** | Easy | Very High | Python-like | Writing new high-performance code |
| **pybind11** | Easy | Very High | C++ libraries | Modern C++ integration, classes |
| **CFFI** | Medium | High | C libraries | Complex C interfaces |
| **f2py** | Easy | High | Fortran | Scientific/numerical Fortran code |
| **SWIG** | Hard | High | Multiple languages | Multi-language support |

## Summary: Python Bindings Best Practices

### Do:
- **Profile first** - identify actual bottlenecks before optimizing
- **Choose the right tool** - consider complexity vs. performance needs
- **Handle errors gracefully** - validate inputs and catch exceptions
- **Use appropriate data types** - match Python and C/Fortran types
- **Consider memory layout** - C-contiguous arrays for better performance
- **Document interfaces clearly** - especially for team development

### Don't:
- **Over-optimize** - NumPy is often fast enough
- **Ignore memory management** - especially with manual pointer handling
- **Mix different binding approaches** unnecessarily
- **Forget about maintainability** - complex bindings are hard to debug
- **Skip testing** - foreign function calls can be fragile

### Decision Guide:

1. **Start with NumPy/SciPy/JAX/Numba** - usually sufficient for most needs
2. **Need custom algorithms?** → **Cython** (easiest path)
3. **Existing C library?** → **ctypes** (quick integration)
4. **Existing C++ code?** → **pybind11** (modern C++)
5. **Existing Fortran code?** → **f2py** (scientific computing)
6. **Complex C interfaces?** → **CFFI** (more features than ctypes)
7. **Multiple languages?** → **SWIG** (universal but complex)

### Performance Hierarchy (typical):
1. **Optimized C/C++/Fortran** (with good compiler flags)
2. **Cython** (close to C performance)
3. **NumPy** (highly optimized for array operations)
4. **ctypes/CFFI/f2py** (function call overhead)
5. **Pure Python** (development and prototyping)

In [16]:
# Final demonstration: Building a complete pipeline
print("Complete High-Performance Pipeline Demo")
print("=" * 45)

# Simulate a real-world scientific computing pipeline
# Step 1: Data generation (NumPy)
n_samples = 50000
n_features = 100

print(f"1. Generating {n_samples}×{n_features} dataset with NumPy...")
start_time = time.time()
data = np.random.rand(n_samples, n_features).astype(np.float64)
labels = np.random.randint(0, 3, n_samples)
data_gen_time = time.time() - start_time
print(f"   ✓ Complete in {data_gen_time:.3f}s")

# Step 2: Preprocessing with compiled code
if cython_available:
    print("2. Preprocessing with Cython...")
    start_time = time.time()
    # Normalize each sample
    normalized_data = np.array([row / cython_sum_of_squares(row)**0.5 
                               for row in data[:1000]])  # Sample for demo
    preprocess_time = time.time() - start_time
    print(f"   ✓ Complete in {preprocess_time:.3f}s (1000 samples)")
else:
    print("2. Preprocessing with NumPy (Cython not available)...")
    start_time = time.time()
    normalized_data = data / np.linalg.norm(data, axis=1, keepdims=True)
    preprocess_time = time.time() - start_time
    print(f"   ✓ Complete in {preprocess_time:.3f}s")

# Step 3: Analysis with best available tool
analysis_data = data[:10000]  # Subset for analysis

if cython_available:
    print("3. Distance computation with Cython...")
    start_time = time.time()
    distances = cython_euclidean_distance(analysis_data[:500], analysis_data[:500])
    analysis_time = time.time() - start_time
    print(f"   ✓ 500×500 distance matrix in {analysis_time:.3f}s")
else:
    print("3. Distance computation with NumPy...")
    start_time = time.time()
    # Using the NumPy broadcasting trick from earlier sessions
    diff = analysis_data[:500, np.newaxis, :] - analysis_data[np.newaxis, :500, :]
    distances = np.sqrt(np.sum(diff**2, axis=2))
    analysis_time = time.time() - start_time
    print(f"   ✓ 500×500 distance matrix in {analysis_time:.3f}s")

# Step 4: Summary statistics
print("4. Computing summary statistics...")
start_time = time.time()

stats = {
    'mean_distance': np.mean(distances[np.triu_indices(len(distances), k=1)]),
    'std_distance': np.std(distances[np.triu_indices(len(distances), k=1)]),
    'min_distance': np.min(distances[distances > 0]),
    'max_distance': np.max(distances)
}

stats_time = time.time() - start_time
print(f"   ✓ Statistics computed in {stats_time:.6f}s")

# Results
total_time = data_gen_time + preprocess_time + analysis_time + stats_time

print("\n" + "=" * 45)
print("Pipeline Results:")
print(f"Data generation:  {data_gen_time:.3f}s ({100*data_gen_time/total_time:.1f}%)")
print(f"Preprocessing:    {preprocess_time:.3f}s ({100*preprocess_time/total_time:.1f}%)")
print(f"Analysis:         {analysis_time:.3f}s ({100*analysis_time/total_time:.1f}%)")
print(f"Statistics:       {stats_time:.6f}s ({100*stats_time/total_time:.1f}%)")
print(f"Total time:       {total_time:.3f}s")

print(f"\nDistance Statistics:")
for key, value in stats.items():
    print(f"{key}: {value:.4f}")

Complete High-Performance Pipeline Demo
1. Generating 50000×100 dataset with NumPy...
   ✓ Complete in 0.038s
2. Preprocessing with Cython...
   ✓ Complete in 0.001s (1000 samples)
3. Distance computation with Cython...
   ✓ 500×500 distance matrix in 0.019s
4. Computing summary statistics...
   ✓ Statistics computed in 0.003010s

Pipeline Results:
Data generation:  0.038s (62.5%)
Preprocessing:    0.001s (2.1%)
Analysis:         0.019s (30.4%)
Statistics:       0.003010s (4.9%)
Total time:       0.061s

Distance Statistics:
mean_distance: 4.0719
std_distance: 0.2408
min_distance: 3.0166
max_distance: 5.0241


In [17]:
# Clean up generated files
cleanup_files = [
    'mathlib.c', 'mathlib.so', 'mathlib.dll',
    'pybind11_math.cpp', 'setup.py', 
    'pybind11_math.cpython-*.so', 'build/'
]

for filename in cleanup_files:
    if filename == 'build/' and os.path.exists(filename):
        import shutil
        try:
            shutil.rmtree(filename)
            print(f"Removed directory: {filename}")
        except:
            pass
    elif os.path.exists(filename):
        try:
            os.remove(filename)
            print(f"Removed file: {filename}")
        except:
            pass

# Also clean up any .so files that match the pybind11 pattern
import glob
for so_file in glob.glob('pybind11_math*.so'):
    try:
        os.remove(so_file)
        print(f"Removed file: {so_file}")
    except:
        pass

Removed file: mathlib.c
Removed file: mathlib.so
Removed file: pybind11_math.cpp
Removed file: setup.py
Removed directory: build/
Removed file: pybind11_math.cpython-312-x86_64-linux-gnu.so
