In [1]:
import numpy as np
import time
import matplotlib.pyplot as plt

# Parameters matching the Fortran version
WIDTH = 1024 * 1024  # 1M elements
MASK_WIDTH = 5
M = np.array([0.1, 0.2, 0.4, 0.2, 0.1])  # Same mask as Fortran version

In [2]:
def convolution_1d(input_data, mask):
    """1D convolution implementation using NumPy"""
    return np.convolve(input_data, mask, mode='same')

# Generate the same input data as Fortran version
input_data = np.sin(np.arange(WIDTH) / 100.0)

In [3]:
def benchmark_numpy():
    print("NumPy Convolution Benchmark:")
    print(f"  Input size: {WIDTH:10d}")
    print(f"  Mask width: {MASK_WIDTH:10d}")
    print()
    
    # Warm-up run
    _ = convolution_1d(input_data, M)
    
    # Timed run
    start_time = time.perf_counter()
    output = convolution_1d(input_data, M)
    end_time = time.perf_counter()
    
    elapsed_ms = (end_time - start_time) * 1000
    giga_elements_per_sec = (WIDTH * 1e-6) / (elapsed_ms * 1e-3)
    
    print(f"Time:          {elapsed_ms:10.2f} ms")
    print(f"Performance:      {giga_elements_per_sec:10.2f} GElements/s")

# Run benchmark
benchmark_numpy()

NumPy Convolution Benchmark:
  Input size:    1048576
  Mask width:          5

Time:                2.70 ms
Performance:          388.79 GElements/s


In [4]:
import numpy as np
import time
import matplotlib.pyplot as plt
import torch
import pandas as pd
from scipy.signal import convolve
from sklearn.linear_model import LinearRegression  # We'll use this to demonstrate why scikit-learn isn't meant for this

# Parameters matching the Fortran version
WIDTH = 1024 * 1024  # 1M elements
MASK_WIDTH = 5
M = np.array([0.1, 0.2, 0.4, 0.2, 0.1])  # Same mask as Fortran version
input_data = np.sin(np.arange(WIDTH) / 100.0)

In [5]:
def benchmark_pytorch():
    print("\nPyTorch Convolution Benchmark:")
    print(f"  Input size: {WIDTH:10d}")
    print(f"  Mask width: {MASK_WIDTH:10d}")
    print()
    
    # Convert to PyTorch tensors
    input_tensor = torch.from_numpy(input_data).float()
    mask_tensor = torch.from_numpy(M).float()
    
    # CPU benchmark
    # Warm-up
    _ = torch.nn.functional.conv1d(input_tensor.view(1, 1, -1), 
                                 mask_tensor.view(1, 1, -1), 
                                 padding=MASK_WIDTH//2)
    
    start_time = time.perf_counter()
    output_cpu = torch.nn.functional.conv1d(input_tensor.view(1, 1, -1), 
                                          mask_tensor.view(1, 1, -1), 
                                          padding=MASK_WIDTH//2)
    end_time = time.perf_counter()
    
    elapsed_ms_cpu = (end_time - start_time) * 1000
    giga_elements_per_sec_cpu = (WIDTH * 1e-6) / (elapsed_ms_cpu * 1e-3)
    
    print(f"CPU Time:          {elapsed_ms_cpu:10.2f} ms")
    print(f"CPU Performance:      {giga_elements_per_sec_cpu:10.2f} GElements/s")
    
    # GPU benchmark (if available)
    if torch.cuda.is_available():
        input_tensor_gpu = input_tensor.cuda()
        mask_tensor_gpu = mask_tensor.cuda()
        
        # Warm-up
        _ = torch.nn.functional.conv1d(input_tensor_gpu.view(1, 1, -1), 
                                     mask_tensor_gpu.view(1, 1, -1), 
                                     padding=MASK_WIDTH//2)
        torch.cuda.synchronize()
        
        start_time = time.perf_counter()
        output_gpu = torch.nn.functional.conv1d(input_tensor_gpu.view(1, 1, -1), 
                                              mask_tensor_gpu.view(1, 1, -1), 
                                              padding=MASK_WIDTH//2)
        torch.cuda.synchronize()
        end_time = time.perf_counter()
        
        elapsed_ms_gpu = (end_time - start_time) * 1000
        giga_elements_per_sec_gpu = (WIDTH * 1e-6) / (elapsed_ms_gpu * 1e-3)
        
        print(f"GPU Time:          {elapsed_ms_gpu:10.2f} ms")
        print(f"GPU Performance:      {giga_elements_per_sec_gpu:10.2f} GElements/s")

benchmark_pytorch()


PyTorch Convolution Benchmark:
  Input size:    1048576
  Mask width:          5

CPU Time:               20.83 ms
CPU Performance:           50.35 GElements/s
GPU Time:                0.36 ms
GPU Performance:         2921.83 GElements/s


In [6]:
def benchmark_scipy():
    print("\nSciPy Convolution Benchmark:")
    print(f"  Input size: {WIDTH:10d}")
    print(f"  Mask width: {MASK_WIDTH:10d}")
    print()
    
    # Warm-up
    _ = convolve(input_data, M, mode='same')
    
    start_time = time.perf_counter()
    output = convolve(input_data, M, mode='same')
    end_time = time.perf_counter()
    
    elapsed_ms = (end_time - start_time) * 1000
    giga_elements_per_sec = (WIDTH * 1e-6) / (elapsed_ms * 1e-3)
    
    print(f"Time:          {elapsed_ms:10.2f} ms")
    print(f"Performance:      {giga_elements_per_sec:10.2f} GElements/s")

benchmark_scipy()


SciPy Convolution Benchmark:
  Input size:    1048576
  Mask width:          5

Time:                2.81 ms
Performance:          373.33 GElements/s


In [8]:
def run_all_benchmarks():
    print("=== Performance Comparison ===")
    print("\nCUDA Fortran Performance (from previous run):")
    print("Performance:      11,130 GElements/s on RTX 4060")
    print("Time:             0.09 ms")
    print("\nRunning Python implementations...")
    
    benchmark_numpy()
    benchmark_pytorch()
    benchmark_scipy()

run_all_benchmarks()

=== Performance Comparison ===

CUDA Fortran Performance (from previous run):
Performance:      11,130 GElements/s on RTX 4060
Time:             0.09 ms

Running Python implementations...
NumPy Convolution Benchmark:
  Input size:    1048576
  Mask width:          5

Time:                2.70 ms
Performance:          388.11 GElements/s

PyTorch Convolution Benchmark:
  Input size:    1048576
  Mask width:          5

CPU Time:               22.38 ms
CPU Performance:           46.85 GElements/s
GPU Time:                0.17 ms
GPU Performance:         6239.22 GElements/s

SciPy Convolution Benchmark:
  Input size:    1048576
  Mask width:          5

Time:                2.28 ms
Performance:          460.16 GElements/s
