# Simple CUDA H2D Memory Copy Benchmark

This notebook benchmarks CUDA Host-to-Device memory transfers using CuPy and provides comprehensive visualization of the results.

## Features
- Configurable buffer size and iteration count
- Async and sync transfer modes
- Detailed statistics and bandwidth calculations
- Multiple visualization plots

In [None]:
# Import required libraries
import numpy as np
import cupy as cp
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Import benchmark functions from external module
from memcpy_benchmark import simple_h2d_benchmark, simple_h2h_benchmark, MemcpyStats

print("Libraries imported successfully!")
print(f"CuPy version: {cp.__version__}")
print(f"NumPy version: {np.__version__}")
print("Using Plotly for interactive, zoomable plots!")

## GPU Information

In [None]:
# Get GPU information
device = cp.cuda.Device()
device_props = cp.cuda.runtime.getDeviceProperties(device.id)
gpu_name = device_props['name'].decode('utf-8')

print(f"GPU: {gpu_name}")
print(f"Compute Capability: {device_props['major']}.{device_props['minor']}")
print(f"Total Memory: {device_props['totalGlobalMem'] / (1024**3):.2f} GB")
print(f"Multiprocessors: {device_props['multiProcessorCount']}")

## Benchmark Configuration

The benchmark functions are imported from `memcpy_benchmark.py`, which can also be run standalone or with nsys for profiling.

## Configure Benchmark Parameters

Adjust these parameters to customize the benchmark.

**Note:** To profile with nsys, run:
```bash
nsys profile python memcpy_benchmark.py [num_elements] [num_iterations] [use_async] [output_file]
```

In [None]:
# Benchmark configuration
num_elements = 32768      # Number of int16 elements (64 KB)
# num_elements = 65536
num_iterations = 1000000  # Number of benchmark iterations
use_async = True         # Use async transfers (True) or sync (False) for H2D

print("Benchmark Configuration:")
print(f"  Number of elements: {num_elements:,}")
print(f"  Buffer size: {num_elements * 2 / 1024:.1f} KB ({num_elements * 2 / (1024*1024):.6f} MB)")
print(f"  Iterations: {num_iterations:,}")
print(f"  H2D Transfer mode: {'Async (with stream)' if use_async else 'Sync'}")
print(f"  H2H Transfer mode: Host memcpy")

## Run Benchmarks

In [None]:
# Run the H2D benchmark
print("Running H2D benchmark...")
h2d_stats = simple_h2d_benchmark(num_elements, num_iterations, use_async)
print("H2D benchmark complete!\n")

# Run the H2H benchmark
print("Running H2H benchmark...")
h2h_stats = simple_h2h_benchmark(num_elements, num_iterations)
print("H2H benchmark complete!\n")

# Print H2D statistics
print("="*60)
print("Host-to-Device (H2D) Statistics")
print("="*60)
print(f"Number of elements:     {h2d_stats.num_elements:,}")
print(f"Bytes transferred:      {h2d_stats.bytes_transferred:,} ({h2d_stats.bytes_transferred / 1024:.1f} KB)")
print(f"Number of iterations:   {num_iterations:,}")
print()
print(f"Min time:               {h2d_stats.min_time:.6f} ms  ({h2d_stats.min_time * 1000:.3f} µs)")
print(f"Avg time:               {h2d_stats.avg_time:.6f} ms  ({h2d_stats.avg_time * 1000:.3f} µs)")
print(f"Max time:               {h2d_stats.max_time:.6f} ms  ({h2d_stats.max_time * 1000:.3f} µs)")
print(f"Range:                  {h2d_stats.max_time - h2d_stats.min_time:.6f} ms")
print(f"Total time:             {h2d_stats.total_time:.6f} ms")
print()
print(f"Bandwidth (avg):        {h2d_stats.bandwidth_gbps:.6f} GB/s")
print()
print(f"Transfers > 50 µs:      {h2d_stats.count_above_50us:,} ({100.0 * h2d_stats.count_above_50us / num_iterations:.1f}%)")
print("="*60)
print()

# Print H2H statistics
print("="*60)
print("Host-to-Host (H2H) Statistics")
print("="*60)
print(f"Number of elements:     {h2h_stats.num_elements:,}")
print(f"Bytes transferred:      {h2h_stats.bytes_transferred:,} ({h2h_stats.bytes_transferred / 1024:.1f} KB)")
print(f"Number of iterations:   {num_iterations:,}")
print()
print(f"Min time:               {h2h_stats.min_time:.6f} ms  ({h2h_stats.min_time * 1000:.3f} µs)")
print(f"Avg time:               {h2h_stats.avg_time:.6f} ms  ({h2h_stats.avg_time * 1000:.3f} µs)")
print(f"Max time:               {h2h_stats.max_time:.6f} ms  ({h2h_stats.max_time * 1000:.3f} µs)")
print(f"Range:                  {h2h_stats.max_time - h2h_stats.min_time:.6f} ms")
print(f"Total time:             {h2h_stats.total_time:.6f} ms")
print()
print(f"Bandwidth (avg):        {h2h_stats.bandwidth_gbps:.6f} GB/s")
print()
print(f"Transfers > 50 µs:      {h2h_stats.count_above_50us:,} ({100.0 * h2h_stats.count_above_50us / num_iterations:.1f}%)")
print("="*60)

## Visualization: Time Series Plots

Shows transfer time for each iteration over time (H2D and H2H side by side):

In [None]:
# Convert timings to microseconds for plotting
h2d_timings_us = [t * 1000 for t in h2d_stats.timings]
h2h_timings_us = [t * 1000 for t in h2h_stats.timings]

# Create subplots side by side
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=(
        f'H2D Transfer Time<br>{h2d_stats.num_elements:,} elements, {"Async" if use_async else "Sync"} mode',
        f'H2H Transfer Time<br>{h2h_stats.num_elements:,} elements, Host memcpy'
    ),
    horizontal_spacing=0.12
)

# H2D Plot (left)
fig.add_trace(go.Scatter(
    y=h2d_timings_us,
    mode='lines',
    name='H2D Transfer Time',
    line=dict(width=1, color='steelblue'),
    opacity=0.7,
    hovertemplate='Iteration: %{x}<br>Time: %{y:.3f} µs<extra></extra>',
    legendgroup='h2d'
), row=1, col=1)

fig.add_trace(go.Scatter(
    x=[0, len(h2d_timings_us)-1],
    y=[h2d_stats.avg_time * 1000, h2d_stats.avg_time * 1000],
    mode='lines',
    name=f'H2D Avg: {h2d_stats.avg_time * 1000:.2f} µs',
    line=dict(color='red', dash='dash', width=2),
    legendgroup='h2d'
), row=1, col=1)

# 50µs threshold line for H2D
fig.add_trace(go.Scatter(
    x=[0, len(h2d_timings_us)-1],
    y=[50, 50],
    mode='lines',
    name='50 µs threshold',
    line=dict(color='orange', dash='dot', width=2),
    legendgroup='threshold'
), row=1, col=1)

# H2H Plot (right)
fig.add_trace(go.Scatter(
    y=h2h_timings_us,
    mode='lines',
    name='H2H Transfer Time',
    line=dict(width=1, color='purple'),
    opacity=0.7,
    hovertemplate='Iteration: %{x}<br>Time: %{y:.3f} µs<extra></extra>',
    legendgroup='h2h'
), row=1, col=2)

fig.add_trace(go.Scatter(
    x=[0, len(h2h_timings_us)-1],
    y=[h2h_stats.avg_time * 1000, h2h_stats.avg_time * 1000],
    mode='lines',
    name=f'H2H Avg: {h2h_stats.avg_time * 1000:.2f} µs',
    line=dict(color='red', dash='dash', width=2),
    legendgroup='h2h'
), row=1, col=2)

# 50µs threshold line for H2H
fig.add_trace(go.Scatter(
    x=[0, len(h2h_timings_us)-1],
    y=[50, 50],
    mode='lines',
    name='50 µs threshold',
    line=dict(color='orange', dash='dot', width=2),
    legendgroup='threshold',
    showlegend=False  # Don't show duplicate legend entry
), row=1, col=2)

# Calculate common y-axis range
all_timings = h2d_timings_us + h2h_timings_us
y_min = min(all_timings)
y_max = max(all_timings)
y_range = [y_min * 0.95, y_max * 1.05]  # Add 5% padding

# Update axes labels and synchronize y-axis range
fig.update_xaxes(title_text='Iteration', row=1, col=1)
fig.update_xaxes(title_text='Iteration', row=1, col=2)
fig.update_yaxes(title_text='Transfer Time (µs)', range=y_range, row=1, col=1)
fig.update_yaxes(title_text='Transfer Time (µs)', range=y_range, row=1, col=2)

# Update layout
fig.update_layout(
    hovermode='closest',
    height=500,
    showlegend=True,
    template='plotly_white'
    # title_text='H2D vs H2H Transfer Time Comparison'
)

fig.show()