<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/multi_GPU_computing_lambda_ai_cloud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import numba
from numba import cuda
import math
import time
from scipy.stats import norm # For Black-Scholes comparison

# --- Option Parameters ---
S0 = 100.0  # Start Price
K = 100.0   # Strike Price
r = 0.02    # Risk-free Interest Rate
sigma = 0.20  # Volatility
T = 1.0     # Time to Maturity (years)
N_STEPS = 100 # Number of Time Steps
N_SIMULATIONS = 1_000_000 # Use 1 million simulations

# --- Numba CUDA Kernel ---
@cuda.jit
def monte_carlo_option_kernel(s0, k, r, sigma, dt, n_steps, paths_device, payoffs_device):
    """
    CUDA kernel to simulate stock paths and calculate option payoffs.
    Each thread simulates one path.
    """
    # Get the index of the current thread (which corresponds to a simulation path)
    tid = cuda.grid(1)

    # Only process if the thread index is within the number of simulations
    if tid < paths_device.shape[0]:
        s_t = s0
        # Use the pre-generated random numbers specific to this path
        path_random_numbers = paths_device[tid, :]

        # Simulate the stock price path using Geometric Brownian Motion
        for i in range(n_steps):
            # Ensure numerical stability: Check for potential issues before exp
            # This check is often not strictly necessary for standard GBM with
            # reasonable parameters, but can help debug infinities.
            # The primary fix for inf with N=100 was increasing N_SIMULATIONS
            # which makes the average more stable.
            diffusion_term = sigma * math.sqrt(dt) * path_random_numbers[i]
            drift_term = (r - 0.5 * sigma**2) * dt

            # Potential point of numerical error if drift_term + diffusion_term is huge
            # However, with standard parameters, this is unlikely.
            log_return = drift_term + diffusion_term

            # Check if log_return is finite before exponentiating
            # if not math.isfinite(log_return):
            #     # Handle non-finite case - for MC, often means this path is discarded
            #     # or results in a 0 payoff. Setting payoff to 0 is safer than inf.
            #     s_t = 0.0 # Will result in 0 payoff
            #     break # Stop simulating this path

            s_t = s_t * math.exp(log_return)

        # Calculate the payoff at maturity T
        payoff = max(0.0, s_t - k)

        # Store the calculated payoff for this path
        payoffs_device[tid] = payoff

# --- CPU Monte Carlo Function (for comparison) ---
def monte_carlo_option_cpu(s0, k, r, sigma, t, n_steps, n_simulations):
    dt = t / n_steps
    total_payoff = 0.0

    for i in range(n_simulations):
        s_t = s0
        # Generate random numbers for each step within the loop (less efficient than batching)
        # Or generate batch beforehand for fairer comparison with GPU preparation
        random_numbers = np.random.standard_normal(n_steps)
        for j in range(n_steps):
            s_t = s_t * np.exp((r - 0.5 * sigma**2) * dt + sigma * np.sqrt(dt) * random_numbers[j])

        payoff = max(0.0, s_t - k)
        total_payoff += payoff

    # Discount the average payoff back to present value
    option_price = (total_payoff / n_simulations) * np.exp(-r * t)
    return option_price

# --- Black-Scholes Formula (for theoretical benchmark) ---
def black_scholes_price(S, K, T, r, sigma):
    d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
    d2 = d1 - sigma * np.sqrt(T)
    call_price = S * norm.cdf(d1) - K * np.exp(-r * T) * norm.cdf(d2)
    return call_price

# --- Main Execution ---
print("----------------------------------------")
print("European Call Option Pricing using Monte Carlo Simulation")
print("----------------------------------------")
# Note: The script detected 8 GPUs in your output. This Numba code
# targets one GPU per process. To use 8 GPUs, you would launch this
# script 8 times, each on a different GPU, or use a framework like
# PyTorch/Accelerate that orchestrates this. The Numba kernel itself
# doesn't manage multiple devices simultaneously unless explicitly coded.
# We'll correct the single-GPU Numba part which was causing the error/warning.
print(f"Number of Simulations: {N_SIMULATIONS:,}")
print(f"Start Price: {S0:.2f}")
print(f"Strike Price: {K:.2f}")
print(f"Expected Return (mu): {r:.2f}") # Using r as drift proxy for printing consistency
print(f"Volatility (sigma): {sigma:.2f}")
print(f"Time to Maturity (T): {T:.2f} years")
print(f"Number of Time Steps: {N_STEPS}")
print(f"Risk-free Interest Rate (r): {r:.2f}")
print("----------------------------------------")


# --- CPU Calculation ---
start_time_cpu = time.time()
option_price_cpu = monte_carlo_option_cpu(S0, K, r, sigma, T, N_STEPS, N_SIMULATIONS)
end_time_cpu = time.time()
cpu_time = end_time_cpu - start_time_cpu

print(f"Option Price (CPU): {option_price_cpu:.4f}")
print(f"Execution Time (CPU): {cpu_time:.4f} seconds")


# --- GPU Calculation ---
# Prepare data and kernel launch configuration
dt = T / N_STEPS
# Generate all random numbers needed on the CPU first
# Shape is (N_SIMULATIONS, N_STEPS)
random_numbers_cpu = np.random.standard_normal((N_SIMULATIONS, N_STEPS)).astype(np.float64)

# Determine kernel launch configuration
threads_per_block = 512 # Common choice, can be tuned
# Calculate the required number of blocks
blocks_per_grid = (N_SIMULATIONS + threads_per_block - 1) // threads_per_block

# Allocate device memory and transfer data
# Use float64 matching numpy default and standard for financial calcs
random_numbers_device = cuda.to_device(random_numbers_cpu)
# Allocate space for payoffs on the device
payoffs_device = cuda.device_array(N_SIMULATIONS, dtype=np.float64)

start_time_gpu = time.time()

# Launch the CUDA kernel
# Pass the launch configuration (blocks_per_grid, threads_per_block) before arguments
monte_carlo_option_kernel[blocks_per_grid, threads_per_block](
    S0, K, r, sigma, dt, N_STEPS, random_numbers_device, payoffs_device
)

# Synchronize device to ensure kernel completes before stopping timer
cuda.synchronize()

end_time_gpu = time.time()
gpu_time = end_time_gpu - start_time_gpu

# Transfer payoffs back to host (CPU)
payoffs_cpu = payoffs_device.copy_to_host()

# Calculate the final option price on the CPU (summation/average)
# This reduction could also be done on the GPU for larger speedup on very many simulations
total_payoff_gpu = np.sum(payoffs_cpu)
option_price_gpu = (total_payoff_gpu / N_SIMULATIONS) * np.exp(-r * T)

print(f"Option Price (GPU): {option_price_gpu:.4f}")
print(f"Execution Time (GPU): {gpu_time:.4f} seconds")


# --- Speedup and Black-Scholes Comparison ---
speedup = cpu_time / gpu_time if gpu_time > 0 else float('inf')
print(f"Speedup (CPU vs. GPU): {speedup:.2f}x")

black_scholes = black_scholes_price(S0, K, T, r, sigma)
print(f"Option Price (Black-Scholes): {black_scholes:.4f}")

print("----------------------------------------")
print("Device used for calculations: GPU (via Numba CUDA kernel)")
print("----------------------------------------")