Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions code/matx/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ FetchContent_Declare(matx
GIT_TAG main
)

FetchContent_GetProperties(arrayfire)
FetchContent_GetProperties(matx)
if(NOT matx_POPULATED)
FetchContent_Populate(matx)
add_subdirectory(${matx_SOURCE_DIR} ${matx_BINARY_DIR} EXCLUDE_FROM_ALL)
Expand All @@ -27,8 +27,18 @@ if (NOT CMAKE_CXX_COMPILER)
set(CMAKE_CXX_COMPILER "/usr/bin/g++")
endif()

set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
set(CMAKE_CUDA_COMPILER /opt/nvidia/hpc_sdk/Linux_x86_64/24.11/compilers/bin/nvcc)

add_executable(test test.cu)
set(CMAKE_CUDA_ARCHITECTURES 75)

# Optimization flags
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -ftree-vectorize -march=native")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 -use_fast_math -arch=compute_75 -code=sm_75")

# Enable Link Time Optimization (LTO)4
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flto")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -flto")

add_executable(test bs.cu)
set_property(TARGET test PROPERTY CXX_STANDARD 20)
target_link_libraries(test PRIVATE matx::matx)
138 changes: 138 additions & 0 deletions code/matx/bs.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#include "matx.h"
#include <cassert>
#include <cstdio>
#include <math.h>
#include <memory>

using namespace matx;

/**
* MatX uses C++ expression templates to build arithmetic expressions that compile into a lazily-evaluated
* type for executing on the device. Currently, nvcc cannot see certain optimizations
* when building the expression tree that would be obvious by looking at the code. Specifically any code reusing
* the same tensor multiple times appears to the compiler as separate tensors, and it may issue multiple load
* instructions. While caching helps, this can have a slight performance impact when compared to native CUDA
* kernels. To work around this problem, complex expressions can be placed in a custom operator by adding some
* boilerplate code around the original expression. This custom operator can then be used either alone or inside
* other arithmetic expressions, and only a single load is issues for each tensor.
*
* This example uses the Black-Scholes equtation to demonstrate the two ways to implement the equation in MatX, and
* shows the performance difference.
*/

/* Custom operator */
template <class O, class I1, class I2, class I3, class I4, class I5>
class BlackScholes : public BaseOp<BlackScholes<O, I1, I2, I3, I4, I5>>
{
private:
O out_;
I1 K_;
I2 V_;
I3 S_;
I4 r_;
I5 T_;

public:
BlackScholes(O out, I1 K, I2 V, I3 S, I4 r, I5 T)
: out_(out), V_(V), S_(S), K_(K), r_(r), T_(T) {}

__device__ inline void operator()(index_t idx)
{
auto V = V_();
auto K = K_();
auto S = S_(idx);
auto T = T_();
auto r = r_();

auto VsqrtT = V * sqrt(T);
auto d1 = (log(S / K) + (r + 0.5f * V * V) * T) / VsqrtT;
auto d2 = d1 - VsqrtT;
auto cdf_d1 = normcdff(d1);
auto cdf_d2 = normcdff(d2);
auto expRT = exp(-1.f * r * T);

out_(idx) = S * cdf_d1 - K * expRT * cdf_d2;
}

__host__ __device__ inline index_t Size(uint32_t i) const { return out_.Size(i); }
static inline constexpr __host__ __device__ int32_t Rank() { return O::Rank(); }
};

template <typename T1>
void compute_black_scholes_matx(tensor_t<T1, 0> &K,
tensor_t<T1, 1> &S,
tensor_t<T1, 0> &V,
tensor_t<T1, 0> &r,
tensor_t<T1, 0> &T,
tensor_t<T1, 1> &output,
cudaExecutor &exec)
{
auto VsqrtT = V * sqrt(T);
auto d1 = (log(S / K) + (r + 0.5f * V * V) * T) / VsqrtT;
auto d2 = d1 - VsqrtT;
auto cdf_d1 = normcdf(d1);
auto cdf_d2 = normcdf(d2);
auto expRT = exp(-1.f * r * T);

(output = S * cdf_d1 - K * expRT * cdf_d2).run(exec);
}

int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
{
MATX_ENTER_HANDLER();

using dtype = float;

index_t input_size = 100000;
auto output_tensor = make_tensor<dtype>({input_size});
auto S_tensor = make_tensor<dtype>({input_size});
auto K_tensor = make_tensor<dtype>({});
auto V_tensor = make_tensor<dtype>({});
auto r_tensor = make_tensor<dtype>({});
auto T_tensor = make_tensor<dtype>({});
float time_ms;
int num_iterations = 99;

for (index_t i = 0; i < input_size; i++)
{
S_tensor(i) = (dtype)90 + dtype(i % 20);
}
K_tensor() = (dtype)100.;
V_tensor() = (dtype)0.1;
r_tensor() = (dtype)0.05;
T_tensor() = (dtype)1.0;

cudaStream_t stream;
cudaStreamCreate(&stream);
cudaExecutor exec{stream};
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

BlackScholes(output_tensor, K_tensor, V_tensor, S_tensor, r_tensor, T_tensor).run(exec);
exec.sync();

cudaEventRecord(start, stream);
for (int i = 0; i < num_iterations; i++)
{
BlackScholes(output_tensor, K_tensor, V_tensor, S_tensor, r_tensor, T_tensor).run(exec);
}
cudaEventRecord(stop, stream);
exec.sync();
cudaEventElapsedTime(&time_ms, start, stop);

printf("Black-Scholes time = %.2fus per iteration\n",
time_ms * 1e3 / num_iterations);

compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);

printf("First 20 values of computed Black-Scholes output:\n");
for (index_t i = 0; i < 20; i++)
{
printf("%f\n", static_cast<float>(output_tensor(i)));
}

cudaStreamDestroy(stream);
MATX_CUDA_CHECK_LAST_ERROR();
MATX_EXIT_HANDLER();
}
Loading