coreylowman · coreylowman · Oct 25, 2023 · Sep 29, 2023 · Sep 29, 2023 · Sep 29, 2023
diff --git a/.github/workflows/cargo-check.yaml b/.github/workflows/cargo-check.yaml
@@ -20,4 +20,4 @@ jobs:
       - uses: actions-rs/cargo@v1
         with:
           command: check
-          args: --no-default-features --features ci-check,no-std,cudnn,cublas,nvrtc,driver,curand,nccl
+          args: --no-default-features --features ci-check,no-std,cudnn,cublas,cublaslt,nvrtc,driver,curand,nccl
diff --git a/.github/workflows/cargo-clippy.yaml b/.github/workflows/cargo-clippy.yaml
@@ -15,4 +15,4 @@ jobs:
       - uses: actions-rs/cargo@v1
         with:
           command: clippy
-          args: --no-default-features --features ci-check,no-std,cudnn,cublas,nvrtc,driver,curand,nccl -- -D warnings
+          args: --no-default-features --features ci-check,no-std,cudnn,cublas,cublaslt,nvrtc,driver,curand,nccl -- -D warnings
diff --git a/Cargo.toml b/Cargo.toml
@@ -28,6 +28,7 @@ default = ["std", "driver", "nvrtc", "cublas", "curand"]
 nvrtc = []
 driver = ["nvrtc"]
 cublas = ["driver"]
+cublaslt = ["driver"]
 cudnn = ["driver"]
 curand = ["driver"]
 nccl = ["driver"]

diff --git a/README.md b/README.md
@@ -11,6 +11,7 @@ Safe abstractions over:
 2. [NVRTC API](https://docs.nvidia.com/cuda/nvrtc/index.html)
 3. [cuRAND API](https://docs.nvidia.com/cuda/curand/index.html)
 4. [cuBLAS API](https://docs.nvidia.com/cuda/cublas/index.html)
+5. [cuBLASLt API](https://docs.nvidia.com/cuda/cublas/#using-the-cublaslt-api)
 
 **Pre-alpha state**, expect breaking changes and not all cuda functions
 contain a safe wrapper. **Contributions welcome for any that aren't included!**
@@ -27,6 +28,7 @@ To that end there are three levels to each wrapper (by default the safe api is e
 use cudarc::driver::{safe, result, sys};
 use cudarc::nvrtc::{safe, result, sys};
 use cudarc::cublas::{safe, result, sys};
+use cudarc::cublaslt::{safe, result, sys};
 use cudarc::curand::{safe, result, sys};
 ```
 

diff --git a/build.rs b/build.rs
@@ -30,27 +30,37 @@ fn link_cuda() {
 
     #[cfg(feature = "driver")]
     println!("cargo:rustc-link-lib=dylib=cuda");
-    #[cfg(feature = "nvrtc")]
-    println!("cargo:rustc-link-lib=dylib=nvrtc");
-    #[cfg(feature = "curand")]
-    println!("cargo:rustc-link-lib=dylib=curand");
     #[cfg(feature = "nccl")]
     println!("cargo:rustc-link-lib=dylib=nccl");
 
     #[cfg(feature = "static-linking")]
     {
-        #[cfg(feature = "cublas")]
         println!("cargo:rustc-link-lib=dylib=stdc++");
+        #[cfg(any(feature = "cublas", feature = "cublaslt"))] {
+            println!("cargo:rustc-link-lib=dylib=cudart");
+            println!("cargo:rustc-link-lib=static=cublasLt_static");
+        }
         #[cfg(feature = "cublas")]
         println!("cargo:rustc-link-lib=static=cublas_static");
-        #[cfg(feature = "cublas")]
-        println!("cargo:rustc-link-lib=static=cublasLt_static");
+        #[cfg(feature = "curand")] {
+            println!("cargo:rustc-link-lib=dylib=culibos");
+            println!("cargo:rustc-link-lib=static=curand_static");
+        }
+        #[cfg(feature = "nvrtc")] {
+            println!("cargo:rustc-link-lib=static=nvrtc_static");
+            println!("cargo:rustc-link-lib=static=nvptxcompiler_static");
+            println!("cargo:rustc-link-lib=static=nvrtc-builtins_static");
+        }
     }
     #[cfg(not(feature = "static-linking"))]
     {
+        #[cfg(feature = "nvrtc")]
+        println!("cargo:rustc-link-lib=dylib=nvrtc");
+        #[cfg(feature = "curand")]
+        println!("cargo:rustc-link-lib=dylib=curand");
         #[cfg(feature = "cublas")]
         println!("cargo:rustc-link-lib=dylib=cublas");
-        #[cfg(feature = "cublas")]
+        #[cfg(any(feature = "cublas", feature = "cublaslt"))]
         println!("cargo:rustc-link-lib=dylib=cublasLt");
     }
 

diff --git a/src/cublaslt/bindgen.sh b/src/cublaslt/bindgen.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# Requires rust-bindgen 0.68.1 or superior
+set -exu
+BINDGEN_EXTRA_CLANG_ARGS="-D__CUDA_BF16_TYPES_EXIST__" \
+bindgen \
+  --allowlist-type="^cublasLt.*" \
+  --allowlist-var="^cublasLt.*" \
+  --allowlist-function="^cublasLt.*" \
+  --default-enum-style=rust \
+  --no-doc-comments \
+  --with-derive-default \
+  --with-derive-eq \
+  --with-derive-hash \
+  --with-derive-ord \
+  --use-core \
+  wrapper.h -- -I/usr/local/cuda/include \
+  > sys.rs
diff --git a/src/cublaslt/mod.rs b/src/cublaslt/mod.rs
@@ -0,0 +1,6 @@
+pub mod result;
+pub mod safe;
+#[allow(warnings)]
+pub mod sys;
+
+pub use safe::*;
diff --git a/src/cublaslt/result.rs b/src/cublaslt/result.rs
@@ -0,0 +1,233 @@
+use super::sys;
+use crate::cublaslt::sys::cublasLtMatmulAlgo_t;
+use core::ffi::c_void;
+use core::mem::MaybeUninit;
+
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub struct CublasError(pub sys::cublasStatus_t);
+
+impl sys::cublasStatus_t {
+    pub fn result(self) -> Result<(), CublasError> {
+        match self {
+            sys::cublasStatus_t::CUBLAS_STATUS_SUCCESS => Ok(()),
+            _ => Err(CublasError(self)),
+        }
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::fmt::Display for CublasError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{self:?}")
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for CublasError {}
+
+/// Creates a handle to the cuBLASLT library. See
+/// [nvidia docs](https://docs.nvidia.com/cuda/cublas/index.html#cublasltcreate)
+pub fn create_handle() -> Result<sys::cublasLtHandle_t, CublasError> {
+    let mut handle = MaybeUninit::uninit();
+    unsafe {
+        sys::cublasLtCreate(handle.as_mut_ptr()).result()?;
+        Ok(handle.assume_init())
+    }
+}
+
+/// Destroys a handle previously created with [create_handle()]. See
+/// [nvidia docs](https://docs.nvidia.com/cuda/cublas/index.html#cublasltdestroy)
+///
+/// # Safety
+///
+/// `handle` must not have been freed already.
+pub unsafe fn destroy_handle(handle: sys::cublasLtHandle_t) -> Result<(), CublasError> {
+    sys::cublasLtDestroy(handle).result()
+}
+
+/// Creates a matrix layout descriptor. See
+/// [nvidia docs](https://docs.nvidia.com/cuda/cublas/index.html#cublasltmatrixlayoutcreate)
+pub fn create_matrix_layout(
+    matrix_type: sys::cudaDataType,
+    rows: u64,
+    cols: u64,
+    ld: i64,
+) -> Result<sys::cublasLtMatrixLayout_t, CublasError> {
+    let mut matrix_layout = MaybeUninit::uninit();
+    unsafe {
+        sys::cublasLtMatrixLayoutCreate(matrix_layout.as_mut_ptr(), matrix_type, rows, cols, ld)
+            .result()?;
+        Ok(matrix_layout.assume_init())
+    }
+}
+
+/// Destroys a matrix layout previously created with [create_matrix_layout(...)]. See
+/// [nvidia docs](https://docs.nvidia.com/cuda/cublas/index.html#cublasltmatrixlayoutdestroy)
+///
+/// # Safety
+///
+/// `matrix_layout` must not have been freed already.
+pub unsafe fn destroy_matrix_layout(
+    matrix_layout: sys::cublasLtMatrixLayout_t,
+) -> Result<(), CublasError> {
+    sys::cublasLtMatrixLayoutDestroy(matrix_layout).result()
+}
+
+/// Creates a matrix multiply descriptor. See
+/// [nvidia docs](https://docs.nvidia.com/cuda/cublas/index.html#cublasltmatmuldesccreate)
+pub fn create_matmul_desc(
+    compute_type: sys::cublasComputeType_t,
+    scale_type: sys::cudaDataType,
+) -> Result<sys::cublasLtMatmulDesc_t, CublasError> {
+    let mut matmul_desc = MaybeUninit::uninit();
+    unsafe {
+        sys::cublasLtMatmulDescCreate(matmul_desc.as_mut_ptr(), compute_type, scale_type)
+            .result()?;
+        Ok(matmul_desc.assume_init())
+    }
+}
+
+/// Sets the value of the specified attribute belonging to a previously created matrix multiply
+/// descriptor. See
+/// [nvidia docs](https://docs.nvidia.com/cuda/cublas/index.html#cublasltmatmuldescsetattribute)
+pub unsafe fn set_matmul_desc_attribute(
+    matmul_desc: sys::cublasLtMatmulDesc_t,
+    attr: sys::cublasLtMatmulDescAttributes_t,
+    buf: *const c_void,
+    buf_size: usize,
+) -> Result<(), CublasError> {
+    sys::cublasLtMatmulDescSetAttribute(matmul_desc, attr, buf, buf_size).result()
+}
+
+/// Destroys a matrix multiply descriptor previously created with [create_matmul_desc(...)]. See
+/// [nvidia docs](https://docs.nvidia.com/cuda/cublas/index.html#cublasltmatmuldescdestroy)
+///
+/// # Safety
+///
+/// `matmul_desc` must not have been freed already.
+pub unsafe fn destroy_matmul_desc(
+    matmul_desc: sys::cublasLtMatmulDesc_t,
+) -> Result<(), CublasError> {
+    sys::cublasLtMatmulDescDestroy(matmul_desc).result()
+}
+
+/// Creates a matrix multiply heuristic search preferences descriptor. See
+/// [nvidia docs](https://docs.nvidia.com/cuda/cublas/index.html#cublasltmatmulpreferencecreate)
+pub fn create_matmul_pref() -> Result<sys::cublasLtMatmulPreference_t, CublasError> {
+    let mut matmul_pref = MaybeUninit::uninit();
+    unsafe {
+        sys::cublasLtMatmulPreferenceCreate(matmul_pref.as_mut_ptr()).result()?;
+        Ok(matmul_pref.assume_init())
+    }
+}
+
+/// Sets the value of the specified attribute belonging to a previously create matrix multiply
+/// preferences descriptor. See
+/// [nvidia docs](https://docs.nvidia.com/cuda/cublas/index.html#cublasltmatmulpreferencesetattribute)
+pub unsafe fn set_matmul_pref_attribute(
+    matmul_pref: sys::cublasLtMatmulPreference_t,
+    attr: sys::cublasLtMatmulPreferenceAttributes_t,
+    buf: *const c_void,
+    buf_size: usize,
+) -> Result<(), CublasError> {
+    sys::cublasLtMatmulPreferenceSetAttribute(matmul_pref, attr, buf, buf_size).result()
+}
+
+/// Destroys a matrix multiply preferences descriptor previously created
+/// with [create_matmul_pref()]. See
+/// [nvidia docs](https://docs.nvidia.com/cuda/cublas/index.html#cublasltmatmulpreferencedestroy)
+///
+/// # Safety
+///
+/// `matmul_pref` must not have been freed already.
+pub unsafe fn destroy_matmul_pref(
+    matmul_pref: sys::cublasLtMatmulPreference_t,
+) -> Result<(), CublasError> {
+    sys::cublasLtMatmulPreferenceDestroy(matmul_pref).result()
+}
+
+/// Retrieves the fastest possible algorithm for the matrix multiply operation function
+/// given input matrices A, B and C and the output matrix D. See
+/// [nvidia docs](https://docs.nvidia.com/cuda/cublas/index.html#cublasltmatmulalgogetheuristic)
+pub fn get_matmul_algo_heuristic(
+    handle: sys::cublasLtHandle_t,
+    matmul_desc: sys::cublasLtMatmulDesc_t,
+    a_layout: sys::cublasLtMatrixLayout_t,
+    b_layout: sys::cublasLtMatrixLayout_t,
+    c_layout: sys::cublasLtMatrixLayout_t,
+    d_layout: sys::cublasLtMatrixLayout_t,
+    matmul_pref: sys::cublasLtMatmulPreference_t,
+) -> Result<sys::cublasLtMatmulHeuristicResult_t, CublasError> {
+    let mut matmul_heuristic = MaybeUninit::uninit();
+    let mut algo_count = 0;
+
+    unsafe {
+        sys::cublasLtMatmulAlgoGetHeuristic(
+            handle,
+            matmul_desc,
+            a_layout,
+            b_layout,
+            c_layout,
+            d_layout,
+            matmul_pref,
+            1, // only select the fastest algo
+            matmul_heuristic.as_mut_ptr(),
+            &mut algo_count,
+        )
+        .result()?;
+
+        if algo_count == 0 {
+            return Err(CublasError(
+                sys::cublasStatus_t::CUBLAS_STATUS_NOT_SUPPORTED,
+            ));
+        }
+
+        let matmul_heuristic = matmul_heuristic.assume_init();
+        matmul_heuristic.state.result()?;
+
+        Ok(matmul_heuristic)
+    }
+}
+
+/// Computes the matrix multiplication of matrics A and B to produce the output matrix D,
+/// according to the following operation: D = alpha*(A*B) + beta*(C)
+/// where A, B, and C are input matrices, and alpha and beta are input scalars. See
+/// [nvidia docs](https://docs.nvidia.com/cuda/cublas/index.html#cublasltmatmul)
+pub unsafe fn matmul(
+    handle: sys::cublasLtHandle_t,
+    matmul_desc: sys::cublasLtMatmulDesc_t,
+    alpha: *const c_void,
+    beta: *const c_void,
+    a: *const c_void,
+    a_layout: sys::cublasLtMatrixLayout_t,
+    b: *const c_void,
+    b_layout: sys::cublasLtMatrixLayout_t,
+    c: *const c_void,
+    c_layout: sys::cublasLtMatrixLayout_t,
+    d: *mut c_void,
+    d_layout: sys::cublasLtMatrixLayout_t,
+    algo: *const cublasLtMatmulAlgo_t,
+    workspace: *mut c_void,
+    workspace_size: usize,
+    stream: sys::cudaStream_t,
+) -> Result<(), CublasError> {
+    sys::cublasLtMatmul(
+        handle,
+        matmul_desc,
+        alpha,
+        a,
+        a_layout,
+        b,
+        b_layout,
+        beta,
+        c,
+        c_layout,
+        d,
+        d_layout,
+        algo,
+        workspace,
+        workspace_size,
+        stream,
+    )
+    .result()
+}