diff --git a/include/cutlass/cutlass.h b/include/cutlass/cutlass.h
index 2a900a19d..b29ab353b 100644
--- a/include/cutlass/cutlass.h
+++ b/include/cutlass/cutlass.h
@@ -46,11 +46,6 @@
 #pragma once
 
 #include "cutlass/detail/helper_macros.hpp"
-
-#if defined(CUTLASS_ENABLE_SYCL)
-#include "syclcompat.hpp"
-#endif
-
 #include <cutlass/gpu_generics.h>
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gpu_generics.h b/include/cutlass/gpu_generics.h
index 6d9e152d5..e4efcb326 100644
--- a/include/cutlass/gpu_generics.h
+++ b/include/cutlass/gpu_generics.h
@@ -36,6 +36,11 @@
  * frameworks such as CUDA and SYCL.
  */
 
+#if defined(CUTLASS_ENABLE_SYCL)
+#include <sycl/sycl.hpp>
+#include <syclcompat.hpp>
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 static const int NumThreadsPerWarp = 32;
diff --git a/include/cutlass/matrix.h b/include/cutlass/matrix.h
index ab32597e3..5d8ccb3c1 100644
--- a/include/cutlass/matrix.h
+++ b/include/cutlass/matrix.h
@@ -7825,7 +7825,7 @@ struct Matrix<Element_, 3, 3> {
 
     Matrix m;
 
-    m.set_slice3x3({
+    m.set_slice_3x3({
       c + x * x * one_minus_cos, x * y * one_minus_cos - z * s, x * z * one_minus_cos + y * s,
       y * x * one_minus_cos * z * s, c + y * y * one_minus_cos, y * z * one_minus_cos - x * s,
       z * x * one_minus_cos - y * s, z * y * one_minus_cos + x * s, c + z * z * one_minus_cos
@@ -7845,7 +7845,7 @@ struct Matrix<Element_, 3, 3> {
 
     Matrix m = Matrix::identity();
 
-    m.set_slice3x3({
+    m.set_slice_3x3({
       Element(1) - Element(2) * a * a, Element(-2) * a * b, Element(-2) * a * c,
       Element(-2) * a * b, Element(1) - Element(2) * b * b, Element(-2) * b * c,
       Element(-2) * a * c, Element(-2) * b * c, Element(1) - Element(2) * c * c
@@ -14005,7 +14005,7 @@ struct Matrix<Element_, 4, 4> {
 
     Matrix m;
 
-    m.set_slice3x3({
+    m.set_slice_3x3({
       c + x * x * one_minus_cos, x * y * one_minus_cos - z * s, x * z * one_minus_cos + y * s,
       y * x * one_minus_cos * z * s, c + y * y * one_minus_cos, y * z * one_minus_cos - x * s,
       z * x * one_minus_cos - y * s, z * y * one_minus_cos + x * s, c + z * z * one_minus_cos
@@ -14025,7 +14025,7 @@ struct Matrix<Element_, 4, 4> {
 
     Matrix m = Matrix::identity();
 
-    m.set_slice3x3({
+    m.set_slice_3x3({
       Element(1) - Element(2) * a * a, Element(-2) * a * b, Element(-2) * a * c,
       Element(-2) * a * b, Element(1) - Element(2) * b * b, Element(-2) * b * c,
       Element(-2) * a * c, Element(-2) * b * c, Element(1) - Element(2) * c * c
diff --git a/tools/util/include/cutlass/util/device_memory.h b/tools/util/include/cutlass/util/device_memory.h
index 66262ac00..0b78640be 100644
--- a/tools/util/include/cutlass/util/device_memory.h
+++ b/tools/util/include/cutlass/util/device_memory.h
@@ -56,13 +56,14 @@ T* allocate(size_t count = 1) {
   T* ptr = 0;
 
 #if defined(CUTLASS_ENABLE_SYCL)
-  ptr = syclcompat::malloc<T>(count);
-  if (ptr == nullptr) {
-    throw std::runtime_error("Failed to allocate memory");
+  if (count > 0) {
+    ptr = reinterpret_cast<T*>(syclcompat::malloc(bytes));
+    if ((void*)ptr == nullptr) {
+      throw std::runtime_error("Failed to allocate memory");
+    }
   }
 #else
   size_t bytes = 0;
-
   bytes = count * sizeof(T);
 
   cudaError_t cuda_error = cudaMalloc((void**)&ptr, bytes);
@@ -78,7 +79,7 @@ T* allocate(size_t count = 1) {
 template <typename T>
 void free(T* ptr) {
 #if defined(CUTLASS_ENABLE_SYCL)
-    syclcompat::free((void*)ptr);
+    syclcompat::free(ptr);
     if (ptr != nullptr) {
       throw std::runtime_error("Failed to free device memory");
     }