codeplaysoftware · AD2605 · Mar 8, 2024 · Mar 8, 2024 · Mar 9, 2024 · Mar 11, 2024
diff --git a/src/portfft/committed_descriptor_impl.hpp b/src/portfft/committed_descriptor_impl.hpp
@@ -30,7 +30,7 @@
 #include <vector>
 
 #include "common/exceptions.hpp"
-#include "common/subgroup.hpp"
+#include "common/subgroup_ct.hpp"
 #include "defines.hpp"
 #include "enums.hpp"
 #include "specialization_constant.hpp"
@@ -89,8 +89,8 @@ class committed_descriptor_impl {
       Scalar1* output, const TIn& input_imag, Scalar1* output_imag, const Scalar1* twiddles_ptr,
       const IdxGlobal* factors_triple, IdxGlobal intermediate_twiddle_offset, IdxGlobal subimpl_twiddle_offset,
       IdxGlobal input_global_offset, IdxGlobal committed_size, Idx num_batches_in_l2, IdxGlobal n_transforms,
-      IdxGlobal batch_start, Idx factor_id, Idx total_factors, complex_storage storage,
-      const std::vector<sycl::event>& dependencies, sycl::queue& queue);
+      IdxGlobal batch_start, Idx total_factors, complex_storage storage, const std::vector<sycl::event>& dependencies,
+      sycl::queue& queue);
 
   template <typename Scalar1, domain Domain1, typename TOut>
   friend sycl::event detail::transpose_level(
@@ -148,18 +148,30 @@ class committed_descriptor_impl {
     std::vector<kernel_data_struct> transpose_kernels;
     std::shared_ptr<IdxGlobal> factors_and_scan;
     detail::level level;
+    // The size of DFT transform which will be computed for the given dimension. Will be different from the
+    // committed_length when the Bluestein / Rader algorithms are used
     std::size_t length;
+    // The committed length (as in the user specified length) for the particular dimension
+    std::size_t committed_length;
-    std::size_t committed_length;
+    std::size_t padded_length;
-    std::size_t committed_length;
+    std::size_t padded_length;
     Idx used_sg_size;
     Idx num_batches_in_l2;
     Idx num_factors;
+    detail::fft_algorithm algorithm;
 
     dimension_struct(std::vector<kernel_data_struct> forward_kernels, std::vector<kernel_data_struct> backward_kernels,
-                     detail::level level, std::size_t length, Idx used_sg_size)
+                     detail::level level, std::size_t length, std::size_t committed_length, Idx used_sg_size,
+                     detail::fft_algorithm algorithm)
         : forward_kernels(std::move(forward_kernels)),
           backward_kernels(std::move(backward_kernels)),
           level(level),
           length(length),
-          used_sg_size(used_sg_size) {}
+          committed_length(committed_length),
+          used_sg_size(used_sg_size),
+          algorithm(algorithm) {
+      if (algorithm == detail::fft_algorithm::BLUESTEIN && level != detail::level::SUBGROUP) {
+        throw unsupported_configuration("Prime sizes that do not fit in the subgroup implementation are not supported");
+      }
+    }
   };
 
   std::vector<dimension_struct> dimensions;
@@ -203,12 +215,12 @@ class committed_descriptor_impl {
    * set of kernels that need to be JIT compiled.
    *
    * @tparam SubgroupSize size of the subgroup
-   * @param kernel_num the consecutive number of the kernel to prepare
-   * @return implementation to use for the dimension and a vector of tuples of: implementation to use for a kernel,
-   * vector of kernel ids, factors
+   * @param fft_size The size for which kernel needs to be prepared
+   * @return implementation to use for the dimension and a vector of tuples of: implementation to use for a kernel, the
+   * size of the fft for which the implementation was prepared and the vector of kernel ids, factors
    */
   template <Idx SubgroupSize>
-  std::tuple<detail::level, kernel_ids_and_metadata_t> prepare_implementation(std::size_t kernel_num) {
+  std::tuple<detail::level, std::size_t, kernel_ids_and_metadata_t> prepare_implementation(IdxGlobal fft_size) {
     PORTFFT_LOG_FUNCTION_ENTRY();
     // TODO: check and support all the parameter values
     if constexpr (Domain != domain::COMPLEX) {
@@ -217,11 +229,12 @@ class committed_descriptor_impl {
 
     std::vector<sycl::kernel_id> ids;
     std::vector<Idx> factors;
-    IdxGlobal fft_size = static_cast<IdxGlobal>(params.lengths[kernel_num]);
     if (detail::fits_in_wi<Scalar>(fft_size)) {
       ids = detail::get_ids<detail::workitem_kernel, Scalar, Domain, SubgroupSize>();
       PORTFFT_LOG_TRACE("Prepared workitem impl for size: ", fft_size);
-      return {detail::level::WORKITEM, {{detail::level::WORKITEM, ids, factors}}};
+      return {detail::level::WORKITEM,
+              static_cast<std::size_t>(fft_size),
+              {{detail::level::WORKITEM, ids, {static_cast<Idx>(fft_size)}}}};
     }
     if (detail::fits_in_sg<Scalar>(fft_size, SubgroupSize)) {
       Idx factor_sg = detail::factorize_sg(static_cast<Idx>(fft_size), SubgroupSize);
@@ -232,14 +245,11 @@ class committed_descriptor_impl {
       factors.push_back(factor_sg);
       ids = detail::get_ids<detail::subgroup_kernel, Scalar, Domain, SubgroupSize>();
       PORTFFT_LOG_TRACE("Prepared subgroup impl with factor_wi:", factor_wi, "and factor_sg:", factor_sg);
-      return {detail::level::SUBGROUP, {{detail::level::SUBGROUP, ids, factors}}};
+      return {detail::level::SUBGROUP, static_cast<std::size_t>(fft_size), {{detail::level::SUBGROUP, ids, factors}}};
     }
     IdxGlobal n_idx_global = detail::factorize(fft_size);
     if (detail::can_cast_safely<IdxGlobal, Idx>(n_idx_global) &&
         detail::can_cast_safely<IdxGlobal, Idx>(fft_size / n_idx_global)) {
-      if (n_idx_global == 1) {
-        throw unsupported_configuration("FFT size ", fft_size, " : Large Prime sized FFT currently is unsupported");
-      }
       Idx n = static_cast<Idx>(n_idx_global);
       Idx m = static_cast<Idx>(fft_size / n_idx_global);
       Idx factor_sg_n = detail::factorize_sg(n, SubgroupSize);
@@ -265,7 +275,8 @@ class committed_descriptor_impl {
         ids = detail::get_ids<detail::workgroup_kernel, Scalar, Domain, SubgroupSize>();
         PORTFFT_LOG_TRACE("Prepared workgroup impl with factor_wi_n:", factor_wi_n, " factor_sg_n:", factor_sg_n,
                           " factor_wi_m:", factor_wi_m, " factor_sg_m:", factor_sg_m);
-        return {detail::level::WORKGROUP, {{detail::level::WORKGROUP, ids, factors}}};
+        return {
+            detail::level::WORKGROUP, static_cast<std::size_t>(fft_size), {{detail::level::WORKGROUP, ids, factors}}};
       }
     }
     PORTFFT_LOG_TRACE("Preparing global impl");
@@ -286,12 +297,10 @@ class committed_descriptor_impl {
         if (detail::can_cast_safely<IdxGlobal, Idx>(factor_sg) && detail::can_cast_safely<IdxGlobal, Idx>(factor_wi)) {
           std::size_t input_scalars =
               num_scalars_in_local_mem(detail::level::SUBGROUP, static_cast<std::size_t>(factor_size), SubgroupSize,
-                                       {static_cast<Idx>(factor_sg), static_cast<Idx>(factor_wi)}, temp_num_sgs_in_wg,
+                                       {static_cast<Idx>(factor_wi), static_cast<Idx>(factor_sg)}, temp_num_sgs_in_wg,
                                        batch_interleaved_layout ? layout::BATCH_INTERLEAVED : layout::PACKED);
-          std::size_t store_modifiers = batch_interleaved_layout ? input_scalars : 0;
           std::size_t twiddle_scalars = 2 * static_cast<std::size_t>(factor_size);
-          return (sizeof(Scalar) * (input_scalars + store_modifiers + twiddle_scalars)) <
-                 static_cast<std::size_t>(local_memory_size);
+          return (sizeof(Scalar) * (input_scalars + twiddle_scalars)) <= static_cast<std::size_t>(local_memory_size);
         }
         return false;
       }();
@@ -303,13 +312,17 @@ class committed_descriptor_impl {
                           "and factor_sg:", factor_sg);
         param_vec.emplace_back(detail::level::SUBGROUP,
                                detail::get_ids<detail::global_kernel, Scalar, Domain, SubgroupSize>(),
-                               std::vector<Idx>{factor_sg, factor_wi});
+                               std::vector<Idx>{factor_wi, factor_sg});
         return true;
       }
       return false;
     };
-    detail::factorize_input(fft_size, check_and_select_target_level);
-    return {detail::level::GLOBAL, param_vec};
+    bool encountered_large_prime = detail::factorize_input(fft_size, check_and_select_target_level);
+    if (encountered_large_prime) {
+      IdxGlobal padded_size = detail::get_bluestein_padded_size(fft_size);
+      return prepare_implementation<SubgroupSize>(padded_size);
+    }
+    return {detail::level::GLOBAL, static_cast<std::size_t>(fft_size), param_vec};
   }
 
   /**
@@ -463,6 +476,7 @@ class committed_descriptor_impl {
       const bool is_global = top_level == detail::level::GLOBAL;
       const bool is_final_factor = counter == (prepared_vec.size() - 1);
       const bool is_final_dim = dimension_num == (params.lengths.size() - 1);
+      const Idx factor_size = std::accumulate(factors.begin(), factors.end(), 1, std::multiplies<Idx>());
       const bool is_backward = compute_direction == direction::BACKWARD;
       if (is_multi_dim && is_global) {
         throw unsupported_configuration("multidimensional global transforms are not supported.");
@@ -477,29 +491,26 @@ class committed_descriptor_impl {
       const auto apply_scale = is_final_factor && is_final_dim ? detail::apply_scale_factor::APPLIED
                                                                : detail::apply_scale_factor::NOT_APPLIED;
 
-      Idx length{};
       IdxGlobal forward_stride{};
       IdxGlobal backward_stride{};
       IdxGlobal forward_distance{};
       IdxGlobal backward_distance{};
 
       if (is_global) {
-        length = std::accumulate(factors.begin(), factors.end(), Idx(1), std::multiplies<Idx>());
-
-        remaining_factors_prod /= length;
+        remaining_factors_prod /= factor_size;
         forward_stride = remaining_factors_prod;
         backward_stride = remaining_factors_prod;
-        forward_distance = is_final_factor ? length : 1;
-        backward_distance = is_final_factor ? length : 1;
+        forward_distance = is_final_factor ? factor_size : 1;
+        backward_distance = is_final_factor ? factor_size : 1;
 
       } else {
-        length = static_cast<Idx>(params.lengths[dimension_num]);
+        Idx committed_length = static_cast<Idx>(params.lengths[dimension_num]);
         forward_stride = static_cast<IdxGlobal>(params.forward_strides[dimension_num]);
         backward_stride = static_cast<IdxGlobal>(params.backward_strides[dimension_num]);
         if (is_multi_dim) {
           if (is_final_dim) {
-            forward_distance = length;
-            backward_distance = length;
+            forward_distance = committed_length;
+            backward_distance = committed_length;
           } else {
             forward_distance = 1;
             backward_distance = 1;
@@ -517,14 +528,34 @@ class committed_descriptor_impl {
 
       auto in_bundle = sycl::get_kernel_bundle<sycl::bundle_state::input>(queue.get_context(), ids);
 
-      set_spec_constants(top_level, in_bundle, length, factors, detail::elementwise_multiply::NOT_APPLIED,
+      if (factor_size != static_cast<Idx>(params.lengths[dimension_num]) && !is_global) {
+        in_bundle.template set_specialization_constant<detail::SpecConstFFTAlgorithm>(detail::fft_algorithm::BLUESTEIN);
+        in_bundle.template set_specialization_constant<detail::SpecConstCommittedLength>(
+            static_cast<Idx>(params.lengths[dimension_num]));
+      } else {
+        // TODO: This needs to change in the case of global
+        if (is_global) {
+          in_bundle.template set_specialization_constant<detail::SpecConstCommittedLength>(
+              static_cast<Idx>(factor_size));
+        } else {
+          in_bundle.template set_specialization_constant<detail::SpecConstCommittedLength>(
+              static_cast<Idx>(params.lengths[dimension_num]));
+        }
+
+        in_bundle.template set_specialization_constant<detail::SpecConstFFTAlgorithm>(
+            detail::fft_algorithm::COOLEY_TUKEY);
+      }
+
+      set_spec_constants(top_level, in_bundle, factor_size, factors, detail::elementwise_multiply::NOT_APPLIED,
                          multiply_on_store, apply_scale, level, conjugate_on_load, conjugate_on_store, scale_factor,
                          input_stride, output_stride, input_distance, output_distance, static_cast<Idx>(counter),
                          static_cast<Idx>(prepared_vec.size()));
       try {
         PORTFFT_LOG_TRACE("Building kernel bundle with subgroup size", SubgroupSize);
-        result.emplace_back(sycl::build(in_bundle), factors, params.lengths[dimension_num], SubgroupSize,
-                            PORTFFT_SGS_IN_WG, std::shared_ptr<Scalar>(), level);
+        result.emplace_back(
+            sycl::build(in_bundle), factors,
+            static_cast<std::size_t>(std::accumulate(factors.begin(), factors.end(), 1, std::multiplies<Idx>())),
+            SubgroupSize, PORTFFT_SGS_IN_WG, std::shared_ptr<Scalar>(), level);
         PORTFFT_LOG_TRACE("Kernel bundle build complete.");
       } catch (std::exception& e) {
         PORTFFT_LOG_WARNING("Build for subgroup size", SubgroupSize, "failed with message:\n", e.what());
@@ -549,7 +580,8 @@ class committed_descriptor_impl {
   dimension_struct build_w_spec_const(std::size_t dimension_num) {
     PORTFFT_LOG_FUNCTION_ENTRY();
     if (std::count(supported_sg_sizes.begin(), supported_sg_sizes.end(), SubgroupSize)) {
-      auto [top_level, prepared_vec] = prepare_implementation<SubgroupSize>(dimension_num);
+      auto [top_level, fft_size, prepared_vec] =
+          prepare_implementation<SubgroupSize>(static_cast<IdxGlobal>(params.lengths[dimension_num]));
       bool is_compatible = true;
       for (auto [level, ids, factors] : prepared_vec) {
         is_compatible = is_compatible && sycl::is_compatible(ids, dev);
@@ -563,9 +595,23 @@ class committed_descriptor_impl {
             set_spec_constants_driver<SubgroupSize>(top_level, prepared_vec, direction::FORWARD, dimension_num);
         auto backward_kernels =
             set_spec_constants_driver<SubgroupSize>(top_level, prepared_vec, direction::BACKWARD, dimension_num);
+        detail::fft_algorithm algorithm;
+        if (fft_size == params.lengths[dimension_num]) {
+          algorithm = detail::fft_algorithm::COOLEY_TUKEY;
+        } else if (fft_size > params.lengths[dimension_num]) {
+          algorithm = detail::fft_algorithm::BLUESTEIN;
+        } else {
+          throw internal_error("Invalid FFT size encountered while preparing the implementation");
+        }
+
         if (forward_kernels.has_value() && backward_kernels.has_value()) {
-          return {forward_kernels.value(), backward_kernels.value(), top_level, params.lengths[dimension_num],
-                  SubgroupSize};
+          return {forward_kernels.value(),
+                  backward_kernels.value(),
+                  top_level,
+                  fft_size,
+                  params.lengths[dimension_num],
+                  SubgroupSize,
+                  algorithm};
         }
       }
     }
@@ -912,6 +958,15 @@ class committed_descriptor_impl {
     const auto input_layout = detail::get_layout(params, compute_direction);
     const auto output_layout = detail::get_layout(params, inv(compute_direction));
 
+    if (dimensions.back().algorithm == detail::fft_algorithm::BLUESTEIN) {
+      if (input_layout == detail::layout::UNPACKED || output_layout == detail::layout::UNPACKED) {
+        throw unsupported_configuration("Unsupported configuration for prime sized DFTs");
+      }
+      if (input_layout == detail::layout::PACKED && output_layout != detail::layout::PACKED) {
+        throw unsupported_configuration("Unsupported configuration for prime sized DFTs");
+      }
+    }
+
     // currently multi-dimensional transforms are implemented just for default (PACKED) data layout
     const bool multi_dim_supported = input_layout == detail::layout::PACKED && output_layout == detail::layout::PACKED;
     if (n_dimensions != 1 && !multi_dim_supported) {

diff --git a/src/portfft/common/bluestein.hpp b/src/portfft/common/bluestein.hpp
@@ -0,0 +1,76 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  Codeplay's portFFT
+ *
+ **************************************************************************/
+
+#ifndef PORTFFT_COMMON_BLUESTEIN_HPP
+#define PORTFFT_COMMON_BLUESTEIN_HPP
+
+#include "portfft/common/host_dft.hpp"
+#include "portfft/defines.hpp"
+
+#include <complex>
+#include <sycl/sycl.hpp>
+
+namespace portfft {
+namespace detail {
+/**
+ * Utility function to get the dft transform of the chirp signal
+ * @tparam T Scalar Type
+ * @param ptr Host Pointer containing the load/store modifiers.
+ * @param committed_size original problem size
+ * @param dimension_size padded size
+ */
+template <typename T>
+void populate_fft_chirp_signal(T* ptr, std::size_t committed_size, std::size_t dimension_size) {
+  using complex_t = std::complex<T>;
+  std::vector<complex_t> chirp_signal(dimension_size, 0);
+  std::vector<complex_t> chirp_fft(dimension_size, 0);
+  for (std::size_t i = 0; i < committed_size; i++) {
+    double theta = M_PI * static_cast<double>(i * i) / static_cast<double>(committed_size);
+    chirp_signal[i] = complex_t(static_cast<T>(std::cos(theta)), static_cast<T>(std::sin(theta)));
+  }
+  std::size_t num_zeros = dimension_size - 2 * committed_size + 1;
+  for (std::size_t i = 1; i < committed_size; i++) {
+    chirp_signal[committed_size + num_zeros + i - 1] = chirp_signal[committed_size - i];
+  }
+  host_cooley_tukey(chirp_signal.data(), chirp_fft.data(), dimension_size);
+  std::memcpy(ptr, reinterpret_cast<T*>(chirp_fft.data()), 2 * dimension_size * sizeof(T));
+}
+
+/**
+ * Populates input modifiers required for bluestein
+ * @tparam T Scalar Type
+ * @param ptr Host Pointer containing the load/store modifiers.
+ * @param committed_size committed problem length
+ * @param dimension_size padded dft length
+ */
+template <typename T>
+void populate_bluestein_input_modifiers(T* ptr, std::size_t committed_size, std::size_t dimension_size) {
+  using complex_t = std::complex<T>;
+  std::vector<complex_t> scratch(dimension_size, 0);
+  for (std::size_t i = 0; i < committed_size; i++) {
+    double theta = -M_PI * static_cast<double>(i * i) / static_cast<double>(committed_size);
+    scratch[i] = complex_t(static_cast<T>(std::cos(theta)), static_cast<T>(std::sin(theta)));
+  }
+  std::memcpy(ptr, reinterpret_cast<T*>(scratch.data()), 2 * dimension_size * sizeof(T));
+}
+}  // namespace detail
+}  // namespace portfft
+
+#endif