Skip to content

Commit

Permalink
Merge pull request #39321 from fwyzard/cuda_backport_alpaka_tests
Browse files Browse the repository at this point in the history
Reimplement the PortableCollection and related tests in CUDA [12.5.x]
  • Loading branch information
cmsbuild committed Sep 11, 2022
2 parents 5b55544 + d2e962a commit 75fe478
Show file tree
Hide file tree
Showing 27 changed files with 718 additions and 12 deletions.
1 change: 1 addition & 0 deletions CUDADataFormats/Common/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<iftool name="cuda">
<use name="cuda"/>
<use name="rootcore"/>
<use name="DataFormats/Common"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
Expand Down
67 changes: 67 additions & 0 deletions CUDADataFormats/Common/interface/PortableDeviceCollection.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#ifndef CUDADataFormats_Common_interface_PortableDeviceCollection_h
#define CUDADataFormats_Common_interface_PortableDeviceCollection_h

#include <cassert>
#include <cstdlib>

#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"

namespace cms::cuda {

// generic SoA-based product in device memory
template <typename T>
class PortableDeviceCollection {
public:
using Layout = T;
using View = typename Layout::View;
using ConstView = typename Layout::ConstView;
using Buffer = cms::cuda::device::unique_ptr<std::byte[]>;

PortableDeviceCollection() = default;

PortableDeviceCollection(int32_t elements, cudaStream_t stream)
: buffer_{cms::cuda::make_device_unique<std::byte[]>(Layout::computeDataSize(elements), stream)},
layout_{buffer_.get(), elements},
view_{layout_} {
// CUDA device memory uses a default alignment of at least 128 bytes
assert(reinterpret_cast<uintptr_t>(buffer_.get()) % Layout::alignment == 0);
}

// non-copyable
PortableDeviceCollection(PortableDeviceCollection const&) = delete;
PortableDeviceCollection& operator=(PortableDeviceCollection const&) = delete;

// movable
PortableDeviceCollection(PortableDeviceCollection&&) = default;
PortableDeviceCollection& operator=(PortableDeviceCollection&&) = default;

// default destructor
~PortableDeviceCollection() = default;

// access the View
View& view() { return view_; }
ConstView const& view() const { return view_; }
ConstView const& const_view() const { return view_; }

View& operator*() { return view_; }
ConstView const& operator*() const { return view_; }

View* operator->() { return &view_; }
ConstView const* operator->() const { return &view_; }

// access the Buffer
Buffer& buffer() { return buffer_; }
Buffer const& buffer() const { return buffer_; }
Buffer const& const_buffer() const { return buffer_; }

size_t bufferSize() const { return layout_.metadata().byteSize(); }

private:
Buffer buffer_; //!
Layout layout_; //
View view_; //!
};

} // namespace cms::cuda

#endif // CUDADataFormats_Common_interface_PortableDeviceCollection_h
85 changes: 85 additions & 0 deletions CUDADataFormats/Common/interface/PortableHostCollection.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#ifndef CUDADataFormats_Common_interface_PortableHostCollection_h
#define CUDADataFormats_Common_interface_PortableHostCollection_h

#include <cassert>
#include <cstdlib>

#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

namespace cms::cuda {

// generic SoA-based product in host memory
template <typename T>
class PortableHostCollection {
public:
using Layout = T;
using View = typename Layout::View;
using ConstView = typename Layout::ConstView;
using Buffer = cms::cuda::host::unique_ptr<std::byte[]>;

PortableHostCollection() = default;

PortableHostCollection(int32_t elements)
// allocate pageable host memory
: buffer_{cms::cuda::make_host_unique<std::byte[]>(Layout::computeDataSize(elements))},
layout_{buffer_.get(), elements},
view_{layout_} {
// make_host_unique for pageable host memory uses a default alignment of 128 bytes
assert(reinterpret_cast<uintptr_t>(buffer_.get()) % Layout::alignment == 0);
}

PortableHostCollection(int32_t elements, cudaStream_t stream)
// allocate pinned host memory, accessible by the current device
: buffer_{cms::cuda::make_host_unique<std::byte[]>(Layout::computeDataSize(elements), stream)},
layout_{buffer_.get(), elements},
view_{layout_} {
// CUDA pinned host memory uses a default alignment of at least 128 bytes
assert(reinterpret_cast<uintptr_t>(buffer_.get()) % Layout::alignment == 0);
}

// non-copyable
PortableHostCollection(PortableHostCollection const&) = delete;
PortableHostCollection& operator=(PortableHostCollection const&) = delete;

// movable
PortableHostCollection(PortableHostCollection&&) = default;
PortableHostCollection& operator=(PortableHostCollection&&) = default;

// default destructor
~PortableHostCollection() = default;

// access the View
View& view() { return view_; }
ConstView const& view() const { return view_; }
ConstView const& const_view() const { return view_; }

View& operator*() { return view_; }
ConstView const& operator*() const { return view_; }

View* operator->() { return &view_; }
ConstView const* operator->() const { return &view_; }

// access the Buffer
Buffer& buffer() { return buffer_; }
Buffer const& buffer() const { return buffer_; }
Buffer const& const_buffer() const { return buffer_; }

size_t bufferSize() const { return layout_.metadata().byteSize(); }

// part of the ROOT read streamer
static void ROOTReadStreamer(PortableHostCollection* newObj, Layout const& layout) {
newObj->~PortableHostCollection();
// allocate pinned host memory using the legacy stream, that synchronises with all (blocking) streams
new (newObj) PortableHostCollection(layout.metadata().size());
newObj->layout_.ROOTReadStreamer(layout);
}

private:
Buffer buffer_; //!
Layout layout_; //
View view_; //!
};

} // namespace cms::cuda

#endif // CUDADataFormats_Common_interface_PortableHostCollection_h
7 changes: 7 additions & 0 deletions CUDADataFormats/PortableTestObjects/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<use name="rootcore"/>
<use name="CUDADataFormats/Common"/>
<use name="DataFormats/Common"/>
<use name="DataFormats/PortableTestObjects"/>
<export>
<lib name="1"/>
</export>
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#ifndef CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
#define CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h

#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
#include "DataFormats/PortableTestObjects/interface/TestSoA.h"

namespace cudatest {

// SoA with x, y, z, id fields in device global memory
using TestDeviceCollection = cms::cuda::PortableDeviceCollection<portabletest::TestSoA>;

} // namespace cudatest

#endif // CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
14 changes: 14 additions & 0 deletions CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#ifndef CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
#define CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h

#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
#include "DataFormats/PortableTestObjects/interface/TestSoA.h"

namespace cudatest {

// SoA with x, y, z, id fields in host memory
using TestHostCollection = cms::cuda::PortableHostCollection<portabletest::TestSoA>;

} // namespace cudatest

#endif // CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
5 changes: 5 additions & 0 deletions CUDADataFormats/PortableTestObjects/src/classes.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#include "CUDADataFormats/Common/interface/Product.h"
#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
#include "DataFormats/Common/interface/Wrapper.h"
#include "DataFormats/PortableTestObjects/interface/TestSoA.h"
21 changes: 21 additions & 0 deletions CUDADataFormats/PortableTestObjects/src/classes_def.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<lcgdict>
<class name="cudatest::TestHostCollection"/>
<read
sourceClass="cudatest::TestHostCollection"
targetClass="cudatest::TestHostCollection"
version="[1-]"
source="portabletest::TestSoA layout_;"
target="buffer_"
embed="false">
<![CDATA[
cudatest::TestHostCollection::ROOTReadStreamer(newObj, onfile.layout_);
]]>
</read>
<class name="edm::Wrapper<cudatest::TestHostCollection>" splitLevel="0"/>

<class name="cudatest::TestDeviceCollection" persistent="false"/>
<class name="edm::Wrapper<cudatest::TestDeviceCollection>" persistent="false"/>

<class name="cms::cuda::Product<cudatest::TestDeviceCollection>" persistent="false"/>
<class name="edm::Wrapper<cms::cuda::Product<cudatest::TestDeviceCollection>>" persistent="false"/>
</lcgdict>
1 change: 1 addition & 0 deletions DataFormats/Portable/interface/PortableDeviceCollection.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef DataFormats_Portable_interface_PortableDeviceCollection_h
#define DataFormats_Portable_interface_PortableDeviceCollection_h

#include <cassert>
#include <optional>
#include <type_traits>

Expand Down
1 change: 1 addition & 0 deletions DataFormats/Portable/interface/PortableHostCollection.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef DataFormats_Portable_interface_PortableHostCollection_h
#define DataFormats_Portable_interface_PortableHostCollection_h

#include <cassert>
#include <optional>

#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
Expand Down
1 change: 1 addition & 0 deletions DataFormats/SoATemplate/test/SoALayoutAndView_t.cu
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include <cassert>
#include <cstdlib>
#include <memory>

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h
#define HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h

#include <cassert>
#include <memory>

#include "FWCore/Utilities/interface/thread_safety_macros.h"
Expand Down
2 changes: 2 additions & 0 deletions HeterogeneousCore/AlpakaInterface/interface/host.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#ifndef HeterogeneousCore_AlpakaInterface_interface_host_h
#define HeterogeneousCore_AlpakaInterface_interface_host_h

#include <cassert>

namespace cms::alpakatools {

namespace detail {
Expand Down
5 changes: 1 addition & 4 deletions HeterogeneousCore/AlpakaTest/test/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
<test
name="testHeterogeneousCoreAlpakaTestWriteRead"
command="cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/writer.py; cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/reader.py"
/>
<test name="testHeterogeneousCoreAlpakaTestWriteRead" command="testHeterogeneousCoreAlpakaTestWriteRead.sh"/>
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#! /bin/bash -e

if ! [ "${LOCALTOP}" ]; then
export LOCALTOP=${CMSSW_BASE}
cd ${CMSSW_BASE}
fi

mkdir -p testHeterogeneousCoreAlpakaTestWriteRead
cd testHeterogeneousCoreAlpakaTestWriteRead
rm -f test.root
echo "--------------------------------------------------------------------------------"
echo "$ cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/writer.py"
echo
cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/writer.py
echo
echo "--------------------------------------------------------------------------------"
echo "$ cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/reader.py"
echo
cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/reader.py
echo
echo "--------------------------------------------------------------------------------"
echo "$ edmDumpEventContent test.root"
echo
edmDumpEventContent test.root
19 changes: 19 additions & 0 deletions HeterogeneousCore/CUDATest/plugins/TestAlgo.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"

#include "TestAlgo.h"

namespace cudatest {

static void testAlgoKernel(cudatest::TestHostCollection::View view, int32_t size) {
view.r() = 1.;

for (auto i = 0; i < size; ++i) {
view[i] = {0., 0., 0., i};
}
}

void TestAlgo::fill(cudatest::TestHostCollection& collection) const {
testAlgoKernel(collection.view(), collection->metadata().size());
}

} // namespace cudatest
30 changes: 30 additions & 0 deletions HeterogeneousCore/CUDATest/plugins/TestAlgo.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#include <cuda_runtime.h>

#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"

#include "TestAlgo.h"

namespace cudatest {

static __global__ void testAlgoKernel(cudatest::TestDeviceCollection::View view, int32_t size) {
const int32_t thread = blockIdx.x * blockDim.x + threadIdx.x;
const int32_t stride = blockDim.x * gridDim.x;

if (thread == 0) {
view.r() = 1.;
}
for (auto i = thread; i < size; i += stride) {
view[i] = {0., 0., 0., i};
}
}

void TestAlgo::fill(cudatest::TestDeviceCollection& collection, cudaStream_t stream) const {
const uint32_t maxThreadsPerBlock = 1024;

uint32_t threadsPerBlock = maxThreadsPerBlock;
uint32_t blocksPerGrid = (collection->metadata().size() + threadsPerBlock - 1) / threadsPerBlock;

testAlgoKernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(collection.view(), collection->metadata().size());
}

} // namespace cudatest
17 changes: 17 additions & 0 deletions HeterogeneousCore/CUDATest/plugins/TestAlgo.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#ifndef HeterogeneousCore_CUDATest_plugins_TestAlgo_h
#define HeterogeneousCore_CUDATest_plugins_TestAlgo_h

#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"

namespace cudatest {

class TestAlgo {
public:
void fill(cudatest::TestDeviceCollection& collection, cudaStream_t stream) const;
void fill(cudatest::TestHostCollection& collection) const;
};

} // namespace cudatest

#endif // HeterogeneousCore_CUDATest_plugins_TestAlgo_h

0 comments on commit 75fe478

Please sign in to comment.