Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reimplement the PortableCollection and related tests in CUDA [12.5.x] #39321

Merged
merged 5 commits into from
Sep 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CUDADataFormats/Common/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<iftool name="cuda">
<use name="cuda"/>
<use name="rootcore"/>
<use name="DataFormats/Common"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
Expand Down
67 changes: 67 additions & 0 deletions CUDADataFormats/Common/interface/PortableDeviceCollection.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#ifndef CUDADataFormats_Common_interface_PortableDeviceCollection_h
#define CUDADataFormats_Common_interface_PortableDeviceCollection_h

#include <cassert>
#include <cstdlib>

#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"

namespace cms::cuda {

// generic SoA-based product in device memory
template <typename T>
class PortableDeviceCollection {
public:
using Layout = T;
using View = typename Layout::View;
using ConstView = typename Layout::ConstView;
using Buffer = cms::cuda::device::unique_ptr<std::byte[]>;

PortableDeviceCollection() = default;

PortableDeviceCollection(int32_t elements, cudaStream_t stream)
: buffer_{cms::cuda::make_device_unique<std::byte[]>(Layout::computeDataSize(elements), stream)},
layout_{buffer_.get(), elements},
view_{layout_} {
// CUDA device memory uses a default alignment of at least 128 bytes
assert(reinterpret_cast<uintptr_t>(buffer_.get()) % Layout::alignment == 0);
}

// non-copyable
PortableDeviceCollection(PortableDeviceCollection const&) = delete;
PortableDeviceCollection& operator=(PortableDeviceCollection const&) = delete;

// movable
PortableDeviceCollection(PortableDeviceCollection&&) = default;
PortableDeviceCollection& operator=(PortableDeviceCollection&&) = default;

// default destructor
~PortableDeviceCollection() = default;

// access the View
View& view() { return view_; }
ConstView const& view() const { return view_; }
ConstView const& const_view() const { return view_; }

View& operator*() { return view_; }
ConstView const& operator*() const { return view_; }

View* operator->() { return &view_; }
ConstView const* operator->() const { return &view_; }

// access the Buffer
Buffer& buffer() { return buffer_; }
Buffer const& buffer() const { return buffer_; }
Buffer const& const_buffer() const { return buffer_; }

size_t bufferSize() const { return layout_.metadata().byteSize(); }

private:
Buffer buffer_; //!
Layout layout_; //
View view_; //!
};

} // namespace cms::cuda

#endif // CUDADataFormats_Common_interface_PortableDeviceCollection_h
85 changes: 85 additions & 0 deletions CUDADataFormats/Common/interface/PortableHostCollection.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#ifndef CUDADataFormats_Common_interface_PortableHostCollection_h
#define CUDADataFormats_Common_interface_PortableHostCollection_h

#include <cassert>
#include <cstdlib>

#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

namespace cms::cuda {

// generic SoA-based product in host memory
template <typename T>
class PortableHostCollection {
public:
using Layout = T;
using View = typename Layout::View;
using ConstView = typename Layout::ConstView;
using Buffer = cms::cuda::host::unique_ptr<std::byte[]>;

PortableHostCollection() = default;

PortableHostCollection(int32_t elements)
// allocate pageable host memory
: buffer_{cms::cuda::make_host_unique<std::byte[]>(Layout::computeDataSize(elements))},
layout_{buffer_.get(), elements},
view_{layout_} {
// make_host_unique for pageable host memory uses a default alignment of 128 bytes
assert(reinterpret_cast<uintptr_t>(buffer_.get()) % Layout::alignment == 0);
}

PortableHostCollection(int32_t elements, cudaStream_t stream)
// allocate pinned host memory, accessible by the current device
: buffer_{cms::cuda::make_host_unique<std::byte[]>(Layout::computeDataSize(elements), stream)},
layout_{buffer_.get(), elements},
view_{layout_} {
// CUDA pinned host memory uses a default alignment of at least 128 bytes
assert(reinterpret_cast<uintptr_t>(buffer_.get()) % Layout::alignment == 0);
}

// non-copyable
PortableHostCollection(PortableHostCollection const&) = delete;
PortableHostCollection& operator=(PortableHostCollection const&) = delete;

// movable
PortableHostCollection(PortableHostCollection&&) = default;
PortableHostCollection& operator=(PortableHostCollection&&) = default;

// default destructor
~PortableHostCollection() = default;

// access the View
View& view() { return view_; }
ConstView const& view() const { return view_; }
ConstView const& const_view() const { return view_; }

View& operator*() { return view_; }
ConstView const& operator*() const { return view_; }

View* operator->() { return &view_; }
ConstView const* operator->() const { return &view_; }

// access the Buffer
Buffer& buffer() { return buffer_; }
Buffer const& buffer() const { return buffer_; }
Buffer const& const_buffer() const { return buffer_; }

size_t bufferSize() const { return layout_.metadata().byteSize(); }

// part of the ROOT read streamer
static void ROOTReadStreamer(PortableHostCollection* newObj, Layout const& layout) {
newObj->~PortableHostCollection();
// allocate pinned host memory using the legacy stream, that synchronises with all (blocking) streams
new (newObj) PortableHostCollection(layout.metadata().size());
newObj->layout_.ROOTReadStreamer(layout);
}

private:
Buffer buffer_; //!
Layout layout_; //
View view_; //!
};

} // namespace cms::cuda

#endif // CUDADataFormats_Common_interface_PortableHostCollection_h
7 changes: 7 additions & 0 deletions CUDADataFormats/PortableTestObjects/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<use name="rootcore"/>
<use name="CUDADataFormats/Common"/>
<use name="DataFormats/Common"/>
<use name="DataFormats/PortableTestObjects"/>
<export>
<lib name="1"/>
</export>
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#ifndef CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
#define CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h

#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
#include "DataFormats/PortableTestObjects/interface/TestSoA.h"

namespace cudatest {

// SoA with x, y, z, id fields in device global memory
using TestDeviceCollection = cms::cuda::PortableDeviceCollection<portabletest::TestSoA>;

} // namespace cudatest

#endif // CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
14 changes: 14 additions & 0 deletions CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#ifndef CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
#define CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h

#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
#include "DataFormats/PortableTestObjects/interface/TestSoA.h"

namespace cudatest {

// SoA with x, y, z, id fields in host memory
using TestHostCollection = cms::cuda::PortableHostCollection<portabletest::TestSoA>;

} // namespace cudatest

#endif // CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
5 changes: 5 additions & 0 deletions CUDADataFormats/PortableTestObjects/src/classes.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#include "CUDADataFormats/Common/interface/Product.h"
#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
#include "DataFormats/Common/interface/Wrapper.h"
#include "DataFormats/PortableTestObjects/interface/TestSoA.h"
21 changes: 21 additions & 0 deletions CUDADataFormats/PortableTestObjects/src/classes_def.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<lcgdict>
<class name="cudatest::TestHostCollection"/>
<read
sourceClass="cudatest::TestHostCollection"
targetClass="cudatest::TestHostCollection"
version="[1-]"
source="portabletest::TestSoA layout_;"
target="buffer_"
embed="false">
<![CDATA[
cudatest::TestHostCollection::ROOTReadStreamer(newObj, onfile.layout_);
]]>
</read>
<class name="edm::Wrapper<cudatest::TestHostCollection>" splitLevel="0"/>

<class name="cudatest::TestDeviceCollection" persistent="false"/>
<class name="edm::Wrapper<cudatest::TestDeviceCollection>" persistent="false"/>

<class name="cms::cuda::Product<cudatest::TestDeviceCollection>" persistent="false"/>
<class name="edm::Wrapper<cms::cuda::Product<cudatest::TestDeviceCollection>>" persistent="false"/>
</lcgdict>
1 change: 1 addition & 0 deletions DataFormats/Portable/interface/PortableDeviceCollection.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef DataFormats_Portable_interface_PortableDeviceCollection_h
#define DataFormats_Portable_interface_PortableDeviceCollection_h

#include <cassert>
#include <optional>
#include <type_traits>

Expand Down
1 change: 1 addition & 0 deletions DataFormats/Portable/interface/PortableHostCollection.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef DataFormats_Portable_interface_PortableHostCollection_h
#define DataFormats_Portable_interface_PortableHostCollection_h

#include <cassert>
#include <optional>

#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
Expand Down
1 change: 1 addition & 0 deletions DataFormats/SoATemplate/test/SoALayoutAndView_t.cu
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include <cassert>
#include <cstdlib>
#include <memory>

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h
#define HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h

#include <cassert>
#include <memory>

#include "FWCore/Utilities/interface/thread_safety_macros.h"
Expand Down
2 changes: 2 additions & 0 deletions HeterogeneousCore/AlpakaInterface/interface/host.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#ifndef HeterogeneousCore_AlpakaInterface_interface_host_h
#define HeterogeneousCore_AlpakaInterface_interface_host_h

#include <cassert>

namespace cms::alpakatools {

namespace detail {
Expand Down
5 changes: 1 addition & 4 deletions HeterogeneousCore/AlpakaTest/test/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
<test
name="testHeterogeneousCoreAlpakaTestWriteRead"
command="cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/writer.py; cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/reader.py"
/>
<test name="testHeterogeneousCoreAlpakaTestWriteRead" command="testHeterogeneousCoreAlpakaTestWriteRead.sh"/>
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#! /bin/bash -e

if ! [ "${LOCALTOP}" ]; then
export LOCALTOP=${CMSSW_BASE}
cd ${CMSSW_BASE}
fi

mkdir -p testHeterogeneousCoreAlpakaTestWriteRead
cd testHeterogeneousCoreAlpakaTestWriteRead
rm -f test.root
echo "--------------------------------------------------------------------------------"
echo "$ cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/writer.py"
echo
cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/writer.py
echo
echo "--------------------------------------------------------------------------------"
echo "$ cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/reader.py"
echo
cmsRun ${LOCALTOP}/src/HeterogeneousCore/AlpakaTest/test/reader.py
echo
echo "--------------------------------------------------------------------------------"
echo "$ edmDumpEventContent test.root"
echo
edmDumpEventContent test.root
19 changes: 19 additions & 0 deletions HeterogeneousCore/CUDATest/plugins/TestAlgo.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"

#include "TestAlgo.h"

namespace cudatest {

static void testAlgoKernel(cudatest::TestHostCollection::View view, int32_t size) {
view.r() = 1.;

for (auto i = 0; i < size; ++i) {
view[i] = {0., 0., 0., i};
}
}

void TestAlgo::fill(cudatest::TestHostCollection& collection) const {
testAlgoKernel(collection.view(), collection->metadata().size());
}

} // namespace cudatest
30 changes: 30 additions & 0 deletions HeterogeneousCore/CUDATest/plugins/TestAlgo.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#include <cuda_runtime.h>

#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"

#include "TestAlgo.h"

namespace cudatest {

static __global__ void testAlgoKernel(cudatest::TestDeviceCollection::View view, int32_t size) {
const int32_t thread = blockIdx.x * blockDim.x + threadIdx.x;
const int32_t stride = blockDim.x * gridDim.x;

if (thread == 0) {
view.r() = 1.;
}
for (auto i = thread; i < size; i += stride) {
view[i] = {0., 0., 0., i};
}
}

void TestAlgo::fill(cudatest::TestDeviceCollection& collection, cudaStream_t stream) const {
const uint32_t maxThreadsPerBlock = 1024;

uint32_t threadsPerBlock = maxThreadsPerBlock;
uint32_t blocksPerGrid = (collection->metadata().size() + threadsPerBlock - 1) / threadsPerBlock;

testAlgoKernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(collection.view(), collection->metadata().size());
}

} // namespace cudatest
17 changes: 17 additions & 0 deletions HeterogeneousCore/CUDATest/plugins/TestAlgo.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#ifndef HeterogeneousCore_CUDATest_plugins_TestAlgo_h
#define HeterogeneousCore_CUDATest_plugins_TestAlgo_h

#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"

namespace cudatest {

class TestAlgo {
public:
void fill(cudatest::TestDeviceCollection& collection, cudaStream_t stream) const;
void fill(cudatest::TestHostCollection& collection) const;
};

} // namespace cudatest

#endif // HeterogeneousCore_CUDATest_plugins_TestAlgo_h