Skip to content

Commit

Permalink
Add VideoReaderDecoder GPU (NVIDIA#3668)
Browse files Browse the repository at this point in the history
* Add VideoReaderDecoderGpu op

Signed-off-by: Albert Wolant <awolant@nvidia.com>
  • Loading branch information
awolant authored and cyyever committed Jun 7, 2022
1 parent 45b993a commit 974cf7d
Show file tree
Hide file tree
Showing 11 changed files with 465 additions and 4 deletions.
5 changes: 4 additions & 1 deletion dali/operators/reader/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,15 @@ list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/sequence_reader_op.c

if (BUILD_FFMPEG)
list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_decoder_cpu_op.cc")
list(APPEND DALI_OPERATOR_TEST_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_decoder_op_test.cc")
endif()

if (BUILD_NVDEC)
list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_op.cc")
list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_resize_op.cc")
list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_decoder_gpu_op.cc")
if (BUILD_TEST)
list(APPEND DALI_OPERATOR_TEST_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_decoder_op_test.cc")
endif()
endif()

if (BUILD_LIBTAR)
Expand Down
2 changes: 2 additions & 0 deletions dali/operators/reader/loader/video/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ if (BUILD_NVDEC)
add_subdirectory(nvdecode)
list(APPEND DALI_INST_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/frames_decoder_gpu.h")
list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/frames_decoder_gpu.cc")
list(APPEND DALI_INST_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/video_loader_decoder_gpu.h")
list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_loader_decoder_gpu.cc")
if (BUILD_TEST)
list(APPEND DALI_OPERATOR_TEST_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/frames_decoder_test.cc")
endif()
Expand Down
9 changes: 7 additions & 2 deletions dali/operators/reader/loader/video/frames_decoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include "dali/operators/reader/loader/video/frames_decoder.h"
#include <memory>
#include <iomanip>
#include "dali/core/error_handling.h"


Expand Down Expand Up @@ -173,13 +174,14 @@ bool FramesDecoder::ReadRegularFrame(uint8_t *data, bool copy_to_output) {
break;
}

LOG_LINE << "Read frame (ReadRegularFrame), index " << next_frame_idx_ << ", timestamp " <<
std::setw(5) << av_state_->frame_->pts << ", current copy " << copy_to_output << std::endl;
if (!copy_to_output) {
++next_frame_idx_;
return true;
}

CopyToOutput(data);
LOG_LINE << "Read frame (ReadRegularFrame), timestamp " << av_state_->frame_->pts << std::endl;
++next_frame_idx_;
return true;
}
Expand Down Expand Up @@ -257,10 +259,13 @@ bool FramesDecoder::ReadFlushFrame(uint8_t *data, bool copy_to_output) {

if (copy_to_output) {
CopyToOutput(data);
LOG_LINE << "Read frame (ReadFlushFrame), timestamp " << av_state_->frame_->pts << std::endl;
}

LOG_LINE << "Read frame (ReadFlushFrame), index " << next_frame_idx_ << " timestamp " <<
std::setw(5) << av_state_->frame_->pts << ", current copy " << copy_to_output << std::endl;
++next_frame_idx_;

// TODO(awolant): Figure out how to handle this during index building
if (next_frame_idx_ >= NumFrames()) {
next_frame_idx_ = -1;
}
Expand Down
2 changes: 1 addition & 1 deletion dali/operators/reader/loader/video/frames_decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ class DLL_PUBLIC FramesDecoder {

std::vector<IndexEntry> index_;

int next_frame_idx_;
int next_frame_idx_ = 0;

private:
/**
Expand Down
13 changes: 13 additions & 0 deletions dali/operators/reader/loader/video/frames_decoder_gpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <string>
#include <memory>
#include <iomanip>

#include "dali/core/error_handling.h"
#include "dali/core/cuda_utils.h"
Expand Down Expand Up @@ -125,6 +126,10 @@ int FramesDecoderGpu::ProcessPictureDecode(void *user_data, CUVIDPICPARAMS *pict
if (current_pts == NextFramePts()) {
// Currently decoded frame is actually the one we wanted
frame_returned_ = true;

LOG_LINE << "Read frame, index " << next_frame_idx_ << ", timestamp " <<
std::setw(5) << current_pts << ", current copy " << current_copy_to_output_ << std::endl;

if (current_copy_to_output_ == false) {
return 1;
}
Expand Down Expand Up @@ -155,12 +160,17 @@ int FramesDecoderGpu::ProcessPictureDecode(void *user_data, CUVIDPICPARAMS *pict
Width(),
Height(),
stream_);
// TODO(awolant): Alterantive is to copy the data to a buffer
// and then process it on the stream. Check, if this is faster, when
// the benchmark is ready.
CUDA_CALL(cudaStreamSynchronize(stream_));
CUDA_CALL(cuvidUnmapVideoFrame(nvdecode_state_->decoder, frame));

return 1;
}

void FramesDecoderGpu::SeekFrame(int frame_id) {
// TODO(awolant): This seek can be optimized - for consecutive frames not needed etc.
SendLastPacket(true);
FramesDecoder::SeekFrame(frame_id);
}
Expand All @@ -177,6 +187,9 @@ bool FramesDecoderGpu::ReadNextFrame(uint8_t *data, bool copy_to_output) {
if (copy_to_output) {
copyD2D(data, frame.frame_.data(), FrameSize());
}
LOG_LINE << "Read frame, index " << next_frame_idx_ << ", timestamp " <<
std::setw(5) << frame.pts_ << ", current copy " << copy_to_output << std::endl;

frame.pts_ = -1;

++next_frame_idx_;
Expand Down
2 changes: 2 additions & 0 deletions dali/operators/reader/loader/video/frames_decoder_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ class DLL_PUBLIC FramesDecoderGpu : public FramesDecoder {

int ProcessPictureDecode(void *user_data, CUVIDPICPARAMS *picture_params);

FramesDecoderGpu(FramesDecoderGpu&&) = default;

~FramesDecoderGpu();

private:
Expand Down
124 changes: 124 additions & 0 deletions dali/operators/reader/loader/video/video_loader_decoder_gpu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "dali/operators/reader/loader/video/video_loader_decoder_gpu.h"

#include "dali/util/nvml.h"

namespace dali {
void VideoSampleGpu::Decode() {
TensorShape<4> shape = {
sequence_len_,
video_file_->Height(),
video_file_->Width(),
video_file_->Channels()};

data_.Resize(
shape,
DALIDataType::DALI_UINT8);

for (int i = 0; i < sequence_len_; ++i) {
int frame_id = span_->start_ + i * span_->stride_;
video_file_->SeekFrame(frame_id);
video_file_->ReadNextFrame(
static_cast<uint8_t *>(data_.raw_mutable_data()) + i * video_file_->FrameSize());
}
}

VideoLoaderDecoderGpu::~VideoLoaderDecoderGpu() {
CUDA_DTOR_CALL(cudaStreamDestroy(cuda_stream_));
}

cudaStream_t VideoLoaderDecoderGpu::GetCudaStream() {
#if NVML_ENABLED
{
nvml::Init();
static float driver_version = nvml::GetDriverVersion();
if (driver_version > 460 && driver_version < 470.21) {
DALI_WARN_ONCE("Warning: Decoding on a default stream. Performance may be affected.");
return 0;
}
}
#else
{
int driver_cuda_version = 0;
CUDA_CALL(cuDriverGetVersion(&driver_cuda_version));
if (driver_cuda_version >= 11030 && driver_cuda_version < 11040) {
DALI_WARN_ONCE("Warning: Decoding on a default stream. Performance may be affected.");
return 0;
}
}
#endif

// TODO(awolant): Check per decoder stream
cudaStream_t stream;
DeviceGuard dg(device_id_);
CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
return stream;
}

void VideoLoaderDecoderGpu::PrepareEmpty(VideoSampleGpu &sample) {
sample = {};
}

void VideoLoaderDecoderGpu::ReadSample(VideoSampleGpu &sample) {
auto &sample_span = sample_spans_[current_index_];

// Bind sample to the video and span, so it can be decoded later
sample.span_ = &sample_span;
sample.video_file_ = &video_files_[sample_span.video_idx_];
sample.sequence_len_ = sequence_len_;

if (has_labels_) {
sample.label_ = labels_[sample_span.video_idx_];
}

++current_index_;
MoveToNextShard(current_index_);
}

Index VideoLoaderDecoderGpu::SizeImpl() {
return sample_spans_.size();
}

void VideoLoaderDecoderGpu::PrepareMetadataImpl() {
video_files_.reserve(filenames_.size());
for (auto &filename : filenames_) {
video_files_.emplace_back(filename, cuda_stream_);
}

for (size_t video_idx = 0; video_idx < video_files_.size(); ++video_idx) {
for (int start = 0;
start + stride_ * sequence_len_ <= video_files_[video_idx].NumFrames();
start += step_) {
sample_spans_.push_back(
VideoSampleDesc(start, start + stride_ * sequence_len_, stride_, video_idx));
}
}
if (shuffle_) {
// seeded with hardcoded value to get
// the same sequence on every shard
std::mt19937 g(kDaliDataloaderSeed);
std::shuffle(std::begin(sample_spans_), std::end(sample_spans_), g);
}

// set the initial index for each shard
Reset(true);
}

void VideoLoaderDecoderGpu::Reset(bool wrap_to_shard) {
current_index_ = wrap_to_shard ? start_index(shard_id_, num_shards_, SizeImpl()) : 0;
}

} // namespace dali
86 changes: 86 additions & 0 deletions dali/operators/reader/loader/video/video_loader_decoder_gpu.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef DALI_OPERATORS_READER_LOADER_VIDEO_VIDEO_LOADER_DECODER_GPU_H_
#define DALI_OPERATORS_READER_LOADER_VIDEO_VIDEO_LOADER_DECODER_GPU_H_

#include <string>
#include <vector>

#include "dali/operators/reader/loader/loader.h"
#include "dali/operators/reader/loader/video/video_loader_decoder_cpu.h"
#include "dali/operators/reader/loader/video/frames_decoder_gpu.h"

namespace dali {
class VideoSampleGpu {
public:
void Decode();

FramesDecoderGpu *video_file_ = nullptr;
VideoSampleDesc *span_ = nullptr;
int sequence_len_ = 0;
Tensor<GPUBackend> data_;
int label_ = -1;
};


class VideoLoaderDecoderGpu : public Loader<GPUBackend, VideoSampleGpu> {
public:
explicit inline VideoLoaderDecoderGpu(const OpSpec &spec) :
Loader<GPUBackend, VideoSampleGpu>(spec),
filenames_(spec.GetRepeatedArgument<std::string>("filenames")),
sequence_len_(spec.GetArgument<int>("sequence_length")),
stride_(spec.GetArgument<int>("stride")),
step_(spec.GetArgument<int>("step")),
cuda_stream_(GetCudaStream()) {
if (step_ <= 0) {
step_ = stride_ * sequence_len_;
}
has_labels_ = spec.TryGetRepeatedArgument(labels_, "labels");
}

void ReadSample(VideoSampleGpu &sample) override;

void PrepareEmpty(VideoSampleGpu &sample) override;

~VideoLoaderDecoderGpu();

protected:
Index SizeImpl() override;

void PrepareMetadataImpl() override;

private:
void Reset(bool wrap_to_shard) override;

cudaStream_t GetCudaStream();

std::vector<std::string> filenames_;
std::vector<int> labels_;
bool has_labels_ = false;
std::vector<FramesDecoderGpu> video_files_;
std::vector<VideoSampleDesc> sample_spans_;

Index current_index_ = 0;

int sequence_len_;
int stride_;
int step_;

cudaStream_t cuda_stream_;
};

} // namespace dali

#endif // DALI_OPERATORS_READER_LOADER_VIDEO_VIDEO_LOADER_DECODER_GPU_H_
Loading

0 comments on commit 974cf7d

Please sign in to comment.