@@ -8,38 +8,63 @@
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include "../src/aligner_global_ukkonen.hpp"
#include "../src/aligner_global_myers.hpp"
#include "../src/aligner_global_hirschberg_myers.hpp"

#include <claragenomics/cudaaligner/alignment.hpp>
#include <claragenomics/utils/signed_integer_utils.hpp>
#include <claragenomics/utils/genomeutils.hpp>

#include <random>
#include "gtest/gtest.h"
#include "../src/aligner_global.hpp"
#include "cudaaligner/alignment.hpp"
#include <utils/signed_integer_utils.hpp>
#include <utils/genomeutils.hpp>

namespace claragenomics
{

namespace cudaaligner
{

enum class AlignmentAlgorithm
{
Default = 0,
Ukkonen,
Myers,
HirschbergMyers
};

std::string get_algorithm_name(AlignmentAlgorithm x)
{
switch (x)
{
case AlignmentAlgorithm::Default: return "Default";
case AlignmentAlgorithm::Ukkonen: return "Ukkonen";
case AlignmentAlgorithm::Myers: return "Myers";
case AlignmentAlgorithm::HirschbergMyers: return "Hirschberg + Myers";
default: return "";
}
}

// Common data structures and functions.
typedef struct
struct AlignerTestData
{
std::vector<std::pair<std::string, std::string>> inputs;
std::vector<std::string> cigars;
} AlignerTestData;
AlignmentAlgorithm algorithm = AlignmentAlgorithm::Ukkonen;
};

// Test adding alignments to Aligner objects
TEST(TestCudaAligner, TestAlignmentAddition)
{
std::unique_ptr<AlignerGlobal> aligner = std::make_unique<AlignerGlobal>(5, 5, 5, 0);
std::unique_ptr<AlignerGlobal> aligner = std::make_unique<AlignerGlobalUkkonen>(10, 10, 5, nullptr, 0);
ASSERT_EQ(StatusType::success, aligner->add_alignment("ATCG", 4, "TACG", 4));
ASSERT_EQ(StatusType::success, aligner->add_alignment("ATCG", 4, "TACG", 4));
ASSERT_EQ(StatusType::success, aligner->add_alignment("ATCG", 4, "TACG", 4));

ASSERT_EQ(3, aligner->num_alignments());

ASSERT_EQ(StatusType::exceeded_max_length, aligner->add_alignment("ATCGAT", 6, "TACG", 4));
ASSERT_EQ(StatusType::exceeded_max_length, aligner->add_alignment("ATCG", 4, "TACGAG", 6));
ASSERT_EQ(StatusType::exceeded_max_length, aligner->add_alignment("ATCGATTACGC", 11, "TACGTACGGA", 10));
ASSERT_EQ(StatusType::exceeded_max_length, aligner->add_alignment("ATCGATTACG", 10, "ATACGTAGCGA", 11));

ASSERT_EQ(3, aligner->num_alignments());

@@ -60,65 +85,121 @@ std::vector<AlignerTestData> create_aligner_test_cases()
AlignerTestData data;

// Test case 1
data.inputs = {{"AAAA", "TTAT"}};
data.cigars = {"4M"};
data.inputs = {{"AAAA", "TTAT"}};
data.cigars = {"4M"};
data.algorithm = AlignmentAlgorithm::Default;
test_cases.push_back(data);

// Test case 2
data.inputs = {{"ATAAAAAAAA", "AAAAAAAAA"}};
data.cigars = {"1M1I8M"};
data.inputs = {{"ATAAAAAAAA", "AAAAAAAAA"}};
data.cigars = {"1M1D8M"};
data.algorithm = AlignmentAlgorithm::Default;
test_cases.push_back(data);

// Test case 3
data.inputs = {{"AAAAAAAAA", "ATAAAAAAAA"}};
data.cigars = {"1M1D8M"};
data.inputs = {{"AAAAAAAAA", "ATAAAAAAAA"}};
data.cigars = {"1M1I8M"};
data.algorithm = AlignmentAlgorithm::Default;
test_cases.push_back(data);

// Test case 3
data.inputs = {{"ACTGA", "GCTAG"}};
data.cigars = {"3M1I1M1D"};
// Test case 4
data.inputs = {{"ACTGA", "GCTAG"}};
data.cigars = {"3M1D1M1I"};
data.algorithm = AlignmentAlgorithm::Default;
test_cases.push_back(data);

// Test case 4
data.inputs = {{"ACTGA", "GCTAG"}, {"ACTG", "ACTG"}, {"A", "T"}};
data.cigars = {"3M1I1M1D", "4M", "1M"};
// Test case 5
data.inputs = {{"ACTGA", "GCTAG"}, {"ACTG", "ACTG"}, {"A", "T"}};
data.cigars = {"3M1D1M1I", "4M", "1M"};
data.algorithm = AlignmentAlgorithm::Default;
test_cases.push_back(data);

return test_cases;
// Test case 6
data.inputs = {
{"AAAA", "TTAT"}, {"ATAAAAAAAA", "AAAAAAAAA"}, {"AAAAAAAAA", "ATAAAAAAAA"}, {"ACTGA", "GCTAG"}, {"ACTGA", "GCTAG"}, {"ACTG", "ACTG"}, {"A", "T"}, {"AAAA", "TTAT"}, {"ATAAAAAAAA", "AAAAAAAAA"}, {"AAAAAAAAA", "ATAAAAAAAA"}, {"ACTGA", "GCTAG"}, {"ACTGA", "GCTAG"}, {"ACTG", "ACTG"}, {"A", "T"}, {"AAAA", "TTAT"}, {"ATAAAAAAAA", "AAAAAAAAA"}, {"AAAAAAAAA", "ATAAAAAAAA"}, {"ACTGA", "GCTAG"}, {"ACTGA", "GCTAG"}, {"ACTG", "ACTG"}, {"A", "T"}, {"AAAA", "TTAT"}, {"ATAAAAAAAA", "AAAAAAAAA"}, {"AAAAAAAAA", "ATAAAAAAAA"}, {"ACTGA", "GCTAG"}, {"ACTGA", "GCTAG"}, {"ACTG", "ACTG"}, {"A", "T"}};
data.cigars = {
"4M", "1M1D8M", "1M1I8M", "3M1D1M1I", "3M1D1M1I", "4M", "1M",
"4M", "1M1D8M", "1M1I8M", "3M1D1M1I", "3M1D1M1I", "4M", "1M",
"4M", "1M1D8M", "1M1I8M", "3M1D1M1I", "3M1D1M1I", "4M", "1M",
"4M", "1M1D8M", "1M1I8M", "3M1D1M1I", "3M1D1M1I", "4M", "1M"};
data.algorithm = AlignmentAlgorithm::Default;
test_cases.push_back(data);

std::minstd_rand rng(1);
data.inputs = {{claragenomics::genomeutils::generate_random_genome(4800, rng), claragenomics::genomeutils::generate_random_genome(5000, rng)}};
data.cigars = {}; // do not test cigars
data.algorithm = AlignmentAlgorithm::Default;
test_cases.push_back(data);

std::vector<AlignerTestData> test_cases_final;
test_cases_final.reserve(4 * test_cases.size());
test_cases_final.insert(test_cases_final.end(), test_cases.begin(), test_cases.end());
std::transform(test_cases.begin(), test_cases.end(), std::back_inserter(test_cases_final), [](AlignerTestData td) { td.algorithm = AlignmentAlgorithm::Ukkonen; return td; });
std::transform(test_cases.begin(), test_cases.end(), std::back_inserter(test_cases_final), [](AlignerTestData td) { td.algorithm = AlignmentAlgorithm::Myers; return td; });
std::transform(test_cases.begin(), test_cases.end(), std::back_inserter(test_cases_final), [](AlignerTestData td) { td.algorithm = AlignmentAlgorithm::HirschbergMyers; return td; });

return test_cases_final;
};

class TestAlignerGlobalImpl : public ::testing::TestWithParam<AlignerTestData>
class TestAlignerGlobal : public ::testing::TestWithParam<AlignerTestData>
{
public:
virtual void SetUp()
};

int32_t get_max_sequence_length(std::vector<std::pair<std::string, std::string>> const& inputs)
{
int64_t max_string_size = 0;
for (auto const& pair : inputs)
{
param = GetParam();
int64_t max_string_size = 0;
for (auto& pair : param.inputs)
{
max_string_size = std::max(max_string_size, get_size(pair.first));
max_string_size = std::max(max_string_size, get_size(pair.second));
}
max_string_size++;
aligner = std::make_unique<AlignerGlobal>(max_string_size,
max_string_size,
param.inputs.size(),
0);
aligner->set_cuda_stream(0);
max_string_size = std::max(max_string_size, get_size(pair.first));
max_string_size = std::max(max_string_size, get_size(pair.second));
}
return static_cast<int32_t>(max_string_size);
}

protected:
std::unique_ptr<AlignerGlobal> aligner;
AlignerTestData param;
};

TEST_P(TestAlignerGlobalImpl, TestAlignmentKernel)
TEST_P(TestAlignerGlobal, TestAlignmentKernel)
{
AlignerTestData param = GetParam();
const std::vector<std::pair<std::string, std::string>>& inputs = param.inputs;
const std::vector<std::string>& cigars = param.cigars;

ASSERT_EQ(inputs.size(), cigars.size()) << "Input data length mismatch";
if (!cigars.empty())
{
ASSERT_EQ(inputs.size(), cigars.size()) << "Input data length mismatch";
}

const int32_t max_string_size = get_max_sequence_length(inputs) + 1;
std::unique_ptr<Aligner> aligner;
switch (param.algorithm)
{
case AlignmentAlgorithm::Default:
aligner = claragenomics::cudaaligner::create_aligner(max_string_size,
max_string_size,
param.inputs.size(),
claragenomics::cudaaligner::AlignmentType::global,
nullptr,
0);
case AlignmentAlgorithm::Ukkonen:
aligner = std::make_unique<AlignerGlobalUkkonen>(max_string_size,
max_string_size,
param.inputs.size(),
nullptr,
0);
break;
case AlignmentAlgorithm::HirschbergMyers:
aligner = std::make_unique<AlignerGlobalHirschbergMyers>(max_string_size,
max_string_size,
param.inputs.size(),
nullptr,
0);
break;
default:
case AlignmentAlgorithm::Myers:
aligner = std::make_unique<AlignerGlobalMyers>(max_string_size,
max_string_size,
param.inputs.size(),
nullptr,
0);
}
for (auto& pair : inputs)
{
auto& query = pair.first;
@@ -138,11 +219,19 @@ TEST_P(TestAlignerGlobalImpl, TestAlignmentKernel)
auto alignment = alignments[a];
EXPECT_EQ(StatusType::success, alignment->get_status()) << "Alignment status is not success";
EXPECT_EQ(AlignmentType::global, alignment->get_alignment_type()) << "Alignment type is not global";
EXPECT_STREQ(cigars[a].c_str(), alignment->convert_to_cigar().c_str()) << "CIGAR doesn't match for alignment " << a;
if (!cigars.empty())
{
EXPECT_EQ(cigars[a], alignment->convert_to_cigar()) << "CIGAR doesn't match for alignment of\n"
<< alignment->get_query_sequence()
<< "\nand\n"
<< alignment->get_target_sequence()
<< "\nindex: " << a
<< "\nusing " << get_algorithm_name(param.algorithm);
}
}
}

INSTANTIATE_TEST_SUITE_P(TestCudaAligner, TestAlignerGlobalImpl, ::testing::ValuesIn(create_aligner_test_cases()));
INSTANTIATE_TEST_SUITE_P(TestCudaAligner, TestAlignerGlobal, ::testing::ValuesIn(create_aligner_test_cases()));

// Test performance of kernel for large genomes
std::vector<AlignerTestData> create_aligner_perf_test_cases()
@@ -165,13 +254,20 @@ std::vector<AlignerTestData> create_aligner_perf_test_cases()
return test_cases;
};

class TestAlignerGlobalImplPerf : public TestAlignerGlobalImpl
class TestAlignerGlobalImplPerf : public TestAlignerGlobal
{
};

TEST_P(TestAlignerGlobalImplPerf, TestAlignmentKernelPerf)
{
AlignerTestData param = GetParam();
const std::vector<std::pair<std::string, std::string>>& inputs = param.inputs;
const int32_t max_string_size = get_max_sequence_length(inputs) + 1;
std::unique_ptr<Aligner> aligner = std::make_unique<AlignerGlobalUkkonen>(max_string_size,
max_string_size,
param.inputs.size(),
nullptr,
0);

for (auto& pair : inputs)
{
@@ -8,11 +8,12 @@
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include <memory>
#include "../src/alignment_impl.hpp"

#include <claragenomics/utils/signed_integer_utils.hpp>

#include "gtest/gtest.h"
#include "../src/alignment_impl.hpp"
#include <utils/signed_integer_utils.hpp>
#include <memory>

namespace claragenomics
{
@@ -45,7 +46,7 @@ TEST(TestAlignmentImplIndividual, Type)
typedef struct AlignmentTestData
{
std::string query;
std::string subject;
std::string target;
std::vector<AlignmentState> alignment;
FormattedAlignment formatted_alignment;
std::string cigar;
@@ -57,62 +58,62 @@ std::vector<AlignmentTestData> create_alignment_test_cases()
AlignmentTestData data;

// Test case 1
data.subject = "TTATG";
data.query = "AAAA";
data.target = "TTATG";
data.alignment = {
AlignmentState::mismatch,
AlignmentState::mismatch,
AlignmentState::match,
AlignmentState::mismatch,
AlignmentState::deletion};
AlignmentState::insertion};
data.formatted_alignment = std::make_pair("AAAA-", "TTATG");
data.cigar = "4M1I";
test_cases.push_back(data);

// Test case 2
data.subject = "CATAA";
data.query = "CGATAATG";
data.target = "CATAA";
data.alignment = {
AlignmentState::insertion,
AlignmentState::deletion,
AlignmentState::mismatch,
AlignmentState::match,
AlignmentState::match,
AlignmentState::match,
AlignmentState::match,
AlignmentState::insertion,
AlignmentState::insertion};
AlignmentState::deletion,
AlignmentState::deletion};
data.formatted_alignment = std::make_pair("CGATAATG", "-CATAA--");
data.cigar = "1D5M2D";
test_cases.push_back(data);

// Test case 3
data.subject = "AAGTCTAGAA";
data.query = "GTTAG";
data.target = "AAGTCTAGAA";
data.alignment = {
AlignmentState::deletion,
AlignmentState::deletion,
AlignmentState::insertion,
AlignmentState::insertion,
AlignmentState::match,
AlignmentState::match,
AlignmentState::deletion,
AlignmentState::insertion,
AlignmentState::match,
AlignmentState::match,
AlignmentState::match,
AlignmentState::deletion,
AlignmentState::deletion,
AlignmentState::insertion,
AlignmentState::insertion,
};
data.formatted_alignment = std::make_pair("--GT-TAG--", "AAGTCTAGAA");
data.cigar = "2I2M1I3M2I";
test_cases.push_back(data);

// Test case 4
data.subject = "GATTCA";
data.query = "GTTACA";
data.target = "GATTCA";
data.alignment = {
AlignmentState::match,
AlignmentState::deletion,
AlignmentState::insertion,
AlignmentState::match,
AlignmentState::match,
AlignmentState::insertion,
AlignmentState::deletion,
AlignmentState::match,
AlignmentState::match};
data.formatted_alignment = std::make_pair("G-TTACA", "GATT-CA");
@@ -130,8 +131,8 @@ class TestAlignmentImpl : public ::testing::TestWithParam<AlignmentTestData>
param_ = GetParam();
alignment_ = std::make_unique<AlignmentImpl>(param_.query.c_str(),
param_.query.size(),
param_.subject.c_str(),
param_.subject.size());
param_.target.c_str(),
param_.target.size());
alignment_->set_alignment(param_.alignment);
}

@@ -142,11 +143,8 @@ class TestAlignmentImpl : public ::testing::TestWithParam<AlignmentTestData>

TEST_P(TestAlignmentImpl, StringGetters)
{
ASSERT_STREQ(param_.query.c_str(), alignment_->get_query_sequence().c_str()) << "Query doesn't match original string";
ASSERT_EQ(param_.query.size(), alignment_->get_query_sequence().size()) << "Query length match original string";

ASSERT_STREQ(param_.subject.c_str(), alignment_->get_subject_sequence().c_str()) << "Subject doesn't match original string";
ASSERT_EQ(param_.subject.size(), alignment_->get_subject_sequence().size()) << "Subject length match original string";
ASSERT_EQ(param_.query, alignment_->get_query_sequence()) << "Query doesn't match original string";
ASSERT_EQ(param_.target, alignment_->get_target_sequence()) << "Target doesn't match original string";
}

TEST_P(TestAlignmentImpl, AlignmentState)
@@ -163,15 +161,15 @@ TEST_P(TestAlignmentImpl, AlignmentFormatting)
{
FormattedAlignment formatted_alignment = alignment_->format_alignment();
std::string query = formatted_alignment.first;
std::string subject = formatted_alignment.second;
ASSERT_STREQ(param_.formatted_alignment.first.c_str(), query.c_str());
ASSERT_STREQ(param_.formatted_alignment.second.c_str(), subject.c_str());
std::string target = formatted_alignment.second;
ASSERT_EQ(param_.formatted_alignment.first, query);
ASSERT_EQ(param_.formatted_alignment.second, target);
}

TEST_P(TestAlignmentImpl, CigarFormatting)
{
std::string cigar = alignment_->convert_to_cigar();
ASSERT_STREQ(param_.cigar.c_str(), cigar.c_str());
ASSERT_EQ(param_.cigar, cigar);
}

INSTANTIATE_TEST_SUITE_P(TestAlignment, TestAlignmentImpl, ValuesIn(create_alignment_test_cases()));
@@ -0,0 +1,197 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include "../src/hirschberg_myers_gpu.cu"
#include "../src/batched_device_matrices.cuh"
#include <claragenomics/utils/device_buffer.cuh>
#include <claragenomics/utils/signed_integer_utils.hpp>
#include <vector>
#include <gtest/gtest.h>

namespace claragenomics
{

namespace cudaaligner
{

using WordType = hirschbergmyers::WordType;

namespace test
{

__global__ void myers_preprocess_kernel(batched_device_matrices<WordType>::device_interface* batched_query_pattern, char const* query, int32_t query_size)
{
CGA_CONSTEXPR int32_t word_size = sizeof(WordType) * CHAR_BIT;
const int32_t n_words = ceiling_divide<int32_t>(query_size, word_size);
device_matrix_view<WordType> query_pattern = batched_query_pattern->get_matrix_view(0, n_words, 8);
hirschbergmyers::myers_preprocess(query_pattern, query, query_size);
}

__global__ void myers_get_query_pattern_test_kernel(int32_t n_words, WordType* result, batched_device_matrices<WordType>::device_interface* batched_query_pattern, int32_t idx, char x, bool reverse)
{
int const i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < 32)
{
device_matrix_view<WordType> patterns = batched_query_pattern->get_matrix_view(0, n_words, 8);
result[i] = hirschbergmyers::get_query_pattern(patterns, idx, i, x, reverse);
}
}

matrix<WordType> compute_myers_preprocess_matrix(std::string query_host)
{
CGA_CONSTEXPR int32_t word_size = sizeof(WordType) * CHAR_BIT;
cudaStream_t stream = nullptr;
const int32_t n_words = ceiling_divide<int32_t>(query_host.size(), word_size);

device_buffer<char> query(query_host.size());
cudaMemcpy(query.data(), query_host.data(), sizeof(char) * query.size(), cudaMemcpyHostToDevice);

batched_device_matrices<WordType> query_pattern(1, 8 * n_words, stream);
myers_preprocess_kernel<<<1, 32>>>(query_pattern.get_device_interface(), query.data(), query.size());
return query_pattern.get_matrix(0, n_words, 8, stream);
}

std::vector<WordType> myers_get_query_pattern_test(std::string query_host, int32_t idx, char x, bool reverse)
{
CGA_CONSTEXPR int32_t word_size = sizeof(WordType) * CHAR_BIT;
cudaStream_t stream = nullptr;
const int32_t n_words = ceiling_divide<int32_t>(query_host.size(), word_size);
device_buffer<char> query(query_host.size());
cudaMemcpy(query.data(), query_host.data(), sizeof(char) * query.size(), cudaMemcpyHostToDevice);
batched_device_matrices<WordType> query_pattern(1, 8 * n_words, stream);
myers_preprocess_kernel<<<1, 32>>>(query_pattern.get_device_interface(), query.data(), query.size());

device_buffer<WordType> result(32);
myers_get_query_pattern_test_kernel<<<1, 32>>>(n_words, result.data(), query_pattern.get_device_interface(), idx, x, reverse);

std::vector<WordType> result_host(result.size());
cudaMemcpy(result_host.data(), result.data(), sizeof(WordType) * result.size(), cudaMemcpyDeviceToHost);
return result_host;
}

} // namespace test

TEST(HirschbergMyers, myers_preprocess_test)
{
CGA_CONSTEXPR int32_t word_size = sizeof(WordType) * CHAR_BIT;
static_assert(word_size == 32, "This test assumes word_size = 32bit.");
using test::compute_myers_preprocess_matrix;
std::string query =
"AACCGGTTACGTACGT"
"AAACCCGGGTTTACGT"
"AAACCCGGGTTTACG";
matrix<WordType> patterns = compute_myers_preprocess_matrix(query);
ASSERT_EQ(patterns.num_rows(), 2);
ASSERT_EQ(patterns.num_cols(), 8);
// A=0, C=1, G=2, T=3
EXPECT_EQ(patterns(0, 0), 0b00010000000001110001000100000011u);
EXPECT_EQ(patterns(0, 1), 0b00100000001110000010001000001100u);
EXPECT_EQ(patterns(0, 2), 0b01000001110000000100010000110000u);
EXPECT_EQ(patterns(0, 3), 0b10001110000000001000100011000000u);
EXPECT_EQ(patterns(1, 0), 0b001000000000111u);
EXPECT_EQ(patterns(1, 1), 0b010000000111000u);
EXPECT_EQ(patterns(1, 2), 0b100000111000000u);
EXPECT_EQ(patterns(1, 3), 0b000111000000000u);
// reverse: A=4, C=5, G=6, T=7
EXPECT_EQ(patterns(0, 4), 0b01110000000001000111000000000100u);
EXPECT_EQ(patterns(0, 6), 0b00000001110000010000000111000001u);
EXPECT_EQ(patterns(0, 5), 0b00001110000000100000111000000010u);
EXPECT_EQ(patterns(0, 7), 0b10000000001110001000000000111000u);
EXPECT_EQ(patterns(1, 4), 0b110000001000100u);
EXPECT_EQ(patterns(1, 5), 0b001100000100010u);
EXPECT_EQ(patterns(1, 6), 0b000011000010001u);
EXPECT_EQ(patterns(1, 7), 0b000000110001000u);

std::reverse(query.begin(), query.end());
matrix<WordType> patterns_reversed = compute_myers_preprocess_matrix(query);
ASSERT_EQ(patterns.num_rows(), 2);
ASSERT_EQ(patterns.num_cols(), 8);
EXPECT_EQ(patterns_reversed(0, 0), patterns(0, 4));
EXPECT_EQ(patterns_reversed(1, 0), patterns(1, 4));
EXPECT_EQ(patterns_reversed(0, 1), patterns(0, 5));
EXPECT_EQ(patterns_reversed(1, 1), patterns(1, 5));
EXPECT_EQ(patterns_reversed(0, 2), patterns(0, 6));
EXPECT_EQ(patterns_reversed(1, 2), patterns(1, 6));
EXPECT_EQ(patterns_reversed(0, 3), patterns(0, 7));
EXPECT_EQ(patterns_reversed(1, 3), patterns(1, 7));
EXPECT_EQ(patterns_reversed(0, 4), patterns(0, 0));
EXPECT_EQ(patterns_reversed(1, 4), patterns(1, 0));
EXPECT_EQ(patterns_reversed(0, 5), patterns(0, 1));
EXPECT_EQ(patterns_reversed(1, 5), patterns(1, 1));
EXPECT_EQ(patterns_reversed(0, 6), patterns(0, 2));
EXPECT_EQ(patterns_reversed(1, 6), patterns(1, 2));
EXPECT_EQ(patterns_reversed(0, 7), patterns(0, 3));
EXPECT_EQ(patterns_reversed(1, 7), patterns(1, 3));
}

TEST(HirschbergMyers, myers_get_query_pattern)
{
CGA_CONSTEXPR int32_t word_size = sizeof(WordType) * CHAR_BIT;
static_assert(word_size == 32, "This test assumes word_size = 32bit.");
using test::compute_myers_preprocess_matrix;
using test::myers_get_query_pattern_test;
std::string query =
"AACCGGTTACGTACGT"
"AAACCCGGGTTTACGT"
"AAACCCGGGTTTACG";
std::vector<WordType> patterns_0 = myers_get_query_pattern_test(query, 0, 'A', false);
std::vector<WordType> patterns_1 = myers_get_query_pattern_test(query, 1, 'A', false);
ASSERT_EQ(get_size(patterns_0), get_size(patterns_1));
int32_t const n = get_size(patterns_0);
for (int32_t i = 0; i < n; ++i)
{
std::string shifted_query = std::string(query.begin() + i, query.end());
matrix<WordType> shifted_p = compute_myers_preprocess_matrix(shifted_query);
EXPECT_EQ(patterns_0[i], shifted_p(0, 0)) << "for shift:" << i << std::endl;
if (get_size(shifted_query) > word_size)
{
ASSERT_EQ(shifted_p.num_rows(), 2);
EXPECT_EQ(patterns_1[i], shifted_p(1, 0)) << "for shift:" << i << std::endl;
}
else
{
EXPECT_EQ(patterns_1[i], WordType(0)) << "for shift:" << i << std::endl;
}
}
}

TEST(HirschbergMyers, myers_get_query_pattern_reverse)
{
CGA_CONSTEXPR int32_t word_size = sizeof(WordType) * CHAR_BIT;
static_assert(word_size == 32, "This test assumes word_size = 32bit.");
using test::compute_myers_preprocess_matrix;
using test::myers_get_query_pattern_test;
std::string query =
"AACCGGTTACGTACGT"
"AAACCCGGGTTTACGT"
"AAACCCGGGTTTACG";
std::vector<WordType> reverse_patterns_0 = myers_get_query_pattern_test(query, 0, 'A', true);
std::vector<WordType> reverse_patterns_1 = myers_get_query_pattern_test(query, 1, 'A', true);
int32_t const n = get_size(reverse_patterns_0);
ASSERT_EQ(get_size(reverse_patterns_0), get_size(reverse_patterns_1));
for (int32_t i = 0; i < n; ++i)
{
std::string shifted_end_query = std::string(query.begin(), query.end() - i);
matrix<WordType> shifted_p = compute_myers_preprocess_matrix(shifted_end_query);
EXPECT_EQ(reverse_patterns_0[i], shifted_p(0, 4)) << "for shift:" << i << std::endl;
if (get_size(shifted_end_query) > word_size)
{
ASSERT_EQ(shifted_p.num_rows(), 2);
EXPECT_EQ(reverse_patterns_1[i], shifted_p(1, 4)) << "for shift:" << i << std::endl;
}
else
{
EXPECT_EQ(reverse_patterns_1[i], WordType(0)) << "for shift:" << i << std::endl;
}
}
}

} // namespace cudaaligner
} // namespace claragenomics
@@ -0,0 +1,33 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include <claragenomics/utils/mathutils.hpp>

#include "gtest/gtest.h"

namespace claragenomics
{

namespace cudaaligner
{

TEST(TestCudaAlignerMisc, CeilingDivide)
{
EXPECT_EQ(ceiling_divide(0, 5), 0);
EXPECT_EQ(ceiling_divide(5, 5), 1);
EXPECT_EQ(ceiling_divide(10, 5), 2);
EXPECT_EQ(ceiling_divide(20, 5), 4);

EXPECT_EQ(ceiling_divide(6, 5), 2);
EXPECT_EQ(ceiling_divide(4, 5), 1);
}

} // namespace cudaaligner
} // namespace claragenomics
@@ -0,0 +1,136 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include "../src/myers_gpu.cuh"
#include "../src/needleman_wunsch_cpu.hpp"

#include <claragenomics/utils/genomeutils.hpp>

#include <random>
#include <gtest/gtest.h>

namespace claragenomics
{

namespace cudaaligner
{

struct TestCaseData
{
std::string target;
std::string query;
int32_t edit_distance;
};

std::vector<TestCaseData> create_myers_test_cases()
{
std::vector<TestCaseData> tests;

TestCaseData t;

t.target = "AAAAAAAAAA";
t.query = "CGTCGTCGTC";
t.edit_distance = 10;
tests.push_back(t);

t.target = "AATAATAATA";
t.query = "CGTCGTCGTC";
t.edit_distance = 7;
tests.push_back(t);

t.target = "AATAATAATA";
t.query = "";
t.edit_distance = 10;
tests.push_back(t);

t.target = "";
t.query = "CGTCGTCGTC";
t.edit_distance = 10;
tests.push_back(t);

t.target = "AATAATAATA";
t.query = "C";
t.edit_distance = 10;
tests.push_back(t);

t.target = "CGTCGTCGTC";
t.query = "CGTCGTCGTC";
t.edit_distance = 0;
tests.push_back(t);

t.target = "CGTCGTCGTCCGTCGTCGTCCGTCGTCGTCGT";
t.query = "AGTCGTCGTCCGTAATCGTCCGTCGTCGTCGA";
t.edit_distance = 4;
tests.push_back(t);

t.target = "CGTCGTCGTCCGTCGTCGTCCGTCGTCGTCGTC";
t.query = "AGTCGTCGTCCGTAATCGTCCGTCGTCGTCGTA";
t.edit_distance = 4;
tests.push_back(t);

t.target = "GTCGTCGTCCGTCGTCGTCCGTCGTCGTCGTCGTCGTCGTCCGTCGTCGTCCGTCGTCGTCGTCGTCGTCGTCCGTCGTCGTCCGTCGTCGTCGTC";
t.query = "GTCGTCGTCCGTCGTCGTCCGTCGTCGTCGAAAACGTCGTCCGTCGTCGTCCGTCGTCGAAAACGTCGTCGTCCGTAGTCGTCCGACGTCGTCGTC";
t.edit_distance = 10;
tests.push_back(t);

t.target = "GTCGTCGTCCGTCGTCGTCCGTCGTCGTCGTCGTCGTCGTCCGTCGTCGTCCGTCGTCGTCGTCGTCGTCGTCCGTCGTCGTCCGTCGTCGTCGTC";
t.query = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
t.edit_distance = 96;
tests.push_back(t);

std::minstd_rand rng(1);
t.target = claragenomics::genomeutils::generate_random_genome(5000, rng);
t.query = claragenomics::genomeutils::generate_random_genome(4800, rng);
matrix<int> s = needleman_wunsch_build_score_matrix_naive(t.target, t.query);
t.edit_distance = s(s.num_rows() - 1, s.num_cols() - 1);
tests.push_back(t);
return tests;
}

class TestMyersEditDistance : public ::testing::TestWithParam<TestCaseData>
{
};

TEST_P(TestMyersEditDistance, TestCases)
{
TestCaseData t = GetParam();

int32_t d = myers_compute_edit_distance(t.target, t.query);
ASSERT_EQ(d, t.edit_distance);
}

class TestMyersScoreMatrix : public ::testing::TestWithParam<TestCaseData>
{
};

TEST_P(TestMyersScoreMatrix, TestCases)
{
TestCaseData t = GetParam();

matrix<int32_t> m = myers_get_full_score_matrix(t.target, t.query);
matrix<int32_t> r = needleman_wunsch_build_score_matrix_naive(t.target, t.query);

ASSERT_EQ(m.num_rows(), r.num_rows());
ASSERT_EQ(m.num_cols(), r.num_cols());

for (int32_t j = 0; j < m.num_cols(); ++j)
{
for (int32_t i = 0; i < m.num_rows(); ++i)
{
EXPECT_EQ(m(i, j), r(i, j)) << "index: (" << i << "," << j << ")";
}
}
}

INSTANTIATE_TEST_SUITE_P(TestMyersAlgorithm, TestMyersEditDistance, ::testing::ValuesIn(create_myers_test_cases()));
INSTANTIATE_TEST_SUITE_P(TestMyersAlgorithm, TestMyersScoreMatrix, ::testing::ValuesIn(create_myers_test_cases()));

} // namespace cudaaligner
} // namespace claragenomics
@@ -8,17 +8,19 @@
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include "gtest/gtest.h"
#include "../src/needleman_wunsch_cpu.hpp"
#include "../src/ukkonen_cpu.hpp"
#include "../src/ukkonen_gpu.cuh"
#include "../src/device_storage.cuh"
#include "../src/batched_device_matrices.cuh"
#include <utils/signed_integer_utils.hpp>
#include <utils/genomeutils.hpp>

#include <claragenomics/utils/signed_integer_utils.hpp>
#include <claragenomics/utils/genomeutils.hpp>
#include <claragenomics/utils/device_buffer.cuh>

#include <cuda_runtime_api.h>
#include <random>
#include <algorithm>
#include "gtest/gtest.h"

namespace claragenomics
{
@@ -108,7 +110,7 @@ class AlignerImplementation : public TestWithParam<TestAlignmentPair>
if (j - i >= -p && j - i <= std::abs(n - m) + p)
{
int32_t k, l;
std::tie(k, l) = nw_cpu::to_band_indices(i, j, p);
std::tie(k, l) = to_band_indices(i, j, p);
ASSERT_EQ(banded_matrix(k, l), regular_matrix_ref(i, j)) << "(" << k << "," << l << ")d=(" << i << "," << j << ") -- " << banded_matrix(k, l) << " != " << regular_matrix_ref(i, j);
}
}
@@ -151,19 +153,19 @@ matrix<int> ukkonen_gpu_build_score_matrix(const std::string& target, const std:
int32_t max_length_difference = std::abs(target_length - query_length);

auto score_matrices = std::make_unique<batched_device_matrices<nw_score_t>>(
1, ukkonen_max_score_matrix_size(query_length, target_length, max_length_difference, ukkonen_p), nullptr, 0);
1, ukkonen_max_score_matrix_size(query_length, target_length, max_length_difference, ukkonen_p), nullptr);

device_storage<int8_t> path_d(max_path_length, 0);
device_buffer<int8_t> path_d(max_path_length);
std::vector<int8_t> path_h(max_path_length);

device_storage<int32_t> path_length_d(1, 0);
device_buffer<int32_t> path_length_d(1);
std::vector<int32_t> path_length_h(1);

device_storage<char> sequences_d(2 * max_alignment_length, 0);
device_buffer<char> sequences_d(2 * max_alignment_length);
CGA_CU_CHECK_ERR(cudaMemcpy(sequences_d.data(), query.c_str(), sizeof(char) * query_length, cudaMemcpyHostToDevice));
CGA_CU_CHECK_ERR(cudaMemcpy(sequences_d.data() + max_alignment_length, target.c_str(), sizeof(char) * target_length, cudaMemcpyHostToDevice));

device_storage<int32_t> sequence_lengths_d(2, 0);
device_buffer<int32_t> sequence_lengths_d(2);
CGA_CU_CHECK_ERR(cudaMemcpy(sequence_lengths_d.data(), &query_length, sizeof(int32_t) * 1, cudaMemcpyHostToDevice));
CGA_CU_CHECK_ERR(cudaMemcpy(sequence_lengths_d.data() + 1, &target_length, sizeof(int32_t) * 1, cudaMemcpyHostToDevice));

@@ -187,16 +189,16 @@ matrix<int> ukkonen_gpu_build_score_matrix(const std::string& target, const std:

TEST_P(AlignerImplementation, UkkonenVsNaiveScoringMatrix)
{
matrix<int> u = nw_cpu::ukkonen_build_score_matrix(param_.target, param_.query, param_.p);
matrix<int> r = nw_cpu::needleman_wunsch_build_score_matrix_naive(param_.target, param_.query);
matrix<int> u = ukkonen_build_score_matrix(param_.target, param_.query, param_.p);
matrix<int> r = needleman_wunsch_build_score_matrix_naive(param_.target, param_.query);

compare_banded_score_matrix(r, u, param_.p);
}

TEST_P(AlignerImplementation, UkkonenGpuVsUkkonenCpuScoringMatrix)
{
matrix<int> u = ukkonen_gpu_build_score_matrix(param_.target, param_.query, param_.p);
matrix<int> r = nw_cpu::ukkonen_build_score_matrix(param_.target, param_.query, param_.p);
matrix<int> r = ukkonen_build_score_matrix(param_.target, param_.query, param_.p);
int const m = param_.query.length() + 1;
int const n = param_.target.length() + 1;
int const p = param_.p;
@@ -212,9 +214,11 @@ TEST_P(AlignerImplementation, UkkonenGpuVsUkkonenCpuScoringMatrix)
if (j - i >= -p && j - i <= n - m + p)
{
int k, l;
std::tie(k, l) = nw_cpu::to_band_indices(i, j, p);
std::tie(k, l) = to_band_indices(i, j, p);
if (u(k, l) != r(k, l))
{
ASSERT_EQ(u(k, l), r(k, l)) << "(" << k << "," << l << ")d[=(" << i << "," << j << ")] -- " << u(k, l) << " != " << r(k, l) << std::endl;
}
}
}
}
@@ -229,19 +233,19 @@ std::vector<int8_t> run_ukkonen_gpu(const std::string& target, const std::string
int32_t max_length_difference = std::abs(target_length - query_length);

auto score_matrices = std::make_unique<batched_device_matrices<nw_score_t>>(
1, ukkonen_max_score_matrix_size(query_length, target_length, max_length_difference, ukkonen_p), nullptr, 0);
1, ukkonen_max_score_matrix_size(query_length, target_length, max_length_difference, ukkonen_p), nullptr);

device_storage<int8_t> path_d(max_path_length, 0);
device_buffer<int8_t> path_d(max_path_length);
std::vector<int8_t> path_h(max_path_length);

device_storage<int32_t> path_length_d(1, 0);
device_buffer<int32_t> path_length_d(1);
std::vector<int32_t> path_length_h(1);

device_storage<char> sequences_d(2 * max_alignment_length, 0);
device_buffer<char> sequences_d(2 * max_alignment_length);
CGA_CU_CHECK_ERR(cudaMemcpy(sequences_d.data(), query.c_str(), sizeof(char) * query_length, cudaMemcpyHostToDevice));
CGA_CU_CHECK_ERR(cudaMemcpy(sequences_d.data() + max_alignment_length, target.c_str(), sizeof(char) * target_length, cudaMemcpyHostToDevice));

device_storage<int32_t> sequence_lengths_d(2, 0);
device_buffer<int32_t> sequence_lengths_d(2);
CGA_CU_CHECK_ERR(cudaMemcpy(sequence_lengths_d.data(), &query_length, sizeof(int32_t) * 1, cudaMemcpyHostToDevice));
CGA_CU_CHECK_ERR(cudaMemcpy(sequence_lengths_d.data() + 1, &target_length, sizeof(int32_t) * 1, cudaMemcpyHostToDevice));

@@ -271,7 +275,7 @@ std::vector<int8_t> run_ukkonen_gpu(const std::string& target, const std::string
TEST_P(AlignerImplementation, UkkonenCpuFullVsUkkonenGpuFull)
{
int32_t const p = 1;
std::vector<int8_t> cpu_bt = nw_cpu::ukkonen_cpu(param_.target, param_.query, p);
std::vector<int8_t> cpu_bt = ukkonen_cpu(param_.target, param_.query, p);
std::vector<int8_t> gpu_bt = run_ukkonen_gpu(param_.target, param_.query, p);

compare_backtrace(cpu_bt, gpu_bt);
@@ -8,34 +8,24 @@
# license agreement from NVIDIA CORPORATION is strictly prohibited.
#

cmake_minimum_required(VERSION 3.10.2)
project(cudapoa)

# Process data subdirectory first
add_subdirectory(data)

find_package(CUDA 9.0 QUIET REQUIRED)
# Project specific NVCC flags
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")

if(NOT ${CUDA_FOUND})
message(FATAL_ERROR "CUDA not detected on system. Please install")
else()
message(STATUS "Using CUDA ${CUDA_VERSION} from ${CUDA_TOOLKIT_ROOT_DIR}")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11 -lineinfo -use_fast_math -Xcompiler -Wall,-Wno-pedantic")
endif()

cuda_add_library(cudapoa STATIC
get_property(cga_library_type GLOBAL PROPERTY cga_library_type)
cuda_add_library(cudapoa ${cga_library_type}
src/cudapoa.cpp
src/cudapoa_kernels.cu
src/cudapoa_topsort.cu
src/cudapoa_add_alignment.cu
src/cudapoa_nw.cu
src/cudapoa_nw_banded.cu
src/batch.cpp
src/cudapoa_batch.cpp
src/allocate_block.cpp
)

target_link_libraries(cudapoa cudautils logging utils)
target_link_libraries(cudapoa utils logging)

target_compile_options(cudapoa PRIVATE -Werror -Wall -Wextra -march=native)

@@ -55,10 +45,6 @@ target_include_directories(cudapoa

add_doxygen_source_dir(${CMAKE_CURRENT_SOURCE_DIR}/include/cudapoa)

# Add tests folder
add_subdirectory(tests)
add_subdirectory(benchmarks)

install(TARGETS cudapoa
COMPONENT logging
EXPORT cudapoa
@@ -68,5 +54,11 @@ install(TARGETS cudapoa
install(DIRECTORY include/ DESTINATION include)
install(EXPORT cudapoa DESTINATION cmake)

# Add other folders.
add_subdirectory(tests)
add_subdirectory(benchmarks)
add_subdirectory(samples)


# Add auto formatting.
cga_enable_auto_formatting("${CMAKE_CURRENT_SOURCE_DIR}")
@@ -7,7 +7,7 @@ of a single batch of POA.

To run the benchmark, execute
```
./benchmarks/cudapoa/singlebatch
./benchmarks/cudapoa/benchmark_cudapoa_singlebatch
```

## Multi Batch
@@ -17,5 +17,5 @@ of several batched CUDA POA stream that fill up the GPU.

To the the benchmark, execute
```
./benchmarks/cudapoa/multibatch
./benchmarks/cudapoa/benchmark_cudapoa_multibatch
```
@@ -14,6 +14,7 @@
#include <vector>
#include <sstream>
#include <fstream>
#include <cassert>

namespace claragenomics
{
@@ -18,8 +18,7 @@
# license agreement from NVIDIA CORPORATION is strictly prohibited.
#

cmake_minimum_required(VERSION 3.10.2)
project(multibatch)
project(benchmark_cudapoa_multibatch)

set(SOURCES
main.cpp
@@ -8,13 +8,14 @@
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include <benchmark/benchmark.h>
#include <cuda_runtime_api.h>

#include "../common/utils.hpp"
#include "multi_batch.hpp"
#include "file_location.hpp"
#include "../common/utils.hpp"
#include "cudautils/cudautils.hpp"

#include <claragenomics/utils/cudautils.hpp>

#include <benchmark/benchmark.h>
#include <cuda_runtime_api.h>

namespace claragenomics
{
@@ -24,52 +25,22 @@ namespace cudapoa

static void BM_MultiBatchTest(benchmark::State& state)
{
// TODO: Query size per window from CUDAPOA API
// Assuming 9MB per window.
const size_t size_per_window = 9 * 1024 * 1024;

// Query free total and free GPU memory.
size_t free, total;
CGA_CU_CHECK_ERR(cudaMemGetInfo(&free, &total));

int32_t batches = state.range(0);
int32_t batch_size = state.range(1);
size_t total_memory = (batches * batch_size * size_per_window);

if (total_memory >= free)
{
state.SkipWithError("Not enough available memory for config, skipping");
state.KeepRunning(); // Added as a WAR to Google Benchmark crashing in DEBUG mode.
}
else if (total_memory < (free / 2))
{
state.SkipWithError("Config using less than half of available memory, skipping");
state.KeepRunning(); // Added as a WAR to Google Benchmark crashing in DEBUG mode.
}
else
int32_t batches = state.range(0);
const int32_t total_windows = 5500;
MultiBatch mb(batches, std::string(CUDAPOA_BENCHMARK_DATA_DIR) + "/sample-windows.txt", total_windows);
for (auto _ : state)
{
const int32_t total_windows = 5500;
MultiBatch mb(batches, batch_size, std::string(CUDAPOA_BENCHMARK_DATA_DIR) + "/sample-windows.txt", total_windows);
for (auto _ : state)
{
mb.process_batches();
}
mb.process_batches();
}
}

static void CustomArguments(benchmark::internal::Benchmark* b)
{
const int32_t min_total_windows = 512;
const int32_t max_total_windows = 4096;
for (int32_t batches = 1; batches <= 64; batches *= 2)
for (int32_t batches = 1; batches <= 16; batches *= 2)
{
for (int32_t batch_size = 64; batch_size <= 1024; batch_size *= 2)
{
if (batches * batch_size <= max_total_windows && batches * batch_size >= min_total_windows)
{
b->Args({batches, batch_size});
}
}
b->Args({batches});
}
}

@@ -8,11 +8,13 @@
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include "../common/utils.hpp"

#include <claragenomics/cudapoa/batch.hpp>
#include <claragenomics/utils/signed_integer_utils.hpp>

#include <future>
#include <numeric>
#include "cudapoa/batch.hpp"
#include "../common/utils.hpp"
#include "utils/signed_integer_utils.hpp"

namespace claragenomics
{
@@ -27,22 +29,26 @@ class MultiBatch
/// \brief Construct a multi batch processor
///
/// \param num_batches Number of cudapoa batches
/// \param max_poas_per_batch Batch size
/// \param filename Filename with window data
MultiBatch(int32_t num_batches, int32_t max_poas_per_batch, const std::string& filename, int32_t total_windows = -1)
MultiBatch(int32_t num_batches, const std::string& filename, int32_t total_windows = -1)
: num_batches_(num_batches)
, max_poas_per_batch_(max_poas_per_batch)
{
parse_window_data_file(windows_, filename, total_windows);

assert(get_size(windows_) > 0);

size_t total = 0, free = 0;
cudaSetDevice(0);
cudaMemGetInfo(&free, &total);
size_t mem_per_batch = 0.9 * free / num_batches_;
for (int32_t batch = 0; batch < num_batches_; batch++)
{
cudaStream_t stream;
cudaStreamCreate(&stream);
batches_.emplace_back(create_batch(max_poas_per_batch, 200, 0, OutputType::consensus));
batches_.back()->set_cuda_stream(stream);
batches_.emplace_back(create_batch(200,
0, stream, mem_per_batch,
OutputType::consensus,
-8, -6, 8, false));
}
}

@@ -66,19 +72,33 @@ class MultiBatch

// Lambda function for adding windows to batches.
auto fill_next_batch = [&mutex_windows, &next_window_index, this](Batch* batch) -> std::pair<int32_t, int32_t> {
// Reset batch before adding fresh set of windows.
batch->reset();

// Use mutex to read the vector containing windows in a threadsafe manner.
std::lock_guard<std::mutex> guard(mutex_windows);

// TODO: Reducing window wize by 10 for debugging.
int32_t initial_count = next_window_index;
int32_t count = get_size(windows_);
int32_t num_inserted = 0;
while (next_window_index < count)
{
if (num_inserted < max_poas_per_batch_)
Group poa_group;
const std::vector<std::string>& window = windows_[next_window_index];
for (auto& seq : window)
{
Entry e{};
e.seq = seq.c_str();
e.weights = NULL;
e.length = seq.length();
poa_group.push_back(e);
}

std::vector<StatusType> s;
StatusType status = batch->add_poa_group(s, poa_group);
if (status == StatusType::success)
{
next_window_index++;
num_inserted++;
}
else
{
@@ -93,24 +113,7 @@ class MultiBatch
auto process_batch = [&fill_next_batch, this](Batch* batch) -> void {
while (true)
{
batch->reset();

std::pair<int32_t, int32_t> range = fill_next_batch(batch);
for (int32_t i = range.first; i < range.second; i++)
{
if (batch->add_poa() == StatusType::success)
{
const std::vector<std::string>& window = windows_[i];
for (auto& seq : window)
{
batch->add_seq_to_poa(seq.c_str(), NULL, seq.length());
}
}
else
{
throw std::runtime_error("Could not add new POA to batch");
}
}
if (batch->get_total_poas() > 0)
{
std::vector<std::string> consensus_temp;
@@ -200,7 +203,6 @@ class MultiBatch

private:
int32_t num_batches_;
int32_t max_poas_per_batch_;
std::vector<std::unique_ptr<Batch>> batches_;
std::vector<std::vector<std::string>> windows_;
std::vector<std::string> consensus_;
@@ -18,8 +18,7 @@
# license agreement from NVIDIA CORPORATION is strictly prohibited.
#

cmake_minimum_required(VERSION 3.10.2)
project(singlebatch)
project(benchmark_cudapoa_singlebatch)

set(SOURCES
main.cpp
@@ -8,11 +8,11 @@
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include <benchmark/benchmark.h>

#include "single_batch.hpp"
#include "file_location.hpp"

#include <benchmark/benchmark.h>

namespace claragenomics
{

@@ -8,9 +8,10 @@
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include "cudapoa/batch.hpp"
#include "../common/utils.hpp"
#include "utils/signed_integer_utils.hpp"

#include <claragenomics/cudapoa/batch.hpp>
#include <claragenomics/utils/signed_integer_utils.hpp>

namespace claragenomics
{
@@ -34,10 +35,15 @@ class SingleBatch

assert(get_size(windows_) > 0);

batch_ = create_batch(max_poas_per_batch, 200, 0, OutputType::consensus, -8, -6, 8, false);
cudaStream_t stream;
cudaStreamCreate(&stream);
batch_->set_cuda_stream(stream);

size_t total = 0, free = 0;
cudaSetDevice(0);
cudaMemGetInfo(&free, &total);
size_t mem_per_batch = 0.9 * free;

batch_ = create_batch(200, 0, stream, mem_per_batch, OutputType::consensus, -8, -6, 8, false);
}

~SingleBatch()
@@ -53,15 +59,18 @@ class SingleBatch
int32_t total_windows = get_size(windows_);
for (int32_t i = 0; i < max_poas_per_batch_; i++)
{
batch_->add_poa();
Group poa_group;
const auto& window = windows_[i % total_windows];
for (int32_t s = 0; s < get_size(window); s++)
{
batch_->add_seq_to_poa(
window[s].c_str(),
NULL,
window[s].length());
Entry e{};
e.seq = window[s].c_str();
e.weights = NULL;
e.length = window[s].length();
poa_group.push_back(e);
}
std::vector<StatusType> error_status;
batch_->add_poa_group(error_status, poa_group);
}
}

@@ -8,8 +8,6 @@
# license agreement from NVIDIA CORPORATION is strictly prohibited.
#

cmake_minimum_required(VERSION 3.10.2)

set(CUDAPOA_BENCHMARK_DATA_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../data")
configure_file(file_location.hpp.in ${PROJECT_BINARY_DIR}/data/file_location.hpp @ONLY)

@@ -0,0 +1,132 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#pragma once

#include <claragenomics/cudapoa/cudapoa.hpp>

#include <memory>
#include <vector>
#include <stdint.h>
#include <string>
#include <iostream>
#include <cuda_runtime_api.h>

namespace claragenomics
{

namespace cudapoa
{

/// A structure to represent a sequence entry.
struct Entry
{
/// Pointer to string representing sequence.
const char* seq;
/// Pointer to array of weight per base in sequence.
const int8_t* weights;
/// Length of sequence.
int32_t length;
};

/// A type defining the set and order of Entry's in which a POA is processed.
typedef std::vector<Entry> Group;

/// \class Batch
/// Batched GPU CUDA POA object
class Batch
{
public:
/// \brief CudapoaBatch has a custom dtor, so declare ~Batch virtual and give it a default implementation
virtual ~Batch() = default;

/// \brief Add a new group to the batch to run POA algorithm on. Based on the constraints
/// of the batch, now all entries in a group may be added. This will be reflected in
/// the per_seq_status of the call. Those entries that were added will be shown with a success.
///
/// \param per_seq_status Reference to an output vector of StatusType that holds
/// the processing status of each entry in the group.
/// NOTE: This API clears old entries in the vector.
/// \param poa_group Vector of Entry's to process in POA. Based on the constraints
/// of the batch, not all entries in a group may be added.
/// This will be reflected in the per_seq_status of the call. Those entries that were
/// added will show a success status. The POA algorithm will run with
/// the sequences that were added.
///
/// \return Status representing whether PoaGroup was successfully added to batch.
virtual StatusType add_poa_group(std::vector<StatusType>& per_seq_status,
const Group& poa_group) = 0;

/// \brief Get total number of partial order alignments in batch.
///
/// \return Total POAs in batch.
virtual int32_t get_total_poas() const = 0;

/// \brief Run partial order alignment algorithm over all POAs.
virtual void generate_poa() = 0;

/// \brief Get the consensus for each POA.
///
/// \param consensus Reference to vector where consensus strings
/// will be returned
/// \param coverage Reference to vector where coverage of each
/// base in each consensus string is returned
/// \param output_status Reference to vector where the errors
/// during kernel execution is captured
virtual StatusType get_consensus(std::vector<std::string>& consensus,
std::vector<std::vector<uint16_t>>& coverage,
std::vector<claragenomics::cudapoa::StatusType>& output_status) = 0;

/// \brief Get the multiple sequence alignments for each POA.
///
/// \param msa Reference to vector where msa strings of each
/// poa is returned
/// \param output_status Reference to vector where the errors
/// during kernel execution is captured
virtual StatusType get_msa(std::vector<std::vector<std::string>>& msa,
std::vector<StatusType>& output_status) = 0;

/// \brief Return batch ID.
///
/// \return Batch ID
virtual int32_t batch_id() const = 0;

/// \brief Reset batch. Must do before re-using batch.
virtual void reset() = 0;
};

/// \brief Creates a new CUDA Batch object.
///
/// \param max_sequences_per_poa Maximum number of sequences per POA
/// \param device_id GPU device on which to run CUDA POA algorithm
/// \param stream CUDA stream to use on GPU
/// \param max_mem Maximum GPU memory to use for this batch.
/// \param output_mask Which outputs to produce from POA (msa, consensus)
/// \param gap_score Score to be assigned to a gap
/// \param mismatch_score Score to be assigned to a mismatch
/// \param match_score Score to be assigned for a match
/// \param cuda_banded_alignment Whether to use banded alignment
///
/// \return Returns a unique pointer to a new Batch object
std::unique_ptr<Batch> create_batch(int32_t max_sequences_per_poa,
int32_t device_id,
cudaStream_t stream,
size_t max_mem,
int8_t output_mask,
int16_t gap_score,
int16_t mismatch_score,
int16_t match_score,
bool cuda_banded_alignment);

/// \}

} // namespace cudapoa

} // namespace claragenomics
@@ -19,15 +19,18 @@ namespace claragenomics
namespace cudapoa
{
/// CUDA POA error type
enum class StatusType
enum StatusType
{
success = 0,
exceeded_maximum_poas,
exceeded_maximum_sequence_size,
exceeded_maximum_sequences_per_poa,
exceeded_batch_size,
node_count_exceeded_maximum_graph_size,
edge_count_exceeded_maximum_graph_size,
seq_len_exceeded_maximum_nodes_per_window,
loop_count_exceeded_upper_bound,
output_type_unavailable,
generic_error
};

This file was deleted.

@@ -0,0 +1,25 @@
#
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
#

project(sample_cudapoa)

get_property(cudapoa_data_include_dir GLOBAL PROPERTY cudapoa_data_include_dir)
include_directories(${cudapoa_data_include_dir})

add_executable(${PROJECT_NAME}
sample_cudapoa.cpp
)

target_link_libraries(${PROJECT_NAME}
cudapoa
)

install(TARGETS ${PROJECT_NAME}
DESTINATION samples/cudapoa)
@@ -0,0 +1,227 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include "../benchmarks/common/utils.hpp"

#include <file_location.hpp>
#include <claragenomics/cudapoa/cudapoa.hpp>
#include <claragenomics/cudapoa/batch.hpp>
#include <claragenomics/utils/signed_integer_utils.hpp>
#include <claragenomics/utils/cudautils.hpp>

#include <cuda_runtime_api.h>
#include <vector>
#include <string>
#include <stdexcept>
#include <unistd.h>

using namespace claragenomics;
using namespace claragenomics::cudapoa;

std::unique_ptr<Batch> initialize_batch(bool msa)
{
// Get device information.
int32_t device_count = 0;
CGA_CU_CHECK_ERR(cudaGetDeviceCount(&device_count));
assert(device_count > 0);

size_t total = 0, free = 0;
cudaSetDevice(0); // Using first GPU for sample.
cudaMemGetInfo(&free, &total);

// Initialize internal logging framework.
Init();

// Initialize CUDAPOA batch object for batched processing of POAs on the GPU.
const int32_t max_sequences_per_poa_group = 100;
const int32_t device_id = 0;
cudaStream_t stream = 0;
size_t mem_per_batch = 0.9 * free; // Using 90% of GPU available memory for CUDAPOA batch.
const int32_t mismatch_score = -6, gap_score = -8, match_score = 8;
bool banded_alignment = false;

std::unique_ptr<Batch> batch = create_batch(max_sequences_per_poa_group,
device_id,
stream,
mem_per_batch,
msa ? OutputType::msa : OutputType::consensus,
gap_score,
mismatch_score,
match_score,
banded_alignment);

return std::move(batch);
}

void process_batch(Batch* batch, bool msa, bool print)
{
batch->generate_poa();

StatusType status = StatusType::success;
if (msa)
{
// Grab MSA results for all POA groups in batch.
std::vector<std::vector<std::string>> msa; // MSA per group
std::vector<StatusType> output_status; // Status of MSA generation per group

status = batch->get_msa(msa, output_status);
if (status != StatusType::success)
{
std::cerr << "Could not generate MSA for batch : " << status << std::endl;
}

for (int32_t g = 0; g < get_size(msa); g++)
{
if (output_status[g] != StatusType::success)
{
std::cerr << "Error generating MSA for POA group " << g << ". Error type " << output_status[g] << std::endl;
}
else
{
if (print)
{
for (const auto& alignment : msa[g])
{
std::cout << alignment << std::endl;
}
}
}
}
}
else
{
// Grab consensus results for all POA groups in batch.
std::vector<std::string> consensus; // Consensus string for each POA group
std::vector<std::vector<uint16_t>> coverage; // Per base coverage for each consensus
std::vector<StatusType> output_status; // Status of consensus generation per group

status = batch->get_consensus(consensus, coverage, output_status);
if (status != StatusType::success)
{
std::cerr << "Could not generate consensus for batch : " << status << std::endl;
}

for (int32_t g = 0; g < get_size(consensus); g++)
{
if (output_status[g] != StatusType::success)
{
std::cerr << "Error generating consensus for POA group " << g << ". Error type " << output_status[g] << std::endl;
}
else
{
if (print)
{
std::cout << consensus[g] << std::endl;
}
}
}
}
}

int main(int argc, char** argv)
{
// Process options
int c = 0;
bool msa = false;
bool help = false;
bool print = false;

while ((c = getopt(argc, argv, "mhp")) != -1)
{
switch (c)
{
case 'm':
msa = true;
break;
case 'p':
print = true;
break;
case 'h':
help = true;
break;
}
}

if (help)
{
std::cout << "CUDAPOA API sample program. Runs consensus or MSA generation on pre-canned data." << std::endl;
std::cout << "Usage:" << std::endl;
std::cout << "./sample_cudapoa [-m] [-h]" << std::endl;
std::cout << "-m : Generate MSA (if not provided, generates consensus by default)" << std::endl;
std::cout << "-p : Print the MSA or consensus output to stdout" << std::endl;
std::cout << "-h : Print help message" << std::endl;
std::exit(0);
}

// Load input data. Each POA group is represented as a vector of strings. The sample
// data has many such POA groups to process, hence the data is loaded into a vector
// of vector of strings.
const std::string input_data = std::string(CUDAPOA_BENCHMARK_DATA_DIR) + "/sample-windows.txt";
std::vector<std::vector<std::string>> windows;
parse_window_data_file(windows, input_data, 1000); // Generate windows.
assert(get_size(windows) > 0);

// Initialize batch.
std::unique_ptr<Batch> batch = initialize_batch(msa);

// Loop over all the POA groups, add them to the batch and process them.
int32_t window_count = 0;
for (int32_t i = 0; i < get_size(windows);)
{
const std::vector<std::string>& window = windows[i];

Group poa_group;
// Create a new entry for each sequence and add to the group.
for (const auto& seq : window)
{
Entry poa_entry{};
poa_entry.seq = seq.c_str();
poa_entry.length = seq.length();
poa_entry.weights = nullptr;
poa_group.push_back(poa_entry);
}

std::vector<StatusType> seq_status;
StatusType status = batch->add_poa_group(seq_status, poa_group);

if (status == StatusType::success)
{
// Check if all sequences in POA group wre added successfully.
for (const auto& s : seq_status)
{
if (s == StatusType::exceeded_maximum_sequence_size)
{
std::cerr << "Dropping sequence because sequence exceeded maximum size" << std::endl;
}
}
i++;
}
// NOTE: If number of windows smaller than batch capacity, then run POA generation
// once last window is added to batch.
if (status == StatusType::exceeded_maximum_poas || (i == get_size(windows) - 1))
{
// No more POA groups can be added to batch. Now process batch.
process_batch(batch.get(), msa, print);

// After MSA is generated for batch, reset batch to make roomf or next set of POA groups.
batch->reset();

std::cout << "Processed windows " << window_count << " - " << i << std::endl;
window_count = i;
}

if (status != StatusType::exceeded_maximum_poas && status != StatusType::success)
{
std::cerr << "Could not add POA group to batch. Error code " << status << std::endl;
}
}

return 0;
}

Large diffs are not rendered by default.

@@ -10,15 +10,14 @@

#pragma once

#include "cudapoa_kernels.cuh"

#include <memory>
#include <vector>
#include <stdint.h>
#include <string>

#include <cuda_runtime_api.h>

#include "cudapoa_kernels.cuh"

namespace claragenomics
{

@@ -28,7 +27,7 @@ namespace cudapoa
class BatchBlock
{
public:
BatchBlock(int32_t device_id_, int32_t max_poas, int32_t max_sequences_per_poa, int8_t output_mask, bool banded_alignment = false);
BatchBlock(int32_t device_id_, size_t avail_mem, int32_t max_sequences_per_poa, int8_t output_mask, bool banded_alignment = false);
~BatchBlock();

void get_output_details(OutputDetails** output_details_h_p, OutputDetails** output_details_d_p);
@@ -43,8 +42,15 @@ class BatchBlock

uint8_t* get_block_device();

int32_t get_max_poas() const { return max_poas_; };

protected:
void calculate_size();
// Returns amount of host and device memory needed to store metadata per POA entry.
// The first two elements of the tuple are fixed host and device sizes that
// don't vary based on POA count. The latter two are host and device
// buffer sizes that scale with number of POA entries to process. These sizes do
// not include the scoring matrix needs for POA processing.
std::tuple<int64_t, int64_t, int64_t, int64_t> calculate_space_per_poa();

protected:
// Maximum POAs to process in batch.
@@ -61,18 +67,18 @@ class BatchBlock
uint8_t* block_data_d_;

// Accumulator for the memory size
uint64_t total_h_ = 0;
uint64_t total_d_ = 0;
int64_t total_h_ = 0;
int64_t total_d_ = 0;

// Offset index for pointing a buffer to block memory
uint64_t offset_h_ = 0;
uint64_t offset_d_ = 0;

int32_t input_size_;
int32_t output_size_;
int32_t matrix_sequence_dimension_;
int32_t max_graph_dimension_;
uint16_t max_nodes_per_window_;
int64_t offset_h_ = 0;
int64_t offset_d_ = 0;

int64_t input_size_ = 0;
int64_t output_size_ = 0;
int32_t matrix_sequence_dimension_ = 0;
int32_t max_graph_dimension_ = 0;
int32_t max_nodes_per_window_ = 0;
int32_t device_id_;

// Bit field for output type
@@ -8,25 +8,35 @@
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include "cudapoa/batch.hpp"
#include "cudapoa_batch.hpp"

#include <claragenomics/cudapoa/batch.hpp>

namespace claragenomics
{

namespace cudapoa
{

std::unique_ptr<Batch> create_batch(int32_t max_poas,
int32_t max_sequences_per_poa,
std::unique_ptr<Batch> create_batch(int32_t max_sequences_per_poa,
int32_t device_id,
cudaStream_t stream,
size_t max_mem,
int8_t output_mask,
int16_t gap_score,
int16_t mismatch_score,
int16_t match_score,
bool cuda_banded_alignment)
{
return std::make_unique<CudapoaBatch>(max_poas, max_sequences_per_poa, device_id, output_mask, gap_score, mismatch_score, match_score, cuda_banded_alignment);
return std::make_unique<CudapoaBatch>(max_sequences_per_poa,
device_id,
stream,
max_mem,
output_mask,
gap_score,
mismatch_score,
match_score,
cuda_banded_alignment);
}

} // namespace cudapoa
@@ -8,8 +8,8 @@
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include <cudapoa/cudapoa.hpp>
#include <logging/logging.hpp>
#include <claragenomics/cudapoa/cudapoa.hpp>
#include <claragenomics/logging/logging.hpp>

namespace claragenomics
{
@@ -9,6 +9,9 @@
*/

#include "cudapoa_kernels.cuh"

#include <claragenomics/utils/cudautils.hpp>

#include <stdio.h>

namespace claragenomics
@@ -20,7 +23,8 @@ namespace cudapoa
/**
* @brief Device function for adding a new alignment to the partial order alignment graph.
*
* @param[in/out] nodes Device buffer with unique nodes in graph
* @param[out] new_node_count Number of nodes in graph after update
* @param[in/out] nodes Device buffer with unique nodes in graph
* @param[in] node_count Number of nodes in graph
* @graph[in] node_alignments Device buffer with alignment nodes per node in graph
* @param[in] node_alignment_count Device buffer with number of aligned nodes
@@ -43,13 +47,14 @@ namespace cudapoa
* @param[in] s Current sequence id
* @param[in] max_sequences_per_poa Maximum sequences allowed in a graph
*
* @return Number of nodes in graph after update
* @return Status code for any errors encountered.
*/

template <bool msa = false>
__device__
uint16_t
addAlignmentToGraph(uint8_t* nodes,
uint8_t
addAlignmentToGraph(uint16_t& new_node_count,
uint8_t* nodes,
uint16_t node_count,
uint16_t* node_alignments, uint16_t* node_alignment_count,
uint16_t* incoming_edges, uint16_t* incoming_edge_count,
@@ -101,7 +106,7 @@ __device__
curr_node_id = node_count++;
if (node_count >= CUDAPOA_MAX_NODES_PER_WINDOW)
{
break;
return static_cast<uint8_t>(StatusType::node_count_exceeded_maximum_graph_size);
}
//printf("create new node %d\n", curr_node_id);
new_node = true;
@@ -157,7 +162,7 @@ __device__
curr_node_id = node_count++;
if (node_count >= CUDAPOA_MAX_NODES_PER_WINDOW)
{
break;
return static_cast<uint8_t>(StatusType::node_count_exceeded_maximum_graph_size);
}
//printf("create new node %d\n", curr_node_id);
nodes[curr_node_id] = read_base;
@@ -232,7 +237,8 @@ __device__

if (out_count + 1 >= CUDAPOA_MAX_NODE_EDGES || in_count + 1 >= CUDAPOA_MAX_NODE_EDGES)
{
printf("exceeded max edge count\n");
return static_cast<uint8_t>(StatusType::edge_count_exceeded_maximum_graph_size);
//printf("exceeded max edge count\n");
}
}
else if (msa) //if edge exists and for msa generation
@@ -261,7 +267,8 @@ __device__
}
}
//printf("final size %d\n", node_count);
return node_count;
new_node_count = node_count;
return static_cast<uint8_t>(StatusType::success);
}

// kernel that calls the addAlignmentToGraph device funtion
@@ -285,24 +292,26 @@ __global__ void addAlignmentKernel(uint8_t* nodes,
uint32_t max_sequences_per_poa)
{
// all pointers will be allocated in unified memory visible to both host and device
*node_count = addAlignmentToGraph(nodes,
*node_count,
node_alignments, node_alignment_count,
incoming_edges, incoming_edge_count,
outgoing_edges, outgoing_edge_count,
incoming_edge_w, outgoing_edge_w,
*alignment_length,
graph,
alignment_graph,
read,
alignment_read,
node_coverage_counts,
base_weights,
sequence_begin_nodes_ids,
outgoing_edges_coverage,
outgoing_edges_coverage_count,
s,
max_sequences_per_poa);
uint16_t new_node_count;
uint8_t error_code = addAlignmentToGraph(new_node_count, nodes,
*node_count,
node_alignments, node_alignment_count,
incoming_edges, incoming_edge_count,
outgoing_edges, outgoing_edge_count,
incoming_edge_w, outgoing_edge_w,
*alignment_length,
graph,
alignment_graph,
read,
alignment_read,
node_coverage_counts,
base_weights,
sequence_begin_nodes_ids,
outgoing_edges_coverage,
outgoing_edges_coverage_count,
s,
max_sequences_per_poa);
*node_count = new_node_count;
}

// Host function that calls the kernel
@@ -343,6 +352,7 @@ void addAlignment(uint8_t* nodes,
outgoing_edges_coverage_count,
s,
max_sequences_per_poa);
CGA_CU_CHECK_ERR(cudaPeekAtLastError());
}

} // namespace cudapoa
@@ -8,16 +8,16 @@
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include <algorithm>
#include <cstring>

#include "allocate_block.hpp"
#include "cudapoa_batch.hpp"
#include "allocate_block.hpp"
#include "cudapoa_kernels.cuh"

#include <cudautils/cudautils.hpp>
#include <logging/logging.hpp>
#include <utils/signed_integer_utils.hpp>
#include <claragenomics/utils/cudautils.hpp>
#include <claragenomics/logging/logging.hpp>
#include <claragenomics/utils/signed_integer_utils.hpp>

#include <algorithm>
#include <cstring>

#ifndef TABS
#define TABS printTabs(bid_)
@@ -67,16 +67,29 @@ void CudapoaBatch::initialize_graph_details()
batch_block_->get_graph_details(&graph_details_d_);
}

CudapoaBatch::CudapoaBatch(int32_t max_poas, int32_t max_sequences_per_poa, int32_t device_id, int8_t output_mask, int16_t gap_score, int16_t mismatch_score, int16_t match_score, bool cuda_banded_alignment)
: max_poas_(throw_on_negative(max_poas, "Maximum POAs in batch has to be non-negative"))
, max_sequences_per_poa_(throw_on_negative(max_sequences_per_poa, "Maximum sequences per POA has to be non-negative"))
CudapoaBatch::CudapoaBatch(int32_t max_sequences_per_poa,
int32_t device_id,
cudaStream_t stream,
size_t max_mem,
int8_t output_mask,
int16_t gap_score,
int16_t mismatch_score,
int16_t match_score,
bool cuda_banded_alignment)
: max_sequences_per_poa_(throw_on_negative(max_sequences_per_poa, "Maximum sequences per POA has to be non-negative"))
, device_id_(throw_on_negative(device_id, "Device ID has to be non-negative"))
, stream_(stream)
, output_mask_(output_mask)
, gap_score_(gap_score)
, mismatch_score_(mismatch_score)
, match_score_(match_score)
, banded_alignment_(cuda_banded_alignment)
, batch_block_(new BatchBlock(device_id, max_poas, max_sequences_per_poa, output_mask, cuda_banded_alignment))
, batch_block_(new BatchBlock(device_id,
throw_on_negative(max_mem, "Maximum memory per batch has to be non-negative"),
max_sequences_per_poa,
output_mask,
cuda_banded_alignment))
, max_poas_(batch_block_->get_max_poas())
{
bid_ = CudapoaBatch::batches++;

@@ -86,62 +99,13 @@ CudapoaBatch::CudapoaBatch(int32_t max_poas, int32_t max_sequences_per_poa, int3
print_batch_debug_message(msg);

// Allocate host memory and CUDA memory based on max sequence and target counts.

// Verify that maximum sequence size is in multiples of tb size.
// We subtract one because the matrix dimension needs to be one element larger
// than the sequence size.
if (CUDAPOA_MAX_SEQUENCE_SIZE % CUDAPOA_THREADS_PER_BLOCK != 0)
{
CGA_LOG_CRITICAL("Thread block size needs to be in multiples of 32.");
exit(-1);
}

initialize_input_details();

int32_t input_size = max_poas_ * max_sequences_per_poa_ * CUDAPOA_MAX_SEQUENCE_SIZE; //TODO how big does this need to be
msg = " Allocated input buffers of size " + std::to_string((static_cast<float>(input_size) / (1024 * 1024))) + "MB on device ";
print_batch_debug_message(msg);

initialize_output_details();

input_size += input_size * sizeof(uint16_t);
msg = " Allocated output buffers of size " + std::to_string((static_cast<float>(input_size) / (1024 * 1024))) + "MB on device ";
print_batch_debug_message(msg);

initialize_alignment_details();
initialize_graph_details();
initialize_alignment_details();

// Debug print for size allocated.
int32_t matrix_sequence_dimension = banded_alignment_ ? CUDAPOA_BANDED_MAX_MATRIX_SEQUENCE_DIMENSION : CUDAPOA_MAX_MATRIX_SEQUENCE_DIMENSION;
int32_t max_graph_dimension = cuda_banded_alignment ? CUDAPOA_MAX_MATRIX_GRAPH_DIMENSION_BANDED : CUDAPOA_MAX_MATRIX_GRAPH_DIMENSION;

int32_t temp_size = (sizeof(int16_t) * max_graph_dimension * matrix_sequence_dimension * max_poas_);

temp_size += 2 * (sizeof(int16_t) * max_graph_dimension * max_poas_);
msg = " Allocated temp buffers of size " + std::to_string((static_cast<float>(temp_size) / (1024 * 1024))) + "MB on device ";
print_batch_debug_message(msg);

// Debug print for size allocated.
uint16_t max_nodes_per_window = banded_alignment_ ? CUDAPOA_MAX_NODES_PER_WINDOW_BANDED : CUDAPOA_MAX_NODES_PER_WINDOW;
temp_size = sizeof(uint8_t) * max_nodes_per_window * max_poas_;
temp_size += sizeof(uint16_t) * max_nodes_per_window * CUDAPOA_MAX_NODE_ALIGNMENTS * max_poas_;
temp_size += sizeof(uint16_t) * max_nodes_per_window * max_poas_;
temp_size += sizeof(uint16_t) * max_nodes_per_window * CUDAPOA_MAX_NODE_EDGES * max_poas_;
temp_size += sizeof(uint16_t) * max_nodes_per_window * max_poas_;
temp_size += sizeof(uint16_t) * max_nodes_per_window * CUDAPOA_MAX_NODE_EDGES * max_poas_;
temp_size += sizeof(uint16_t) * max_nodes_per_window * max_poas_;
temp_size += sizeof(uint16_t) * max_nodes_per_window * CUDAPOA_MAX_NODE_EDGES * max_poas_;
temp_size += sizeof(uint16_t) * max_nodes_per_window * CUDAPOA_MAX_NODE_EDGES * max_poas_;
temp_size += sizeof(uint16_t) * max_nodes_per_window * max_poas_;
temp_size += sizeof(uint16_t) * max_nodes_per_window * max_poas_;
temp_size += sizeof(int16_t) * max_nodes_per_window * max_poas_;
temp_size += sizeof(int16_t) * max_nodes_per_window * max_poas_;
temp_size += sizeof(int8_t) * max_nodes_per_window * max_poas_;
temp_size += sizeof(bool) * max_nodes_per_window * max_poas_;
temp_size += sizeof(uint16_t) * max_nodes_per_window * max_poas_;
temp_size += sizeof(uint16_t) * max_nodes_per_window * max_poas_;
msg = " Allocated temp buffers of size " + std::to_string((static_cast<float>(temp_size) / (1024 * 1024))) + "MB on device ";
print_batch_debug_message(msg);
// Call reset function to cleanly initialize members.
reset();
}

CudapoaBatch::~CudapoaBatch()
@@ -150,6 +114,15 @@ CudapoaBatch::~CudapoaBatch()
print_batch_debug_message(msg);
}

void CudapoaBatch::reset()
{
poa_count_ = 0;
num_nucleotides_copied_ = 0;
global_sequence_idx_ = 0;
next_scores_offset_ = 0;
avail_scorebuf_mem_ = alignment_details_d_->scorebuf_alloc_size;
}

int32_t CudapoaBatch::batch_id() const
{
return bid_;
@@ -190,7 +163,6 @@ void CudapoaBatch::generate_poa()
max_sequences_per_poa_,
output_mask_);

CGA_CU_CHECK_ERR(cudaPeekAtLastError());
msg = " Launched kernel on device ";
print_batch_debug_message(msg);
}
@@ -201,26 +173,36 @@ void CudapoaBatch::decode_cudapoa_kernel_error(claragenomics::cudapoa::StatusTyp
switch (error_type)
{
case claragenomics::cudapoa::StatusType::node_count_exceeded_maximum_graph_size:
CGA_LOG_ERROR("Kernel Error:: Node count exceeded maximum nodes per window\n");
output_status.emplace_back(claragenomics::cudapoa::StatusType::node_count_exceeded_maximum_graph_size);
CGA_LOG_WARN("Kernel Error:: Node count exceeded maximum nodes per graph in batch {}\n", bid_);
output_status.emplace_back(error_type);
break;
case claragenomics::cudapoa::StatusType::edge_count_exceeded_maximum_graph_size:
CGA_LOG_WARN("Kernel Error:: Edge count exceeded maximum edges per graph in batch {}\n", bid_);
output_status.emplace_back(error_type);
break;
case claragenomics::cudapoa::StatusType::seq_len_exceeded_maximum_nodes_per_window:
CGA_LOG_ERROR("Kernel Error::Sequence length exceeded maximum nodes per window\n");
output_status.emplace_back(claragenomics::cudapoa::StatusType::seq_len_exceeded_maximum_nodes_per_window);
CGA_LOG_WARN("Kernel Error::Sequence length exceeded maximum nodes per window in batch {}\n", bid_);
output_status.emplace_back(error_type);
break;
case claragenomics::cudapoa::StatusType::loop_count_exceeded_upper_bound:
CGA_LOG_ERROR("Kernel Error::Loop count exceeded upper bound in nw algorithm\n");
output_status.emplace_back(claragenomics::cudapoa::StatusType::loop_count_exceeded_upper_bound);
CGA_LOG_WARN("Kernel Error::Loop count exceeded upper bound in nw algorithm in batch {}\n", bid_);
output_status.emplace_back(error_type);
break;
default:
break;
}
}

void CudapoaBatch::get_consensus(std::vector<std::string>& consensus,
std::vector<std::vector<uint16_t>>& coverage,
std::vector<StatusType>& output_status)
StatusType CudapoaBatch::get_consensus(std::vector<std::string>& consensus,
std::vector<std::vector<uint16_t>>& coverage,
std::vector<StatusType>& output_status)
{
// Check if consensus was requested at init time.
if (!(OutputType::consensus & output_mask_))
{
return StatusType::output_type_unavailable;
}

std::string msg = " Launching memcpy D2H on device ";
print_batch_debug_message(msg);
CGA_CU_CHECK_ERR(cudaMemcpyAsync(output_details_h_->consensus,
@@ -262,12 +244,21 @@ void CudapoaBatch::get_consensus(std::vector<std::string>& consensus,
&(output_details_h_->coverage[poa * CUDAPOA_MAX_CONSENSUS_SIZE]),
&(output_details_h_->coverage[poa * CUDAPOA_MAX_CONSENSUS_SIZE + get_size(consensus.back())])));
std::reverse(coverage.back().begin(), coverage.back().end());
//std::cout << consensus.back() << std::endl;
}
}

return StatusType::success;
}

void CudapoaBatch::get_msa(std::vector<std::vector<std::string>>& msa, std::vector<StatusType>& output_status)
StatusType CudapoaBatch::get_msa(std::vector<std::vector<std::string>>& msa, std::vector<StatusType>& output_status)
{
// Check if msa was requested at init time.
if (!(OutputType::msa & output_mask_))
{
return StatusType::output_type_unavailable;
}

std::string msg = " Launching memcpy D2H on device for msa ";
print_batch_debug_message(msg);

@@ -309,11 +300,68 @@ void CudapoaBatch::get_msa(std::vector<std::vector<std::string>>& msa, std::vect
}
}
}

return StatusType::success;
}

bool CudapoaBatch::reserve_buf(int32_t max_seq_length)
{
int32_t max_graph_dimension = banded_alignment_ ? CUDAPOA_MAX_MATRIX_GRAPH_DIMENSION_BANDED : CUDAPOA_MAX_MATRIX_GRAPH_DIMENSION;

int32_t scores_width = banded_alignment_ ? CUDAPOA_BANDED_MAX_MATRIX_SEQUENCE_DIMENSION : cudautils::align<int32_t, 4>(max_seq_length + 1 + CELLS_PER_THREAD);
size_t scores_size = scores_width * max_graph_dimension * sizeof(int16_t);

if (scores_size > avail_scorebuf_mem_)
{
return false;
}
else
{
avail_scorebuf_mem_ -= scores_size;
return true;
}
}

void CudapoaBatch::set_cuda_stream(cudaStream_t stream)
StatusType CudapoaBatch::add_poa_group(std::vector<StatusType>& per_seq_status,
const Group& poa_group)
{
stream_ = stream;
// Check if the largest entry in the group fill fit
// in available scoring matrix memory or not.
auto max_length_entry = std::max_element(poa_group.begin(),
poa_group.end(),
[](const Entry& lhs, const Entry& rhs) {
return lhs.length < rhs.length;
});
int32_t max_seq_length = max_length_entry->length;

//std::cout << "Adding new poa group!" << std::endl;

if (!reserve_buf(max_seq_length))
{
return StatusType::exceeded_batch_size;
}

// If matrix fits, see if a new poa group can be added.
per_seq_status.clear();
StatusType status = add_poa();
if (status != StatusType::success)
{
return status;
}

// If a new group can be added, attempt to add all entries
// in the group. If they can't be added, record their status
// and continue adding till the end of the group.
for (auto& entry : poa_group)
{
StatusType entry_status = add_seq_to_poa(entry.seq,
entry.weights,
entry.length);

per_seq_status.push_back(entry_status);
}

return StatusType::success;
}

StatusType CudapoaBatch::add_poa()
@@ -326,19 +374,14 @@ StatusType CudapoaBatch::add_poa()
WindowDetails window_details{};
window_details.seq_len_buffer_offset = global_sequence_idx_;
window_details.seq_starts = num_nucleotides_copied_;
window_details.scores_width = 0;
window_details.scores_offset = next_scores_offset_;
input_details_h_->window_details[poa_count_] = window_details;
poa_count_++;

return StatusType::success;
}

void CudapoaBatch::reset()
{
poa_count_ = 0;
num_nucleotides_copied_ = 0;
global_sequence_idx_ = 0;
}

StatusType CudapoaBatch::add_seq_to_poa(const char* seq, const int8_t* weights, int32_t seq_len)
{
if (seq_len >= CUDAPOA_MAX_SEQUENCE_SIZE)
@@ -347,6 +390,12 @@ StatusType CudapoaBatch::add_seq_to_poa(const char* seq, const int8_t* weights,
}

WindowDetails* window_details = &(input_details_h_->window_details[poa_count_ - 1]);
int32_t scores_width_ = cudautils::align<int32_t, 4>(seq_len + 1 + CELLS_PER_THREAD);
if (scores_width_ > window_details->scores_width)
{
next_scores_offset_ += (scores_width_ - window_details->scores_width);
window_details->scores_width = scores_width_;
}

if (static_cast<int32_t>(window_details->num_seqs) + 1 >= max_sequences_per_poa_)
{
@@ -10,14 +10,13 @@

#pragma once

#include "cudapoa/batch.hpp"
#include <claragenomics/cudapoa/batch.hpp>

#include <memory>
#include <vector>
#include <stdint.h>
#include <string>
#include <iostream>

#include <cuda_runtime_api.h>

namespace claragenomics
@@ -46,14 +45,11 @@ class BatchBlock;
class CudapoaBatch : public Batch
{
public:
CudapoaBatch(int32_t max_poas, int32_t max_sequences_per_poa, int32_t device_id, int8_t output_mask, int16_t gap_score = -8, int16_t mismatch_score = -6, int16_t match_score = 8, bool cuda_banded_alignment = false);
CudapoaBatch(int32_t max_sequences_per_poa, int32_t device_id, cudaStream_t stream, size_t max_mem, int8_t output_mask, int16_t gap_score = -8, int16_t mismatch_score = -6, int16_t match_score = 8, bool cuda_banded_alignment = false);
~CudapoaBatch();

// Add new partial order alignment to batch.
StatusType add_poa();

// Add sequence to last partial order alignment.
StatusType add_seq_to_poa(const char* seq, const int8_t* weights, int32_t seq_len);
virtual StatusType add_poa_group(std::vector<StatusType>& per_seq_status,
const Group& poa_group);

// Get total number of partial order alignments in batch.
int32_t get_total_poas() const;
@@ -62,16 +58,13 @@ class CudapoaBatch : public Batch
void generate_poa();

// Get the consensus for each POA.
void get_consensus(std::vector<std::string>& consensus,
std::vector<std::vector<uint16_t>>& coverage,
std::vector<claragenomics::cudapoa::StatusType>& output_status);
StatusType get_consensus(std::vector<std::string>& consensus,
std::vector<std::vector<uint16_t>>& coverage,
std::vector<claragenomics::cudapoa::StatusType>& output_status);

// Get multiple sequence alignments for each POA
void get_msa(std::vector<std::vector<std::string>>& msa,
std::vector<StatusType>& output_status);

// Set CUDA stream for GPU device.
void set_cuda_stream(cudaStream_t stream);
StatusType get_msa(std::vector<std::vector<std::string>>& msa,
std::vector<StatusType>& output_status);

// Return batch ID.
int32_t batch_id() const;
@@ -99,16 +92,25 @@ class CudapoaBatch : public Batch
void decode_cudapoa_kernel_error(claragenomics::cudapoa::StatusType error_type,
std::vector<StatusType>& output_status);

protected:
// Maximum POAs to process in batch.
int32_t max_poas_ = 0;
// Add new partial order alignment to batch.
StatusType add_poa();

// Add sequence to last partial order alignment.
StatusType add_seq_to_poa(const char* seq, const int8_t* weights, int32_t seq_len);

// Check if seq length can fit in available scoring matrix memory.
bool reserve_buf(int32_t max_seq_length);

protected:
// Maximum sequences per POA.
int32_t max_sequences_per_poa_ = 0;

// GPU Device ID
int32_t device_id_ = 0;

// CUDA stream for launching kernels.
cudaStream_t stream_;

// Bit field for output type
int8_t output_mask_;

@@ -117,9 +119,6 @@ class CudapoaBatch : public Batch
int16_t mismatch_score_;
int16_t match_score_;

// CUDA stream for launching kernels.
cudaStream_t stream_;

// Host and device buffer for output data.
OutputDetails* output_details_h_;
OutputDetails* output_details_d_;
@@ -149,11 +148,20 @@ class CudapoaBatch : public Batch
// Global sequence index.
int32_t global_sequence_idx_ = 0;

// Remaining scores buffer memory available for use.
size_t avail_scorebuf_mem_ = 0;

// Temporary variable to compute the offset to scorebuf.
size_t next_scores_offset_ = 0;

// Use banded POA alignment
bool banded_alignment_;

// Pointer of a seperate class BatchBlock that implements details on calculating and allocating the memory for each batch
std::unique_ptr<BatchBlock> batch_block_;

// Maximum POAs to process in batch.
int32_t max_poas_ = 0;
};

/// \}
@@ -9,6 +9,9 @@
*/

#include "cudapoa_kernels.cuh"

#include <claragenomics/utils/cudautils.hpp>

#include <stdio.h>

namespace claragenomics
@@ -382,6 +385,7 @@ void generateConsensusTestHost(uint8_t* nodes,
node_coverage_counts,
node_alignments,
node_alignment_count);
CGA_CU_CHECK_ERR(cudaPeekAtLastError());
}

} // namespace cudapoa
@@ -1,5 +1,17 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include "cudapoa_kernels.cuh"
#include <time.h>

#include <claragenomics/utils/cudautils.hpp>

#include <stdio.h>

namespace claragenomics