diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 952ca2b..bb65602 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,7 +16,7 @@ jobs: - name: Install ROOT run: | - ROOT_URL="https://root.cern/download/root_v6.34.06.Linux-ubuntu24.04-x86_64-gcc13.3.tar.gz" + ROOT_URL="https://root.cern/download/root_v6.36.00.Linux-ubuntu24.04-x86_64-gcc13.3.tar.gz" wget -O root.tar.gz $ROOT_URL tar -xzf root.tar.gz -C /opt/ echo "/opt/root/bin" >> $GITHUB_PATH diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 84f1946..377ba4f 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -48,7 +48,15 @@ ROOT_EXECUTABLE(region_query_benchmark ramtools_views ) -install(TARGETS sam_to_ram_benchmark conversion_time_benchmark region_query_benchmark +ROOT_EXECUTABLE(chromosome_split_benchmark + chromosome_split_benchmark.cxx + LIBRARIES + benchmark::benchmark + ramcore + sam_generator +) + +install(TARGETS sam_to_ram_benchmark conversion_time_benchmark region_query_benchmark chromosome_split_benchmark RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ) @@ -61,7 +69,10 @@ add_custom_target(benchmark COMMAND ${CMAKE_COMMAND} -E echo "" COMMAND ${CMAKE_COMMAND} -E echo "=== Region Query Benchmark ===" COMMAND region_query_benchmark - DEPENDS sam_to_ram_benchmark conversion_time_benchmark region_query_benchmark + COMMAND ${CMAKE_COMMAND} -E echo "" + COMMAND ${CMAKE_COMMAND} -E echo "=== Chromosome Split Benchmark ===" + COMMAND chromosome_split_benchmark + DEPENDS sam_to_ram_benchmark conversion_time_benchmark region_query_benchmark chromosome_split_benchmark WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMENT "Running all RAM tools benchmarks" ) diff --git a/benchmark/chromosome_split_benchmark.cxx b/benchmark/chromosome_split_benchmark.cxx new file mode 100644 index 0000000..e5f1622 --- /dev/null +++ b/benchmark/chromosome_split_benchmark.cxx @@ -0,0 +1,191 @@ +#include +#include "ramcore/SamToNTuple.h" +#include "generate_sam_benchmark.h" +#include +#include +#include +#include +#include +#include +#include + +static void CleanupFiles(const std::string &pattern) +{ + for (const auto &entry : std::filesystem::directory_iterator(".")) { + if (entry.path().filename().string().find(pattern) != std::string::npos) { + std::remove(entry.path().c_str()); + } + } +} + +static size_t GetTotalFileSize(const std::string &pattern) +{ + size_t total = 0; + for (const auto &entry : std::filesystem::directory_iterator(".")) { + if (entry.path().filename().string().find(pattern) != std::string::npos) { + total += std::filesystem::file_size(entry.path()); + } + } + return total; +} + +static std::vector GetChromosomes(const std::string &sam_file) +{ + std::vector chroms; + std::ifstream f(sam_file); + std::string line; + + while (std::getline(f, line) && line[0] == '@') { + if (line.find("@SQ\tSN:") == 0) { + size_t start = 7; + size_t end = line.find('\t', start); + chroms.push_back(line.substr(start, end - start)); + } + } + return chroms; +} + +static void BM_SamtoolsSplit(benchmark::State &state) +{ + int num_reads = state.range(0); + std::string sam_file = "bench_st_" + std::to_string(num_reads) + ".sam"; + + GenerateSAMFile(sam_file, num_reads); + auto chromosomes = GetChromosomes(sam_file); + + for (auto _ : state) { + std::string bam_file = "bench_st_tmp.bam"; + std::string sorted_bam = "bench_st_sorted.bam"; + + system(("samtools view -bS " + sam_file + " -o " + bam_file + " 2>/dev/null").c_str()); + + system(("samtools sort " + bam_file + " -o " + sorted_bam + " 2>/dev/null").c_str()); + + system(("samtools index " + sorted_bam + " 2>/dev/null").c_str()); + + for (const auto &chr : chromosomes) { + std::string cmd = "samtools view -b " + sorted_bam + " " + chr + " > bench_st_" + chr + ".bam 2>/dev/null"; + system(cmd.c_str()); + } + + state.counters["size_MB"] = GetTotalFileSize("bench_st_chr") / (1024.0 * 1024.0); + + CleanupFiles("bench_st_chr"); + std::remove(bam_file.c_str()); + std::remove(sorted_bam.c_str()); + std::remove((sorted_bam + ".bai").c_str()); + } + + std::remove(sam_file.c_str()); + state.counters["reads/s"] = benchmark::Counter(num_reads, benchmark::Counter::kIsRate); +} + +static void BM_SamtoolsSplitThreaded(benchmark::State &state) +{ + int num_reads = state.range(0); + int num_threads = state.range(1); + std::string sam_file = "bench_st_mt_" + std::to_string(num_reads) + ".sam"; + + GenerateSAMFile(sam_file, num_reads); + auto chromosomes = GetChromosomes(sam_file); + + for (auto _ : state) { + std::string bam_file = "bench_st_mt_tmp.bam"; + std::string sorted_bam = "bench_st_mt_sorted.bam"; + + std::string cmd = + "samtools view -@ " + std::to_string(num_threads) + " -bS " + sam_file + " -o " + bam_file + " 2>/dev/null"; + system(cmd.c_str()); + + cmd = "samtools sort -@ " + std::to_string(num_threads) + " -m 1G " + bam_file + " -o " + sorted_bam + + " 2>/dev/null"; + system(cmd.c_str()); + + cmd = "samtools index -@ " + std::to_string(num_threads) + " " + sorted_bam + " 2>/dev/null"; + system(cmd.c_str()); + + std::vector threads; + for (const auto &chr : chromosomes) { + threads.emplace_back([&sorted_bam, &chr]() { + std::string cmd = + "samtools view -@ 2 -b " + sorted_bam + " " + chr + " > bench_st_mt_" + chr + ".bam 2>/dev/null"; + system(cmd.c_str()); + }); + + if (threads.size() >= num_threads) { + for (auto &t : threads) { + t.join(); + } + threads.clear(); + } + } + + for (auto &t : threads) { + t.join(); + } + + state.counters["size_MB"] = GetTotalFileSize("bench_st_mt_chr") / (1024.0 * 1024.0); + state.counters["threads"] = num_threads; + + CleanupFiles("bench_st_mt_chr"); + std::remove(bam_file.c_str()); + std::remove(sorted_bam.c_str()); + std::remove((sorted_bam + ".bai").c_str()); + } + + std::remove(sam_file.c_str()); + state.counters["reads/s"] = benchmark::Counter(num_reads, benchmark::Counter::kIsRate); +} + +static void BM_ChromosomeSplitThreads(benchmark::State &state) +{ + int num_reads = state.range(0); + int num_threads = state.range(1); + std::string sam_file = "bench_split_par_" + std::to_string(num_reads) + ".sam"; + + GenerateSAMFile(sam_file, num_reads); + + FILE *old_stdout = stdout; + FILE *old_stderr = stderr; + + for (auto _ : state) { + stdout = fopen("/dev/null", "w"); + stderr = fopen("/dev/null", "w"); + + samtoramntuple_split_by_chromosome(sam_file.c_str(), "bench_split_par_out", 505, 1, num_threads); + + fclose(stdout); + fclose(stderr); + stdout = old_stdout; + stderr = old_stderr; + + state.counters["size_MB"] = GetTotalFileSize("bench_split_par_out_") / (1024.0 * 1024.0); + state.counters["threads"] = num_threads; + CleanupFiles("bench_split_par_out_"); + } + + std::remove(sam_file.c_str()); + state.counters["reads/s"] = benchmark::Counter(num_reads, benchmark::Counter::kIsRate); +} + +BENCHMARK(BM_SamtoolsSplit)->Arg(100000)->Arg(500000)->Arg(1000000)->Unit(benchmark::kMillisecond); + +BENCHMARK(BM_SamtoolsSplitThreaded) + ->Args({100000, 2}) + ->Args({100000, 4}) + ->Args({500000, 2}) + ->Args({500000, 4}) + ->Args({1000000, 2}) + ->Args({1000000, 4}) + ->Unit(benchmark::kMillisecond); + +BENCHMARK(BM_ChromosomeSplitThreads) + ->Args({100000, 2}) + ->Args({100000, 4}) + ->Args({500000, 2}) + ->Args({500000, 4}) + ->Args({1000000, 2}) + ->Args({1000000, 4}) + ->Unit(benchmark::kMillisecond); + +BENCHMARK_MAIN(); diff --git a/inc/ramcore/SamToNTuple.h b/inc/ramcore/SamToNTuple.h index 429c2fa..1b986c0 100644 --- a/inc/ramcore/SamToNTuple.h +++ b/inc/ramcore/SamToNTuple.h @@ -1,7 +1,8 @@ #ifndef RAMCORE_SAMTONTUPLE_H #define RAMCORE_SAMTONTUPLE_H -#include +#include +#include void samtoramntuple(const char *datafile, const char *treefile, @@ -9,5 +10,7 @@ void samtoramntuple(const char *datafile, int compression_algorithm, uint32_t quality_policy); -#endif // RAMCORE_SAMTONTUPLE_H +void samtoramntuple_split_by_chromosome(const char *datafile, const char *output_prefix, int compression_algorithm, + uint32_t quality_policy, int num_threads = 4); +#endif diff --git a/inc/rntuple/RAMNTupleRecord.h b/inc/rntuple/RAMNTupleRecord.h index eeb66a4..55142c1 100644 --- a/inc/rntuple/RAMNTupleRecord.h +++ b/inc/rntuple/RAMNTupleRecord.h @@ -18,11 +18,11 @@ #include #include -namespace ROOT::Experimental { +namespace ROOT { class RNTupleModel; class RNTupleWriter; class RNTupleReader; -} // namespace ROOT::Experimental +} // namespace ROOT class RAMNTupleRefs; class RAMNTupleIndex; @@ -222,7 +222,7 @@ class RAMNTupleRecord { static RAMNTupleIndex *GetIndex() { return fgIndex.get(); } // File I/O - static std::unique_ptr + static std::unique_ptr OpenRAMFile(const std::string &filename, const std::string &ntupleName = "RAM"); static void WriteAllRefs(TFile &file); static void ReadAllRefs(const std::string &filename = ""); @@ -230,7 +230,7 @@ class RAMNTupleRecord { static void ReadIndex(const std::string &filename = ""); // RNTuple model creation - static std::unique_ptr MakeModel(); + static std::unique_ptr MakeModel(); void SetCompressionMode(uint32_t flags) { compression_flags = flags; } diff --git a/src/ramcore/SamToNTuple.cxx b/src/ramcore/SamToNTuple.cxx index 067af1d..5eaf9c6 100644 --- a/src/ramcore/SamToNTuple.cxx +++ b/src/ramcore/SamToNTuple.cxx @@ -5,10 +5,23 @@ #include #include #include +#include +#include #include #include #include #include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include void samtoramntuple(const char *datafile, const char *treefile, @@ -28,14 +41,13 @@ void samtoramntuple(const char *datafile, RAMNTupleRecord::InitializeRefs(); auto model = RAMNTupleRecord::MakeModel(); - - ROOT::Experimental::RNTupleWriteOptions writeOptions; + + ROOT::RNTupleWriteOptions writeOptions; writeOptions.SetCompression(compression_algorithm); writeOptions.SetMaxUnzippedPageSize(64000); - - auto writer = ROOT::Experimental::RNTupleWriter::Append( - std::move(model), "RAM", *rootFile, writeOptions); - auto defaultEntry = writer->CreateEntry(); + + auto writer = ROOT::RNTupleWriter::Append(std::move(model), "RAM", *rootFile, writeOptions); + auto defaultEntry = writer->GetModel().CreateEntry(); auto recordPtr = defaultEntry->GetPtr("record"); TList headers; @@ -57,37 +69,34 @@ void samtoramntuple(const char *datafile, } } }; - - auto record_callback = [&](const ramcore::SamRecord& sam_record, size_t record_num) { - RAMNTupleRecord r; - r.SetBit(quality_policy); - - r.SetQNAME(sam_record.qname.c_str()); - r.SetFLAG(sam_record.flag); - r.SetREFID(sam_record.rname.c_str()); - r.SetPOS(sam_record.pos); - r.SetMAPQ(sam_record.mapq); - r.SetCIGAR(sam_record.cigar.c_str()); - r.SetREFNEXT(sam_record.rnext.c_str()); - r.SetPNEXT(sam_record.pnext); - r.SetTLEN(sam_record.tlen); - r.SetSEQ(sam_record.seq.c_str()); - r.SetQUAL(sam_record.qual.c_str()); - - r.ResetNOPT(); - for (const auto& opt : sam_record.optional_fields) { - r.SetOPT(opt.c_str()); - } - - *recordPtr = std::move(r); - writer->Fill(*defaultEntry); - - if (index && record_num % 1000 == 0) { - RAMNTupleRecord::GetIndex()->AddItem(recordPtr->GetREFID(), - recordPtr->GetPOS() - 1, record_num); - } + + auto record_callback = [&](const ramcore::SamRecord &sam_record, size_t record_num) { + recordPtr->SetBit(quality_policy); + + recordPtr->SetQNAME(sam_record.qname.c_str()); + recordPtr->SetFLAG(sam_record.flag); + recordPtr->SetREFID(sam_record.rname.c_str()); + recordPtr->SetPOS(sam_record.pos); + recordPtr->SetMAPQ(sam_record.mapq); + recordPtr->SetCIGAR(sam_record.cigar.c_str()); + recordPtr->SetREFNEXT(sam_record.rnext.c_str()); + recordPtr->SetPNEXT(sam_record.pnext); + recordPtr->SetTLEN(sam_record.tlen); + recordPtr->SetSEQ(sam_record.seq.c_str()); + recordPtr->SetQUAL(sam_record.qual.c_str()); + + recordPtr->ResetNOPT(); + for (const auto &opt : sam_record.optional_fields) { + recordPtr->SetOPT(opt.c_str()); + } + + writer->Fill(*defaultEntry); + + if (index && record_num % 1000 == 0) { + RAMNTupleRecord::GetIndex()->AddItem(recordPtr->GetREFID(), recordPtr->GetPOS() - 1, record_num); + } }; - + if (!parser.ParseFile(datafile, header_callback, record_callback)) { printf("Failed to parse SAM file %s\n", datafile); return; @@ -120,3 +129,144 @@ void samtoramntuple(const char *datafile, stopwatch.Print(); } +void samtoramntuple_split_by_chromosome(const char *datafile, const char *output_prefix, int compression_algorithm, + uint32_t quality_policy, int num_threads) +{ + ROOT::EnableThreadSafety(); + RAMNTupleRecord::InitializeRefs(); + + std::map> chromosome_records; + std::vector> headers; + + ramcore::SamParser parser; + + auto header_callback = [&](const std::string &tag, const std::string &content) { + headers.push_back({tag, content}); + + if (tag == "@SQ") { + size_t sn_pos = content.find("SN:"); + if (sn_pos != std::string::npos) { + sn_pos += 3; + size_t tab_pos = content.find('\t', sn_pos); + std::string ref_name = + content.substr(sn_pos, tab_pos != std::string::npos ? tab_pos - sn_pos : std::string::npos); + RAMNTupleRecord::GetRnameRefs()->GetRefId(ref_name.c_str()); + } + } + }; + + auto record_callback = [&](const ramcore::SamRecord &sam_record, size_t record_num) { + if (sam_record.rname != "*") { + chromosome_records[sam_record.rname].push_back(sam_record); + } + }; + + parser.ParseFile(datafile, header_callback, record_callback); + + std::vector chr_names; + for (const auto &[chr, records] : chromosome_records) { + chr_names.push_back(chr); + } + + std::vector sort_threads; + for (const auto &chr : chr_names) { + sort_threads.emplace_back([&chromosome_records, chr]() { + auto &records = chromosome_records[chr]; + std::sort(records.begin(), records.end(), + [](const ramcore::SamRecord &a, const ramcore::SamRecord &b) { return a.pos < b.pos; }); + }); + } + for (auto &t : sort_threads) { + t.join(); + } + + auto write_chromosome_parallel = [&](const std::string &chr) { + const auto &records = chromosome_records[chr]; + + std::string filename = std::string(output_prefix) + "_" + chr + ".root"; + + auto model = RAMNTupleRecord::MakeModel(); + ROOT::RNTupleWriteOptions writeOptions; + + writeOptions.SetCompression(ROOT::RCompressionSetting::EAlgorithm::kZSTD, 1); + writeOptions.SetApproxZippedClusterSize(200 * 1024 * 1024); + writeOptions.SetMaxUnzippedClusterSize(1024 * 1024 * 1024); + writeOptions.SetMaxUnzippedPageSize(1024 * 1024); + writeOptions.SetUseBufferedWrite(true); + + auto parallel_writer = + ROOT::Experimental::RNTupleParallelWriter::Recreate(std::move(model), "RAM", filename, writeOptions); + + const int contexts_per_file = std::min(4, num_threads); + const size_t records_per_context = (records.size() + contexts_per_file - 1) / contexts_per_file; + + std::vector write_threads; + + for (int ctx = 0; ctx < contexts_per_file && ctx * records_per_context < records.size(); ++ctx) { + write_threads.emplace_back([&, ctx]() { + auto fill_context = parallel_writer->CreateFillContext(); + auto entry = fill_context->GetModel().CreateEntry(); + auto recordPtr = entry->GetPtr("record").get(); + + size_t start = ctx * records_per_context; + size_t end = std::min(start + records_per_context, records.size()); + + for (size_t i = start; i < end; ++i) { + const auto &sam_record = records[i]; + + recordPtr->SetBit(quality_policy); + recordPtr->SetQNAME(sam_record.qname.c_str()); + recordPtr->SetFLAG(sam_record.flag); + recordPtr->SetREFID(sam_record.rname.c_str()); + recordPtr->SetPOS(sam_record.pos); + recordPtr->SetMAPQ(sam_record.mapq); + recordPtr->SetCIGAR(sam_record.cigar.c_str()); + recordPtr->SetREFNEXT(sam_record.rnext.c_str()); + recordPtr->SetPNEXT(sam_record.pnext); + recordPtr->SetTLEN(sam_record.tlen); + recordPtr->SetSEQ(sam_record.seq.c_str()); + recordPtr->SetQUAL(sam_record.qual.c_str()); + + recordPtr->ResetNOPT(); + for (const auto &opt : sam_record.optional_fields) { + recordPtr->SetOPT(opt.c_str()); + } + + fill_context->Fill(*entry); + } + }); + } + + for (auto &t : write_threads) { + t.join(); + } + + parallel_writer.reset(); + + std::unique_ptr file(TFile::Open(filename.c_str(), "UPDATE")); + + TList h; + h.SetName("headers"); + for (const auto &[tag, content] : headers) { + h.Add(new TNamed(tag.c_str(), content.c_str())); + } + + RAMNTupleRecord::WriteAllRefs(*file); + h.Write(); + + file->Close(); + }; + + size_t chr_idx = 0; + while (chr_idx < chr_names.size()) { + std::vector threads; + + for (int i = 0; i < num_threads && chr_idx < chr_names.size(); ++i, ++chr_idx) { + threads.emplace_back(write_chromosome_parallel, chr_names[chr_idx]); + } + + for (auto &t : threads) { + t.join(); + } + } +} diff --git a/src/rntuple/RAMNTupleRecord.cxx b/src/rntuple/RAMNTupleRecord.cxx index efba067..fe60c5b 100644 --- a/src/rntuple/RAMNTupleRecord.cxx +++ b/src/rntuple/RAMNTupleRecord.cxx @@ -13,7 +13,7 @@ #include #include -using namespace ROOT::Experimental; +using namespace ROOT; std::unique_ptr RAMNTupleRecord::fgRnameRefs = nullptr; std::unique_ptr RAMNTupleRecord::fgRnextRefs = nullptr; @@ -203,7 +203,7 @@ void RAMNTupleRecord::WriteAllRefs(TFile &file) writeOptions.SetCompression(505); auto metaWriter = RNTupleWriter::Append(std::move(metaModel), "METADATA", file, writeOptions); - auto metaEntry = metaWriter->CreateEntry(); + auto metaEntry = metaWriter->GetModel().CreateEntry(); auto rnamePtr = metaEntry->GetPtr>("rname_refs"); auto rnextPtr = metaEntry->GetPtr>("rnext_refs"); @@ -260,7 +260,7 @@ void RAMNTupleRecord::WriteIndex(TFile &file) writeOptions.SetCompression(505); auto indexWriter = RNTupleWriter::Append(std::move(indexModel), "INDEX", file, writeOptions); - auto indexEntry = indexWriter->CreateEntry(); + auto indexEntry = indexWriter->GetModel().CreateEntry(); auto indexPtr = indexEntry->GetPtr>("index_entries"); *indexPtr = fgIndex->GetEntries(); @@ -534,7 +534,7 @@ void RAMNTupleConverter::ConvertSAMToRAMNTuple(const std::string &sam_file, cons writeOptions.SetCompression(505); // ZSTD level 5 auto writer = RNTupleWriter::Append(std::move(model), "RAM", *file, writeOptions); - auto defaultEntry = writer->CreateEntry(); + auto defaultEntry = writer->GetModel().CreateEntry(); auto recordPtr = defaultEntry->GetPtr("record"); std::ifstream sam(sam_file); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 1cddd96..0920747 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -24,8 +24,9 @@ function(add_ramcore_test test_name) endfunction() add_ramcore_test(ramcoretests ramcoretests.cxx) +add_ramcore_test(chromosome_split_test chromosome_split_test.cxx) -install(TARGETS ramcoretests +install(TARGETS ramcoretests chromosome_split_test RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ) diff --git a/test/chromosome_split_test.cxx b/test/chromosome_split_test.cxx new file mode 100644 index 0000000..91274cb --- /dev/null +++ b/test/chromosome_split_test.cxx @@ -0,0 +1,100 @@ +#include +#include "ramcore/SamToNTuple.h" +#include "rntuple/RAMNTupleRecord.h" +#include "generate_sam_benchmark.h" +#include +#include +#include + +class ChromosomeSplitTest : public ::testing::Test { +protected: + void SetUp() override + { + if (!std::filesystem::exists("test.sam")) { + GenerateSAMFile("test.sam", 100); + } + CleanupTestFiles(); + } + + void TearDown() override { CleanupTestFiles(); } + + void CleanupTestFiles() + { + std::remove("test_regular.root"); + for (const auto &entry : std::filesystem::directory_iterator(".")) { + std::string filename = entry.path().filename().string(); + if (filename.find("test_split_") == 0 && filename.find(".root") != std::string::npos) { + std::remove(filename.c_str()); + } + } + } +}; + +TEST_F(ChromosomeSplitTest, NoDataLoss) +{ + samtoramntuple("test.sam", "test_regular.root", false, true, true, 505, 1); + auto regularReader = ROOT::RNTupleReader::Open("RAM", "test_regular.root"); + Long64_t totalEntries = regularReader->GetNEntries(); + + samtoramntuple_split_by_chromosome("test.sam", "test_split", 505, 1, 4); + + Long64_t splitEntriesSum = 0; + for (const auto &entry : std::filesystem::directory_iterator(".")) { + std::string filename = entry.path().filename().string(); + if (filename.find("test_split_") == 0 && filename.find(".root") != std::string::npos) { + auto reader = ROOT::RNTupleReader::Open("RAM", filename); + if (reader) { + splitEntriesSum += reader->GetNEntries(); + } + } + } + + EXPECT_EQ(totalEntries, splitEntriesSum); +} + +TEST_F(ChromosomeSplitTest, CorrectChromosomeAssignment) +{ + samtoramntuple_split_by_chromosome("test.sam", "test_split", 505, 1, 4); + + for (const auto &entry : std::filesystem::directory_iterator(".")) { + std::string filename = entry.path().filename().string(); + if (filename.find("test_split_") == 0 && filename.find(".root") != std::string::npos) { + size_t pos = filename.find("test_split_"); + size_t end = filename.find(".root"); + std::string expectedChr = filename.substr(pos + 11, end - pos - 11); + + auto reader = ROOT::RNTupleReader::Open("RAM", filename); + ASSERT_NE(reader, nullptr); + + auto viewRecord = reader->GetView("record"); + + for (auto i : reader->GetEntryRange()) { + const auto &record = viewRecord(i); + std::string actualChr = record.GetRNAME(); + EXPECT_EQ(expectedChr, actualChr) << "Wrong chromosome in " << filename << " at entry " << i; + } + } + } +} + +TEST_F(ChromosomeSplitTest, MetadataPresent) +{ + samtoramntuple_split_by_chromosome("test.sam", "test_split", 505, 1, 4); + + int filesChecked = 0; + for (const auto &entry : std::filesystem::directory_iterator(".")) { + std::string filename = entry.path().filename().string(); + if (filename.find("test_split_") == 0 && filename.find(".root") != std::string::npos) { + auto metaReader = ROOT::RNTupleReader::Open("METADATA", filename); + EXPECT_NE(metaReader, nullptr) << "Missing METADATA in " << filename; + + if (metaReader) { + EXPECT_GT(metaReader->GetNEntries(), 0); + } + + filesChecked++; + } + } + + EXPECT_GT(filesChecked, 0); +} diff --git a/test/ramcoretests.cxx b/test/ramcoretests.cxx index 2974400..458ba50 100644 --- a/test/ramcoretests.cxx +++ b/test/ramcoretests.cxx @@ -43,7 +43,7 @@ TEST_F(ramcoreTest, ConversionProducesEqualEntries) { ASSERT_NE(ttree, nullptr) << "Failed to get TTree"; Long64_t ttreeEntries = ttree->GetEntries(); - auto reader = ROOT::Experimental::RNTupleReader::Open("RAM", rntupleFile); + auto reader = ROOT::RNTupleReader::Open("RAM", rntupleFile); ASSERT_NE(reader, nullptr) << "Failed to open RNTuple"; Long64_t rntupleEntries = reader->GetNEntries(); diff --git a/tools/samtoramntuple.cxx b/tools/samtoramntuple.cxx index 242f7cf..55d0be1 100644 --- a/tools/samtoramntuple.cxx +++ b/tools/samtoramntuple.cxx @@ -6,48 +6,57 @@ int main(int argc, char* argv[]) { if (argc < 2) { - std::cout << "Usage: " << argv[0] << " [output.ram]\n"; - std::cout << "Options:\n"; - std::cout << " -noindex Disable indexing\n"; - std::cout << " -illumina Use Illumina quality binning\n"; - std::cout << " -dropqual Drop quality scores\n"; - return 1; + std::cout << "Usage: " << argv[0] << " [output]\n"; + std::cout << "Options:\n"; + std::cout << " -split Split by chromosome\n"; + std::cout << " -noindex Disable indexing\n"; + std::cout << " -illumina Use Illumina quality binning\n"; + std::cout << " -dropqual Drop quality scores\n"; + return 1; } const char* input = argv[1]; const char* output = nullptr; - - std::string outfile; - if (argc > 2 && argv[2][0] != '-') { - output = argv[2]; - } else { - outfile = std::string(input); - size_t pos = outfile.rfind(".sam"); - if (pos != std::string::npos) { - outfile.replace(pos, 4, ".ram"); - } else { - outfile += ".ram"; - } - output = outfile.c_str(); - } - - + + bool do_split = false; bool do_index = true; uint32_t quality_mode = RAMNTupleRecord::kPhred33; for (int i = 2; i < argc; i++) { std::string arg = argv[i]; - if (arg == "-noindex") { - do_index = false; + if (arg == "-split") { + do_split = true; + } else if (arg == "-noindex") { + do_index = false; } else if (arg == "-illumina") { - quality_mode = RAMNTupleRecord::kIlluminaBinning; + quality_mode = RAMNTupleRecord::kIlluminaBinning; } else if (arg == "-dropqual") { - quality_mode = RAMNTupleRecord::kDrop; + quality_mode = RAMNTupleRecord::kDrop; + } else if (arg[0] != '-') { + output = argv[i]; } } - + + std::string outfile; + if (!output) { + outfile = std::string(input); + size_t pos = outfile.rfind(".sam"); + if (pos != std::string::npos) { + outfile.erase(pos); + } + output = outfile.c_str(); + } + try { - samtoramntuple(input, output, do_index, true, true, 505, quality_mode); + if (do_split) { + samtoramntuple_split_by_chromosome(input, output, 505, quality_mode, 4); + } else { + std::string ramfile = std::string(output); + if (ramfile.find(".root") == std::string::npos && ramfile.find(".ram") == std::string::npos) { + ramfile += ".ram"; + } + samtoramntuple(input, ramfile.c_str(), do_index, true, true, 505, quality_mode); + } } catch (const std::exception& e) { std::cerr << "Error: " << e.what() << std::endl; return 1;