Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify backup coordination for file infos #48095

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
146 changes: 146 additions & 0 deletions src/Backups/BackupCoordinationFileInfos.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
#include <Backups/BackupCoordinationFileInfos.h>
#include <Common/quoteString.h>


namespace DB
{

namespace ErrorCodes
{
extern const int BACKUP_ENTRY_ALREADY_EXISTS;
extern const int BAD_ARGUMENTS;
extern const int LOGICAL_ERROR;
}

using SizeAndChecksum = std::pair<UInt64, UInt128>;


void BackupCoordinationFileInfos::addFileInfos(BackupFileInfos && file_infos_, const String & host_id_)
{
if (prepared)
throw Exception(ErrorCodes::LOGICAL_ERROR, "addFileInfos() must not be called after preparing");
file_infos.emplace(host_id_, std::move(file_infos_));
}

BackupFileInfos BackupCoordinationFileInfos::getFileInfos(const String & host_id_) const
{
prepare();
auto it = file_infos.find(host_id_);
if (it == file_infos.end())
return {};
return it->second;
}

BackupFileInfos BackupCoordinationFileInfos::getFileInfosForAllHosts() const
{
prepare();
BackupFileInfos res;
res.reserve(file_infos_for_all_hosts.size());
for (const auto * file_info : file_infos_for_all_hosts)
res.emplace_back(*file_info);
return res;
}

BackupFileInfo BackupCoordinationFileInfos::getFileInfoByDataFileIndex(size_t data_file_index) const
{
prepare();
if (data_file_index >= file_infos_for_all_hosts.size())
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid data file index: {}", data_file_index);
return *(file_infos_for_all_hosts[data_file_index]);
}

void BackupCoordinationFileInfos::prepare() const
{
if (prepared)
return;

/// Make a list of all file infos from all hosts.
size_t total_num_infos = 0;
for (const auto & [_, infos] : file_infos)
total_num_infos += infos.size();

file_infos_for_all_hosts.reserve(total_num_infos);
for (auto & [_, infos] : file_infos)
for (auto & info : infos)
file_infos_for_all_hosts.emplace_back(&info);

/// Sort the list of all file infos by file name (file names must be unique).
std::sort(file_infos_for_all_hosts.begin(), file_infos_for_all_hosts.end(), BackupFileInfo::LessByFileName{});

auto adjacent_it = std::adjacent_find(file_infos_for_all_hosts.begin(), file_infos_for_all_hosts.end(), BackupFileInfo::EqualByFileName{});
if (adjacent_it != file_infos_for_all_hosts.end())
{
throw Exception(
ErrorCodes::BACKUP_ENTRY_ALREADY_EXISTS, "Entry {} added multiple times to backup", quoteString((*adjacent_it)->file_name));
}

num_files = 0;
total_size_of_files = 0;

if (plain_backup)
{
/// For plain backup all file infos are stored as is, without checking for duplicates or skipping empty files.
for (size_t i = 0; i != file_infos_for_all_hosts.size(); ++i)
{
auto & info = *(file_infos_for_all_hosts[i]);
info.data_file_name = info.file_name;
info.data_file_index = i;
info.base_size = 0; /// Base backup must not be used while creating a plain backup.
info.base_checksum = 0;
total_size_of_files += info.size;
}
num_files = file_infos_for_all_hosts.size();
}
else
{
/// For non-plain backups files with the same size and checksum are stored only once,
/// in order to find those files we'll use this map.
std::map<SizeAndChecksum, size_t> data_file_index_by_checksum;

for (size_t i = 0; i != file_infos_for_all_hosts.size(); ++i)
{
auto & info = *(file_infos_for_all_hosts[i]);
if (info.size == info.base_size)
{
/// A file is either empty or can be get from the base backup as a whole.
info.data_file_name.clear();
info.data_file_index = static_cast<size_t>(-1);
}
else
{
SizeAndChecksum size_and_checksum{info.size, info.checksum};
auto [it, inserted] = data_file_index_by_checksum.emplace(size_and_checksum, i);
if (inserted)
{
/// Found a new file.
info.data_file_name = info.file_name;
info.data_file_index = i;
++num_files;
total_size_of_files += info.size - info.base_size;
}
else
{
/// Found a file with the same size and checksum as some file before, reuse old `data_file_index` and `data_file_name`.
info.data_file_index = it->second;
info.data_file_name = file_infos_for_all_hosts[it->second]->data_file_name;
}
}
}
}

prepared = true;
}

size_t BackupCoordinationFileInfos::getNumFiles() const
{
prepare();
return num_files;
}

size_t BackupCoordinationFileInfos::getTotalSizeOfFiles() const
{
prepare();
return total_size_of_files;
}

}
56 changes: 56 additions & 0 deletions src/Backups/BackupCoordinationFileInfos.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#pragma once

#include <map>
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <Backups/BackupFileInfo.h>


namespace DB
{

/// Hosts use this class to coordinate lists of files they are going to write to a backup.
/// Because different hosts shouldn't write the same file twice and or even files with different names but with the same checksum.
/// Also the initiator of the BACKUP query uses this class to get a whole list of files written by all hosts to write that list
/// as a part of the contents of the .backup file (the backup metadata file).
class BackupCoordinationFileInfos
{
public:
/// plain_backup sets that we're writing a plain backup, which means all duplicates are written as is, and empty files are written as is.
/// (For normal backups only the first file amongst duplicates is actually stored, and empty files are not stored).
BackupCoordinationFileInfos(bool plain_backup_) : plain_backup(plain_backup_) {}

/// Adds file infos for the specified host.
void addFileInfos(BackupFileInfos && file_infos, const String & host_id);

/// Returns file infos for the specified host after preparation.
BackupFileInfos getFileInfos(const String & host_id) const;

/// Returns file infos for all hosts after preparation.
BackupFileInfos getFileInfosForAllHosts() const;

/// Returns a file info by data file index (see BackupFileInfo::data_file_index).
BackupFileInfo getFileInfoByDataFileIndex(size_t data_file_index) const;

/// Returns the number of files after deduplication and excluding empty files.
size_t getNumFiles() const;

/// Returns the total size of files after deduplication and excluding empty files.
size_t getTotalSizeOfFiles() const;

private:
void prepare() const;

/// before preparation
const bool plain_backup;
mutable std::unordered_map<String, BackupFileInfos> file_infos;

/// after preparation
mutable bool prepared = false;
mutable std::vector<BackupFileInfo *> file_infos_for_all_hosts;
mutable size_t num_files;
mutable size_t total_size_of_files;
};

}