Skip to content

Commit

Permalink
Merge pull request #380 from cwida/benchmarkdatareuse
Browse files Browse the repository at this point in the history
Efficient CSV Reader and Benchmark Data Re-Use
  • Loading branch information
Mytherin committed Dec 28, 2019
2 parents ba82470 + 0c4d0a9 commit 5df60e7
Show file tree
Hide file tree
Showing 12 changed files with 1,088 additions and 234 deletions.
81 changes: 76 additions & 5 deletions benchmark/benchmark_runner.cpp
Expand Up @@ -2,11 +2,14 @@

#include "duckdb/common/profiler.hpp"
#include "duckdb/common/string_util.hpp"
#include "duckdb.hpp"

#define CATCH_CONFIG_RUNNER
#include "catch.hpp"
#include "re2/re2.h"

#include <fstream>
#include <sstream>
#include <thread>

using namespace duckdb;
Expand All @@ -22,6 +25,78 @@ Benchmark::Benchmark(bool register_benchmark, string name, string group) : name(
}
}

void BenchmarkRunner::SaveDatabase(DuckDB &db, string name) {
auto &fs = *db.file_system;
// check if the database directory exists; if not create it
if (!fs.DirectoryExists(DUCKDB_BENCHMARK_DIRECTORY)) {
fs.CreateDirectory(DUCKDB_BENCHMARK_DIRECTORY);
}
// first export the schema
// create two files, "[name].sql" and "[name].list"
// [name].sql contains the SQL used to re-create the tables
// [name].list contains a list of the exported tables
ofstream sql_file(fs.JoinPath(DUCKDB_BENCHMARK_DIRECTORY, name + ".sql"));
ofstream list_file(fs.JoinPath(DUCKDB_BENCHMARK_DIRECTORY, name + ".list"));

vector<string> table_list;
Connection con(db);
auto result = con.Query("SELECT name, sql FROM sqlite_master()");
for (auto &row : *result) {
auto table_name = row.GetValue<string>(0);
auto table_sql = row.GetValue<string>(1);
table_list.push_back(table_name);

list_file << table_name << std::endl;
sql_file << table_sql << std::endl;
}
sql_file.close();
list_file.close();

// now for each table, write it to a separate file "[name]_[tablename].csv"
for (auto &table : table_list) {
auto target_path = fs.JoinPath(DUCKDB_BENCHMARK_DIRECTORY, name + "_" + table + ".csv");
result = con.Query("COPY " + table + " TO '" + target_path + "'");
if (!result->success) {
throw Exception("Failed to save database: " + result->error);
}
}
}

bool BenchmarkRunner::TryLoadDatabase(DuckDB &db, string name) {
auto &fs = *db.file_system;
if (!fs.DirectoryExists(DUCKDB_BENCHMARK_DIRECTORY)) {
return false;
}
auto sql_fname = fs.JoinPath(DUCKDB_BENCHMARK_DIRECTORY, name + ".sql");
auto list_fname = fs.JoinPath(DUCKDB_BENCHMARK_DIRECTORY, name + ".list");
// check if the [name].list and [name].sql files exist
if (!fs.FileExists(list_fname) || !fs.FileExists(sql_fname)) {
return false;
}
Connection con(db);
// the files exist, load the data into the database
// first load the entire SQL and execute it
ifstream sql_file(sql_fname);
std::stringstream buffer;
buffer << sql_file.rdbuf();
auto result = con.Query(buffer.str());
if (!result->success) {
throw Exception("Failed to load database: " + result->error);
}
// now read the tables line by line
ifstream list_file(list_fname);
string table_name;
while (getline(list_file, table_name)) {
// for each table, copy the files
auto target_path = fs.JoinPath(DUCKDB_BENCHMARK_DIRECTORY, name + "_" + table_name + ".csv");
result = con.Query("COPY " + table_name + " FROM '" + target_path + "'");
if (!result->success) {
throw Exception("Failed to load database: " + result->error);
}
}
return true;
}

volatile bool is_active = false;
volatile bool timeout = false;

Expand Down Expand Up @@ -136,11 +211,7 @@ void print_help() {
"e.g., DS.* for TPC-DS benchmarks\n");
}

enum class BenchmarkMetaType {
NONE,
INFO,
GROUP
};
enum class BenchmarkMetaType { NONE, INFO, GROUP };

struct BenchmarkConfiguration {
std::string name_pattern{};
Expand Down
8 changes: 8 additions & 0 deletions benchmark/include/benchmark_runner.hpp
Expand Up @@ -15,18 +15,26 @@
#include "duckdb/common/fstream.hpp"

namespace duckdb {
class DuckDB;

//! The benchmark runner class is responsible for running benchmarks
class BenchmarkRunner {
BenchmarkRunner() {
}

public:
static constexpr const char *DUCKDB_BENCHMARK_DIRECTORY = "duckdb_benchmark_data";

static BenchmarkRunner &GetInstance() {
static BenchmarkRunner instance;
return instance;
}

//! Save the current database state, exporting it to a set of CSVs in the DUCKDB_BENCHMARK_DIRECTORY directory
static void SaveDatabase(DuckDB &db, string name);
//! Try to initialize the database from the DUCKDB_BENCHMARK_DIRECTORY
static bool TryLoadDatabase(DuckDB &db, string name);

//! Register a benchmark in the Benchmark Runner, this is done automatically
//! as long as the proper macro's are used
static void RegisterBenchmark(Benchmark *benchmark);
Expand Down
5 changes: 4 additions & 1 deletion benchmark/tpcds/sf1.cpp
Expand Up @@ -10,7 +10,10 @@ using namespace std;

#define TPCDS_QUERY_BODY(QNR) \
virtual void Load(DuckDBBenchmarkState *state) { \
tpcds::dbgen(SF, state->db); \
if (!BenchmarkRunner::TryLoadDatabase(state->db, "tpcds")) { \
tpcds::dbgen(SF, state->db); \
BenchmarkRunner::SaveDatabase(state->db, "tpcds"); \
} \
} \
virtual string GetQuery() { \
return tpcds::get_query(QNR); \
Expand Down
38 changes: 38 additions & 0 deletions benchmark/tpch/read_lineitem.cpp
Expand Up @@ -46,6 +46,44 @@ string BenchmarkInfo() override {
}
FINISH_BENCHMARK(ReadLineitemCSV)

DUCKDB_BENCHMARK(ReadLineitemCSVUnicode, "[csv]")
int64_t count = 0;
void Load(DuckDBBenchmarkState *state) override {
// load the data into the tpch schema
state->conn.Query("CREATE SCHEMA tpch");
tpch::dbgen(SF, state->db, "tpch");
// create the CSV file
auto result = state->conn.Query("COPY tpch.lineitem TO 'lineitem_unicode.csv' DELIMITER '🦆' HEADER");
assert(result->success);
count = result->collection.chunks[0]->data[0].GetValue(0).GetNumericValue();
// delete the database
state->conn.Query("DROP SCHEMA tpch CASCADE");
// create the empty schema to load into
tpch::dbgen(0, state->db);
}
string GetQuery() override {
return "COPY lineitem FROM 'lineitem_unicode.csv' DELIMITER '🦆' HEADER";
}
void Cleanup(DuckDBBenchmarkState *state) override {
state->conn.Query("DROP TABLE lineitem");
tpch::dbgen(0, state->db);
}
string VerifyResult(QueryResult *result) override {
if (!result->success) {
return result->error;
}
auto &materialized = (MaterializedQueryResult &)*result;
auto expected_count = materialized.collection.chunks[0]->data[0].GetValue(0).GetNumericValue();
if (expected_count != count) {
return StringUtil::Format("Count mismatch, expected %lld elements but got %lld", count, expected_count);
}
return string();
}
string BenchmarkInfo() override {
return "Read the lineitem table from SF 0.1 from CSV format";
}
FINISH_BENCHMARK(ReadLineitemCSVUnicode)

DUCKDB_BENCHMARK(WriteLineitemCSV, "[csv]")
void Load(DuckDBBenchmarkState *state) override {
// load the data into the tpch schema
Expand Down
5 changes: 4 additions & 1 deletion benchmark/tpch/sf1.cpp
Expand Up @@ -10,7 +10,10 @@ using namespace std;

#define TPCH_QUERY_BODY(QNR) \
virtual void Load(DuckDBBenchmarkState *state) { \
tpch::dbgen(SF, state->db); \
if (!BenchmarkRunner::TryLoadDatabase(state->db, "tpch")) { \
tpch::dbgen(SF, state->db); \
BenchmarkRunner::SaveDatabase(state->db, "tpch"); \
} \
} \
virtual string GetQuery() { \
return tpch::get_query(QNR); \
Expand Down
58 changes: 58 additions & 0 deletions src/common/types/value.cpp
Expand Up @@ -9,6 +9,7 @@
#include "duckdb/common/printer.hpp"
#include "duckdb/common/serializer.hpp"
#include "duckdb/common/types/date.hpp"
#include "duckdb/common/types/null_value.hpp"
#include "duckdb/common/types/time.hpp"
#include "duckdb/common/types/timestamp.hpp"
#include "duckdb/common/types/vector.hpp"
Expand Down Expand Up @@ -172,6 +173,9 @@ Value Value::TIMESTAMP(int32_t year, int32_t month, int32_t day, int32_t hour, i
return Value::TIMESTAMP(Date::FromDate(year, month, day), Time::FromTime(hour, min, sec, msec));
}

//===--------------------------------------------------------------------===//
// CreateValue
//===--------------------------------------------------------------------===//
template <> Value Value::CreateValue(bool value) {
return Value::BOOLEAN(value);
}
Expand Down Expand Up @@ -208,6 +212,60 @@ template <> Value Value::CreateValue(double value) {
return Value::DOUBLE(value);
}

//===--------------------------------------------------------------------===//
// GetValue
//===--------------------------------------------------------------------===//
template <class T> T Value::GetValueInternal() {
if (is_null) {
return NullValue<T>();
}
switch (type) {
case TypeId::BOOLEAN:
return Cast::Operation<bool, T>(value_.boolean);
case TypeId::TINYINT:
return Cast::Operation<int8_t, T>(value_.tinyint);
case TypeId::SMALLINT:
return Cast::Operation<int16_t, T>(value_.smallint);
case TypeId::INTEGER:
return Cast::Operation<int32_t, T>(value_.integer);
case TypeId::BIGINT:
return Cast::Operation<int64_t, T>(value_.bigint);
case TypeId::FLOAT:
return Cast::Operation<float, T>(value_.float_);
case TypeId::DOUBLE:
return Cast::Operation<double, T>(value_.double_);
case TypeId::VARCHAR:
return Cast::Operation<const char *, T>(str_value.c_str());
default:
throw NotImplementedException("Unimplemented type for GetValue()");
}
}

template <> bool Value::GetValue() {
return GetValueInternal<bool>();
}
template <> int8_t Value::GetValue() {
return GetValueInternal<int8_t>();
}
template <> int16_t Value::GetValue() {
return GetValueInternal<int16_t>();
}
template <> int32_t Value::GetValue() {
return GetValueInternal<int32_t>();
}
template <> int64_t Value::GetValue() {
return GetValueInternal<int64_t>();
}
template <> string Value::GetValue() {
return GetValueInternal<string>();
}
template <> float Value::GetValue() {
return GetValueInternal<float>();
}
template <> double Value::GetValue() {
return GetValueInternal<double>();
}

Value Value::Numeric(TypeId type, int64_t value) {
assert(!TypeIsIntegral(type) ||
(value >= duckdb::MinimumValue(type) && (value < 0 || (uint64_t)value <= duckdb::MaximumValue(type))));
Expand Down

0 comments on commit 5df60e7

Please sign in to comment.