diff --git a/benchmark/benchmark_runner.cpp b/benchmark/benchmark_runner.cpp index 27e3809ed8a..cc5e9966649 100644 --- a/benchmark/benchmark_runner.cpp +++ b/benchmark/benchmark_runner.cpp @@ -2,11 +2,14 @@ #include "duckdb/common/profiler.hpp" #include "duckdb/common/string_util.hpp" +#include "duckdb.hpp" #define CATCH_CONFIG_RUNNER #include "catch.hpp" #include "re2/re2.h" +#include +#include #include using namespace duckdb; @@ -22,6 +25,78 @@ Benchmark::Benchmark(bool register_benchmark, string name, string group) : name( } } +void BenchmarkRunner::SaveDatabase(DuckDB &db, string name) { + auto &fs = *db.file_system; + // check if the database directory exists; if not create it + if (!fs.DirectoryExists(DUCKDB_BENCHMARK_DIRECTORY)) { + fs.CreateDirectory(DUCKDB_BENCHMARK_DIRECTORY); + } + // first export the schema + // create two files, "[name].sql" and "[name].list" + // [name].sql contains the SQL used to re-create the tables + // [name].list contains a list of the exported tables + ofstream sql_file(fs.JoinPath(DUCKDB_BENCHMARK_DIRECTORY, name + ".sql")); + ofstream list_file(fs.JoinPath(DUCKDB_BENCHMARK_DIRECTORY, name + ".list")); + + vector table_list; + Connection con(db); + auto result = con.Query("SELECT name, sql FROM sqlite_master()"); + for (auto &row : *result) { + auto table_name = row.GetValue(0); + auto table_sql = row.GetValue(1); + table_list.push_back(table_name); + + list_file << table_name << std::endl; + sql_file << table_sql << std::endl; + } + sql_file.close(); + list_file.close(); + + // now for each table, write it to a separate file "[name]_[tablename].csv" + for (auto &table : table_list) { + auto target_path = fs.JoinPath(DUCKDB_BENCHMARK_DIRECTORY, name + "_" + table + ".csv"); + result = con.Query("COPY " + table + " TO '" + target_path + "'"); + if (!result->success) { + throw Exception("Failed to save database: " + result->error); + } + } +} + +bool BenchmarkRunner::TryLoadDatabase(DuckDB &db, string name) { + auto &fs = *db.file_system; + if (!fs.DirectoryExists(DUCKDB_BENCHMARK_DIRECTORY)) { + return false; + } + auto sql_fname = fs.JoinPath(DUCKDB_BENCHMARK_DIRECTORY, name + ".sql"); + auto list_fname = fs.JoinPath(DUCKDB_BENCHMARK_DIRECTORY, name + ".list"); + // check if the [name].list and [name].sql files exist + if (!fs.FileExists(list_fname) || !fs.FileExists(sql_fname)) { + return false; + } + Connection con(db); + // the files exist, load the data into the database + // first load the entire SQL and execute it + ifstream sql_file(sql_fname); + std::stringstream buffer; + buffer << sql_file.rdbuf(); + auto result = con.Query(buffer.str()); + if (!result->success) { + throw Exception("Failed to load database: " + result->error); + } + // now read the tables line by line + ifstream list_file(list_fname); + string table_name; + while (getline(list_file, table_name)) { + // for each table, copy the files + auto target_path = fs.JoinPath(DUCKDB_BENCHMARK_DIRECTORY, name + "_" + table_name + ".csv"); + result = con.Query("COPY " + table_name + " FROM '" + target_path + "'"); + if (!result->success) { + throw Exception("Failed to load database: " + result->error); + } + } + return true; +} + volatile bool is_active = false; volatile bool timeout = false; @@ -136,11 +211,7 @@ void print_help() { "e.g., DS.* for TPC-DS benchmarks\n"); } -enum class BenchmarkMetaType { - NONE, - INFO, - GROUP -}; +enum class BenchmarkMetaType { NONE, INFO, GROUP }; struct BenchmarkConfiguration { std::string name_pattern{}; diff --git a/benchmark/include/benchmark_runner.hpp b/benchmark/include/benchmark_runner.hpp index 5e6a2724cbc..0590ac3b3d9 100644 --- a/benchmark/include/benchmark_runner.hpp +++ b/benchmark/include/benchmark_runner.hpp @@ -15,6 +15,7 @@ #include "duckdb/common/fstream.hpp" namespace duckdb { +class DuckDB; //! The benchmark runner class is responsible for running benchmarks class BenchmarkRunner { @@ -22,11 +23,18 @@ class BenchmarkRunner { } public: + static constexpr const char *DUCKDB_BENCHMARK_DIRECTORY = "duckdb_benchmark_data"; + static BenchmarkRunner &GetInstance() { static BenchmarkRunner instance; return instance; } + //! Save the current database state, exporting it to a set of CSVs in the DUCKDB_BENCHMARK_DIRECTORY directory + static void SaveDatabase(DuckDB &db, string name); + //! Try to initialize the database from the DUCKDB_BENCHMARK_DIRECTORY + static bool TryLoadDatabase(DuckDB &db, string name); + //! Register a benchmark in the Benchmark Runner, this is done automatically //! as long as the proper macro's are used static void RegisterBenchmark(Benchmark *benchmark); diff --git a/benchmark/tpcds/sf1.cpp b/benchmark/tpcds/sf1.cpp index a89511d17b2..5b98a620b9b 100644 --- a/benchmark/tpcds/sf1.cpp +++ b/benchmark/tpcds/sf1.cpp @@ -10,7 +10,10 @@ using namespace std; #define TPCDS_QUERY_BODY(QNR) \ virtual void Load(DuckDBBenchmarkState *state) { \ - tpcds::dbgen(SF, state->db); \ + if (!BenchmarkRunner::TryLoadDatabase(state->db, "tpcds")) { \ + tpcds::dbgen(SF, state->db); \ + BenchmarkRunner::SaveDatabase(state->db, "tpcds"); \ + } \ } \ virtual string GetQuery() { \ return tpcds::get_query(QNR); \ diff --git a/benchmark/tpch/read_lineitem.cpp b/benchmark/tpch/read_lineitem.cpp index f66783670e8..edf32a5c8bf 100644 --- a/benchmark/tpch/read_lineitem.cpp +++ b/benchmark/tpch/read_lineitem.cpp @@ -46,6 +46,44 @@ string BenchmarkInfo() override { } FINISH_BENCHMARK(ReadLineitemCSV) +DUCKDB_BENCHMARK(ReadLineitemCSVUnicode, "[csv]") +int64_t count = 0; +void Load(DuckDBBenchmarkState *state) override { + // load the data into the tpch schema + state->conn.Query("CREATE SCHEMA tpch"); + tpch::dbgen(SF, state->db, "tpch"); + // create the CSV file + auto result = state->conn.Query("COPY tpch.lineitem TO 'lineitem_unicode.csv' DELIMITER '🦆' HEADER"); + assert(result->success); + count = result->collection.chunks[0]->data[0].GetValue(0).GetNumericValue(); + // delete the database + state->conn.Query("DROP SCHEMA tpch CASCADE"); + // create the empty schema to load into + tpch::dbgen(0, state->db); +} +string GetQuery() override { + return "COPY lineitem FROM 'lineitem_unicode.csv' DELIMITER '🦆' HEADER"; +} +void Cleanup(DuckDBBenchmarkState *state) override { + state->conn.Query("DROP TABLE lineitem"); + tpch::dbgen(0, state->db); +} +string VerifyResult(QueryResult *result) override { + if (!result->success) { + return result->error; + } + auto &materialized = (MaterializedQueryResult &)*result; + auto expected_count = materialized.collection.chunks[0]->data[0].GetValue(0).GetNumericValue(); + if (expected_count != count) { + return StringUtil::Format("Count mismatch, expected %lld elements but got %lld", count, expected_count); + } + return string(); +} +string BenchmarkInfo() override { + return "Read the lineitem table from SF 0.1 from CSV format"; +} +FINISH_BENCHMARK(ReadLineitemCSVUnicode) + DUCKDB_BENCHMARK(WriteLineitemCSV, "[csv]") void Load(DuckDBBenchmarkState *state) override { // load the data into the tpch schema diff --git a/benchmark/tpch/sf1.cpp b/benchmark/tpch/sf1.cpp index a8ad5929231..1251f9b330c 100644 --- a/benchmark/tpch/sf1.cpp +++ b/benchmark/tpch/sf1.cpp @@ -10,7 +10,10 @@ using namespace std; #define TPCH_QUERY_BODY(QNR) \ virtual void Load(DuckDBBenchmarkState *state) { \ - tpch::dbgen(SF, state->db); \ + if (!BenchmarkRunner::TryLoadDatabase(state->db, "tpch")) { \ + tpch::dbgen(SF, state->db); \ + BenchmarkRunner::SaveDatabase(state->db, "tpch"); \ + } \ } \ virtual string GetQuery() { \ return tpch::get_query(QNR); \ diff --git a/src/common/types/value.cpp b/src/common/types/value.cpp index 04b2e45684b..34231393f1b 100644 --- a/src/common/types/value.cpp +++ b/src/common/types/value.cpp @@ -9,6 +9,7 @@ #include "duckdb/common/printer.hpp" #include "duckdb/common/serializer.hpp" #include "duckdb/common/types/date.hpp" +#include "duckdb/common/types/null_value.hpp" #include "duckdb/common/types/time.hpp" #include "duckdb/common/types/timestamp.hpp" #include "duckdb/common/types/vector.hpp" @@ -172,6 +173,9 @@ Value Value::TIMESTAMP(int32_t year, int32_t month, int32_t day, int32_t hour, i return Value::TIMESTAMP(Date::FromDate(year, month, day), Time::FromTime(hour, min, sec, msec)); } +//===--------------------------------------------------------------------===// +// CreateValue +//===--------------------------------------------------------------------===// template <> Value Value::CreateValue(bool value) { return Value::BOOLEAN(value); } @@ -208,6 +212,60 @@ template <> Value Value::CreateValue(double value) { return Value::DOUBLE(value); } +//===--------------------------------------------------------------------===// +// GetValue +//===--------------------------------------------------------------------===// +template T Value::GetValueInternal() { + if (is_null) { + return NullValue(); + } + switch (type) { + case TypeId::BOOLEAN: + return Cast::Operation(value_.boolean); + case TypeId::TINYINT: + return Cast::Operation(value_.tinyint); + case TypeId::SMALLINT: + return Cast::Operation(value_.smallint); + case TypeId::INTEGER: + return Cast::Operation(value_.integer); + case TypeId::BIGINT: + return Cast::Operation(value_.bigint); + case TypeId::FLOAT: + return Cast::Operation(value_.float_); + case TypeId::DOUBLE: + return Cast::Operation(value_.double_); + case TypeId::VARCHAR: + return Cast::Operation(str_value.c_str()); + default: + throw NotImplementedException("Unimplemented type for GetValue()"); + } +} + +template <> bool Value::GetValue() { + return GetValueInternal(); +} +template <> int8_t Value::GetValue() { + return GetValueInternal(); +} +template <> int16_t Value::GetValue() { + return GetValueInternal(); +} +template <> int32_t Value::GetValue() { + return GetValueInternal(); +} +template <> int64_t Value::GetValue() { + return GetValueInternal(); +} +template <> string Value::GetValue() { + return GetValueInternal(); +} +template <> float Value::GetValue() { + return GetValueInternal(); +} +template <> double Value::GetValue() { + return GetValueInternal(); +} + Value Value::Numeric(TypeId type, int64_t value) { assert(!TypeIsIntegral(type) || (value >= duckdb::MinimumValue(type) && (value < 0 || (uint64_t)value <= duckdb::MaximumValue(type)))); diff --git a/src/execution/operator/persistent/buffered_csv_reader.cpp b/src/execution/operator/persistent/buffered_csv_reader.cpp index bc2ea718f86..cf5c038541c 100644 --- a/src/execution/operator/persistent/buffered_csv_reader.cpp +++ b/src/execution/operator/persistent/buffered_csv_reader.cpp @@ -19,8 +19,37 @@ static char is_newline(char c) { return c == '\n' || c == '\r'; } +TextSearchShiftArray::TextSearchShiftArray(string search_term) : length(search_term.size()) { + if (length > 255) { + throw Exception("Size of delimiter/quote/escape in CSV reader is limited to 255 bytes"); + } + // initialize the shifts array + shifts = unique_ptr(new uint8_t[length * 255]); + memset(shifts.get(), 0, length * 255 * sizeof(uint8_t)); + // iterate over each of the characters in the array + for (index_t main_idx = 0; main_idx < length; main_idx++) { + uint8_t current_char = (uint8_t)search_term[main_idx]; + // now move over all the remaining positions + for (index_t i = main_idx; i < length; i++) { + bool is_match = true; + // check if the prefix matches at this position + // if it does, we move to this position after encountering the current character + for (index_t j = 0; j < main_idx; j++) { + if (search_term[i - main_idx + j] != search_term[j]) { + is_match = false; + } + } + if (!is_match) { + continue; + } + shifts[i * 255 + current_char] = main_idx + 1; + } + } +} + BufferedCSVReader::BufferedCSVReader(CopyInfo &info, vector sql_types, istream &source) - : info(info), sql_types(sql_types), source(source), buffer_size(0), position(0), start(0) { + : info(info), sql_types(sql_types), source(source), buffer_size(0), position(0), start(0), + delimiter_search(info.delimiter), escape_search(info.escape), quote_search(info.quote) { if (info.force_not_null.size() == 0) { info.force_not_null.resize(sql_types.size(), false); } @@ -40,218 +69,377 @@ BufferedCSVReader::BufferedCSVReader(CopyInfo &info, vector sql_types, } } -void BufferedCSVReader::MatchBufferPosition(bool &prev_pos_matches, index_t &control_str_offset, index_t &tmp_position, - bool &match, string &control_str) { - if (prev_pos_matches && control_str_offset < control_str.length()) { - if (buffer[tmp_position] != control_str[control_str_offset]) { - prev_pos_matches = false; - } else { - if (control_str_offset == control_str.length() - 1) { - prev_pos_matches = false; - match = true; +void BufferedCSVReader::ParseComplexCSV(DataChunk &insert_chunk) { + // used for parsing algorithm + bool finished_chunk = false; + index_t column = 0; + vector escape_positions; + uint8_t delimiter_pos = 0, escape_pos = 0, quote_pos = 0; + index_t offset = 0; + + // read values into the buffer (if any) + if (position >= buffer_size) { + if (!ReadBuffer(start)) { + return; + } + } + // start parsing the first value + start = position; + goto value_start; +value_start: + /* state: value_start */ + // this state parses the first characters of a value + offset = 0; + delimiter_pos = 0; + quote_pos = 0; + do { + index_t count = 0; + for (; position < buffer_size; position++) { + quote_search.Match(quote_pos, buffer[position]); + delimiter_search.Match(delimiter_pos, buffer[position]); + count++; + if (delimiter_pos == info.delimiter.size()) { + // found a delimiter, add the value + offset = info.delimiter.size() - 1; + goto add_value; + } else if (is_newline(buffer[position])) { + // found a newline, add the row + goto add_row; + } + if (count > quote_pos) { + // did not find a quote directly at the start of the value, stop looking for the quote now + goto normal; + } + if (quote_pos == info.quote.size()) { + // found a quote, go to quoted loop and skip the initial quote + start += info.quote.size(); + goto in_quotes; } } + } while (ReadBuffer(start)); + // file ends while scanning for quote/delimiter, go to final state + goto final_state; +normal: + /* state: normal parsing state */ + // this state parses the remainder of a non-quoted value until we reach a delimiter or newline + position++; + do { + for (; position < buffer_size; position++) { + delimiter_search.Match(delimiter_pos, buffer[position]); + if (delimiter_pos == info.delimiter.size()) { + offset = info.delimiter.size() - 1; + goto add_value; + } else if (is_newline(buffer[position])) { + goto add_row; + } + } + } while (ReadBuffer(start)); + goto final_state; +add_value: + AddValue(buffer.get() + start, position - start - offset, column, escape_positions); + // increase position by 1 and move start to the new position + offset = 0; + start = ++position; + if (position >= buffer_size && !ReadBuffer(start)) { + // file ends right after delimiter, go to final state + goto final_state; + } + goto value_start; +add_row : { + // check type of newline (\r or \n) + bool carriage_return = buffer[position] == '\r'; + AddValue(buffer.get() + start, position - start - offset, column, escape_positions); + finished_chunk = AddRow(insert_chunk, column); + // increase position by 1 and move start to the new position + offset = 0; + start = ++position; + if (position >= buffer_size && !ReadBuffer(start)) { + // file ends right after newline, go to final state + goto final_state; + } + if (carriage_return) { + // \r newline, go to special state that parses an optional \n afterwards + goto carriage_return; + } else { + // \n newline, move to value start + if (finished_chunk) { + return; + } + goto value_start; } } - -bool BufferedCSVReader::MatchControlString(bool &delim_match, bool "e_match, bool &escape_match) { - index_t tmp_position = position; - index_t control_str_offset = 0; - - bool delim = true; - bool quote = true; - bool escape = true; - - while (true) { - // check if the delimiter string matches - MatchBufferPosition(delim, control_str_offset, tmp_position, delim_match, info.delimiter); - // check if the quote string matches - MatchBufferPosition(quote, control_str_offset, tmp_position, quote_match, info.quote); - // check if the escape string matches - MatchBufferPosition(escape, control_str_offset, tmp_position, escape_match, info.escape); - - // return if matching is not possible any longer - if (!delim && !quote && !escape) { - return false; +in_quotes: + /* state: in_quotes */ + // this state parses the remainder of a quoted value + quote_pos = 0; + escape_pos = 0; + position++; + do { + for (; position < buffer_size; position++) { + quote_search.Match(quote_pos, buffer[position]); + escape_search.Match(escape_pos, buffer[position]); + if (quote_pos == info.quote.size()) { + goto unquote; + } else if (escape_pos == info.escape.size()) { + escape_positions.push_back(position - start - (info.escape.size() - 1)); + goto handle_escape; + } } - - tmp_position++; - control_str_offset++; - - // make sure not to exceed buffer size, and return if there cannot be any further control strings - if (tmp_position >= buffer_size) { - return true; + } while (ReadBuffer(start)); + // still in quoted state at the end of the file, error: + throw ParserException("Error on line %lld: unterminated quotes", linenr); +unquote: + /* state: unquote */ + // this state handles the state directly after we unquote + // in this state we expect either another quote (entering the quoted state again, and escaping the quote) + // or a delimiter/newline, ending the current value and moving on to the next value + delimiter_pos = 0; + quote_pos = 0; + position++; + if (position >= buffer_size && !ReadBuffer(start)) { + // file ends right after unquote, go to final state + offset = info.quote.size(); + goto final_state; + } + if (is_newline(buffer[position])) { + // quote followed by newline, add row + offset = info.quote.size(); + goto add_row; + } + do { + index_t count = 0; + for (; position < buffer_size; position++) { + quote_search.Match(quote_pos, buffer[position]); + delimiter_search.Match(delimiter_pos, buffer[position]); + count++; + if (count > delimiter_pos && count > quote_pos) { + throw ParserException( + "Error on line %lld: quote should be followed by end of value, end of row or another quote", + linenr); + } + if (delimiter_pos == info.delimiter.size()) { + // quote followed by delimiter, add value + offset = info.quote.size() + info.delimiter.size() - 1; + goto add_value; + } else if (quote_pos == info.quote.size()) { + // quote followed by quote, go back to quoted state and add to escape + escape_positions.push_back(position - start - (info.quote.size() - 1)); + goto in_quotes; + } } + } while (ReadBuffer(start)); + throw ParserException("Error on line %lld: quote should be followed by end of value, end of row or another quote", + linenr); +handle_escape: + escape_pos = 0; + quote_pos = 0; + position++; + do { + index_t count = 0; + for (; position < buffer_size; position++) { + quote_search.Match(quote_pos, buffer[position]); + escape_search.Match(escape_pos, buffer[position]); + count++; + if (count > escape_pos && count > quote_pos) { + throw ParserException("Error on line %lld: neither QUOTE nor ESCAPE is proceeded by ESCAPE", linenr); + } + if (quote_pos == info.quote.size() || escape_pos == info.escape.size()) { + // found quote or escape: move back to quoted state + goto in_quotes; + } + } + } while (ReadBuffer(start)); + throw ParserException("Error on line %lld: neither QUOTE nor ESCAPE is proceeded by ESCAPE", linenr); +carriage_return: + /* state: carriage_return */ + // this stage optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line + if (buffer[position] == '\n') { + // newline after carriage return: skip + start = ++position; + if (position >= buffer_size && !ReadBuffer(start)) { + // file ends right after newline, go to final state + goto final_state; + } + } + if (finished_chunk) { + return; + } + goto value_start; +final_state: + if (finished_chunk) { + return; } + if (column > 0 || position > start) { + // remaining values to be added to the chunk + AddValue(buffer.get() + start, position - start - offset, column, escape_positions); + finished_chunk = AddRow(insert_chunk, column); + } + // final stage, only reached after parsing the file is finished + // flush the parsed chunk and finalize parsing + Flush(insert_chunk); } -void BufferedCSVReader::ParseCSV(DataChunk &insert_chunk) { - cached_buffers.clear(); - +void BufferedCSVReader::ParseSimpleCSV(DataChunk &insert_chunk) { // used for parsing algorithm - bool in_quotes = false; bool finished_chunk = false; - bool seen_escape = false; - bool reset_quotes = false; - bool quote_or_escape = false; - bool exhausted_buffer = false; index_t column = 0; index_t offset = 0; - std::queue escape_positions; - - // used for fast control sequence detection - bool delimiter = false; - bool quote = false; - bool escape = false; + vector escape_positions; + // read values into the buffer (if any) if (position >= buffer_size) { if (!ReadBuffer(start)) { return; } } - - // read until we exhaust the stream - while (true) { + // start parsing the first value + goto value_start; +value_start: + offset = 0; + /* state: value_start */ + // this state parses the first character of a value + if (buffer[position] == info.quote[0]) { + // quote: actual value starts in the next position + // move to in_quotes state + start = position + 1; + goto in_quotes; + } else { + // no quote, move to normal parsing state + start = position; + goto normal; + } +normal: + /* state: normal parsing state */ + // this state parses the remainder of a non-quoted value until we reach a delimiter or newline + do { + for (; position < buffer_size; position++) { + if (buffer[position] == info.delimiter[0]) { + // delimiter: end the value and add it to the chunk + goto add_value; + } else if (is_newline(buffer[position])) { + // newline: add row + goto add_row; + } + } + } while (ReadBuffer(start)); + // file ends during normal scan: go to end state + goto final_state; +add_value: + AddValue(buffer.get() + start, position - start - offset, column, escape_positions); + // increase position by 1 and move start to the new position + offset = 0; + start = ++position; + if (position >= buffer_size && !ReadBuffer(start)) { + // file ends right after delimiter, go to final state + goto final_state; + } + goto value_start; +add_row : { + // check type of newline (\r or \n) + bool carriage_return = buffer[position] == '\r'; + AddValue(buffer.get() + start, position - start - offset, column, escape_positions); + finished_chunk = AddRow(insert_chunk, column); + // increase position by 1 and move start to the new position + offset = 0; + start = ++position; + if (position >= buffer_size && !ReadBuffer(start)) { + // file ends right after delimiter, go to final state + goto final_state; + } + if (carriage_return) { + // \r newline, go to special state that parses an optional \n afterwards + goto carriage_return; + } else { + // \n newline, move to value start if (finished_chunk) { return; } - - // detect control strings - exhausted_buffer = MatchControlString(delimiter, quote, escape); - - if (!exhausted_buffer) { - // if QUOTE equals ESCAPE we might need to determine which one we detected in the previous loop - if (quote_or_escape) { - if (delimiter || is_newline(buffer[position]) || (source.eof() && position + 1 == buffer_size)) { - // found quote without escape, end quote - offset = info.quote.length(); - in_quotes = false; - } else { - // found escape - seen_escape = true; - } - quote_or_escape = false; - } - - if (in_quotes) { - if (!quote && !escape && !seen_escape) { - // plain value character - seen_escape = false; - } else if (!quote && !escape && seen_escape) { - throw ParserException("Error on line %lld: neither QUOTE nor ESCAPE is proceeded by ESCAPE", - linenr); - } else if (!quote && escape && !seen_escape) { - // escape - seen_escape = true; - position += info.escape.length() - 1; - } else if (!quote && escape && seen_escape) { - // escaped escape - // we store the position of the escape so we can skip it when adding the value - escape_positions.push(position); - position += info.escape.length() - 1; - seen_escape = false; - } else if (quote && !escape && !seen_escape) { - // found quote without escape, end quote - offset = info.quote.length(); - position += info.quote.length() - 1; - in_quotes = false; - } else if (quote && !escape && seen_escape) { - // escaped quote - // we store the position of the escape so we can skip it when adding the value - escape_positions.push(position); - position += info.quote.length() - 1; - seen_escape = false; - } else if (quote && escape && !seen_escape) { - // either escape or end of quote, decide depending on next character - // NOTE: QUOTE and ESCAPE cannot be subsets of each other - position += info.escape.length() - 1; - quote_or_escape = true; - } else if (quote && escape && seen_escape) { - // we store the position of the escape so we can skip it when adding the value - escape_positions.push(position); - position += info.escape.length() - 1; - seen_escape = false; - } - } else { - if (quote) { - // start quotes can only occur at the start of a field - if (position == start) { - in_quotes = true; - // increment start by quote length - start += info.quote.length(); - reset_quotes = in_quotes; - position += info.quote.length() - 1; - } else { - throw ParserException("Error on line %lld: unterminated quotes", linenr); - } - } else if (delimiter) { - // encountered delimiter - AddValue(buffer.get() + start, position - start - offset, column, escape_positions); - start = position + info.delimiter.length(); - reset_quotes = in_quotes; - position += info.delimiter.length() - 1; - offset = 0; - } - - if (is_newline(buffer[position]) || (source.eof() && position + 1 == buffer_size)) { - char newline = buffer[position]; - // encountered a newline, add the current value and push the row - AddValue(buffer.get() + start, position - start - offset, column, escape_positions); - finished_chunk = AddRow(insert_chunk, column); - - // move to the next character - start = position + 1; - reset_quotes = in_quotes; - offset = 0; - if (newline == '\r') { - // \r, skip subsequent \n - if (position + 1 >= buffer_size) { - if (!ReadBuffer(start)) { - break; - } - if (buffer[position] == '\n') { - start++; - position++; - } - continue; - } - if (buffer[position + 1] == '\n') { - start++; - position++; - } - } - } - if (offset != 0) { - in_quotes = true; - } + goto value_start; + } +} +in_quotes: + /* state: in_quotes */ + // this state parses the remainder of a quoted value + position++; + do { + for (; position < buffer_size; position++) { + if (buffer[position] == info.quote[0]) { + // quote: move to unquoted state + goto unquote; + } else if (buffer[position] == info.escape[0]) { + // escape: store the escaped position and move to handle_escape state + escape_positions.push_back(position - start); + goto handle_escape; } } - - position++; - if (position >= buffer_size) { - // exhausted the buffer - if (!ReadBuffer(start)) { - break; - } - // restore the current state after reading from the buffer - in_quotes = reset_quotes; - seen_escape = false; - position = start; - quote_or_escape = false; - while (!escape_positions.empty()) { - escape_positions.pop(); - } + } while (ReadBuffer(start)); + // still in quoted state at the end of the file, error: + throw ParserException("Error on line %lld: unterminated quotes", linenr); +unquote: + /* state: unquote */ + // this state handles the state directly after we unquote + // in this state we expect either another quote (entering the quoted state again, and escaping the quote) + // or a delimiter/newline, ending the current value and moving on to the next value + position++; + if (position >= buffer_size && !ReadBuffer(start)) { + // file ends right after unquote, go to final state + offset = 1; + goto final_state; + } + if (buffer[position] == info.quote[0]) { + // escaped quote, return to quoted state and store escape position + escape_positions.push_back(position - start); + goto in_quotes; + } else if (buffer[position] == info.delimiter[0]) { + // delimiter, add value + offset = 1; + goto add_value; + } else if (is_newline(buffer[position])) { + offset = 1; + goto add_row; + } else { + throw ParserException( + "Error on line %lld: quote should be followed by end of value, end of row or another quote", linenr); + } +handle_escape: + /* state: handle_escape */ + // escape should be followed by a quote or another escape character + position++; + if (position >= buffer_size && !ReadBuffer(start)) { + throw ParserException("Error on line %lld: neither QUOTE nor ESCAPE is proceeded by ESCAPE", linenr); + } + if (buffer[position] != info.quote[0] && buffer[position] != info.escape[0]) { + throw ParserException("Error on line %lld: neither QUOTE nor ESCAPE is proceeded by ESCAPE", linenr); + } + // escape was followed by quote or escape, go back to quoted state + goto in_quotes; +carriage_return: + /* state: carriage_return */ + // this stage optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line + if (buffer[position] == '\n') { + // newline after carriage return: skip + // increase position by 1 and move start to the new position + start = ++position; + if (position >= buffer_size && !ReadBuffer(start)) { + // file ends right after delimiter, go to final state + goto final_state; } - - // reset values for control string matching - delimiter = false; - quote = false; - escape = false; } - - if (in_quotes) { - throw ParserException("Error on line %lld: unterminated quotes", linenr); + if (finished_chunk) { + return; } + goto value_start; +final_state: + if (finished_chunk) { + return; + } + if (column > 0 || position > start) { + // remaining values to be added to the chunk + AddValue(buffer.get() + start, position - start - offset, column, escape_positions); + finished_chunk = AddRow(insert_chunk, column); + } + // final stage, only reached after parsing the file is finished + // flush the parsed chunk and finalize parsing Flush(insert_chunk); } @@ -277,7 +465,7 @@ bool BufferedCSVReader::ReadBuffer(index_t &start) { index_t read_count = source.eof() ? source.gcount() : buffer_read_size; buffer_size = remaining + read_count; buffer[buffer_size] = '\0'; - if (old_buffer && start != 0) { + if (old_buffer) { cached_buffers.push_back(move(old_buffer)); } start = 0; @@ -286,12 +474,17 @@ bool BufferedCSVReader::ReadBuffer(index_t &start) { return read_count > 0; } -void BufferedCSVReader::AddValue(char *str_val, index_t length, index_t &column, - std::queue &escape_positions) { - // used to remove escape characters - index_t pos = start; - bool in_escape = false; +void BufferedCSVReader::ParseCSV(DataChunk &insert_chunk) { + cached_buffers.clear(); + + if (info.quote.size() == 1 && info.escape.size() == 1 && info.delimiter.size() == 1) { + ParseSimpleCSV(insert_chunk); + } else { + ParseComplexCSV(insert_chunk); + } +} +void BufferedCSVReader::AddValue(char *str_val, index_t length, index_t &column, vector &escape_positions) { if (column == sql_types.size() && length == 0) { // skip a single trailing delimiter column++; @@ -306,35 +499,27 @@ void BufferedCSVReader::AddValue(char *str_val, index_t length, index_t &column, str_val[length] = '\0'; // test against null string - if (info.null_str == str_val && !info.force_not_null[column]) { + if (strcmp(info.null_str.c_str(), str_val) == 0 && !info.force_not_null[column]) { parse_chunk.data[column].nullmask[row_entry] = true; } else { - // optionally remove escape(s) - string new_val = ""; - for (const char *val = str_val; *val; val++) { - if (!escape_positions.empty()) { - if (escape_positions.front() == pos) { - in_escape = false; - escape_positions.pop(); - } else if (escape_positions.front() - info.escape.length() == pos) { - in_escape = true; - } - } - if (!in_escape) { - new_val += *val; + auto &v = parse_chunk.data[column]; + auto parse_data = ((const char **)v.data); + if (escape_positions.size() > 0) { + // remove escape characters (if any) + string old_val = str_val; + string new_val = ""; + index_t prev_pos = 0; + for (index_t i = 0; i < escape_positions.size(); i++) { + index_t next_pos = escape_positions[i]; + new_val += old_val.substr(prev_pos, next_pos - prev_pos); + prev_pos = next_pos + info.escape.size(); } - pos++; - } - while (!escape_positions.empty()) { - escape_positions.pop(); - } - // test for valid utf-8 string - if (!Value::IsUTF8String(new_val.c_str())) { - throw ParserException("Error on line %lld: file is not valid UTF8", linenr); + new_val += old_val.substr(prev_pos, old_val.size() - prev_pos); + escape_positions.clear(); + parse_data[row_entry] = v.string_heap.AddString(new_val.c_str()); + } else { + parse_data[row_entry] = str_val; } - - auto &v = parse_chunk.data[column]; - ((const char **)v.data)[row_entry] = v.string_heap.AddString(new_val.c_str()); } // move to the next column @@ -362,7 +547,16 @@ void BufferedCSVReader::Flush(DataChunk &insert_chunk) { // convert the columns in the parsed chunk to the types of the table for (index_t col_idx = 0; col_idx < sql_types.size(); col_idx++) { if (sql_types[col_idx].id == SQLTypeId::VARCHAR) { - // target type is varchar: just move the parsed chunk + // target type is varchar: no need to convert + // just test that all strings are valid utf-8 strings + auto parse_data = (const char **)parse_chunk.data[col_idx].data; + VectorOperations::Exec(parse_chunk.data[col_idx], [&](index_t i, index_t k) { + if (!parse_chunk.data[col_idx].nullmask[i]) { + if (!Value::IsUTF8String(parse_data[i])) { + throw ParserException("Error on line %lld: file is not valid UTF8", linenr); + } + } + }); parse_chunk.data[col_idx].Move(insert_chunk.data[col_idx]); } else { // target type is not varchar: perform a cast diff --git a/src/include/duckdb/common/types/value.hpp b/src/include/duckdb/common/types/value.hpp index f8ee9e01d83..3ba3dad5ab1 100644 --- a/src/include/duckdb/common/types/value.hpp +++ b/src/include/duckdb/common/types/value.hpp @@ -90,8 +90,11 @@ class Value { //! Create a double Value from a specified value static Value DOUBLE(double value); + template T GetValue() { + throw NotImplementedException("Unimplemented template type for Value::GetValue"); + } template static Value CreateValue(T value) { - throw NotImplementedException("Unimplemented template type for value creation"); + throw NotImplementedException("Unimplemented template type for Value::CreateValue"); } int64_t GetNumericValue(); @@ -175,6 +178,7 @@ class Value { void Print(); private: + template T GetValueInternal(); //! Templated helper function for casting template static DST _cast(const Value &v); @@ -196,4 +200,13 @@ template <> Value Value::CreateValue(string value); template <> Value Value::CreateValue(float value); template <> Value Value::CreateValue(double value); +template <> bool Value::GetValue(); +template <> int8_t Value::GetValue(); +template <> int16_t Value::GetValue(); +template <> int32_t Value::GetValue(); +template <> int64_t Value::GetValue(); +template <> string Value::GetValue(); +template <> float Value::GetValue(); +template <> double Value::GetValue(); + } // namespace duckdb diff --git a/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp b/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp index f5a878ed3be..adcee6a29d2 100644 --- a/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +++ b/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp @@ -11,11 +11,31 @@ #include "duckdb/execution/physical_operator.hpp" #include "duckdb/parser/parsed_data/copy_info.hpp" -#include - namespace duckdb { struct CopyInfo; +//! The shifts array allows for linear searching of multi-byte values. For each position, it determines the next +//! position given that we encounter a byte with the given value. +/*! For example, if we have a string "ABAC", the shifts array will have the following values: + * [0] --> ['A'] = 1, all others = 0 + * [1] --> ['B'] = 2, ['A'] = 1, all others = 0 + * [2] --> ['A'] = 3, all others = 0 + * [3] --> ['C'] = 4 (match), 'B' = 2, 'A' = 1, all others = 0 + * Suppose we then search in the following string "ABABAC", our progression will be as follows: + * 'A' -> [1], 'B' -> [2], 'A' -> [3], 'B' -> [2], 'A' -> [3], 'C' -> [4] (match!) + */ +struct TextSearchShiftArray { + TextSearchShiftArray(string search_term); + + inline bool Match(uint8_t &position, uint8_t byte_value) { + position = shifts[position * 255 + byte_value]; + return position == length; + } + + index_t length; + unique_ptr shifts; +}; + //! Buffered CSV reader is a class that reads values from a stream and parses them as a CSV file class BufferedCSVReader { static constexpr index_t INITIAL_BUFFER_SIZE = 16384; @@ -36,6 +56,8 @@ class BufferedCSVReader { index_t linenr = 0; index_t nr_elements = 0; + TextSearchShiftArray delimiter_search, escape_search, quote_search; + vector> cached_buffers; DataChunk parse_chunk; @@ -45,19 +67,19 @@ class BufferedCSVReader { void ParseCSV(DataChunk &insert_chunk); private: + //! Parses a CSV file with a one-byte delimiter, escape and quote character + void ParseSimpleCSV(DataChunk &insert_chunk); + //! Parses more complex CSV files with multi-byte delimiters, escapes or quotes + void ParseComplexCSV(DataChunk &insert_chunk); + //! Adds a value to the current row - void AddValue(char *str_val, index_t length, index_t &column, std::queue &escape_positions); + void AddValue(char *str_val, index_t length, index_t &column, vector &escape_positions); //! Adds a row to the insert_chunk, returns true if the chunk is filled as a result of this row being added bool AddRow(DataChunk &insert_chunk, index_t &column); //! Finalizes a chunk, parsing all values that have been added so far and adding them to the insert_chunk void Flush(DataChunk &insert_chunk); //! Reads a new buffer from the CSV file if the current one has been exhausted bool ReadBuffer(index_t &start); - //! Sets the control strings starting at the current buffer position, returns false if the buffer was exhausted - bool MatchControlString(bool &delim_str, bool "e_str, bool &escape_str); - //! Matches one position of the buffer against a corresponding char in a control string - void MatchBufferPosition(bool &prev_pos_matches, index_t &control_str_offset, index_t &tmp_position, bool &match, - string &control_str); }; } // namespace duckdb diff --git a/src/include/duckdb/main/query_result.hpp b/src/include/duckdb/main/query_result.hpp index 4a5418d2e6f..b12f88285ef 100644 --- a/src/include/duckdb/main/query_result.hpp +++ b/src/include/duckdb/main/query_result.hpp @@ -58,6 +58,70 @@ class QueryResult { //! Fetch() until both results are exhausted. The data in the results will be lost. bool Equals(QueryResult &other); +private: + //! The current chunk used by the iterator + unique_ptr iterator_chunk; + + class QueryResultIterator; + + class QueryResultRow { + public: + QueryResultRow(QueryResultIterator &iterator) : iterator(iterator), row(0) { + } + + QueryResultIterator &iterator; + index_t row; + + template T GetValue(index_t col_idx) const { + return iterator.result->iterator_chunk->data[col_idx].GetValue(iterator.row_idx).GetValue(); + } + }; + //! The row-based query result iterator. Invoking the + class QueryResultIterator { + public: + QueryResultIterator(QueryResult *result) : current_row(*this), result(result), row_idx(0) { + if (result) { + result->iterator_chunk = result->Fetch(); + } + } + + QueryResultRow current_row; + QueryResult *result; + index_t row_idx; + + public: + void Next() { + if (!result->iterator_chunk) { + return; + } + current_row.row++; + row_idx++; + if (row_idx >= result->iterator_chunk->size()) { + result->iterator_chunk = result->Fetch(); + row_idx = 0; + } + } + + QueryResultIterator &operator++() { + Next(); + return *this; + } + bool operator!=(const QueryResultIterator &other) const { + return result->iterator_chunk && result->iterator_chunk->column_count > 0; + } + const QueryResultRow &operator*() const { + return current_row; + } + }; + +public: + QueryResultIterator begin() { + return QueryResultIterator(this); + } + QueryResultIterator end() { + return QueryResultIterator(nullptr); + } + protected: string HeaderToString(); diff --git a/test/api/test_results.cpp b/test/api/test_results.cpp index 4be0ff344c7..21becb4ccb3 100644 --- a/test/api/test_results.cpp +++ b/test/api/test_results.cpp @@ -34,6 +34,25 @@ TEST_CASE("Test results API", "[api]") { REQUIRE(!str.empty()); } +TEST_CASE("Test iterating over results", "[api]") { + DuckDB db(nullptr); + Connection con(db); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE data(i INTEGER, j VARCHAR)")); + REQUIRE_NO_FAIL(con.Query("INSERT INTO data VALUES (1, 'hello'), (2, 'test')")); + + vector i_values = {1, 2}; + vector j_values = {"hello", "test"}; + index_t row_count = 0; + auto result = con.Query("SELECT * FROM data;"); + for (auto &row : *result) { + REQUIRE(row.GetValue(0) == i_values[row.row]); + REQUIRE(row.GetValue(1) == j_values[row.row]); + row_count++; + } + REQUIRE(row_count == 2); +} + TEST_CASE("Error in streaming result after initial query", "[api]") { DuckDB db(nullptr); Connection con(db); diff --git a/test/sql/copy/test_copy.cpp b/test/sql/copy/test_copy.cpp index 67ffcabff83..a8c03fb08eb 100644 --- a/test/sql/copy/test_copy.cpp +++ b/test/sql/copy/test_copy.cpp @@ -193,6 +193,355 @@ TEST_CASE("Test copy statement", "[copy]") { REQUIRE(CHECK_COLUMN(result, 0, {1024})); } +TEST_CASE("Test CSV file without trailing newline", "[copy]") { + unique_ptr result; + DuckDB db(nullptr); + Connection con(db); + + auto csv_path = GetCSVPath(); + + // no newline at end of file with simple delimiter + ofstream csv_no_newline(fs.JoinPath(csv_path, "no_newline.csv")); + for (int i = 0; i < 1024; i++) { + csv_no_newline << i << "," << i << ", test" << (i + 1 < 1024 ? "\n" : ""); + } + csv_no_newline.close(); + + // load CSV file into a table + REQUIRE_NO_FAIL(con.Query("CREATE TABLE no_newline (a INTEGER, b INTEGER, c VARCHAR(10));")); + result = con.Query("COPY no_newline FROM '" + fs.JoinPath(csv_path, "no_newline.csv") + "';"); + REQUIRE(CHECK_COLUMN(result, 0, {1024})); + + // no newline at end of file with unicode delimiter + ofstream csv_no_newline_unicode(fs.JoinPath(csv_path, "no_newline_unicode.csv")); + for (int i = 0; i < 1024; i++) { + csv_no_newline_unicode << i << "🦆" << i << "🦆 test" << (i + 1 < 1024 ? "\n" : ""); + } + csv_no_newline_unicode.close(); + + // load CSV file into a table + REQUIRE_NO_FAIL(con.Query("CREATE TABLE no_newline_unicode (a INTEGER, b INTEGER, c VARCHAR(10));")); + result = con.Query("COPY no_newline_unicode FROM '" + fs.JoinPath(csv_path, "no_newline_unicode.csv") + + "' DELIMITER '🦆';"); + REQUIRE(CHECK_COLUMN(result, 0, {1024})); +} + +TEST_CASE("Test CSVs with repeating patterns in delimiter/escape/quote", "[copy]") { + unique_ptr result; + DuckDB db(nullptr); + Connection con(db); + + auto csv_dir = GetCSVPath(); + auto csv_path = fs.JoinPath(csv_dir, "abac.csv"); + + SECTION("ABAC delimiter") { + ofstream csv_stream(csv_path); + // this is equivalent to "AB|ABAB|" + csv_stream << "ABABACABABABAC"; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR, b VARCHAR, c VARCHAR);")); + result = con.Query("COPY abac_tbl FROM '" + csv_path + "' DELIMITER 'ABAC';"); + REQUIRE(CHECK_COLUMN(result, 0, {1})); + + result = con.Query("SELECT * FROM abac_tbl"); + REQUIRE(CHECK_COLUMN(result, 0, {"AB"})); + REQUIRE(CHECK_COLUMN(result, 1, {"ABAB"})); + REQUIRE(CHECK_COLUMN(result, 2, {Value()})); + + // do the same but with a large unused quote specifier + REQUIRE_NO_FAIL(con.Query("DELETE FROM abac_tbl;")); + result = con.Query("COPY abac_tbl FROM '" + csv_path + "' DELIMITER 'ABAC' QUOTE 'ABABABABABAB';"); + REQUIRE(CHECK_COLUMN(result, 0, {1})); + + result = con.Query("SELECT * FROM abac_tbl"); + REQUIRE(CHECK_COLUMN(result, 0, {"AB"})); + REQUIRE(CHECK_COLUMN(result, 1, {"ABAB"})); + REQUIRE(CHECK_COLUMN(result, 2, {Value()})); + } + SECTION("Mix of complex quotes/delimiters/escapes") { + ofstream csv_stream(csv_path); + // quote -> "ABAB" + // escape -> "ABAC" + // delimiter -> "ABAD" + // first value is an escaped quote (ABAB) + // second value is a quoted delimiter followed by an escaped quote + // third value is an escape outside of a set of quotes (interpreted as a literal value) + csv_stream << "ABABABACABABABABABADABABABADABABABABABABABADABAC"; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR, b VARCHAR, c VARCHAR);")); + result = con.Query("COPY abac_tbl FROM '" + csv_path + "' DELIMITER 'ABAD' QUOTE 'ABAB' ESCAPE 'ABAC';"); + REQUIRE(CHECK_COLUMN(result, 0, {1})); + + result = con.Query("SELECT * FROM abac_tbl"); + REQUIRE(CHECK_COLUMN(result, 0, {"ABAB"})); + REQUIRE(CHECK_COLUMN(result, 1, {"ABADABAB"})); + REQUIRE(CHECK_COLUMN(result, 2, {"ABAC"})); + } + SECTION("CSV terminates in the middle of quote parsing") { + ofstream csv_stream(csv_path); + csv_stream << "ABAB"; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR);")); + result = con.Query("COPY abac_tbl FROM '" + csv_path + "' QUOTE 'ABABABABAB';"); + REQUIRE(CHECK_COLUMN(result, 0, {1})); + + result = con.Query("SELECT * FROM abac_tbl"); + REQUIRE(CHECK_COLUMN(result, 0, {"ABAB"})); + } + SECTION("Newline in the middle of quote parsing") { + ofstream csv_stream(csv_path); + csv_stream << "ABAB\nABAB"; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR);")); + result = con.Query("COPY abac_tbl FROM '" + csv_path + "' QUOTE 'ABABABABAB';"); + REQUIRE(CHECK_COLUMN(result, 0, {2})); + + result = con.Query("SELECT * FROM abac_tbl"); + REQUIRE(CHECK_COLUMN(result, 0, {"ABAB", "ABAB"})); + } + SECTION("Simple quote terminates immediately results in error") { + ofstream csv_stream(csv_path); + csv_stream << "\""; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR);")); + REQUIRE_FAIL(con.Query("COPY abac_tbl FROM '" + csv_path + "' QUOTE '\"';")); + } + SECTION("File ends in quoted value (simple)") { + ofstream csv_stream(csv_path); + csv_stream << "\"\""; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR);")); + result = con.Query("COPY abac_tbl FROM '" + csv_path + "' QUOTE '\"';"); + REQUIRE(CHECK_COLUMN(result, 0, {1})); + + result = con.Query("SELECT * FROM abac_tbl"); + REQUIRE(CHECK_COLUMN(result, 0, {Value()})); + } + SECTION("File ends in quoted value (complex)") { + ofstream csv_stream(csv_path); + csv_stream << "\"\""; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR);")); + result = con.Query("COPY abac_tbl FROM '" + csv_path + "' QUOTE '\"' DELIMITER 'AAAB';"); + REQUIRE(CHECK_COLUMN(result, 0, {1})); + + result = con.Query("SELECT * FROM abac_tbl"); + REQUIRE(CHECK_COLUMN(result, 0, {Value()})); + } + SECTION("Simple quote terminates after escape results in error") { + ofstream csv_stream(csv_path); + csv_stream << "\"\\\""; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR);")); + REQUIRE_FAIL(con.Query("COPY abac_tbl FROM '" + csv_path + "' QUOTE '\"' ESCAPE '\\';")); + } + SECTION("Simple quote terminates after quote escape results in error") { + ofstream csv_stream(csv_path); + csv_stream << "\"\"\""; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR);")); + REQUIRE_FAIL(con.Query("COPY abac_tbl FROM '" + csv_path + "' QUOTE '\"' ESCAPE '\"';")); + } + SECTION("Simple quote terminates after escape results in error") { + ofstream csv_stream(csv_path); + csv_stream << "\"\\"; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR);")); + REQUIRE_FAIL(con.Query("COPY abac_tbl FROM '" + csv_path + "' QUOTE '\"' ESCAPE '\\';")); + } + SECTION("Multi-byte quote terminates immediately results in error") { + ofstream csv_stream(csv_path); + csv_stream << "ABABAC"; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR);")); + REQUIRE_FAIL(con.Query("COPY abac_tbl FROM '" + csv_path + "' QUOTE 'ABABAC';")); + } + SECTION("Quote followed by incomplete multi-byte delimiter") { + ofstream csv_stream(csv_path); + csv_stream << "\"\"AB"; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR);")); + REQUIRE_FAIL(con.Query("COPY abac_tbl FROM '" + csv_path + "' DELIMITER 'ABAC';")); + REQUIRE_NO_FAIL(con.Query("COPY abac_tbl FROM '" + csv_path + "' DELIMITER 'AB';")); + } + SECTION("Multi-byte quote terminates after escape results in error") { + ofstream csv_stream(csv_path); + csv_stream << "ABACABABABAC"; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR);")); + REQUIRE_FAIL(con.Query("COPY abac_tbl FROM '" + csv_path + "' QUOTE 'ABAC' ESCAPE 'ABAB';")); + } + SECTION("Multi-byte quote terminates after quote escape results in error") { + ofstream csv_stream(csv_path); + csv_stream << "ABACABACABAC"; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR);")); + REQUIRE_FAIL(con.Query("COPY abac_tbl FROM '" + csv_path + "' QUOTE 'ABAC' ESCAPE 'ABAC';")); + } + SECTION("Multi-byte quote terminates after escape results in error") { + ofstream csv_stream(csv_path); + csv_stream << "ABACABAB"; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR);")); + REQUIRE_FAIL(con.Query("COPY abac_tbl FROM '" + csv_path + "' QUOTE 'ABAC' ESCAPE 'ABAB';")); + } + SECTION("Delimiter, quote and escape have a maximum size of 255 bytes") { + ofstream csv_stream(csv_path); + csv_stream << "ABAB"; + csv_stream.close(); + + string long_string(1000, 'a'); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR);")); + REQUIRE_FAIL(con.Query("COPY abac_tbl FROM '" + csv_path + "' QUOTE '" + long_string + "';")); + REQUIRE_FAIL(con.Query("COPY abac_tbl FROM '" + csv_path + "' ESCAPE '" + long_string + "';")); + REQUIRE_FAIL(con.Query("COPY abac_tbl FROM '" + csv_path + "' DELIMITER '" + long_string + "';")); + REQUIRE_NO_FAIL(con.Query("COPY abac_tbl FROM '" + csv_path + "' QUOTE 'BLABLABLA';")); + } + SECTION("Test \r newline with multi-byte delimiter") { + ofstream csv_stream(csv_path); + csv_stream << "ABAB\rABAC\r"; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR, b VARCHAR);")); + result = con.Query("COPY abac_tbl FROM '" + csv_path + "' DELIMITER 'BA';"); + REQUIRE(CHECK_COLUMN(result, 0, {2})); + + result = con.Query("SELECT * FROM abac_tbl"); + REQUIRE(CHECK_COLUMN(result, 0, {"A", "A"})); + REQUIRE(CHECK_COLUMN(result, 1, {"B", "C"})); + } + SECTION("Test \r\n newline with multi-byte delimiter") { + ofstream csv_stream(csv_path); + csv_stream << "ABAB\r\nABAC\r\n"; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR, b VARCHAR);")); + result = con.Query("COPY abac_tbl FROM '" + csv_path + "' DELIMITER 'BA';"); + REQUIRE(CHECK_COLUMN(result, 0, {2})); + + result = con.Query("SELECT * FROM abac_tbl"); + REQUIRE(CHECK_COLUMN(result, 0, {"A", "A"})); + REQUIRE(CHECK_COLUMN(result, 1, {"B", "C"})); + } + SECTION("Test unterminated quotes with multi-line delimiter") { + ofstream csv_stream(csv_path); + csv_stream << "\"AAA"; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR);")); + REQUIRE_FAIL(con.Query("COPY abac_tbl FROM '" + csv_path + "' DELIMITER 'BA';")); + } + SECTION("Test unquote not followed by delimiter") { + ofstream csv_stream(csv_path); + csv_stream << "\"AAA\"BB"; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR, b VARCHAR);")); + REQUIRE_FAIL(con.Query("COPY abac_tbl FROM '" + csv_path + "' DELIMITER 'BA';")); + } + SECTION("Test escape followed by non-quote and non-escape (single-byte)") { + ofstream csv_stream(csv_path); + csv_stream << "\"AAA\\BB\"|A"; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR, b VARCHAR);")); + REQUIRE_FAIL(con.Query("COPY abac_tbl FROM '" + csv_path + "' DELIMITER '|' ESCAPE '\\';")); + } + SECTION("Test escape followed by non-quote and non-escape (multi-byte)") { + ofstream csv_stream(csv_path); + csv_stream << "\"AAA\\BB\"BAA"; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR, b VARCHAR);")); + REQUIRE_FAIL(con.Query("COPY abac_tbl FROM '" + csv_path + "' DELIMITER 'BA' ESCAPE '\\';")); + } + SECTION("Test file end after delimiter with multi-byte delimiter") { + ofstream csv_stream(csv_path); + csv_stream << "AAABA"; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR);")); + result = con.Query("COPY abac_tbl FROM '" + csv_path + "' DELIMITER 'BA';"); + REQUIRE(CHECK_COLUMN(result, 0, {1})); + + result = con.Query("SELECT * FROM abac_tbl"); + REQUIRE(CHECK_COLUMN(result, 0, {"AAA"})); + } + SECTION("Test file end after delimiter with single-byte delimiter") { + ofstream csv_stream(csv_path); + csv_stream << "AAA|"; + csv_stream.close(); + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE abac_tbl (a VARCHAR);")); + result = con.Query("COPY abac_tbl FROM '" + csv_path + "' DELIMITER '|';"); + REQUIRE(CHECK_COLUMN(result, 0, {1})); + + result = con.Query("SELECT * FROM abac_tbl"); + REQUIRE(CHECK_COLUMN(result, 0, {"AAA"})); + } +} + +TEST_CASE("Test long value with escapes", "[copy]") { + unique_ptr result; + DuckDB db(nullptr); + Connection con(db); + + auto csv_path = GetCSVPath(); + + string value = string(10000, 'a') + "\"\"" + string(20000, 'b'); + string expected_value = string(10000, 'a') + "\"" + string(20000, 'b'); + + // long value with escape and simple delimiter + ofstream long_escaped_value(fs.JoinPath(csv_path, "long_escaped_value.csv")); + long_escaped_value << 1 << "🦆" << 2 << "🦆" + << "\"" << value << "\"" << endl; + long_escaped_value.close(); + + // load CSV file into a table + REQUIRE_NO_FAIL(con.Query("CREATE TABLE long_escaped_value (a INTEGER, b INTEGER, c VARCHAR);")); + result = con.Query("COPY long_escaped_value FROM '" + fs.JoinPath(csv_path, "long_escaped_value.csv") + + "' DELIMITER '🦆';"); + REQUIRE(CHECK_COLUMN(result, 0, {1})); + + result = con.Query("SELECT * FROM long_escaped_value"); + REQUIRE(CHECK_COLUMN(result, 0, {1})); + REQUIRE(CHECK_COLUMN(result, 1, {2})); + REQUIRE(CHECK_COLUMN(result, 2, {expected_value})); + + // long value with escape and complex delimiter + ofstream long_escaped_value_unicode(fs.JoinPath(csv_path, "long_escaped_value_unicode.csv")); + long_escaped_value_unicode << 1 << "," << 2 << "," + << "\"" << value << "\"" << endl; + long_escaped_value_unicode.close(); + + // load CSV file into a table + REQUIRE_NO_FAIL(con.Query("CREATE TABLE long_escaped_value_unicode (a INTEGER, b INTEGER, c VARCHAR);")); + result = con.Query("COPY long_escaped_value_unicode FROM '" + + fs.JoinPath(csv_path, "long_escaped_value_unicode.csv") + "';"); + REQUIRE(CHECK_COLUMN(result, 0, {1})); + + result = con.Query("SELECT * FROM long_escaped_value_unicode"); + REQUIRE(CHECK_COLUMN(result, 0, {1})); + REQUIRE(CHECK_COLUMN(result, 1, {2})); + REQUIRE(CHECK_COLUMN(result, 2, {expected_value})); +} + TEST_CASE("Test NULL option of copy statement", "[copy]") { unique_ptr result; DuckDB db(nullptr); @@ -891,6 +1240,18 @@ TEST_CASE("Test Windows Newlines with a long file", "[copy]") { REQUIRE(CHECK_COLUMN(result, 3, {Value::BIGINT(5 * line_count)})); REQUIRE(CHECK_COLUMN(result, 4, {Value::BIGINT(sum_c)})); + REQUIRE_NO_FAIL(con.Query("DELETE FROM test;")); + // now do the same with a multi-byte quote that is not actually used + result = con.Query("COPY test FROM '" + fs.JoinPath(csv_path, "test.csv") + "' QUOTE 'BLABLABLA';"); + REQUIRE(CHECK_COLUMN(result, 0, {Value::BIGINT(line_count)})); + + result = con.Query("SELECT SUM(a), MIN(LENGTH(b)), MAX(LENGTH(b)), SUM(LENGTH(b)), SUM(c) FROM test;"); + REQUIRE(CHECK_COLUMN(result, 0, {Value::BIGINT(sum_a)})); + REQUIRE(CHECK_COLUMN(result, 1, {Value::BIGINT(5)})); + REQUIRE(CHECK_COLUMN(result, 2, {Value::BIGINT(5)})); + REQUIRE(CHECK_COLUMN(result, 3, {Value::BIGINT(5 * line_count)})); + REQUIRE(CHECK_COLUMN(result, 4, {Value::BIGINT(sum_c)})); + REQUIRE_NO_FAIL(con.Query("DROP TABLE test;")); // generate a csv file with one value and many empty values