Skip to content

Commit

Permalink
Merge pull request #11717 from pdet/implicit_cast_sniffer
Browse files Browse the repository at this point in the history
Getting Rid of Value.TryCast in the CSV Sniffer
  • Loading branch information
Mytherin committed Apr 20, 2024
2 parents fd102a3 + 1c0ad20 commit 4750ce2
Show file tree
Hide file tree
Showing 8 changed files with 206 additions and 78 deletions.
3 changes: 2 additions & 1 deletion .github/regression/csv.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ benchmark/micro/csv/null_padding.benchmark
benchmark/micro/csv/projection_pushdown.benchmark
benchmark/micro/csv/1_byte_values.benchmark
benchmark/micro/csv/16_byte_values.benchmark
benchmark/micro/csv/time_type.benchmark
benchmark/micro/csv/multiple_small_files.benchmark
benchmark/micro/csv/time_type.benchmark
1 change: 1 addition & 0 deletions .github/regression/micro_extended.csv
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ benchmark/micro/copy/to_parquet_partition_by_many.benchmark
benchmark/micro/csv/16_byte_values.benchmark
benchmark/micro/csv/1_byte_values.benchmark
benchmark/micro/csv/multiple_read.benchmark
benchmark/micro/csv/multiple_small_files.benchmark
benchmark/micro/csv/multiple_small_read_csv.benchmark
benchmark/micro/csv/null_padding.benchmark
benchmark/micro/csv/projection_pushdown.benchmark
Expand Down
46 changes: 46 additions & 0 deletions benchmark/micro/csv/multiple_small_files.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# name: benchmark/micro/csv/multiple_small_files.benchmark
# description: Run CSV scan over multiple small (2048 rows) values
# group: [csv]

name CSV Read Benchmark over multiple small files
group csv

load
CREATE TABLE t1 AS select i,i,i,i,i,i,i,i,i,i from range(0,2048) tbl(i);
COPY t1 TO '${BENCHMARK_DIR}/small_csv.csv' (FORMAT CSV, HEADER 1);

run
SELECT * from
read_csv(['${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv'],delim= ',', header = 0)
Original file line number Diff line number Diff line change
Expand Up @@ -276,17 +276,6 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size
chunk_col_id++;
}

Value StringValueResult::GetValue(idx_t row_idx, idx_t col_idx) {
if (validity_mask[col_idx]->AllValid()) {
return Value(static_cast<string_t *>(vector_ptr[col_idx])[row_idx]);
} else {
if (validity_mask[col_idx]->RowIsValid(row_idx)) {
return Value(static_cast<string_t *>(vector_ptr[col_idx])[row_idx]);
} else {
return Value();
}
}
}
DataChunk &StringValueResult::ToChunk() {
parse_chunk.SetCardinality(number_of_rows);
return parse_chunk;
Expand Down
20 changes: 10 additions & 10 deletions src/execution/operator/csv_scanner/sniffer/header_detection.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include "duckdb/common/types/cast_helpers.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp"
#include "duckdb/common/types/value.hpp"

#include "utf8proc.hpp"

namespace duckdb {
Expand Down Expand Up @@ -110,7 +112,7 @@ bool CSVSniffer::DetectHeaderWithSetColumn() {
if (best_header_row[i].IsNull()) {
return false;
}
if (best_header_row[i] != (*set_columns.names)[i]) {
if (best_header_row[i].value.GetString() != (*set_columns.names)[i]) {
has_header = false;
break;
}
Expand All @@ -119,12 +121,12 @@ bool CSVSniffer::DetectHeaderWithSetColumn() {
if (!has_header) {
// We verify if the types are consistent
for (idx_t col = 0; col < set_columns.Size(); col++) {
auto dummy_val = best_header_row[col];
// try cast to sql_type of column
const auto &sql_type = (*set_columns.types)[col];
if (sql_type != LogicalType::VARCHAR) {
all_varchar = false;
if (!TryCastValue(options.dialect_options, options.decimal_separator, dummy_val, sql_type)) {
if (!CanYouCastIt(best_header_row[col].value, sql_type, options.dialect_options,
best_header_row[col].IsNull())) {
first_row_consistent = false;
}
}
Expand Down Expand Up @@ -168,16 +170,15 @@ void CSVSniffer::DetectHeader() {
has_header = DetectHeaderWithSetColumn();
} else {
for (idx_t col = 0; col < best_header_row.size(); col++) {
auto dummy_val = best_header_row[col];
if (!dummy_val.IsNull()) {
if (!best_header_row[col].IsNull()) {
first_row_nulls = false;
}
// try cast to sql_type of column
const auto &sql_type = best_sql_types_candidates_per_column_idx[col].back();
if (sql_type != LogicalType::VARCHAR) {
all_varchar = false;
if (!TryCastValue(sniffer_state_machine.dialect_options,
sniffer_state_machine.options.decimal_separator, dummy_val, sql_type)) {
if (!CanYouCastIt(best_header_row[col].value, sql_type, sniffer_state_machine.dialect_options,
best_header_row[col].IsNull())) {
first_row_consistent = false;
}
}
Expand Down Expand Up @@ -208,11 +209,10 @@ void CSVSniffer::DetectHeader() {

// get header names from CSV
for (idx_t col = 0; col < best_header_row.size(); col++) {
const auto &val = best_header_row[col];
string col_name = val.ToString();
string col_name = best_header_row[col].value.GetString();

// generate name if field is empty
if (col_name.empty() || val.IsNull()) {
if (col_name.empty() || best_header_row[col].IsNull()) {
col_name = GenerateColumnName(sniffer_state_machine.dialect_options.num_cols, col);
}

Expand Down
Loading

0 comments on commit 4750ce2

Please sign in to comment.