Merge pull request #11717 from pdet/implicit_cast_sniffer

Getting Rid of Value.TryCast in the CSV Sniffer
duckdb · Apr 20, 2024 · 4750ce2 · 4750ce2
2 parents fd102a3 + 1c0ad20
commit 4750ce2
Show file tree

Hide file tree

Showing 8 changed files with 206 additions and 78 deletions.
diff --git a/.github/regression/csv.csv b/.github/regression/csv.csv
@@ -5,4 +5,5 @@ benchmark/micro/csv/null_padding.benchmark
 benchmark/micro/csv/projection_pushdown.benchmark
 benchmark/micro/csv/1_byte_values.benchmark
 benchmark/micro/csv/16_byte_values.benchmark
-benchmark/micro/csv/time_type.benchmark
+benchmark/micro/csv/multiple_small_files.benchmark
+benchmark/micro/csv/time_type.benchmark
diff --git a/.github/regression/micro_extended.csv b/.github/regression/micro_extended.csv
@@ -79,6 +79,7 @@ benchmark/micro/copy/to_parquet_partition_by_many.benchmark
 benchmark/micro/csv/16_byte_values.benchmark
 benchmark/micro/csv/1_byte_values.benchmark
 benchmark/micro/csv/multiple_read.benchmark
+benchmark/micro/csv/multiple_small_files.benchmark
 benchmark/micro/csv/multiple_small_read_csv.benchmark
 benchmark/micro/csv/null_padding.benchmark
 benchmark/micro/csv/projection_pushdown.benchmark

diff --git a/benchmark/micro/csv/multiple_small_files.benchmark b/benchmark/micro/csv/multiple_small_files.benchmark
@@ -0,0 +1,46 @@
+# name: benchmark/micro/csv/multiple_small_files.benchmark
+# description: Run CSV scan over multiple small (2048 rows) values
+# group: [csv]
+
+name CSV Read Benchmark over multiple small files
+group csv
+
+load
+CREATE TABLE t1 AS select i,i,i,i,i,i,i,i,i,i from range(0,2048) tbl(i);
+COPY t1 TO '${BENCHMARK_DIR}/small_csv.csv' (FORMAT CSV, HEADER 1);
+
+run
+SELECT * from
+read_csv(['${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv',
+'${BENCHMARK_DIR}/small_csv.csv','${BENCHMARK_DIR}/small_csv.csv'],delim= ',',  header = 0)
diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp
@@ -276,17 +276,6 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size
 	chunk_col_id++;
 }
 
-Value StringValueResult::GetValue(idx_t row_idx, idx_t col_idx) {
-	if (validity_mask[col_idx]->AllValid()) {
-		return Value(static_cast<string_t *>(vector_ptr[col_idx])[row_idx]);
-	} else {
-		if (validity_mask[col_idx]->RowIsValid(row_idx)) {
-			return Value(static_cast<string_t *>(vector_ptr[col_idx])[row_idx]);
-		} else {
-			return Value();
-		}
-	}
-}
 DataChunk &StringValueResult::ToChunk() {
 	parse_chunk.SetCardinality(number_of_rows);
 	return parse_chunk;

diff --git a/src/execution/operator/csv_scanner/sniffer/header_detection.cpp b/src/execution/operator/csv_scanner/sniffer/header_detection.cpp
@@ -1,6 +1,8 @@
 #include "duckdb/common/types/cast_helpers.hpp"
 #include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
 #include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp"
+#include "duckdb/common/types/value.hpp"
+
 #include "utf8proc.hpp"
 
 namespace duckdb {
@@ -110,7 +112,7 @@ bool CSVSniffer::DetectHeaderWithSetColumn() {
 			if (best_header_row[i].IsNull()) {
 				return false;
 			}
-			if (best_header_row[i] != (*set_columns.names)[i]) {
+			if (best_header_row[i].value.GetString() != (*set_columns.names)[i]) {
 				has_header = false;
 				break;
 			}
@@ -119,12 +121,12 @@ bool CSVSniffer::DetectHeaderWithSetColumn() {
 	if (!has_header) {
 		// We verify if the types are consistent
 		for (idx_t col = 0; col < set_columns.Size(); col++) {
-			auto dummy_val = best_header_row[col];
 			// try cast to sql_type of column
 			const auto &sql_type = (*set_columns.types)[col];
 			if (sql_type != LogicalType::VARCHAR) {
 				all_varchar = false;
-				if (!TryCastValue(options.dialect_options, options.decimal_separator, dummy_val, sql_type)) {
+				if (!CanYouCastIt(best_header_row[col].value, sql_type, options.dialect_options,
+				                  best_header_row[col].IsNull())) {
 					first_row_consistent = false;
 				}
 			}
@@ -168,16 +170,15 @@ void CSVSniffer::DetectHeader() {
 		has_header = DetectHeaderWithSetColumn();
 	} else {
 		for (idx_t col = 0; col < best_header_row.size(); col++) {
-			auto dummy_val = best_header_row[col];
-			if (!dummy_val.IsNull()) {
+			if (!best_header_row[col].IsNull()) {
 				first_row_nulls = false;
 			}
 			// try cast to sql_type of column
 			const auto &sql_type = best_sql_types_candidates_per_column_idx[col].back();
 			if (sql_type != LogicalType::VARCHAR) {
 				all_varchar = false;
-				if (!TryCastValue(sniffer_state_machine.dialect_options,
-				                  sniffer_state_machine.options.decimal_separator, dummy_val, sql_type)) {
+				if (!CanYouCastIt(best_header_row[col].value, sql_type, sniffer_state_machine.dialect_options,
+				                  best_header_row[col].IsNull())) {
 					first_row_consistent = false;
 				}
 			}
@@ -208,11 +209,10 @@ void CSVSniffer::DetectHeader() {
 
 		// get header names from CSV
 		for (idx_t col = 0; col < best_header_row.size(); col++) {
-			const auto &val = best_header_row[col];
-			string col_name = val.ToString();
+			string col_name = best_header_row[col].value.GetString();
 
 			// generate name if field is empty
-			if (col_name.empty() || val.IsNull()) {
+			if (col_name.empty() || best_header_row[col].IsNull()) {
 				col_name = GenerateColumnName(sniffer_state_machine.dialect_options.num_cols, col);
 			}