duckdb · hannes · Aug 26, 2020 · Aug 14, 2020 · Aug 14, 2020 · Aug 14, 2020
diff --git a/Makefile b/Makefile
@@ -19,6 +19,9 @@ endif
 ifeq (${DISABLE_UNITY}, 1)
 	DISABLE_UNITY_FLAG=-DDISABLE_UNITY=1
 endif
+ifeq (${DISABLE_SANITIZER}, 1)
+	DISABLE_SANITIZER_FLAG=-DENABLE_SANITIZER=FALSE
+endif
 EXTENSIONS=-DBUILD_PARQUET_EXTENSION=TRUE
 ifeq (${BUILD_BENCHMARK}, 1)
 	EXTENSIONS:=${EXTENSIONS} -DBUILD_BENCHMARKS=1
@@ -42,13 +45,13 @@ clean:
 debug:
 	mkdir -p build/debug && \
 	cd build/debug && \
-	cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Debug ../.. && \
+	cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${DISABLE_SANITIZER_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Debug ../.. && \
 	cmake --build .
 
 release_expanded:
 	mkdir -p build/release_expanded && \
 	cd build/release_expanded && \
-	cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Release ../.. && \
+	cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${DISABLE_SANITIZER_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Release ../.. && \
 	cmake --build .
 
 unittest: debug
@@ -69,13 +72,13 @@ release:
 	mkdir -p build/release && \
 	python scripts/amalgamation.py && \
 	cd build/release && \
-	cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Release -DAMALGAMATION_BUILD=1 ../.. && \
+	cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${DISABLE_SANITIZER_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Release -DAMALGAMATION_BUILD=1 ../.. && \
 	cmake --build .
 
 reldebug:
 	mkdir -p build/reldebug && \
 	cd build/reldebug && \
-	cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=RelWithDebInfo ../.. && \
+	cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${DISABLE_SANITIZER_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=RelWithDebInfo ../.. && \
 	cmake --build .
 
 amaldebug:

diff --git a/scripts/amalgamation.py b/scripts/amalgamation.py
@@ -26,20 +26,21 @@
 
 # files included in the amalgamated "duckdb.hpp" file
 main_header_files = [os.path.join(include_dir, 'duckdb.hpp'),
-	os.path.join(include_dir, 'duckdb.h'),
-	os.path.join(include_dir, 'duckdb', 'common', 'types', 'date.hpp'),
-	os.path.join(include_dir, 'duckdb', 'common', 'types', 'hugeint.hpp'),
-	os.path.join(include_dir, 'duckdb', 'common', 'types', 'interval.hpp'),
-	os.path.join(include_dir, 'duckdb', 'common', 'types', 'timestamp.hpp'),
-	os.path.join(include_dir, 'duckdb', 'common', 'types', 'time.hpp'),
-	os.path.join(include_dir, 'duckdb', 'common', 'serializer', 'buffered_file_writer.hpp'),
-	os.path.join(include_dir, 'duckdb', 'common', 'serializer', 'buffered_serializer.hpp'),
-	os.path.join(include_dir, 'duckdb', 'main', 'appender.hpp'),
-	os.path.join(include_dir, 'duckdb', 'main', 'client_context.hpp'),
-	os.path.join(include_dir, 'duckdb', 'function', 'function.hpp'),
-	os.path.join(include_dir, 'duckdb', 'function', 'table_function.hpp'),
-	os.path.join(include_dir, 'duckdb', 'parser', 'parsed_data', 'create_table_function_info.hpp'),
-	os.path.join(include_dir, 'duckdb', 'parser', 'parsed_data', 'create_copy_function_info.hpp')]
+    os.path.join(include_dir, 'duckdb.h'),
+    os.path.join(include_dir, 'duckdb', 'common', 'types', 'date.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'arrow.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'types', 'hugeint.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'types', 'interval.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'types', 'timestamp.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'types', 'time.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'serializer', 'buffered_file_writer.hpp'),
+    os.path.join(include_dir, 'duckdb', 'common', 'serializer', 'buffered_serializer.hpp'),
+    os.path.join(include_dir, 'duckdb', 'main', 'appender.hpp'),
+    os.path.join(include_dir, 'duckdb', 'main', 'client_context.hpp'),
+    os.path.join(include_dir, 'duckdb', 'function', 'function.hpp'),
+    os.path.join(include_dir, 'duckdb', 'function', 'table_function.hpp'),
+    os.path.join(include_dir, 'duckdb', 'parser', 'parsed_data', 'create_table_function_info.hpp'),
+    os.path.join(include_dir, 'duckdb', 'parser', 'parsed_data', 'create_copy_function_info.hpp')]
 
 # include paths for where to search for include files during amalgamation
 include_paths = [include_dir, fmt_include_dir, re2_dir, miniz_dir, utf8proc_include_dir, utf8proc_dir, pg_query_include_dir, pg_query_dir, moodycamel_include_dir]

diff --git a/src/common/types/data_chunk.cpp b/src/common/types/data_chunk.cpp
@@ -8,8 +8,8 @@
 #include "duckdb/common/vector_operations/vector_operations.hpp"
 #include "duckdb/common/unordered_map.hpp"
 #include "duckdb/common/types/sel_cache.hpp"
-
-using namespace std;
+#include "duckdb/common/arrow.hpp"
+#include "duckdb/common/vector.hpp"
 
 namespace duckdb {
 
@@ -191,4 +191,125 @@ void DataChunk::Print() {
 	Printer::Print(ToString());
 }
 
+struct DuckDBArrowArrayHolder {
+	ArrowArray array;
+	const void *buffers[3];              // need max three pointers for strings
+	unique_ptr<ArrowArray *[]> children; // just space for the *pointers* to children, not the children themselves
+
+	Vector vector;
+	unique_ptr<data_t[]> string_offsets;
+	unique_ptr<data_t[]> string_data;
+};
+
+static void release_duckdb_arrow_array(ArrowArray *array) {
+	if (!array || !array->release) {
+		return;
+	}
+	array->release = nullptr;
+	auto holder = (DuckDBArrowArrayHolder *)array->private_data;
+	delete holder;
+}
+
+void DataChunk::ToArrowArray(ArrowArray *out_array) {
+	assert(out_array);
+
+	auto root_holder = new DuckDBArrowArrayHolder();
+	root_holder->children = unique_ptr<ArrowArray *[]>(new ArrowArray *[column_count()]);
+	out_array->private_data = root_holder;
+	out_array->release = release_duckdb_arrow_array;
+
+	out_array->children = root_holder->children.get();
+	out_array->length = size();
+	out_array->n_children = column_count();
+	out_array->n_buffers = 1;
+	out_array->buffers = root_holder->buffers;
+	out_array->buffers[0] = nullptr; // there is no actual buffer there since we don't have NULLs
+	out_array->offset = 0;
+	out_array->null_count = 0; // needs to be 0
+	out_array->dictionary = nullptr;
+
+	for (idx_t col_idx = 0; col_idx < column_count(); col_idx++) {
+		auto holder = new DuckDBArrowArrayHolder();
+		holder->vector.Reference(data[col_idx]);
+		auto &child = holder->array;
+		auto &vector = holder->vector;
+		child.private_data = holder;
+		child.release = release_duckdb_arrow_array;
+
+		child.n_children = 0;
+		child.null_count = -1; // unknown
+		child.offset = 0;
+		child.dictionary = nullptr;
+		child.buffers = holder->buffers;
+
+		child.length = size();
+
+		switch (vector.vector_type) {
+			// TODO support other vector types
+		case VectorType::FLAT_VECTOR:
+
+			switch (GetTypes()[col_idx].id()) {
+				// TODO support other data types
+			case LogicalTypeId::BOOLEAN:
+			case LogicalTypeId::TINYINT:
+			case LogicalTypeId::SMALLINT:
+			case LogicalTypeId::INTEGER:
+			case LogicalTypeId::BIGINT:
+			case LogicalTypeId::FLOAT:
+			case LogicalTypeId::DOUBLE:
+			case LogicalTypeId::HUGEINT:
+				child.n_buffers = 2;
+				child.buffers[1] = (void *)FlatVector::GetData(vector);
+				break;
+
+			case LogicalTypeId::VARCHAR: {
+				child.n_buffers = 3;
+				holder->string_offsets = unique_ptr<data_t[]>(new data_t[sizeof(uint32_t) * (size() + 1)]);
+				child.buffers[1] = holder->string_offsets.get();
+				assert(child.buffers[1]);
+				// step 1: figure out total string length:
+				idx_t total_string_length = 0;
+				auto string_t_ptr = FlatVector::GetData<string_t>(vector);
+				auto is_null = FlatVector::Nullmask(vector);
+				for (idx_t row_idx = 0; row_idx < size(); row_idx++) {
+					if (is_null[row_idx]) {
+						continue;
+					}
+					total_string_length += string_t_ptr[row_idx].GetSize();
+				}
+				// step 2: allocate this much
+				holder->string_data = unique_ptr<data_t[]>(new data_t[total_string_length]);
+				child.buffers[2] = holder->string_data.get();
+				assert(child.buffers[2]);
+				// step 3: assign buffers
+				idx_t current_heap_offset = 0;
+				auto target_ptr = (uint32_t *)child.buffers[1];
+
+				for (idx_t row_idx = 0; row_idx < size(); row_idx++) {
+					target_ptr[row_idx] = current_heap_offset;
+					if (is_null[row_idx]) {
+						continue;
+					}
+					auto &str = string_t_ptr[row_idx];
+					memcpy((void *)((uint8_t *)child.buffers[2] + current_heap_offset), str.GetData(), str.GetSize());
+					current_heap_offset += str.GetSize();
+				}
+				target_ptr[size()] = current_heap_offset; // need to terminate last string!
+				break;
+			}
+			default:
+				throw runtime_error("Unsupported type " + GetTypes()[col_idx].ToString());
+			}
+
+			child.null_count = FlatVector::Nullmask(vector).count();
+			child.buffers[0] = (void *)&FlatVector::Nullmask(vector).flip();
+
+			break;
+		default:
+			throw NotImplementedException(VectorTypeToString(vector.vector_type));
+		}
+		out_array->children[col_idx] = &child;
+	}
+}
+
 } // namespace duckdb
diff --git a/src/common/types/value.cpp b/src/common/types/value.cpp
@@ -332,6 +332,7 @@ template <class T> T Value::GetValueInternal() const {
 		return Cast::Operation<float, T>(value_.float_);
 	case PhysicalType::DOUBLE:
 		return Cast::Operation<double, T>(value_.double_);
+
 	case PhysicalType::VARCHAR:
 		return Cast::Operation<string_t, T>(str_value.c_str());
 	default:
@@ -366,6 +367,10 @@ template <> float Value::GetValue() const {
 template <> double Value::GetValue() const {
 	return GetValueInternal<double>();
 }
+template <> uintptr_t Value::GetValue() const {
+	assert(type()== LogicalType::POINTER);
+    return value_.pointer;
+}
 Value Value::Numeric(LogicalType type, int64_t value) {
 	switch (type.id()) {
 	case LogicalTypeId::TINYINT:

diff --git a/src/function/function.cpp b/src/function/function.cpp
@@ -25,6 +25,7 @@ void BuiltinFunctions::Initialize() {
 	RegisterSQLiteFunctions();
 	RegisterReadFunctions();
 	RegisterTableFunctions();
+	RegisterArrowFunctions();
 
 	RegisterAlgebraicAggregates();
 	RegisterDistributiveAggregates();

diff --git a/src/function/table/CMakeLists.txt b/src/function/table/CMakeLists.txt
@@ -3,6 +3,7 @@ add_subdirectory(sqlite)
 add_library_unity(
   duckdb_func_table
   OBJECT
+  arrow.cpp
   range.cpp
   repeat.cpp
   copy_csv.cpp