Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial Arrow support #866

Merged
merged 39 commits into from
Aug 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
638acef
no debug symbols in python packages by default
hannes Aug 14, 2020
a967fb6
Merge branch 'master' of github.com:cwida/duckdb
hannes Aug 14, 2020
fbf031f
first baby steps
hannes Aug 14, 2020
2c0079d
zero-copy arrow scans working
hannes Aug 14, 2020
d8721cd
basic string support
hannes Aug 15, 2020
bd88718
Merge remote-tracking branch 'origin/master' into arrow
hannes Aug 17, 2020
dd3337b
some moar stuff
hannes Aug 18, 2020
83c1b87
more generic
hannes Aug 19, 2020
691a192
Merge remote-tracking branch 'origin/master' into arrow
hannes Aug 19, 2020
5914fbc
grr
hannes Aug 21, 2020
699e40b
import order setup.py
hannes Aug 22, 2020
8c7f95f
Merge branch 'arrow' of github.com:cwida/duckdb into arrow
hannes Aug 22, 2020
ac8bd90
import side working sort of
hannes Aug 22, 2020
5b25170
first part of fetch to arrow
hannes Aug 22, 2020
1b1d407
hacky version of fetch_arrow_table implemented but not working yet
hannes Aug 22, 2020
a2802ae
got result set fetching work with serious memory ownership issues
hannes Aug 23, 2020
1d80c82
first part of sane memory management for arrow
hannes Aug 23, 2020
af366d7
mc
hannes Aug 23, 2020
5b5b1b0
moving data chunk to arrow conversion into duckdb core
hannes Aug 24, 2020
de43bb1
string support for arrow
hannes Aug 24, 2020
442016b
added round trip test for basic types
hannes Aug 25, 2020
347e10c
round trip from python basically working
hannes Aug 25, 2020
09993b1
some nullmask stuff still to fix
hannes Aug 25, 2020
551141e
bitmasks work
hannes Aug 25, 2020
9c24e4d
test case
hannes Aug 25, 2020
66d975f
pyarrow for tests
hannes Aug 25, 2020
b71d5bf
extra !
hannes Aug 25, 2020
ff66f70
getting rid of malloc() and free()
hannes Aug 25, 2020
f141721
working!
hannes Aug 25, 2020
02c2da6
removing extra file
hannes Aug 25, 2020
d3005a0
making some tests pass
hannes Aug 25, 2020
a9aa504
extra schema child cleanup
hannes Aug 25, 2020
13bf2e5
switching to ArrowArrayStream
hannes Aug 26, 2020
215cfd7
fixing empty results to arrow in python
hannes Aug 26, 2020
5aac23d
arrow cant install from source on buildwheel and has no binaries for …
hannes Aug 26, 2020
34d8868
some cleanup
hannes Aug 26, 2020
b8727eb
more release
hannes Aug 26, 2020
07f7632
some minor changes
hannes Aug 26, 2020
d6ff2c6
fixing issue for python2
hannes Aug 26, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 7 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ endif
ifeq (${DISABLE_UNITY}, 1)
DISABLE_UNITY_FLAG=-DDISABLE_UNITY=1
endif
ifeq (${DISABLE_SANITIZER}, 1)
DISABLE_SANITIZER_FLAG=-DENABLE_SANITIZER=FALSE
endif
EXTENSIONS=-DBUILD_PARQUET_EXTENSION=TRUE
ifeq (${BUILD_BENCHMARK}, 1)
EXTENSIONS:=${EXTENSIONS} -DBUILD_BENCHMARKS=1
Expand All @@ -42,13 +45,13 @@ clean:
debug:
mkdir -p build/debug && \
cd build/debug && \
cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Debug ../.. && \
cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${DISABLE_SANITIZER_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Debug ../.. && \
cmake --build .

release_expanded:
mkdir -p build/release_expanded && \
cd build/release_expanded && \
cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Release ../.. && \
cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${DISABLE_SANITIZER_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Release ../.. && \
cmake --build .

unittest: debug
Expand All @@ -69,13 +72,13 @@ release:
mkdir -p build/release && \
python scripts/amalgamation.py && \
cd build/release && \
cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Release -DAMALGAMATION_BUILD=1 ../.. && \
cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${DISABLE_SANITIZER_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=Release -DAMALGAMATION_BUILD=1 ../.. && \
cmake --build .

reldebug:
mkdir -p build/reldebug && \
cd build/reldebug && \
cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=RelWithDebInfo ../.. && \
cmake $(GENERATOR) $(FORCE_COLOR) ${WARNINGS_AS_ERRORS} ${DISABLE_UNITY_FLAG} ${DISABLE_SANITIZER_FLAG} ${EXTENSIONS} -DCMAKE_BUILD_TYPE=RelWithDebInfo ../.. && \
cmake --build .

amaldebug:
Expand Down
29 changes: 15 additions & 14 deletions scripts/amalgamation.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,21 @@

# files included in the amalgamated "duckdb.hpp" file
main_header_files = [os.path.join(include_dir, 'duckdb.hpp'),
os.path.join(include_dir, 'duckdb.h'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'date.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'hugeint.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'interval.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'timestamp.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'time.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'serializer', 'buffered_file_writer.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'serializer', 'buffered_serializer.hpp'),
os.path.join(include_dir, 'duckdb', 'main', 'appender.hpp'),
os.path.join(include_dir, 'duckdb', 'main', 'client_context.hpp'),
os.path.join(include_dir, 'duckdb', 'function', 'function.hpp'),
os.path.join(include_dir, 'duckdb', 'function', 'table_function.hpp'),
os.path.join(include_dir, 'duckdb', 'parser', 'parsed_data', 'create_table_function_info.hpp'),
os.path.join(include_dir, 'duckdb', 'parser', 'parsed_data', 'create_copy_function_info.hpp')]
os.path.join(include_dir, 'duckdb.h'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'date.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'arrow.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'hugeint.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'interval.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'timestamp.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'types', 'time.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'serializer', 'buffered_file_writer.hpp'),
os.path.join(include_dir, 'duckdb', 'common', 'serializer', 'buffered_serializer.hpp'),
os.path.join(include_dir, 'duckdb', 'main', 'appender.hpp'),
os.path.join(include_dir, 'duckdb', 'main', 'client_context.hpp'),
os.path.join(include_dir, 'duckdb', 'function', 'function.hpp'),
os.path.join(include_dir, 'duckdb', 'function', 'table_function.hpp'),
os.path.join(include_dir, 'duckdb', 'parser', 'parsed_data', 'create_table_function_info.hpp'),
os.path.join(include_dir, 'duckdb', 'parser', 'parsed_data', 'create_copy_function_info.hpp')]

# include paths for where to search for include files during amalgamation
include_paths = [include_dir, fmt_include_dir, re2_dir, miniz_dir, utf8proc_include_dir, utf8proc_dir, pg_query_include_dir, pg_query_dir, moodycamel_include_dir]
Expand Down
125 changes: 123 additions & 2 deletions src/common/types/data_chunk.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
#include "duckdb/common/vector_operations/vector_operations.hpp"
#include "duckdb/common/unordered_map.hpp"
#include "duckdb/common/types/sel_cache.hpp"

using namespace std;
#include "duckdb/common/arrow.hpp"
#include "duckdb/common/vector.hpp"

namespace duckdb {

Expand Down Expand Up @@ -191,4 +191,125 @@ void DataChunk::Print() {
Printer::Print(ToString());
}

struct DuckDBArrowArrayHolder {
ArrowArray array;
const void *buffers[3]; // need max three pointers for strings
unique_ptr<ArrowArray *[]> children; // just space for the *pointers* to children, not the children themselves

Vector vector;
unique_ptr<data_t[]> string_offsets;
unique_ptr<data_t[]> string_data;
};

static void release_duckdb_arrow_array(ArrowArray *array) {
if (!array || !array->release) {
return;
}
array->release = nullptr;
auto holder = (DuckDBArrowArrayHolder *)array->private_data;
delete holder;
}

void DataChunk::ToArrowArray(ArrowArray *out_array) {
assert(out_array);

auto root_holder = new DuckDBArrowArrayHolder();
root_holder->children = unique_ptr<ArrowArray *[]>(new ArrowArray *[column_count()]);
out_array->private_data = root_holder;
out_array->release = release_duckdb_arrow_array;

out_array->children = root_holder->children.get();
out_array->length = size();
out_array->n_children = column_count();
out_array->n_buffers = 1;
out_array->buffers = root_holder->buffers;
out_array->buffers[0] = nullptr; // there is no actual buffer there since we don't have NULLs
out_array->offset = 0;
out_array->null_count = 0; // needs to be 0
out_array->dictionary = nullptr;

for (idx_t col_idx = 0; col_idx < column_count(); col_idx++) {
auto holder = new DuckDBArrowArrayHolder();
holder->vector.Reference(data[col_idx]);
auto &child = holder->array;
auto &vector = holder->vector;
child.private_data = holder;
child.release = release_duckdb_arrow_array;

child.n_children = 0;
child.null_count = -1; // unknown
child.offset = 0;
child.dictionary = nullptr;
child.buffers = holder->buffers;

child.length = size();

switch (vector.vector_type) {
// TODO support other vector types
case VectorType::FLAT_VECTOR:

switch (GetTypes()[col_idx].id()) {
// TODO support other data types
case LogicalTypeId::BOOLEAN:
case LogicalTypeId::TINYINT:
case LogicalTypeId::SMALLINT:
case LogicalTypeId::INTEGER:
case LogicalTypeId::BIGINT:
case LogicalTypeId::FLOAT:
case LogicalTypeId::DOUBLE:
case LogicalTypeId::HUGEINT:
child.n_buffers = 2;
child.buffers[1] = (void *)FlatVector::GetData(vector);
break;

case LogicalTypeId::VARCHAR: {
child.n_buffers = 3;
holder->string_offsets = unique_ptr<data_t[]>(new data_t[sizeof(uint32_t) * (size() + 1)]);
child.buffers[1] = holder->string_offsets.get();
assert(child.buffers[1]);
// step 1: figure out total string length:
idx_t total_string_length = 0;
auto string_t_ptr = FlatVector::GetData<string_t>(vector);
auto is_null = FlatVector::Nullmask(vector);
for (idx_t row_idx = 0; row_idx < size(); row_idx++) {
if (is_null[row_idx]) {
continue;
}
total_string_length += string_t_ptr[row_idx].GetSize();
}
// step 2: allocate this much
holder->string_data = unique_ptr<data_t[]>(new data_t[total_string_length]);
child.buffers[2] = holder->string_data.get();
assert(child.buffers[2]);
// step 3: assign buffers
idx_t current_heap_offset = 0;
auto target_ptr = (uint32_t *)child.buffers[1];

for (idx_t row_idx = 0; row_idx < size(); row_idx++) {
target_ptr[row_idx] = current_heap_offset;
if (is_null[row_idx]) {
continue;
}
auto &str = string_t_ptr[row_idx];
memcpy((void *)((uint8_t *)child.buffers[2] + current_heap_offset), str.GetData(), str.GetSize());
current_heap_offset += str.GetSize();
}
target_ptr[size()] = current_heap_offset; // need to terminate last string!
break;
}
default:
throw runtime_error("Unsupported type " + GetTypes()[col_idx].ToString());
}

child.null_count = FlatVector::Nullmask(vector).count();
child.buffers[0] = (void *)&FlatVector::Nullmask(vector).flip();

break;
default:
throw NotImplementedException(VectorTypeToString(vector.vector_type));
}
out_array->children[col_idx] = &child;
}
}

} // namespace duckdb
5 changes: 5 additions & 0 deletions src/common/types/value.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ template <class T> T Value::GetValueInternal() const {
return Cast::Operation<float, T>(value_.float_);
case PhysicalType::DOUBLE:
return Cast::Operation<double, T>(value_.double_);

case PhysicalType::VARCHAR:
return Cast::Operation<string_t, T>(str_value.c_str());
default:
Expand Down Expand Up @@ -366,6 +367,10 @@ template <> float Value::GetValue() const {
template <> double Value::GetValue() const {
return GetValueInternal<double>();
}
template <> uintptr_t Value::GetValue() const {
assert(type()== LogicalType::POINTER);
return value_.pointer;
}
Value Value::Numeric(LogicalType type, int64_t value) {
switch (type.id()) {
case LogicalTypeId::TINYINT:
Expand Down
1 change: 1 addition & 0 deletions src/function/function.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ void BuiltinFunctions::Initialize() {
RegisterSQLiteFunctions();
RegisterReadFunctions();
RegisterTableFunctions();
RegisterArrowFunctions();

RegisterAlgebraicAggregates();
RegisterDistributiveAggregates();
Expand Down
1 change: 1 addition & 0 deletions src/function/table/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ add_subdirectory(sqlite)
add_library_unity(
duckdb_func_table
OBJECT
arrow.cpp
range.cpp
repeat.cpp
copy_csv.cpp
Expand Down