Skip to content

Commit

Permalink
Merge pull request #53 from duckdb/master
Browse files Browse the repository at this point in the history
Merge as of 2021-04-22
  • Loading branch information
hawkfish committed Apr 22, 2021
2 parents bf12e58 + fb0b2ae commit 5d3f17b
Show file tree
Hide file tree
Showing 226 changed files with 11,347 additions and 9,217 deletions.
28 changes: 28 additions & 0 deletions .github/ISSUE_TEMPLATE/bug_report.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: ''
assignees: ''

---

**What does happen?**
A short, clear and concise description of what the bug is.

**What should happen?**
Describe what you expect DuckDB to do instead.

**To Reproduce**
Steps to reproduce the behavior. Bonus points if those are only SQL queries.
1.
2.
3.

**Environment (please complete the following information):**
- OS: [e.g. iOS]
- DuckDB Version [e.g. 22]

**Before submitting**
- [ ] Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?
- [ ] Have you tried this on the latest `master` branch? In case you cannot compile, you may find some binaries here: https://github.com/duckdb/duckdb/releases/tag/master-builds
23 changes: 9 additions & 14 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,7 @@ jobs:
run: |
python scripts/windows_ci.py
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_GENERATOR_PLATFORM=x64 -DBUILD_ICU_EXTENSION=1 -DBUILD_PARQUET_EXTENSION=1 -DBUILD_TPCH_EXTENSION=1 -DBUILD_FTS_EXTENSION=1 -DBUILD_REST=1 -DJDBC_DRIVER=1
cmake --build . --target duckdb --config Release
cmake --build . --target unittest --config Release
cmake --build . --target shell --config Release
cmake --build . --target jdbc --config Release
cmake --build . --config Release
- name: Test
run: test/Release/unittest.exe
Expand Down Expand Up @@ -150,10 +147,7 @@ jobs:
- name: Build
run: |
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_GENERATOR_PLATFORM=Win32 -DBUILD_ICU_EXTENSION=1 -DBUILD_PARQUET_EXTENSION=1 -DBUILD_TPCH_EXTENSION=1 -DBUILD_FTS_EXTENSION=1 -DJDBC_DRIVER=1
cmake --build . --target duckdb --config Release
cmake --build . --target unittest --config Release
cmake --build . --target shell --config Release
cmake --build . --target jdbc --config Release
cmake --build . --config Release
- name: Test
run: test/Release/unittest.exe
Expand Down Expand Up @@ -255,9 +249,10 @@ jobs:

- name: Amalgamation
run: |
python scripts/amalgamation.py
python scripts/amalgamation.py --extended
python scripts/parquet_amalgamation.py
cd src/amalgamation
clang++ -std=c++11 -emit-llvm -S duckdb.cpp
clang++ -std=c++11 -emit-llvm -S duckdb.cpp parquet-amalgamation.cpp
linux-release-64:
Expand Down Expand Up @@ -687,7 +682,7 @@ jobs:
CIBW_BUILD: 'cp36-* cp37-* cp38-* cp39-*'
CIBW_BEFORE_BUILD: 'pip install --prefer-binary "pandas>=0.24" "pytest>=4.3"'
CIBW_TEST_REQUIRES: 'pytest'
CIBW_BEFORE_TEST: 'pip install --prefer-binary "pandas>=0.24"'
CIBW_BEFORE_TEST: 'pip install --prefer-binary "pandas>=0.24" && (pip install --prefer-binary "pyarrow>=3.0.0" || true)'
CIBW_TEST_COMMAND: 'python -m pytest {project}/tests'
SETUPTOOLS_SCM_NO_LOCAL: 'yes'
TWINE_USERNAME: 'hfmuehleisen'
Expand Down Expand Up @@ -728,7 +723,7 @@ jobs:
CIBW_BUILD: 'cp27-*'
CIBW_BEFORE_BUILD: 'pip install --prefer-binary "pandas>=0.24" "pytest>=4.3"'
CIBW_TEST_REQUIRES: 'pytest'
CIBW_BEFORE_TEST: 'pip install --prefer-binary "pandas>=0.24"'
CIBW_BEFORE_TEST: 'pip install --prefer-binary "pandas>=0.24" '
CIBW_TEST_COMMAND: 'python -m pytest {project}/tests'
SETUPTOOLS_SCM_NO_LOCAL: 'yes'
TWINE_USERNAME: 'hfmuehleisen'
Expand Down Expand Up @@ -762,7 +757,7 @@ jobs:
CIBW_BUILD: 'cp36-* cp37-* cp38-* cp39-*'
CIBW_BEFORE_BUILD: 'pip install --prefer-binary "pandas>=0.24" "pytest>=4.3"'
CIBW_TEST_REQUIRES: 'pytest'
CIBW_BEFORE_TEST: 'pip install --prefer-binary "pandas>=0.24"'
CIBW_BEFORE_TEST: 'pip install --prefer-binary "pandas>=0.24" "pyarrow>=3.0.0"'
CIBW_TEST_COMMAND: 'python -m pytest {project}/tests'
SETUPTOOLS_SCM_NO_LOCAL: 'yes'
TWINE_USERNAME: 'hfmuehleisen'
Expand Down Expand Up @@ -801,7 +796,7 @@ jobs:
CIBW_BUILD: 'cp36-* cp37-* cp38-* cp39-*'
CIBW_BEFORE_BUILD: 'pip install --prefer-binary "pandas>=0.24" "pytest>=4.3"'
CIBW_TEST_REQUIRES: 'pytest'
CIBW_BEFORE_TEST: 'pip install --prefer-binary "pandas>=0.24"'
CIBW_BEFORE_TEST: 'pip install --prefer-binary "pandas>=0.24" '
CIBW_TEST_COMMAND: 'python -m pytest {project}/tests'
SETUPTOOLS_SCM_NO_LOCAL: 'yes'
TWINE_USERNAME: 'hfmuehleisen'
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ m4/lt~obsolete.m4
#==============================================================================#
#m4/
build/
duckdb_benchmark_data/
#*.m4
*.o
*.lo
Expand Down
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ endif()
option(ENABLE_SANITIZER "Enable address sanitizer." TRUE)
option(ENABLE_THREAD_SANITIZER "Enable thread sanitizer." FALSE)
option(ENABLE_UBSAN "Enable undefined behavior sanitizer." TRUE)
option(DISABLE_VPTR_SANITIZER "Disable vptr sanitizer; work-around for sanitizer false positive on Macbook M1" FALSE)
option(
FORCE_SANITIZER
"Forces building with sanitizers even if the Python and R modules are enabled."
Expand Down Expand Up @@ -113,8 +114,14 @@ endif()
if(${ENABLE_UBSAN})
if(FORCE_ASSERT)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
if (${DISABLE_VPTR_SANITIZER})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-sanitize=vptr")
endif()
else()
set(CXX_EXTRA_DEBUG "${CXX_EXTRA_DEBUG} -fsanitize=undefined")
if (${DISABLE_VPTR_SANITIZER})
set(CXX_EXTRA_DEBUG "${CXX_EXTRA_DEBUG} -fno-sanitize=vptr")
endif()
endif()
endif()

Expand Down
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ endif
ifeq (${DISABLE_SANITIZER}, 1)
DISABLE_SANITIZER_FLAG=-DENABLE_SANITIZER=FALSE -DENABLE_UBSAN=0
endif
ifeq (${DISABLE_VPTR_SANITIZER}, 1)
DISABLE_SANITIZER_FLAG:=${DISABLE_SANITIZER_FLAG} -DDISABLE_VPTR_SANITIZER=1
endif
ifeq (${FORCE_SANITIZER}, 1)
DISABLE_SANITIZER_FLAG:=${DISABLE_SANITIZER_FLAG} -DFORCE_SANITIZER=1
endif
Expand Down Expand Up @@ -58,7 +61,7 @@ ifeq (${BUILD_JDBC}, 1)
EXTENSIONS:=${EXTENSIONS} -DJDBC_DRIVER=1
endif
ifeq (${BUILD_PYTHON}, 1)
EXTENSIONS:=${EXTENSIONS} -DBUILD_PYTHON=1
EXTENSIONS:=${EXTENSIONS} -DBUILD_PYTHON=1 -DBUILD_FTS_EXTENSION=1
endif
ifeq (${BUILD_R}, 1)
EXTENSIONS:=${EXTENSIONS} -DBUILD_R=1
Expand Down
9 changes: 8 additions & 1 deletion extension/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ project(ParquetExtension)

include_directories(
include ../../third_party/parquet ../../third_party/snappy
../../third_party/miniz ../../third_party/thrift ../../third_party/zstd)
../../third_party/miniz ../../third_party/thrift
../../third_party/zstd/include)

set(PARQUET_EXTENSION_FILES
parquet-extension.cpp parquet_reader.cpp parquet_timestamp.cpp
Expand Down Expand Up @@ -44,3 +45,9 @@ if(NOT CLANG_TIDY)
endif()

add_library(parquet_extension STATIC ${PARQUET_EXTENSION_FILES})

if(NOT CLANG_TIDY)
add_executable(parquetcli parquetcli.cpp)
target_link_libraries(parquetcli parquet_extension)
target_link_libraries(parquetcli duckdb_static)
endif()
19 changes: 12 additions & 7 deletions extension/parquet/column_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@
#include "zstd.h"
#include <iostream>

#include "duckdb.hpp"
#ifndef DUCKDB_AMALGAMATION
#include "duckdb/common/types/chunk_collection.hpp"
#endif

namespace duckdb {

Expand Down Expand Up @@ -131,10 +134,10 @@ void ColumnReader::PrepareRead(parquet_filter_t &filter) {
}

void ColumnReader::PreparePage(idx_t compressed_page_size, idx_t uncompressed_page_size) {
auto trans = (ThriftFileTransport *)protocol->getTransport().get();
auto &trans = (ThriftFileTransport &)*protocol->getTransport();

block = make_shared<ResizeableBuffer>(compressed_page_size + 1);
trans->read((uint8_t *)block->ptr, compressed_page_size);
trans.read((uint8_t *)block->ptr, compressed_page_size);

shared_ptr<ResizeableBuffer> unpacked_block;
if (chunk->meta_data.codec != CompressionCodec::UNCOMPRESSED) {
Expand Down Expand Up @@ -248,8 +251,8 @@ void ColumnReader::PrepareDataPage(PageHeader &page_hdr) {
idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, uint8_t *define_out, uint8_t *repeat_out,
Vector &result) {
// we need to reset the location because multiple column readers share the same protocol
auto trans = (ThriftFileTransport *)protocol->getTransport().get();
trans->SetLocation(chunk_read_offset);
auto &trans = (ThriftFileTransport &)*protocol->getTransport();
trans.SetLocation(chunk_read_offset);

idx_t result_offset = 0;
auto to_read = num_values;
Expand Down Expand Up @@ -299,7 +302,7 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, uint8_t
to_read -= read_now;
}
group_rows_available -= num_values;
chunk_read_offset = trans->GetLocation();
chunk_read_offset = trans.GetLocation();

return num_values;
}
Expand Down Expand Up @@ -364,7 +367,8 @@ string_t StringParquetValueConversion::DictRead(ByteBuffer &dict, uint32_t &offs
}

string_t StringParquetValueConversion::PlainRead(ByteBuffer &plain_data, ColumnReader &reader) {
uint32_t str_len = plain_data.read<uint32_t>();
auto &scr = ((StringColumnReader &)reader);
uint32_t str_len = scr.fixed_width_string_length == 0 ? plain_data.read<uint32_t>() : scr.fixed_width_string_length;
plain_data.available(str_len);
((StringColumnReader &)reader).VerifyString(plain_data.ptr, str_len);
auto ret_str = string_t(plain_data.ptr, str_len);
Expand All @@ -373,7 +377,8 @@ string_t StringParquetValueConversion::PlainRead(ByteBuffer &plain_data, ColumnR
}

void StringParquetValueConversion::PlainSkip(ByteBuffer &plain_data, ColumnReader &reader) {
uint32_t str_len = plain_data.read<uint32_t>();
auto &scr = ((StringColumnReader &)reader);
uint32_t str_len = scr.fixed_width_string_length == 0 ? plain_data.read<uint32_t>() : scr.fixed_width_string_length;
plain_data.available(str_len);
plain_data.inc(str_len);
}
Expand Down
13 changes: 12 additions & 1 deletion extension/parquet/include/column_reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,15 @@
#include "parquet_rle_bp_decoder.hpp"
#include "parquet_statistics.hpp"

#include "duckdb.hpp"
#ifndef DUCKDB_AMALGAMATION
#include "duckdb/storage/statistics/string_statistics.hpp"
#include "duckdb/storage/statistics/numeric_statistics.hpp"
#include "duckdb/common/types/vector.hpp"
#include "duckdb/common/types/string_type.hpp"
#include "duckdb/common/types/chunk_collection.hpp"
#include "duckdb/common/operator/cast_operators.hpp"
#endif

namespace duckdb {

Expand All @@ -22,6 +25,7 @@ using parquet::format::ColumnChunk;
using parquet::format::FieldRepetitionType;
using parquet::format::PageHeader;
using parquet::format::SchemaElement;
using parquet::format::Type;

typedef std::bitset<STANDARD_VECTOR_SIZE> parquet_filter_t;

Expand Down Expand Up @@ -242,12 +246,19 @@ class StringColumnReader : public TemplatedColumnReader<string_t, StringParquetV
StringColumnReader(LogicalType type_p, const SchemaElement &schema_p, idx_t schema_idx_p, idx_t max_define_p,
idx_t max_repeat_p)
: TemplatedColumnReader<string_t, StringParquetValueConversion>(type_p, schema_p, schema_idx_p, max_define_p,
max_repeat_p) {};
max_repeat_p) {
fixed_width_string_length = 0;
if (schema_p.type == Type::FIXED_LEN_BYTE_ARRAY) {
D_ASSERT(schema_p.__isset.type_length);
fixed_width_string_length = schema_p.type_length;
}
};

void Dictionary(shared_ptr<ByteBuffer> dictionary_data, idx_t num_entries) override;

unique_ptr<string_t[]> dict_strings;
void VerifyString(const char *str_data, idx_t str_len);
idx_t fixed_width_string_length;

protected:
void DictReference(Vector &result) override;
Expand Down
7 changes: 5 additions & 2 deletions extension/parquet/include/parquet_file_metadata_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@
//===----------------------------------------------------------------------===//
#pragma once

#include "duckdb/storage/object_cache.hpp" // ObjectCache
#include "parquet_types.h" // parquet::format::FileMetaData
#include "duckdb.hpp"
#ifndef DUCKDB_AMALGAMATION
#include "duckdb/storage/object_cache.hpp"
#endif
#include "parquet_types.h"

namespace duckdb {

Expand Down
22 changes: 17 additions & 5 deletions extension/parquet/include/parquet_reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,13 @@

#pragma once

#include "duckdb.hpp"
#ifndef DUCKDB_AMALGAMATION
#include "duckdb/common/common.hpp"
#include "duckdb/common/exception.hpp"
#include "duckdb/common/string_util.hpp"
#include "duckdb/common/types/data_chunk.hpp"
#endif
#include "resizable_buffer.hpp"
#include "column_reader.hpp"

Expand All @@ -38,6 +41,7 @@ struct ParquetReaderScanState {
int64_t current_group;
vector<column_t> column_ids;
idx_t group_offset;
unique_ptr<FileHandle> file_handle;
unique_ptr<ColumnReader> root_reader;
unique_ptr<apache::thrift::protocol::TProtocol> thrift_file_proto;

Expand All @@ -51,9 +55,16 @@ struct ParquetReaderScanState {

class ParquetReader {
public:
ParquetReader(ClientContext &context, string file_name, vector<LogicalType> expected_types,
ParquetReader(unique_ptr<FileHandle> file_handle_p, const vector<LogicalType> &expected_types_p,
const string &initial_filename_p = string());
ParquetReader(unique_ptr<FileHandle> file_handle_p)
: ParquetReader(move(file_handle_p), vector<LogicalType>(), string()) {
}

ParquetReader(ClientContext &context, string file_name, const vector<LogicalType> &expected_types_p,
const string &initial_filename = string());
ParquetReader(ClientContext &context, string file_name) : ParquetReader(context, file_name, vector<LogicalType>()) {
ParquetReader(ClientContext &context, string file_name)
: ParquetReader(context, move(file_name), vector<LogicalType>()) {
}
~ParquetReader();

Expand All @@ -63,8 +74,8 @@ class ParquetReader {
shared_ptr<ParquetFileMetadataCache> metadata;

public:
void Initialize(ParquetReaderScanState &state, vector<column_t> column_ids, vector<idx_t> groups_to_read,
TableFilterSet *table_filters);
void InitializeScan(ParquetReaderScanState &state, vector<column_t> column_ids, vector<idx_t> groups_to_read,
TableFilterSet *table_filters);
void Scan(ParquetReaderScanState &state, DataChunk &output);

idx_t NumRows();
Expand All @@ -76,6 +87,7 @@ class ParquetReader {
const parquet::format::FileMetaData *file_meta_data);

private:
void InitializeSchema(const vector<LogicalType> &expected_types_p, const string &initial_filename_p);
bool ScanInternal(ParquetReaderScanState &state, DataChunk &output);

const parquet::format::RowGroup &GetGroup(ParquetReaderScanState &state);
Expand All @@ -88,7 +100,7 @@ class ParquetReader {
}

private:
ClientContext &context;
unique_ptr<FileHandle> file_handle;
};

} // namespace duckdb
3 changes: 3 additions & 0 deletions extension/parquet/include/parquet_statistics.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#pragma once

#include "duckdb.hpp"
#ifndef DUCKDB_AMALGAMATION
#include "duckdb/storage/statistics/base_statistics.hpp"
#endif
#include "parquet_types.h"

namespace duckdb {
Expand Down
2 changes: 1 addition & 1 deletion extension/parquet/include/parquet_timestamp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

#pragma once

#include "duckdb/common/common.hpp"
#include "duckdb.hpp"

namespace duckdb {

Expand Down
3 changes: 3 additions & 0 deletions extension/parquet/include/parquet_writer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@

#pragma once

#include "duckdb.hpp"
#ifndef DUCKDB_AMALGAMATION
#include "duckdb/common/common.hpp"
#include "duckdb/common/exception.hpp"
#include "duckdb/common/mutex.hpp"
#include "duckdb/common/serializer/buffered_file_writer.hpp"
#include "duckdb/common/types/chunk_collection.hpp"
#endif

#include "parquet_types.h"
#include "thrift/protocol/TCompactProtocol.h"
Expand Down
Loading

0 comments on commit 5d3f17b

Please sign in to comment.