Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Use arrow C++ for CSV parsing in JS
  • Loading branch information
texodus committed Oct 4, 2020
1 parent 5757ad1 commit ec09c74
Show file tree
Hide file tree
Showing 24 changed files with 1,269 additions and 263 deletions.
2 changes: 1 addition & 1 deletion cmake/arrow.txt.in
Expand Up @@ -5,7 +5,7 @@ project(arrow-download NONE)
include(ExternalProject)
ExternalProject_Add(apachearrow
GIT_REPOSITORY https://github.com/apache/arrow.git
GIT_TAG apache-arrow-0.16.0
GIT_TAG apache-arrow-1.0.1
SOURCE_DIR "${CMAKE_BINARY_DIR}/arrow-src"
BINARY_DIR "${CMAKE_BINARY_DIR}/arrow-build"
CONFIGURE_COMMAND ""
Expand Down
31 changes: 25 additions & 6 deletions cmake/arrow/CMakeLists.txt
@@ -1,9 +1,14 @@
set(CMAKE_SHARED_LIBRARY_SUFFIX .so)

set(ARROW_SRCS
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/builder.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/pretty_print.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/array_base.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/array_binary.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/array_decimal.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/array_dict.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/array_nested.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/array_primitive.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/builder_adaptive.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/builder_base.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/builder_binary.cc
Expand All @@ -13,11 +18,14 @@ set(ARROW_SRCS
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/builder_primitive.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/builder_union.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/concatenate.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/dict_internal.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/data.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/diff.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/util.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/validate.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/buffer.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/chunked_array.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compare.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/device.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/extension_type.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/memory_pool.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/pretty_print.cc
Expand All @@ -34,9 +42,9 @@ set(ARROW_SRCS
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/converter.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/chunker.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/column_builder.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/column_decoder.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/options.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/parser.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/reader.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/filesystem/filesystem.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/filesystem/localfs.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/filesystem/mockfs.cc
Expand All @@ -50,17 +58,22 @@ set(ARROW_SRCS
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/json/reader.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/buffered.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/compressed.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/file.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/interfaces.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/memory.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/testing/util.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/basic_decimal.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/bit_block_counter.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/bit_util.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/bitmap_builders.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/bitmap_ops.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/compression.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/cpu_info.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/decimal.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/future.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/delimiting.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/int_util.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/io_util.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/iterator.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/logging.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/key_value_metadata.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/memory.cc
Expand All @@ -70,11 +83,17 @@ set(ARROW_SRCS
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/thread_pool.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/trie.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/utf8.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/value_parsing.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/double-conversion.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/cached-powers.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/diy-fp.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/bignum.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/strtod.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/datetime/tz.cpp
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/dictionary.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/feather.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/json_integration.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/json_internal.cc
# ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/json_integration.cc
# ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/json_internal.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/json_simple.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/message.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/metadata_internal.cc
Expand Down
6 changes: 3 additions & 3 deletions cmake/arrow/config.h
Expand Up @@ -15,9 +15,9 @@
// specific language governing permissions and limitations
// under the License.

#define ARROW_VERSION_MAJOR 0
#define ARROW_VERSION_MINOR 16
#define ARROW_VERSION_PATCH 0
#define ARROW_VERSION_MAJOR 1
#define ARROW_VERSION_MINOR 0
#define ARROW_VERSION_PATCH 1
#define ARROW_VERSION ((ARROW_VERSION_MAJOR * 1000) + ARROW_VERSION_MINOR) * 1000 + ARROW_VERSION_PATCH

/* #undef DOUBLE_CONVERSION_HAS_CASE_INSENSIBILITY */
Expand Down
1 change: 0 additions & 1 deletion cmake/modules/FindFlatbuffers.cmake
Expand Up @@ -42,7 +42,6 @@ if(NOT ${FLATBUFFERS_INCLUDE_DIR})
set(FLATBUFFERS_INCLUDE_DIR /usr/local/include)
endif()

message("${FLATBUFFERS_COMPILER}")
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(FLATBUFFERS REQUIRED_VARS
FLATBUFFERS_INCLUDE_DIR FLATBUFFERS_COMPILER)
29 changes: 22 additions & 7 deletions cmake/modules/FindPyArrow.cmake
Expand Up @@ -24,7 +24,7 @@ execute_process(
"from __future__ import print_function\ntry: import pyarrow; print(' '.join(pyarrow.get_libraries()), end='')\nexcept:pass"
OUTPUT_VARIABLE __pyarrow_libraries)

# And the version
# And the version
execute_process(
COMMAND "${Python_EXECUTABLE}" -c
"from __future__ import print_function\ntry: import pyarrow; print(pyarrow.__version__, end='')\nexcept:pass"
Expand All @@ -45,23 +45,38 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
# windows its just "arrow.dll"
set(PYTHON_PYARROW_PYTHON_SHARED_LIBRARY "arrow_python")
set(PYTHON_PYARROW_ARROW_SHARED_LIBRARY "arrow")
set(PYTHON_PYARROW_LIBRARIES ${PYTHON_PYARROW_PYTHON_SHARED_LIBRARY} ${PYTHON_PYARROW_ARROW_SHARED_LIBRARY})
elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND ${PYARROW_VERSION_MAJOR} EQUAL "1")
# Link against pre-built libarrow on MacOS
set(PYTHON_PYARROW_PYTHON_SHARED_LIBRARY ${PYTHON_PYARROW_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}arrow_python.100.dylib)
set(PYTHON_PYARROW_ARROW_SHARED_LIBRARY ${PYTHON_PYARROW_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}arrow.100.dylib)
elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin")
# Link against pre-built libarrow on MacOS
set(PYTHON_PYARROW_PYTHON_SHARED_LIBRARY ${PYTHON_PYARROW_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}arrow_python.${PYARROW_VERSION_MINOR}.dylib)
set(PYTHON_PYARROW_ARROW_SHARED_LIBRARY ${PYTHON_PYARROW_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}arrow.${PYARROW_VERSION_MINOR}.dylib)
set(PYTHON_PYARROW_LIBRARIES ${PYTHON_PYARROW_PYTHON_SHARED_LIBRARY} ${PYTHON_PYARROW_ARROW_SHARED_LIBRARY})
else()
elseif (${PYARROW_VERSION_MAJOR} EQUAL "1")
# linux
set(PYTHON_PYARROW_PYTHON_SHARED_LIBRARY ${CMAKE_SHARED_LIBRARY_PREFIX}arrow_python${CMAKE_SHARED_LIBRARY_SUFFIX}.${PYARROW_VERSION_MINOR})
set(PYTHON_PYARROW_ARROW_SHARED_LIBRARY ${CMAKE_SHARED_LIBRARY_PREFIX}arrow${CMAKE_SHARED_LIBRARY_SUFFIX}.${PYARROW_VERSION_MINOR})
set(PYTHON_PYARROW_LIBRARIES ${PYTHON_PYARROW_PYTHON_SHARED_LIBRARY} ${PYTHON_PYARROW_ARROW_SHARED_LIBRARY})
set(PYTHON_PYARROW_PYTHON_SHARED_LIBRARY ${PYTHON_PYARROW_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}arrow_python${CMAKE_SHARED_LIBRARY_SUFFIX}.100)
set(PYTHON_PYARROW_ARROW_SHARED_LIBRARY ${PYTHON_PYARROW_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}arrow${CMAKE_SHARED_LIBRARY_SUFFIX}.100)
else()
set(PYTHON_PYARROW_PYTHON_SHARED_LIBRARY ${PYTHON_PYARROW_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}arrow_python${CMAKE_SHARED_LIBRARY_SUFFIX}.${PYARROW_VERSION_MINOR})
set(PYTHON_PYARROW_ARROW_SHARED_LIBRARY ${PYTHON_PYARROW_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}arrow${CMAKE_SHARED_LIBRARY_SUFFIX}.${PYARROW_VERSION_MINOR})
endif()

set(PYTHON_PYARROW_LIBRARIES ${PYTHON_PYARROW_PYTHON_SHARED_LIBRARY} ${PYTHON_PYARROW_ARROW_SHARED_LIBRARY})

if(PYTHON_PYARROW_INCLUDE_DIR AND PYTHON_PYARROW_LIBRARIES)
set(PYTHON_PYARROW_FOUND 1 CACHE INTERNAL "Python pyarrow found")
endif()


# set(PYTHON_PYARROW_LIBRARIES ${PYTHON_PYARROW_PYTHON_SHARED_LIBRARY} ${PYTHON_PYARROW_ARROW_SHARED_LIBRARY})
# else()
# # linux
# set(PYTHON_PYARROW_PYTHON_SHARED_LIBRARY ${CMAKE_SHARED_LIBRARY_PREFIX}arrow_python${CMAKE_SHARED_LIBRARY_SUFFIX}.${PYARROW_VERSION_MINOR})
# set(PYTHON_PYARROW_ARROW_SHARED_LIBRARY ${CMAKE_SHARED_LIBRARY_PREFIX}arrow${CMAKE_SHARED_LIBRARY_SUFFIX}.${PYARROW_VERSION_MINOR})



include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(PyArrow REQUIRED_VARS PYTHON_PYARROW_INCLUDE_DIR PYTHON_PYARROW_LIBRARIES PYTHON_PYARROW_LIBRARY_DIR
VERSION_VAR __pyarrow_version)
12 changes: 9 additions & 3 deletions cpp/perspective/CMakeLists.txt
Expand Up @@ -548,7 +548,13 @@ set (SOURCE_FILES

set(PYTHON_SOURCE_FILES ${SOURCE_FILES}
${PSP_PYTHON_SRC}/src/column.cpp
)
)

set(WASM_SOURCE_FILES ${SOURCE_FILES}
${PSP_CPP_SRC}/src/cpp/arrow_csv.cpp
${PSP_CPP_SRC}/src/cpp/vendor/arrow_single_threaded_reader.cpp
)


set (PYTHON_BINDING_SOURCE_FILES
${PSP_PYTHON_SRC}/src/accessor.cpp
Expand All @@ -570,7 +576,7 @@ else()
endif()

if (PSP_WASM_BUILD)
add_library(psp ${SOURCE_FILES})
add_library(psp ${WASM_SOURCE_FILES})
target_compile_definitions(psp PRIVATE PSP_ENABLE_WASM=1)
set_target_properties(psp PROPERTIES COMPILE_FLAGS "${ASYNC_MODE_FLAGS}")
target_link_libraries(psp arrow)
Expand Down Expand Up @@ -663,7 +669,7 @@ elseif(PSP_CPP_BUILD OR PSP_PYTHON_BUILD)
endif()
########################
else()
add_library(psp SHARED ${SOURCE_FILES})
add_library(psp SHARED ${WASM_SOURCE_FILES})

# Link perspective against custom-built minimal arrow
target_link_libraries(psp arrow)
Expand Down
60 changes: 60 additions & 0 deletions cpp/perspective/src/cpp/arrow_csv.cpp
@@ -0,0 +1,60 @@
/******************************************************************************
*
* Copyright (c) 2019, the Perspective Authors.
*
* This file is part of the Perspective library, distributed under the terms of
* the Apache License 2.0. The full license can be found in the LICENSE file.
*
*/

#include <perspective/base.h>
#include <perspective/arrow_csv.h>
#include <arrow/util/value_parsing.h>
#include <arrow/io/memory.h>

// This causes build warnings
// https://github.com/emscripten-core/emscripten/issues/8574
#include <perspective/vendor/arrow_single_threaded_reader.h>

namespace perspective {
namespace apachearrow {

std::shared_ptr<::arrow::Table>
csvToTable(std::string& csv, bool is_update,
std::unordered_map<std::string, std::shared_ptr<arrow::DataType>>&
schema) {
arrow::MemoryPool* pool = arrow::default_memory_pool();
auto input = std::make_shared<arrow::io::BufferReader>(csv);
auto read_options = arrow::csv::ReadOptions::Defaults();
auto parse_options = arrow::csv::ParseOptions::Defaults();
auto convert_options = arrow::csv::ConvertOptions::Defaults();

read_options.use_threads = false;
convert_options.timestamp_parsers
= std::vector<std::shared_ptr<arrow::TimestampParser>>{
arrow::TimestampParser::MakeISO8601(),
arrow::TimestampParser::MakeStrptime("%Y-%m-%d\\D%H:%M:%S.%f"),
arrow::TimestampParser::MakeStrptime("%m-%d-%Y"),
arrow::TimestampParser::MakeStrptime("%m/%d/%Y"),
arrow::TimestampParser::MakeStrptime("%d %m %Y"),
arrow::TimestampParser::MakeStrptime("%H:%M:%S.%f"),
};

if (is_update) {
convert_options.column_types = std::move(schema);
}

auto maybe_reader = arrow::csv::TableReader::Make(
pool, input, read_options, parse_options, convert_options);

std::shared_ptr<arrow::csv::TableReader> reader = *maybe_reader;

auto maybe_table = reader->Read();
if (!maybe_table.ok()) {
PSP_COMPLAIN_AND_ABORT(maybe_table.status().ToString());
}
return *maybe_table;
}

} // namespace apachearrow
} // namespace perspective

0 comments on commit ec09c74

Please sign in to comment.