Skip to content

Commit

Permalink
#173: Added chunked Parquet reader (#174)
Browse files Browse the repository at this point in the history
Co-authored-by: jakobbraun <jakob.braun@posteo.de>
  • Loading branch information
morazow and jakobbraun committed Sep 23, 2021
1 parent 5491772 commit 6adaf83
Show file tree
Hide file tree
Showing 44 changed files with 520 additions and 383 deletions.
2 changes: 1 addition & 1 deletion .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ max_line_length = 120
trim_trailing_whitespace = true

[*.md]
max_line_length = 120
max_line_length = 80
trim_trailing_whitespace = false

[Makefile]
Expand Down
17 changes: 0 additions & 17 deletions .github/workflows/broken_link_checker.yml

This file was deleted.

17 changes: 17 additions & 0 deletions .github/workflows/broken_links_checker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
name: Broken Links Checker

on:
schedule:
- cron: "0 5 * * *"
push:

jobs:
linkChecker:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: gaurav-nelson/github-action-markdown-link-check@v1
with:
use-quiet-mode: 'yes'
use-verbose-mode: 'yes'
config-file: '.github/workflows/mlc_config.json'
23 changes: 11 additions & 12 deletions .github/workflows/ci-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,10 @@ jobs:
strategy:
fail-fast: false
matrix:
include:
- scala: 2.12.14
exasol-docker-version: 6.2.15-d1
sonar-run: false
- scala: 2.12.14
exasol-docker-version: 7.0.11
sonar-run: true
scala: [ 2.12.15 ]
exasol-docker-version: [ "6.2.16-d1", "7.0.12", "7.1.1" ]
env:
DEFAULT_DOCKER_DB_VERSION: "7.1.1"

steps:
- name: Checkout the Repository
Expand All @@ -33,7 +30,7 @@ jobs:
run: |
docker pull exasol/docker-db:${{ matrix.exasol-docker-version }}
docker pull localstack/localstack:0.12.16
docker pull alluxio/alluxio:2.6.1
docker pull alluxio/alluxio:2.6.2
- name: Cache Local SBT Dependencies
uses: actions/cache@v2
Expand All @@ -57,18 +54,20 @@ jobs:
EXASOL_DOCKER_VERSION: ${{ matrix.exasol-docker-version }}

- name: Upload Coverage Results to Coveralls
run: sbt coveralls
if: ${{ env.DEFAULT_DOCKER_DB_VERSION == matrix.exasol-docker-version }}
run: sbt ++${{ matrix.scala }} coveralls
env:
COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}

# This required because of the sonarcloud-github-action docker volume mapping.
- name: Prepare for Sonar Cloud Scan
if: matrix.sonar-run
if: ${{ env.DEFAULT_DOCKER_DB_VERSION == matrix.exasol-docker-version }}
run: |
find . -name scoverage.xml -exec sed -i 's#/home/runner/work/cloud-storage-extension/cloud-storage-extension#/github/workspace#g' {} +
find . -name scoverage.xml -exec sed -i \
's#/home/runner/work/cloud-storage-extension/cloud-storage-extension#/github/workspace#g' {} +
- name: Sonar Cloud Scan
if: matrix.sonar-run
if: ${{ env.DEFAULT_DOCKER_DB_VERSION == matrix.exasol-docker-version }}
uses: sonarsource/sonarcloud-github-action@master
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/mlc_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"aliveStatusCodes": [200, 429]
}
1 change: 0 additions & 1 deletion CHANGES.md

This file was deleted.

6 changes: 3 additions & 3 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ lazy val orgSettings = Seq(
)

lazy val buildSettings = Seq(
scalaVersion := "2.12.14",
crossScalaVersions := Seq("2.11.12", "2.12.14")
scalaVersion := "2.12.15",
crossScalaVersions := Seq("2.11.12", "2.12.15")
)

lazy val root =
project
.in(file("."))
.settings(moduleName := "exasol-cloud-storage-extension")
.settings(version := "1.3.1")
.settings(version := "2.0.0")
.settings(orgSettings)
.settings(buildSettings)
.settings(Settings.projectSettings(scalaVersion))
Expand Down
2 changes: 1 addition & 1 deletion doc/changes/changelog.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Releases

* [1.3.2](changes_1.3.2.md)
* [2.0.0](changes_2.0.0.md)
* [1.3.1](changes_1.3.1.md)
* [1.3.0](changes_1.3.0.md)
* [1.2.0](changes_1.2.0.md)
Expand Down
19 changes: 0 additions & 19 deletions doc/changes/changes_1.3.2.md

This file was deleted.

41 changes: 41 additions & 0 deletions doc/changes/changes_2.0.0.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Cloud Storage Extension 2.0.0, released 2021-09-23

Code name: Improved Parquet Reader

## Summary

In this release we add optimized Parquet file importer. Previous versions read single Parquet file in a single importer process, in this version we improved it by virtually splitting files into fixed sized chunks that then can be imported in many parallel processes. For that, please do not forget to update to the latest deployment scripts in the user guide.

In addition, we added support for connections behind proxies when accessing cloud storage systems and added unified error codes.

## Features

* #173: Added improved chunked Parquet reader
* Added support for connection behind proxy (PR #172)

## Refactoring

* #113: Added unified error codes

## Dependency Updates

### Compile Dependency Updates

* Added `com.exasol:error-reporting-java:0.4.0`
* Updated `com.exasol:parquet-io-java:1.0.3` to `1.1.0`
* Updated `org.apache.orc:orc-core:1.6.9` to `1.7.0`
* Updated `org.alluxio:alluxio-core-client-hdfs:2.6.1` to `2.6.2`
* Updated `io.grpc:grpc-netty:1.39.0` to `1.41.0`

### Test Dependency Updates

* Updated `com.dimafeng:testcontainers-scala-scalatest:0.39.5` to `0.39.8`
* Updated `com.exasol:exasol-testcontainers:4.0.0` to `5.0.0`
* Updated `org.mockito:mockito-core:3.11.2` to `3.12.4`
* Updated `org.scalatest:scalatest:3.2.9` to `3.2.10`

### Plugin Updates

* Updated `com.eed3si9n:sbt-assembly:1.0.0` to `1.1.0`
* Updated `net.bzzt:sbt-reproducible-builds:0.28` to `0.30`
* Updated `org.scoverage:sbt-scoverage:1.8.2` to `1.9.0`
32 changes: 26 additions & 6 deletions doc/user_guide/user_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,12 @@ CREATE OR REPLACE JAVA SET SCRIPT IMPORT_PATH(...) EMITS (...) AS
%jar /buckets/bfsdefault/<BUCKET>/exasol-cloud-storage-extension-<VERSION>.jar;
/

CREATE OR REPLACE JAVA SCALAR SCRIPT IMPORT_METADATA(...)
EMITS (filename VARCHAR(2000), partition_index VARCHAR(100)) AS
CREATE OR REPLACE JAVA SCALAR SCRIPT IMPORT_METADATA(...) EMITS (
filename VARCHAR(2000),
partition_index VARCHAR(100),
start_index DECIMAL(36, 0),
end_index DECIMAL(36, 0)
) AS
%scriptclass com.exasol.cloudetl.scriptclasses.FilesMetadataReader;
%jar /buckets/bfsdefault/<BUCKET>/exasol-cloud-storage-extension-<VERSION>.jar;
/
Expand Down Expand Up @@ -255,8 +259,12 @@ CREATE OR REPLACE JAVA SET SCRIPT IMPORT_PATH(...) EMITS (...) AS
%jar /buckets/bfsdefault/<BUCKET>/exasol-cloud-storage-extension-<VERSION>.jar;
/

CREATE OR REPLACE JAVA SCALAR SCRIPT IMPORT_METADATA(...)
EMITS (filename VARCHAR(2000), partition_index VARCHAR(100)) AS
CREATE OR REPLACE JAVA SCALAR SCRIPT IMPORT_METADATA(...) EMITS (
filename VARCHAR(2000),
partition_index VARCHAR(100),
start_index DECIMAL(36, 0),
end_index DECIMAL(36, 0)
) AS
%scriptclass com.exasol.cloudetl.scriptclasses.DockerFilesMetadataReader;
%jar /buckets/bfsdefault/<BUCKET>/exasol-cloud-storage-extension-<VERSION>.jar;
/
Expand Down Expand Up @@ -369,6 +377,14 @@ These are optional parameters that have default values.
in the Import SQL statement. Likewise, the default value is `iproc()` in the
Export SQL statement.

#### Import Optional Parameters

The following are optional parameters for import statements.

* ``CHUNK_SIZE`` - It specifies a file chunk size in bytes. The importer then
will try to virtually splits a file into chunks with specified size, and
imports each chunk in parallel. By default it is `67108864` (64MB).

#### Export Optional Parameters

These optional parameters only apply to the data export statements.
Expand Down Expand Up @@ -415,8 +431,12 @@ scripts.
For example:

```sql
CREATE OR REPLACE JAVA SCALAR SCRIPT IMPORT_METADATA(...)
EMITS (filename VARCHAR(2000), partition_index VARCHAR(100)) AS
CREATE OR REPLACE JAVA SCALAR SCRIPT IMPORT_METADATA(...) EMITS (
filename VARCHAR(2000),
partition_index VARCHAR(100),
start_index DECIMAL(36, 0),
end_index DECIMAL(36, 0)
) AS
%jvmoption -DHTTPS_PROXY=http://username:password@10.10.1.10:1180
%scriptclass com.exasol.cloudetl.scriptclasses.FilesMetadataReader;
%jar /buckets/bfsdefault/<BUCKET>/exasol-cloud-storage-extension-<VERSION>.jar;
Expand Down
2 changes: 1 addition & 1 deletion error_code_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ error-tags:
CSE:
packages:
- com.exasol.cloudetl
highest-index: 26
highest-index: 27
1 change: 1 addition & 0 deletions project/Compilation.scala
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ object Compilation {
val WartremoverFlags: Seq[Wart] = ExtraWartremoverFlags ++ Warts.allBut(
Wart.Any,
Wart.AsInstanceOf,
Wart.DefaultArguments,
Wart.Equals,
Wart.IsInstanceOf,
Wart.Null,
Expand Down
16 changes: 8 additions & 8 deletions project/Dependencies.scala
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,24 @@ object Dependencies {

// Runtime dependencies versions
private val ImportExportUDFVersion = "0.2.0"
private val ParquetIOVersion = "1.0.3"
private val ParquetIOVersion = "1.1.0"
private val HadoopVersion = "3.3.1"
private val DeltaVersion = "0.7.0"
private val OrcVersion = "1.6.10"
private val OrcVersion = "1.7.0"
private val GoogleStorageVersion = "1.9.4-hadoop3"
private val SparkSQLVersion = "3.0.1"
private val AlluxioCoreHDFSVersion = "2.6.1"
private val AlluxioCoreHDFSVersion = "2.6.2"

// Test dependencies versions
private val ScalaTestVersion = "3.2.9"
private val ScalaTestVersion = "3.2.10"
private val ScalaTestPlusVersion = "1.0.0-M2"
private val MockitoCoreVersion = "3.11.2"
private val MockitoCoreVersion = "3.12.4"
private val HamcrestVersion = "2.2"
private val ExasolHamcrestMatcherVersion = "1.4.1"
private val ExasolTestDBBuilderVersion = "3.2.1"
private val ExasolTestContainersVersion = "4.0.0"
private val ExasolTestContainersVersion = "5.0.0"
private val TestContainersLocalstackVersion = "1.16.0"
private val TestContainersScalaVersion = "0.39.5"
private val TestContainersScalaVersion = "0.39.8"

val Resolvers: Seq[Resolver] = Seq(
"Exasol Releases" at "https://maven.exasol.com/artifactory/exasol-releases"
Expand Down Expand Up @@ -65,7 +65,7 @@ object Dependencies {
exclude ("org.apache.logging.log4j", "log4j-slf4j-impl")
exclude ("org.apache.commons", "commons-lang3")
exclude ("org.apache.hadoop", "hadoop-client"),
"io.grpc" % "grpc-netty" % "1.39.0",
"io.grpc" % "grpc-netty" % "1.41.0",
"com.google.cloud.bigdataoss" % "gcs-connector" % GoogleStorageVersion
exclude ("com.google.guava", "guava")
exclude ("org.apache.httpcomponents", "httpclient"),
Expand Down
6 changes: 3 additions & 3 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@ addSbtPlugin("org.wartremover" % "sbt-wartremover-contrib" % "1.3.12")
// Adds a `assembly` task to create a fat JAR with all of its
// dependencies
// https://github.com/sbt/sbt-assembly
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "1.0.0")
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "1.1.0")

// Adds most common doc api mappings
// https://github.com/ThoughtWorksInc/sbt-api-mappings
addSbtPlugin("com.thoughtworks.sbt-api-mappings" % "sbt-api-mappings" % "3.0.0")

// Adds Scala Code Coverage (Scoverage) used during unit tests
// http://github.com/scoverage/sbt-scoverage
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.8.2")
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.0")

// Adds SBT Coveralls plugin for uploading Scala code coverage to
// https://coveralls.io
Expand All @@ -47,4 +47,4 @@ addSbtPlugin("com.github.cb372" % "sbt-explicit-dependencies" % "0.2.16")

// Adds a `sbt-reproducible-builds` plugin
// https://github.com/raboof/sbt-reproducible-builds
addSbtPlugin("net.bzzt" % "sbt-reproducible-builds" % "0.28")
addSbtPlugin("net.bzzt" % "sbt-reproducible-builds" % "0.30")
2 changes: 1 addition & 1 deletion scripts/ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ set -o errtrace -o nounset -o pipefail -o errexit
BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/.. && pwd )"
cd "$BASE_DIR"

DEFAULT_SCALA_VERSION=2.12.14
DEFAULT_SCALA_VERSION=2.12.15

if [[ -z "${SCALA_VERSION:-}" ]]; then
echo "Environment variable SCALA_VERSION is not set."
Expand Down
16 changes: 6 additions & 10 deletions src/it/scala/com/exasol/cloudetl/BaseIntegrationTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import org.scalatest.funsuite.AnyFunSuite
trait BaseIntegrationTest extends AnyFunSuite with BeforeAndAfterAll with LazyLogging {
private[this] val JAR_DIRECTORY_PATTERN = "scala-"
private[this] val JAR_NAME_PATTERN = "cloud-storage-extension-"
private[this] val DEFAULT_EXASOL_DOCKER_IMAGE = "7.0.11"
private[this] val DEFAULT_EXASOL_DOCKER_IMAGE = "7.1.1"

val network = DockerNamedNetwork("it-tests", true)
val exasolContainer = {
Expand Down Expand Up @@ -76,18 +76,17 @@ trait BaseIntegrationTest extends AnyFunSuite with BeforeAndAfterAll with LazyLo
.language(UdfScript.Language.JAVA)
.inputType(UdfScript.InputType.SET)
.emits()
.bucketFsContent(
"com.exasol.cloudetl.scriptclasses.DockerFilesImportQueryGenerator",
jarPath
)
.bucketFsContent("com.exasol.cloudetl.scriptclasses.DockerFilesImportQueryGenerator", jarPath)
.build()
schema
.createUdfBuilder("IMPORT_METADATA")
.language(UdfScript.Language.JAVA)
.inputType(UdfScript.InputType.SCALAR)
.emits(
new Column("filename", "VARCHAR(2000)"),
new Column("partition_index", "VARCHAR(100)")
new Column("partition_index", "VARCHAR(100)"),
new Column("start_index", "DECIMAL(36, 0)"),
new Column("end_index", "DECIMAL(36, 0)")
)
.bucketFsContent("com.exasol.cloudetl.scriptclasses.DockerFilesMetadataReader", jarPath)
.build()
Expand All @@ -108,10 +107,7 @@ trait BaseIntegrationTest extends AnyFunSuite with BeforeAndAfterAll with LazyLo
.language(UdfScript.Language.JAVA)
.inputType(UdfScript.InputType.SET)
.emits()
.bucketFsContent(
"com.exasol.cloudetl.scriptclasses.DockerTableExportQueryGenerator",
jarPath
)
.bucketFsContent("com.exasol.cloudetl.scriptclasses.DockerTableExportQueryGenerator", jarPath)
.build()
schema
.createUdfBuilder("EXPORT_TABLE")
Expand Down
Loading

0 comments on commit 6adaf83

Please sign in to comment.