From a70ec19cc4003dc22162e677e31a625f522523af Mon Sep 17 00:00:00 2001 From: Remi Dettai Date: Thu, 3 Nov 2022 10:15:31 +0000 Subject: [PATCH 1/9] Feat(trino): first version Works locally if not file descriptor limit is applyed. Stucked with file descriptor limit of lambda (1024). --- cli/core.py | 2 +- cli/plugins/trino.py | 23 +++++ infra/runtime/trino/.terraform.lock.hcl | 22 +++++ infra/runtime/trino/build/Dockerfile | 71 ++++++++++++++ infra/runtime/trino/build/README.md | 3 + infra/runtime/trino/build/docker-compose.yaml | 33 +++++++ infra/runtime/trino/build/lambda-handler.py | 93 +++++++++++++++++++ infra/runtime/trino/build/metastore-site.xml | 24 +++++ .../build/trino-etc/catalog/hive.properties | 5 + .../build/trino-etc/catalog/jmx.properties | 1 + .../trino-etc/catalog/localfile.properties | 3 + .../build/trino-etc/catalog/memory.properties | 1 + .../trino/build/trino-etc/config.properties | 5 + .../runtime/trino/build/trino-etc/jvm.config | 14 +++ .../trino/build/trino-etc/node.properties | 2 + infra/runtime/trino/main.tf | 58 ++++++++++++ infra/runtime/trino/terragrunt.hcl | 41 ++++++++ 17 files changed, 400 insertions(+), 1 deletion(-) create mode 100644 cli/plugins/trino.py create mode 100644 infra/runtime/trino/.terraform.lock.hcl create mode 100644 infra/runtime/trino/build/Dockerfile create mode 100644 infra/runtime/trino/build/README.md create mode 100644 infra/runtime/trino/build/docker-compose.yaml create mode 100644 infra/runtime/trino/build/lambda-handler.py create mode 100644 infra/runtime/trino/build/metastore-site.xml create mode 100644 infra/runtime/trino/build/trino-etc/catalog/hive.properties create mode 100644 infra/runtime/trino/build/trino-etc/catalog/jmx.properties create mode 100644 infra/runtime/trino/build/trino-etc/catalog/localfile.properties create mode 100644 infra/runtime/trino/build/trino-etc/catalog/memory.properties create mode 100644 infra/runtime/trino/build/trino-etc/config.properties create mode 100644 infra/runtime/trino/build/trino-etc/jvm.config create mode 100644 infra/runtime/trino/build/trino-etc/node.properties create mode 100644 infra/runtime/trino/main.tf create mode 100644 infra/runtime/trino/terragrunt.hcl diff --git a/cli/core.py b/cli/core.py index 322226c..c8cc61b 100644 --- a/cli/core.py +++ b/cli/core.py @@ -296,4 +296,4 @@ def dockerized(c, engine): compose = f"docker compose -f {RUNTIME_TFDIR}/{engine}/build/docker-compose.yaml" c.run(f"{compose} down -v") c.run(f"{compose} build") - c.run(f"DATA_BUCKET_NAME={bucket_name(c)} {compose} up") + c.run(f"DATA_BUCKET_NAME={bucket_name(c)} {compose} run {engine}") diff --git a/cli/plugins/trino.py b/cli/plugins/trino.py new file mode 100644 index 0000000..822640d --- /dev/null +++ b/cli/plugins/trino.py @@ -0,0 +1,23 @@ +"""Trino on AWS Lambda""" + +from invoke import task +import core + + +@task(autoprint=True) +def lambda_example(c, json_output=False, month="01"): + """SUM(trip_distance) GROUP_BY payment_type with preliminary CREATE EXTERNAL TABLE""" + sql = f""" +CREATE TABLE hive.default.taxi2019{month} (trip_distance REAL, payment_type VARCHAR) +WITH ( + external_location = 's3a://{core.bucket_name(c)}/nyc-taxi/2019/{month}/', + format = 'PARQUET' +); + +SELECT payment_type, SUM(trip_distance) +FROM hive.default.taxi2019{month} +GROUP BY payment_type; +""" + if not json_output: + print(sql) + return core.run_lambda(c, "trino", sql, json_output=json_output) diff --git a/infra/runtime/trino/.terraform.lock.hcl b/infra/runtime/trino/.terraform.lock.hcl new file mode 100644 index 0000000..c51eb53 --- /dev/null +++ b/infra/runtime/trino/.terraform.lock.hcl @@ -0,0 +1,22 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/aws" { + version = "3.75.2" + constraints = "~> 3.0" + hashes = [ + "h1:x0gluX9ZKEmz+JJW3Ut5GgWDFOq/lhs2vkqJ+xt57zs=", + "zh:0e75fb14ec42d69bc46461dd54016bb2487d38da324222cec20863918b8954c4", + "zh:30831a1fe29f005d8b809250b43d09522288db45d474c9d238b26f40bdca2388", + "zh:36163d625ab2999c9cd31ef2475d978f9f033a8dfa0d585f1665f2d6492fac4b", + "zh:48ec39685541e4ddd8ddd196e2cfb72516b87f471d86ac3892bc11f83c573199", + "zh:707b9c8775efd6962b6226d914ab25f308013bba1f68953daa77adca99ff6807", + "zh:72bd9f4609a827afa366c6f119c7dec7d73a35d712dad1457c0497d87bf8d160", + "zh:930e3ae3d0cb152e17ee5a8aee5cb47f7613d6421bc7c22e7f50c19da484a100", + "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", + "zh:a19bf49b80101a0f0272b994153eeff8f8c206ecc592707bfbce7563355b6882", + "zh:a34b5d2bbaf52285b0c9a8df6258f4789f4d927ff777e126bdc77e7887abbeaa", + "zh:caad6fd5e79eae33e6d74e38c3b15c28a5482f2a1a8ca46cc1ee70089de61adb", + "zh:f2eae988635030de9a088f8058fbcd91e2014a8312a48b16bfd09a9d69d9d6f7", + ] +} diff --git a/infra/runtime/trino/build/Dockerfile b/infra/runtime/trino/build/Dockerfile new file mode 100644 index 0000000..58c34c0 --- /dev/null +++ b/infra/runtime/trino/build/Dockerfile @@ -0,0 +1,71 @@ +ARG FUNCTION_DIR="/function" +ARG HADOOP_VERSION=3.2.0 +ARG METASTORE_VERSION=3.0.0 + + +FROM ubuntu:20.04 as ric-dependency + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y \ + g++ \ + make \ + cmake \ + unzip \ + python3 \ + python3-pip \ + libcurl4-openssl-dev +ARG FUNCTION_DIR +RUN mkdir -p ${FUNCTION_DIR} +RUN pip3 install \ + --target ${FUNCTION_DIR} \ + awslambdaric +COPY lambda-handler.py ${FUNCTION_DIR} + + +FROM ubuntu:20.04 +ARG HADOOP_VERSION +ARG METASTORE_VERSION + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y \ + curl \ + less \ + openjdk-11-jdk \ + python3 \ + && rm -rf /var/lib/apt/lists/* +ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64/ +RUN ln -s /usr/bin/python3 /usr/bin/python + +# HIVE METASTORE + +WORKDIR /opt + +ENV HADOOP_HOME=/opt/hadoop-${HADOOP_VERSION} +ENV HIVE_HOME=/opt/apache-hive-metastore-${METASTORE_VERSION}-bin +ENV HADOOP_CLASSPATH=${HADOOP_HOME}/share/hadoop/tools/lib/aws-java-sdk-bundle-1.11.375.jar:${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-aws-3.2.0.jar + +RUN curl -L https://archive.apache.org/dist/hive/hive-standalone-metastore-${METASTORE_VERSION}/hive-standalone-metastore-${METASTORE_VERSION}-bin.tar.gz | tar zxf - && \ + curl -L https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | tar zxf - +ENV PATH="${HIVE_HOME}/bin:${PATH}" +COPY metastore-site.xml ${HIVE_HOME}/conf + +# TRINO + +ENV TRINO_VERSION=378 +ENV TRINO_HOME=/opt/trino-server-${TRINO_VERSION} +RUN curl -L https://repo1.maven.org/maven2/io/trino/trino-server/${TRINO_VERSION}/trino-server-${TRINO_VERSION}.tar.gz | tar zxf - +RUN curl -L https://repo1.maven.org/maven2/io/trino/trino-cli/${TRINO_VERSION}/trino-cli-${TRINO_VERSION}-executable.jar -o ${TRINO_HOME}/bin/trino && \ + chmod +x ${TRINO_HOME}/bin/trino +ENV PATH="${TRINO_HOME}/bin:${PATH}" +COPY trino-etc ${TRINO_HOME}/etc + +# LAMBDA ENTRYPOINT + +ARG FUNCTION_DIR +COPY --from=ric-dependency ${FUNCTION_DIR} ${FUNCTION_DIR} +WORKDIR ${FUNCTION_DIR} +ENTRYPOINT [ "python3", "-m", "awslambdaric" ] +CMD [ "lambda-handler.handler" ] diff --git a/infra/runtime/trino/build/README.md b/infra/runtime/trino/build/README.md new file mode 100644 index 0000000..e33d3a6 --- /dev/null +++ b/infra/runtime/trino/build/README.md @@ -0,0 +1,3 @@ +# Trino lambdatization tricks + +## List of tricks diff --git a/infra/runtime/trino/build/docker-compose.yaml b/infra/runtime/trino/build/docker-compose.yaml new file mode 100644 index 0000000..3fa86c3 --- /dev/null +++ b/infra/runtime/trino/build/docker-compose.yaml @@ -0,0 +1,33 @@ +version: "3.9" +services: + trino: + build: . + image: cloudfuse-io/l12n:trino + cap_drop: + - ALL + read_only: true + volumes: + - trino-tmp:/tmp + entrypoint: + # - bash + - python3 + - lambda-handler.py + environment: + - AWS_ACCESS_KEY_ID=$LAMBDA_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY=$LAMBDA_SECRET_ACCESS_KEY + - AWS_SESSION_TOKEN=$LAMBDA_SESSION_TOKEN + - AWS_REGION=$L12N_AWS_REGION + - DATA_BUCKET_NAME + networks: + - tmpengine + ulimits: + nofile: + soft: 1024 + hard: 1024 + +volumes: + trino-tmp: + + +networks: + tmpengine: diff --git a/infra/runtime/trino/build/lambda-handler.py b/infra/runtime/trino/build/lambda-handler.py new file mode 100644 index 0000000..8ec5702 --- /dev/null +++ b/infra/runtime/trino/build/lambda-handler.py @@ -0,0 +1,93 @@ +import logging +import base64 +import os +import tempfile +import time +import sys +import subprocess +from typing import Tuple + +logging.getLogger().setLevel(logging.INFO) + +IS_COLD_START = True + + +def init(): + trino_proc = subprocess.Popen( + ["launcher", "run"], + stdout=sys.stdout, + stderr=subprocess.PIPE, + ) + + subprocess.check_output( + ["schematool", "-initSchema", "-dbType", "derby"], stderr=sys.stderr, cwd="/tmp" + ) + subprocess.Popen( + ["start-metastore"], stdout=sys.stdout, stderr=sys.stderr, cwd="/tmp" + ) + + while True: + log_line = trino_proc.stderr.readline().decode() + print(log_line, flush=True, file=sys.stderr, end="") + if "======== SERVER STARTED ========" in log_line: + return + + +def query(sql: str) -> Tuple[str, str]: + """Run a single SQL query using Trino cli""" + with tempfile.NamedTemporaryFile(prefix="query", delete=False) as tmp: + query_file = tmp.name + tmp.write(sql.encode()) + + cli_proc = subprocess.run( + ["trino", f"--file={query_file}"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + if cli_proc.returncode != 0: + raise Exception(f"Query failed: {cli_proc.stderr.decode()}") + else: + return (cli_proc.stdout.decode(), cli_proc.stderr.decode()) + + +def handler(event, context): + """An AWS Lambda handler that runs the provided command with bash and returns the standard output""" + start = time.time() + global IS_COLD_START + is_cold_start = IS_COLD_START + IS_COLD_START = False + if is_cold_start: + init() + src_command = base64.b64decode(event["query"]).decode("utf-8") + + (resp_stdout, resp_stderr) = query(src_command) + + result = { + "resp": resp_stdout, + "logs": resp_stderr, + "parsed_queries": [src_command], + "context": { + "cold_start": is_cold_start, + "handler_duration_sec": time.time() - start, + }, + } + return result + + +if __name__ == "__main__": + query_str = f""" +CREATE TABLE hive.default.taxi201901 (trip_distance REAL, payment_type VARCHAR) +WITH ( + external_location = 's3a://{os.getenv("DATA_BUCKET_NAME")}/nyc-taxi/2019/01/', + format = 'PARQUET' +); + +SELECT payment_type, SUM(trip_distance) +FROM hive.default.taxi201901 +GROUP BY payment_type; +""" + res = handler( + {"query": base64.b64encode(query_str.encode("utf-8"))}, + {}, + ) + print(res) diff --git a/infra/runtime/trino/build/metastore-site.xml b/infra/runtime/trino/build/metastore-site.xml new file mode 100644 index 0000000..6da0781 --- /dev/null +++ b/infra/runtime/trino/build/metastore-site.xml @@ -0,0 +1,24 @@ + + + metastore.thrift.uris + thrift://0.0.0.0:9083 + Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore. + + + metastore.task.threads.always + org.apache.hadoop.hive.metastore.events.EventCleanerTask,org.apache.hadoop.hive.metastore.MaterializationsCacheCleanerTask + + + metastore.expression.proxy + org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy + + + metastore.storage.schema.reader.impl + org.apache.hadoop.hive.metastore.SerDeStorageSchemaReader + + + datanucleus.autoCreateSchema + false + Creates necessary schema on a startup, we use init script instead + + diff --git a/infra/runtime/trino/build/trino-etc/catalog/hive.properties b/infra/runtime/trino/build/trino-etc/catalog/hive.properties new file mode 100644 index 0000000..0a60f08 --- /dev/null +++ b/infra/runtime/trino/build/trino-etc/catalog/hive.properties @@ -0,0 +1,5 @@ +connector.name=hive +hive.metastore.uri=thrift://localhost:9083 +hive.storage-format=ORC + +hive.allow-drop-table=true diff --git a/infra/runtime/trino/build/trino-etc/catalog/jmx.properties b/infra/runtime/trino/build/trino-etc/catalog/jmx.properties new file mode 100644 index 0000000..b6e0372 --- /dev/null +++ b/infra/runtime/trino/build/trino-etc/catalog/jmx.properties @@ -0,0 +1 @@ +connector.name=jmx diff --git a/infra/runtime/trino/build/trino-etc/catalog/localfile.properties b/infra/runtime/trino/build/trino-etc/catalog/localfile.properties new file mode 100644 index 0000000..99edd08 --- /dev/null +++ b/infra/runtime/trino/build/trino-etc/catalog/localfile.properties @@ -0,0 +1,3 @@ +connector.name=localfile +trino-logs.http-request-log.location=/tmp/trino/var/log +trino-logs.http-request-log.pattern=http-request.log diff --git a/infra/runtime/trino/build/trino-etc/catalog/memory.properties b/infra/runtime/trino/build/trino-etc/catalog/memory.properties new file mode 100644 index 0000000..833abd3 --- /dev/null +++ b/infra/runtime/trino/build/trino-etc/catalog/memory.properties @@ -0,0 +1 @@ +connector.name=memory diff --git a/infra/runtime/trino/build/trino-etc/config.properties b/infra/runtime/trino/build/trino-etc/config.properties new file mode 100644 index 0000000..a11cba3 --- /dev/null +++ b/infra/runtime/trino/build/trino-etc/config.properties @@ -0,0 +1,5 @@ +#single node install config +coordinator=true +node-scheduler.include-coordinator=true +http-server.http.port=8080 +discovery.uri=http://localhost:8080 diff --git a/infra/runtime/trino/build/trino-etc/jvm.config b/infra/runtime/trino/build/trino-etc/jvm.config new file mode 100644 index 0000000..c675bde --- /dev/null +++ b/infra/runtime/trino/build/trino-etc/jvm.config @@ -0,0 +1,14 @@ +-server +-Xmx1G +-XX:-UseBiasedLocking +-XX:+UseG1GC +-XX:G1HeapRegionSize=32M +-XX:+ExplicitGCInvokesConcurrent +-XX:+HeapDumpOnOutOfMemoryError +-XX:+ExitOnOutOfMemoryError +-XX:-OmitStackTraceInFastThrow +-XX:ReservedCodeCacheSize=256M +-XX:PerMethodRecompilationCutoff=10000 +-XX:PerBytecodeRecompilationCutoff=10000 +-Djdk.attach.allowAttachSelf=true +-Djdk.nio.maxCachedBufferSize=2000000 diff --git a/infra/runtime/trino/build/trino-etc/node.properties b/infra/runtime/trino/build/trino-etc/node.properties new file mode 100644 index 0000000..723662f --- /dev/null +++ b/infra/runtime/trino/build/trino-etc/node.properties @@ -0,0 +1,2 @@ +node.environment=docker +node.data-dir=/tmp/trino diff --git a/infra/runtime/trino/main.tf b/infra/runtime/trino/main.tf new file mode 100644 index 0000000..4ef4ced --- /dev/null +++ b/infra/runtime/trino/main.tf @@ -0,0 +1,58 @@ +variable "region_name" {} + +variable "trino_image" {} + +variable "bucket_arn" {} + +module "env" { + source = "../../common/env" +} + +provider "aws" { + region = var.region_name + default_tags { + tags = module.env.default_tags + } +} + +resource "aws_iam_policy" "s3_access" { + name = "${module.env.module_name}-trino-s3-access-${var.region_name}-${module.env.stage}" + + policy = < images.generated.tfvars +EOT + ] + } + + extra_arguments "image_vars" { + commands = ["apply"] + arguments = ["-var-file=${get_terragrunt_dir()}/images.generated.tfvars"] + } + +} + +inputs = { + region_name = local.region_name + trino_image = "dummy_overriden_by_before_hook" + bucket_arn = dependency.core.outputs.bucket_arn +} From 2f126d3406dd5400adee324a2fefd2294cfd6876 Mon Sep 17 00:00:00 2001 From: Remi Dettai Date: Thu, 3 Nov 2022 10:15:32 +0000 Subject: [PATCH 2/9] fix(ci): black triggered on .properties files --- .github/workflows/style-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/style-check.yaml b/.github/workflows/style-check.yaml index 3591fba..548441e 100644 --- a/.github/workflows/style-check.yaml +++ b/.github/workflows/style-check.yaml @@ -21,7 +21,7 @@ jobs: pip install --upgrade black # Note: black fails when it doesn't have to do anything. git diff --name-only --no-color --diff-filter=ACM $(git merge-base origin/main HEAD) | - grep -v '\(\.json\|\.csv\|\.ipynb\|\.hpp\.in\|\.ref\|\.example\|\.txt\|\.lock\|\.js\)$' | + grep -v '\(\.json\|\.csv\|\.ipynb\|\.hpp\.in\|\.ref\|\.example\|\.txt\|\.lock\|\.js\|\.properties\)$' | 2>/dev/null xargs black || true git diff --exit-code From a9ee40bdedad8f86a5529b9e559ba620795f1c08 Mon Sep 17 00:00:00 2001 From: Remi Dettai Date: Thu, 3 Nov 2022 10:15:32 +0000 Subject: [PATCH 3/9] fix(trino): use custom trino build --- infra/runtime/trino/build/Dockerfile | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/infra/runtime/trino/build/Dockerfile b/infra/runtime/trino/build/Dockerfile index 58c34c0..023f442 100644 --- a/infra/runtime/trino/build/Dockerfile +++ b/infra/runtime/trino/build/Dockerfile @@ -1,6 +1,10 @@ ARG FUNCTION_DIR="/function" ARG HADOOP_VERSION=3.2.0 +# The SDK version must be the one in the Hadoop package +ARG AWS_JAVA_SDK_VERSION=1.11.375 ARG METASTORE_VERSION=3.0.0 +# We use custom builds of trino-server +ARG TRINO_VERSION=378 FROM ubuntu:20.04 as ric-dependency @@ -27,6 +31,8 @@ COPY lambda-handler.py ${FUNCTION_DIR} FROM ubuntu:20.04 ARG HADOOP_VERSION ARG METASTORE_VERSION +ARG TRINO_VERSION +ARG AWS_JAVA_SDK_VERSION ENV DEBIAN_FRONTEND=noninteractive @@ -45,7 +51,7 @@ WORKDIR /opt ENV HADOOP_HOME=/opt/hadoop-${HADOOP_VERSION} ENV HIVE_HOME=/opt/apache-hive-metastore-${METASTORE_VERSION}-bin -ENV HADOOP_CLASSPATH=${HADOOP_HOME}/share/hadoop/tools/lib/aws-java-sdk-bundle-1.11.375.jar:${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-aws-3.2.0.jar +ENV HADOOP_CLASSPATH=${HADOOP_HOME}/share/hadoop/tools/lib/aws-java-sdk-bundle-${AWS_JAVA_SDK_VERSION}.jar:${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-aws-${HADOOP_VERSION}.jar RUN curl -L https://archive.apache.org/dist/hive/hive-standalone-metastore-${METASTORE_VERSION}/hive-standalone-metastore-${METASTORE_VERSION}-bin.tar.gz | tar zxf - && \ curl -L https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | tar zxf - @@ -54,9 +60,8 @@ COPY metastore-site.xml ${HIVE_HOME}/conf # TRINO -ENV TRINO_VERSION=378 ENV TRINO_HOME=/opt/trino-server-${TRINO_VERSION} -RUN curl -L https://repo1.maven.org/maven2/io/trino/trino-server/${TRINO_VERSION}/trino-server-${TRINO_VERSION}.tar.gz | tar zxf - +RUN curl -L https://github.com/cloudfuse-io/lambdatization/releases/download/trino-server-${TRINO_VERSION}/trino-server-${TRINO_VERSION}.tar.gz | tar zxf - RUN curl -L https://repo1.maven.org/maven2/io/trino/trino-cli/${TRINO_VERSION}/trino-cli-${TRINO_VERSION}-executable.jar -o ${TRINO_HOME}/bin/trino && \ chmod +x ${TRINO_HOME}/bin/trino ENV PATH="${TRINO_HOME}/bin:${PATH}" From d1f8b06fe01f5d8c24340fd249e20cb7a79b325a Mon Sep 17 00:00:00 2001 From: Remi Dettai Date: Thu, 3 Nov 2022 10:15:33 +0000 Subject: [PATCH 4/9] fix(trino): use iterator instead of readline on trino stderr --- infra/runtime/trino/build/lambda-handler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/infra/runtime/trino/build/lambda-handler.py b/infra/runtime/trino/build/lambda-handler.py index 8ec5702..ec8ea91 100644 --- a/infra/runtime/trino/build/lambda-handler.py +++ b/infra/runtime/trino/build/lambda-handler.py @@ -26,11 +26,12 @@ def init(): ["start-metastore"], stdout=sys.stdout, stderr=sys.stderr, cwd="/tmp" ) - while True: - log_line = trino_proc.stderr.readline().decode() + for line_bytes in trino_proc.stderr: + log_line = line_bytes.decode() print(log_line, flush=True, file=sys.stderr, end="") if "======== SERVER STARTED ========" in log_line: return + raise Exception("Trino server didn't start successfully") def query(sql: str) -> Tuple[str, str]: From 283be9183ae0bdfbbaeaead77ce273d9011b01f8 Mon Sep 17 00:00:00 2001 From: Remi Dettai Date: Thu, 3 Nov 2022 13:15:32 +0000 Subject: [PATCH 5/9] Fix(trino): remove un-necessary plugins --- .github/workflows/trino.yaml | 44 ++++++++++++++++++- infra/runtime/trino/build/Dockerfile | 4 +- infra/runtime/trino/build/README.md | 19 ++++++++ .../build/trino-etc/catalog/jmx.properties | 1 - .../trino-etc/catalog/localfile.properties | 3 -- .../build/trino-etc/catalog/memory.properties | 1 - 6 files changed, 64 insertions(+), 8 deletions(-) delete mode 100644 infra/runtime/trino/build/trino-etc/catalog/jmx.properties delete mode 100644 infra/runtime/trino/build/trino-etc/catalog/localfile.properties delete mode 100644 infra/runtime/trino/build/trino-etc/catalog/memory.properties diff --git a/.github/workflows/trino.yaml b/.github/workflows/trino.yaml index c77e4a6..74c9e80 100644 --- a/.github/workflows/trino.yaml +++ b/.github/workflows/trino.yaml @@ -23,9 +23,51 @@ jobs: run: | sed -i '/verifyJvmRequirements()/d' trino/core/trino-main/src/main/java/io/trino/server/Server.java sed -i '/import static io.trino.server.TrinoSystemRequirements.verifyJvmRequirements;/d' trino/core/trino-main/src/main/java/io/trino/server/Server.java + cat <> core/trino-server/src/main/provisio/trino.xml + + + + + + + + NOTICE + README.txt + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + EOT - name: Build working-directory: ./trino - run: ./mvnw -pl core/trino-server clean install -DskipTests + run: ./mvnw -pl core/trino-main,core/trino-server clean install -DskipTests - name: Release uses: softprops/action-gh-release@v1 with: diff --git a/infra/runtime/trino/build/Dockerfile b/infra/runtime/trino/build/Dockerfile index 023f442..b4feb46 100644 --- a/infra/runtime/trino/build/Dockerfile +++ b/infra/runtime/trino/build/Dockerfile @@ -61,8 +61,8 @@ COPY metastore-site.xml ${HIVE_HOME}/conf # TRINO ENV TRINO_HOME=/opt/trino-server-${TRINO_VERSION} -RUN curl -L https://github.com/cloudfuse-io/lambdatization/releases/download/trino-server-${TRINO_VERSION}/trino-server-${TRINO_VERSION}.tar.gz | tar zxf - -RUN curl -L https://repo1.maven.org/maven2/io/trino/trino-cli/${TRINO_VERSION}/trino-cli-${TRINO_VERSION}-executable.jar -o ${TRINO_HOME}/bin/trino && \ +RUN curl -L https://github.com/cloudfuse-io/lambdatization/releases/download/trino-server-${TRINO_VERSION}/trino-server-${TRINO_VERSION}.tar.gz | tar zxf - && \ + curl -L https://repo1.maven.org/maven2/io/trino/trino-cli/${TRINO_VERSION}/trino-cli-${TRINO_VERSION}-executable.jar -o ${TRINO_HOME}/bin/trino && \ chmod +x ${TRINO_HOME}/bin/trino ENV PATH="${TRINO_HOME}/bin:${PATH}" COPY trino-etc ${TRINO_HOME}/etc diff --git a/infra/runtime/trino/build/README.md b/infra/runtime/trino/build/README.md index e33d3a6..5107cbc 100644 --- a/infra/runtime/trino/build/README.md +++ b/infra/runtime/trino/build/README.md @@ -1,3 +1,22 @@ # Trino lambdatization tricks ## List of tricks + +- Trino loads many plugins by default, which implies opening many jar files in + parallel. To make sure this process doesn't exceed the system's maximum number + of file descriptors, it has a check on the ulimit for file descriptor that + cannot be disabled through configuration. The minimum set is 4096 but we have + a hard limit on AWS Lambda at 1024. We had to + [rebuild](https://github.com/cloudfuse-io/lambdatization/actions/workflows/trino.yaml) + Trino with a patch that: + - loads less plugins + - removes the check on fileno +- It seems you cannot query S3 without using the Hive metastore, so we had to + install a local version of it running on Derby which adds to the init time. +- The container image is huge (>2GB): + - we are pulling in a full Hadoop distribution, in which most files won't be + used. We could reduce the image size by at least 500MB by cherry picking the + dependencies from it + - we could also use a remote Hive metastore (like Glue) instead of installing + a local one + - obviously, we could use a smaller base image diff --git a/infra/runtime/trino/build/trino-etc/catalog/jmx.properties b/infra/runtime/trino/build/trino-etc/catalog/jmx.properties deleted file mode 100644 index b6e0372..0000000 --- a/infra/runtime/trino/build/trino-etc/catalog/jmx.properties +++ /dev/null @@ -1 +0,0 @@ -connector.name=jmx diff --git a/infra/runtime/trino/build/trino-etc/catalog/localfile.properties b/infra/runtime/trino/build/trino-etc/catalog/localfile.properties deleted file mode 100644 index 99edd08..0000000 --- a/infra/runtime/trino/build/trino-etc/catalog/localfile.properties +++ /dev/null @@ -1,3 +0,0 @@ -connector.name=localfile -trino-logs.http-request-log.location=/tmp/trino/var/log -trino-logs.http-request-log.pattern=http-request.log diff --git a/infra/runtime/trino/build/trino-etc/catalog/memory.properties b/infra/runtime/trino/build/trino-etc/catalog/memory.properties deleted file mode 100644 index 833abd3..0000000 --- a/infra/runtime/trino/build/trino-etc/catalog/memory.properties +++ /dev/null @@ -1 +0,0 @@ -connector.name=memory From 4918ac3e525b1f156490aa29189e58bdbcdbb9c6 Mon Sep 17 00:00:00 2001 From: Remi Dettai Date: Thu, 3 Nov 2022 13:22:37 +0000 Subject: [PATCH 6/9] Fix(ci): use checkout action instead of clone --- .github/workflows/trino.yaml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/trino.yaml b/.github/workflows/trino.yaml index 74c9e80..24d60c7 100644 --- a/.github/workflows/trino.yaml +++ b/.github/workflows/trino.yaml @@ -12,8 +12,10 @@ jobs: name: Trino Build runs-on: ubuntu-20.04 steps: - - name: Clone - run: git clone --depth 1 --branch ${{ github.event.inputs.trino-version }} https://github.com/trinodb/trino.git + - uses: actions/checkout@v3 + with: + repository: 'trinodb/trino' + ref: ${{ github.event.inputs.trino-version }} - uses: actions/setup-java@v3 with: distribution: 'zulu' @@ -21,9 +23,9 @@ jobs: cache: 'maven' - name: Patch run: | - sed -i '/verifyJvmRequirements()/d' trino/core/trino-main/src/main/java/io/trino/server/Server.java - sed -i '/import static io.trino.server.TrinoSystemRequirements.verifyJvmRequirements;/d' trino/core/trino-main/src/main/java/io/trino/server/Server.java - cat <> core/trino-server/src/main/provisio/trino.xml + sed -i '/verifyJvmRequirements()/d' core/trino-main/src/main/java/io/trino/server/Server.java + sed -i '/import static io.trino.server.TrinoSystemRequirements.verifyJvmRequirements;/d' core/trino-main/src/main/java/io/trino/server/Server.java + cat < core/trino-server/src/main/provisio/trino.xml @@ -66,11 +68,10 @@ jobs: EOT - name: Build - working-directory: ./trino run: ./mvnw -pl core/trino-main,core/trino-server clean install -DskipTests - name: Release uses: softprops/action-gh-release@v1 with: tag_name: trino-server-${{ github.event.inputs.trino-version }} body: Custom build of Trino Server version ${{ github.event.inputs.trino-version }} disabling file descriptor checks - files: trino/core/trino-server/target/trino-server-${{ github.event.inputs.trino-version }}.tar.gz + files: core/trino-server/target/trino-server-${{ github.event.inputs.trino-version }}.tar.gz From 11e678f62213022051c8bfe62706e1e8634db308 Mon Sep 17 00:00:00 2001 From: Remi Dettai Date: Thu, 3 Nov 2022 13:34:02 +0000 Subject: [PATCH 7/9] Fix(ci): heredoc substitution --- .github/workflows/trino.yaml | 2 +- infra/runtime/trino/build/README.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/trino.yaml b/.github/workflows/trino.yaml index 24d60c7..35a995d 100644 --- a/.github/workflows/trino.yaml +++ b/.github/workflows/trino.yaml @@ -25,7 +25,7 @@ jobs: run: | sed -i '/verifyJvmRequirements()/d' core/trino-main/src/main/java/io/trino/server/Server.java sed -i '/import static io.trino.server.TrinoSystemRequirements.verifyJvmRequirements;/d' core/trino-main/src/main/java/io/trino/server/Server.java - cat < core/trino-server/src/main/provisio/trino.xml + cat << 'EOT' > core/trino-server/src/main/provisio/trino.xml diff --git a/infra/runtime/trino/build/README.md b/infra/runtime/trino/build/README.md index 5107cbc..7673c80 100644 --- a/infra/runtime/trino/build/README.md +++ b/infra/runtime/trino/build/README.md @@ -4,9 +4,9 @@ - Trino loads many plugins by default, which implies opening many jar files in parallel. To make sure this process doesn't exceed the system's maximum number - of file descriptors, it has a check on the ulimit for file descriptor that - cannot be disabled through configuration. The minimum set is 4096 but we have - a hard limit on AWS Lambda at 1024. We had to + of file descriptors, it performs a check of the ulimit when starting. The + minimum required is 4096, but unfortunately we have a hard limit on AWS Lambda + at 1024. We had to [rebuild](https://github.com/cloudfuse-io/lambdatization/actions/workflows/trino.yaml) Trino with a patch that: - loads less plugins From 103cf374e5985c180776993ebcd3f12cb9c08d57 Mon Sep 17 00:00:00 2001 From: Remi Dettai Date: Fri, 4 Nov 2022 14:52:04 +0000 Subject: [PATCH 8/9] fix(ci): build from fork --- .github/workflows/trino.yaml | 50 ++---------------------------------- 1 file changed, 2 insertions(+), 48 deletions(-) diff --git a/.github/workflows/trino.yaml b/.github/workflows/trino.yaml index 35a995d..e9af09f 100644 --- a/.github/workflows/trino.yaml +++ b/.github/workflows/trino.yaml @@ -14,59 +14,13 @@ jobs: steps: - uses: actions/checkout@v3 with: - repository: 'trinodb/trino' - ref: ${{ github.event.inputs.trino-version }} + repository: 'cloudfuse-io/trino' + ref: ${{ github.event.inputs.trino-version }}-patch - uses: actions/setup-java@v3 with: distribution: 'zulu' java-version: 17 cache: 'maven' - - name: Patch - run: | - sed -i '/verifyJvmRequirements()/d' core/trino-main/src/main/java/io/trino/server/Server.java - sed -i '/import static io.trino.server.TrinoSystemRequirements.verifyJvmRequirements;/d' core/trino-main/src/main/java/io/trino/server/Server.java - cat << 'EOT' > core/trino-server/src/main/provisio/trino.xml - - - - - - - - NOTICE - README.txt - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - EOT - name: Build run: ./mvnw -pl core/trino-main,core/trino-server clean install -DskipTests - name: Release From d3b70a3b79eac91002885be4459420984eed889c Mon Sep 17 00:00:00 2001 From: Remi Dettai Date: Fri, 4 Nov 2022 16:05:25 +0000 Subject: [PATCH 9/9] imp(trino): improved logger and some optimizations The actual fix comes from the the new build for the Trino server --- cli/plugins/monitoring.py | 3 + infra/runtime/dremio/build/README.md | 5 +- infra/runtime/trino/build/Dockerfile | 11 ++- infra/runtime/trino/build/README.md | 23 ++++- infra/runtime/trino/build/docker-compose.yaml | 1 + infra/runtime/trino/build/lambda-handler.py | 91 ++++++++++++++++--- infra/runtime/trino/build/metastore-site.xml | 2 +- .../trino/build/trino-etc/config.properties | 1 + 8 files changed, 120 insertions(+), 17 deletions(-) diff --git a/cli/plugins/monitoring.py b/cli/plugins/monitoring.py index 76390ea..ffd6c54 100644 --- a/cli/plugins/monitoring.py +++ b/cli/plugins/monitoring.py @@ -7,6 +7,7 @@ import plugins.spark as spark import plugins.dremio as dremio import plugins.dask as dask +import plugins.trino as trino from datetime import datetime from google.cloud import bigquery from google.oauth2 import service_account @@ -106,4 +107,6 @@ def run_and_send_twice(example): run_and_send_twice(dremio.lambda_example) if "dask" in active_plugins: run_and_send_twice(dask.lambda_example) + if "trino" in active_plugins: + run_and_send_twice(trino.lambda_example) time.sleep(300) diff --git a/infra/runtime/dremio/build/README.md b/infra/runtime/dremio/build/README.md index 5823171..5ae0ebb 100644 --- a/infra/runtime/dremio/build/README.md +++ b/infra/runtime/dremio/build/README.md @@ -4,4 +4,7 @@ - create a Dremio user and use its credentials to: - create a source - start the query - - poll for the resulut + - poll for the result +- By default Dremio tries to discover its private IP and uses that to + communicate. We want to loopback on `localhost` instead, hence the + configuration `registration.publish-host: "localhost"` diff --git a/infra/runtime/trino/build/Dockerfile b/infra/runtime/trino/build/Dockerfile index b4feb46..c0d8ed0 100644 --- a/infra/runtime/trino/build/Dockerfile +++ b/infra/runtime/trino/build/Dockerfile @@ -51,10 +51,19 @@ WORKDIR /opt ENV HADOOP_HOME=/opt/hadoop-${HADOOP_VERSION} ENV HIVE_HOME=/opt/apache-hive-metastore-${METASTORE_VERSION}-bin +# jars used by Trino ENV HADOOP_CLASSPATH=${HADOOP_HOME}/share/hadoop/tools/lib/aws-java-sdk-bundle-${AWS_JAVA_SDK_VERSION}.jar:${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-aws-${HADOOP_VERSION}.jar RUN curl -L https://archive.apache.org/dist/hive/hive-standalone-metastore-${METASTORE_VERSION}/hive-standalone-metastore-${METASTORE_VERSION}-bin.tar.gz | tar zxf - && \ - curl -L https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | tar zxf - + # Download from mirror and trim some unused libraries + curl -L https://github.com/cloudfuse-io/lambdatization/releases/download/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | tar zxf - && \ + cd hadoop-${HADOOP_VERSION}/share/hadoop/ && \ + rm -r client/* && \ + rm -r hdfs/* && \ + rm -r mapreduce/* && \ + rm -r yarn/* && \ + find ./tools/lib -type f -not \( -name "aws-java-sdk-bundle-${AWS_JAVA_SDK_VERSION}.jar" -or -name "hadoop-aws-${HADOOP_VERSION}.jar" \) -delete + ENV PATH="${HIVE_HOME}/bin:${PATH}" COPY metastore-site.xml ${HIVE_HOME}/conf diff --git a/infra/runtime/trino/build/README.md b/infra/runtime/trino/build/README.md index 7673c80..32bd0ed 100644 --- a/infra/runtime/trino/build/README.md +++ b/infra/runtime/trino/build/README.md @@ -11,12 +11,31 @@ Trino with a patch that: - loads less plugins - removes the check on fileno +- Trino, like Dremio, automatically detects its private IP and tries to use it + for internal connections. We didn't find a knob to disable this behaviour, so + we had to harcode it in the patch. - It seems you cannot query S3 without using the Hive metastore, so we had to install a local version of it running on Derby which adds to the init time. - The container image is huge (>2GB): - we are pulling in a full Hadoop distribution, in which most files won't be - used. We could reduce the image size by at least 500MB by cherry picking the - dependencies from it + used. We started removing some libraries from it but we could probably trim + a few more hundreds of MBs - we could also use a remote Hive metastore (like Glue) instead of installing a local one - obviously, we could use a smaller base image + +## Updating Trino version + +To change the Trino version, the patch needs to be applied to that version (xxx): +```bash +git clone cloudfuse-io/trino +cd trino +git checkout 378-patch +git checkout -b xxx-patch +git rebase xxx +git push +``` + +Then run the build in the [Trino +workflow](https://github.com/cloudfuse-io/lambdatization/actions/workflows/trino.yaml) +with your new Trino version number xxx diff --git a/infra/runtime/trino/build/docker-compose.yaml b/infra/runtime/trino/build/docker-compose.yaml index 3fa86c3..fe1a478 100644 --- a/infra/runtime/trino/build/docker-compose.yaml +++ b/infra/runtime/trino/build/docker-compose.yaml @@ -8,6 +8,7 @@ services: read_only: true volumes: - trino-tmp:/tmp + user: nobody entrypoint: # - bash - python3 diff --git a/infra/runtime/trino/build/lambda-handler.py b/infra/runtime/trino/build/lambda-handler.py index ec8ea91..ef80a59 100644 --- a/infra/runtime/trino/build/lambda-handler.py +++ b/infra/runtime/trino/build/lambda-handler.py @@ -4,31 +4,97 @@ import tempfile import time import sys +import threading +import selectors import subprocess from typing import Tuple logging.getLogger().setLevel(logging.INFO) + +class StdLogger: + """A class that efficiently multiplexes std streams into logs""" + + def _start_logging(self): + while True: + for key, _ in self.selector.select(): + # read1 instead or read to avoid blocking + data = key.fileobj.read1() + if key.fileobj not in self.files: + raise Exception("Unexpected file desc in selector") + with self.lock: + name = self.files[key.fileobj] + if not data: + print(f"{name} - EOS", flush=True, file=sys.stderr) + self.selector.unregister(key.fileobj) + with self.lock: + del self.files[key.fileobj] + else: + lines = data.decode().splitlines() + for line in lines: + print(f"{name} - {line}", flush=True, file=sys.stderr) + with self.lock: + self.logs[name].extend(lines) + + def __init__(self): + self.lock = threading.Lock() + self.files = {} + self.logs = {} + self.selector = selectors.DefaultSelector() + self.thread = threading.Thread(target=self._start_logging, daemon=True) + + def start(self): + """Start consuming registered streams (if any) and logging them""" + self.thread.start() + + def add(self, name: str, file): + """Add a new stream with the given name""" + with self.lock: + self.files[file] = name + self.logs[name] = [] + self.selector.register(file, selectors.EVENT_READ) + + def get(self, name: str) -> str: + """Get the history of the stream for the given name""" + with self.lock: + return "\n".join(self.logs[name]) + + IS_COLD_START = True +STD_LOGGER: StdLogger = None def init(): + global STD_LOGGER + STD_LOGGER = StdLogger() + STD_LOGGER.start() + trino_proc = subprocess.Popen( ["launcher", "run"], - stdout=sys.stdout, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) + STD_LOGGER.add("trino-srv|stdout", trino_proc.stdout) - subprocess.check_output( - ["schematool", "-initSchema", "-dbType", "derby"], stderr=sys.stderr, cwd="/tmp" + schematool_proc = subprocess.Popen( + ["schematool", "-initSchema", "-dbType", "derby"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd="/tmp", ) - subprocess.Popen( - ["start-metastore"], stdout=sys.stdout, stderr=sys.stderr, cwd="/tmp" + STD_LOGGER.add("schematool|stdout", schematool_proc.stdout) + STD_LOGGER.add("schematool|stderr", schematool_proc.stderr) + if schematool_proc.wait() != 0: + raise Exception("Hive schema seeding failed") + hive_proc = subprocess.Popen( + ["start-metastore"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd="/tmp" ) + STD_LOGGER.add("hive-srv|stdout", hive_proc.stdout) + STD_LOGGER.add("hive-srv|stderr", hive_proc.stderr) for line_bytes in trino_proc.stderr: log_line = line_bytes.decode() - print(log_line, flush=True, file=sys.stderr, end="") + print(f"trino-srv|stderr - {log_line}", flush=True, file=sys.stderr, end="") if "======== SERVER STARTED ========" in log_line: return raise Exception("Trino server didn't start successfully") @@ -40,15 +106,16 @@ def query(sql: str) -> Tuple[str, str]: query_file = tmp.name tmp.write(sql.encode()) - cli_proc = subprocess.run( - ["trino", f"--file={query_file}"], + cli_proc = subprocess.Popen( + ["trino", f"--file={query_file}", "--progress"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) - if cli_proc.returncode != 0: - raise Exception(f"Query failed: {cli_proc.stderr.decode()}") - else: - return (cli_proc.stdout.decode(), cli_proc.stderr.decode()) + STD_LOGGER.add("trino-cli|stdout", cli_proc.stdout) + STD_LOGGER.add("trino-cli|stderr", cli_proc.stderr) + if cli_proc.wait() != 0: + raise Exception(f"Query failed: {STD_LOGGER.get('trino-cli|stderr')}") + return (STD_LOGGER.get("trino-cli|stdout"), STD_LOGGER.get("trino-cli|stderr")) def handler(event, context): diff --git a/infra/runtime/trino/build/metastore-site.xml b/infra/runtime/trino/build/metastore-site.xml index 6da0781..a359112 100644 --- a/infra/runtime/trino/build/metastore-site.xml +++ b/infra/runtime/trino/build/metastore-site.xml @@ -1,7 +1,7 @@ metastore.thrift.uris - thrift://0.0.0.0:9083 + thrift://localhost:9083 Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore. diff --git a/infra/runtime/trino/build/trino-etc/config.properties b/infra/runtime/trino/build/trino-etc/config.properties index a11cba3..6f93799 100644 --- a/infra/runtime/trino/build/trino-etc/config.properties +++ b/infra/runtime/trino/build/trino-etc/config.properties @@ -3,3 +3,4 @@ coordinator=true node-scheduler.include-coordinator=true http-server.http.port=8080 discovery.uri=http://localhost:8080 +log.path=/tmp/trino/var/log/server.log