cloudfuse-io · rdettai · Nov 4, 2022 · Nov 3, 2022 · Nov 3, 2022 · Nov 3, 2022
diff --git a/.github/workflows/style-check.yaml b/.github/workflows/style-check.yaml
@@ -21,7 +21,7 @@ jobs:
           pip install --upgrade black
           # Note: black fails when it doesn't have to do anything.
           git diff --name-only --no-color --diff-filter=ACM $(git merge-base origin/main HEAD) |
-            grep -v '\(\.json\|\.csv\|\.ipynb\|\.hpp\.in\|\.ref\|\.example\|\.txt\|\.lock\|\.js\)$' |
+            grep -v '\(\.json\|\.csv\|\.ipynb\|\.hpp\.in\|\.ref\|\.example\|\.txt\|\.lock\|\.js\|\.properties\)$' |
             2>/dev/null xargs black || true
           git diff --exit-code
 

diff --git a/.github/workflows/trino.yaml b/.github/workflows/trino.yaml
@@ -12,23 +12,20 @@ jobs:
     name: Trino Build
     runs-on: ubuntu-20.04
     steps:
-      - name: Clone
-        run: git clone --depth 1 --branch ${{ github.event.inputs.trino-version }} https://github.com/trinodb/trino.git
+      - uses: actions/checkout@v3
+        with:
+          repository: 'cloudfuse-io/trino'
+          ref: ${{ github.event.inputs.trino-version }}-patch
       - uses: actions/setup-java@v3
         with:
           distribution: 'zulu'
           java-version: 17
           cache: 'maven'
-      - name: Patch
-        run: |
-          sed -i '/verifyJvmRequirements()/d' trino/core/trino-main/src/main/java/io/trino/server/Server.java
-          sed -i '/import static io.trino.server.TrinoSystemRequirements.verifyJvmRequirements;/d' trino/core/trino-main/src/main/java/io/trino/server/Server.java
       - name: Build
-        working-directory: ./trino
-        run: ./mvnw -pl core/trino-server clean install -DskipTests
+        run: ./mvnw -pl core/trino-main,core/trino-server clean install -DskipTests
       - name: Release
         uses: softprops/action-gh-release@v1
         with:
           tag_name: trino-server-${{ github.event.inputs.trino-version }}
           body: Custom build of Trino Server version ${{ github.event.inputs.trino-version }} disabling file descriptor checks
-          files: trino/core/trino-server/target/trino-server-${{ github.event.inputs.trino-version }}.tar.gz
+          files: core/trino-server/target/trino-server-${{ github.event.inputs.trino-version }}.tar.gz
diff --git a/cli/core.py b/cli/core.py
@@ -296,4 +296,4 @@ def dockerized(c, engine):
     compose = f"docker compose -f {RUNTIME_TFDIR}/{engine}/build/docker-compose.yaml"
     c.run(f"{compose} down -v")
     c.run(f"{compose} build")
-    c.run(f"DATA_BUCKET_NAME={bucket_name(c)} {compose} up")
+    c.run(f"DATA_BUCKET_NAME={bucket_name(c)} {compose} run {engine}")
diff --git a/cli/plugins/monitoring.py b/cli/plugins/monitoring.py
@@ -7,6 +7,7 @@
 import plugins.spark as spark
 import plugins.dremio as dremio
 import plugins.dask as dask
+import plugins.trino as trino
 from datetime import datetime
 from google.cloud import bigquery
 from google.oauth2 import service_account
@@ -106,4 +107,6 @@ def run_and_send_twice(example):
             run_and_send_twice(dremio.lambda_example)
         if "dask" in active_plugins:
             run_and_send_twice(dask.lambda_example)
+        if "trino" in active_plugins:
+            run_and_send_twice(trino.lambda_example)
         time.sleep(300)
diff --git a/cli/plugins/trino.py b/cli/plugins/trino.py
@@ -0,0 +1,23 @@
+"""Trino on AWS Lambda"""
+
+from invoke import task
+import core
+
+
+@task(autoprint=True)
+def lambda_example(c, json_output=False, month="01"):
+    """SUM(trip_distance) GROUP_BY payment_type with preliminary CREATE EXTERNAL TABLE"""
+    sql = f"""
+CREATE TABLE hive.default.taxi2019{month} (trip_distance REAL, payment_type VARCHAR)
+WITH (
+  external_location = 's3a://{core.bucket_name(c)}/nyc-taxi/2019/{month}/',
+  format = 'PARQUET'
+);
+
+SELECT payment_type, SUM(trip_distance)
+FROM hive.default.taxi2019{month}
+GROUP BY payment_type;
+"""
+    if not json_output:
+        print(sql)
+    return core.run_lambda(c, "trino", sql, json_output=json_output)
diff --git a/infra/runtime/dremio/build/README.md b/infra/runtime/dremio/build/README.md
@@ -4,4 +4,7 @@
 - create a Dremio user and use its credentials to:
   - create a source
   - start the query
-  - poll for the resulut
+  - poll for the result
+- By default Dremio tries to discover its private IP and uses that to
+  communicate. We want to loopback on `localhost` instead, hence the
+  configuration `registration.publish-host: "localhost"`
diff --git a/infra/runtime/trino/.terraform.lock.hcl b/infra/runtime/trino/.terraform.lock.hcl
diff --git a/infra/runtime/trino/build/Dockerfile b/infra/runtime/trino/build/Dockerfile
@@ -0,0 +1,85 @@
+ARG FUNCTION_DIR="/function"
+ARG HADOOP_VERSION=3.2.0
+# The SDK version must be the one in the Hadoop package
+ARG AWS_JAVA_SDK_VERSION=1.11.375
+ARG METASTORE_VERSION=3.0.0
+# We use custom builds of trino-server
+ARG TRINO_VERSION=378
+
+
+FROM ubuntu:20.04 as ric-dependency
+
+ENV DEBIAN_FRONTEND=noninteractive 
+
+RUN apt-get update && \
+    apt-get install -y \
+    g++ \
+    make \
+    cmake \
+    unzip \
+    python3 \
+    python3-pip \
+    libcurl4-openssl-dev
+ARG FUNCTION_DIR
+RUN mkdir -p ${FUNCTION_DIR}
+RUN pip3 install \
+    --target ${FUNCTION_DIR} \
+    awslambdaric
+COPY lambda-handler.py ${FUNCTION_DIR}
+
+
+FROM ubuntu:20.04
+ARG HADOOP_VERSION
+ARG METASTORE_VERSION
+ARG TRINO_VERSION
+ARG AWS_JAVA_SDK_VERSION
+
+ENV DEBIAN_FRONTEND=noninteractive 
+
+RUN apt-get update && apt-get install -y \
+  curl \
+  less \
+  openjdk-11-jdk \
+  python3 \
+  && rm -rf /var/lib/apt/lists/*
+ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64/
+RUN ln -s /usr/bin/python3 /usr/bin/python
+
+# HIVE METASTORE
+
+WORKDIR /opt
+
+ENV HADOOP_HOME=/opt/hadoop-${HADOOP_VERSION}
+ENV HIVE_HOME=/opt/apache-hive-metastore-${METASTORE_VERSION}-bin
+# jars used by Trino
+ENV HADOOP_CLASSPATH=${HADOOP_HOME}/share/hadoop/tools/lib/aws-java-sdk-bundle-${AWS_JAVA_SDK_VERSION}.jar:${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-aws-${HADOOP_VERSION}.jar
+
+RUN curl -L https://archive.apache.org/dist/hive/hive-standalone-metastore-${METASTORE_VERSION}/hive-standalone-metastore-${METASTORE_VERSION}-bin.tar.gz | tar zxf - && \
+  # Download from mirror and trim some unused libraries
+  curl -L https://github.com/cloudfuse-io/lambdatization/releases/download/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | tar zxf - && \
+  cd hadoop-${HADOOP_VERSION}/share/hadoop/ && \
+  rm -r client/* && \
+  rm -r hdfs/* && \
+  rm -r mapreduce/* && \
+  rm -r yarn/* && \
+  find ./tools/lib -type f -not \( -name "aws-java-sdk-bundle-${AWS_JAVA_SDK_VERSION}.jar" -or -name "hadoop-aws-${HADOOP_VERSION}.jar" \) -delete
+
+ENV PATH="${HIVE_HOME}/bin:${PATH}"
+COPY metastore-site.xml ${HIVE_HOME}/conf
+
+# TRINO
+
+ENV TRINO_HOME=/opt/trino-server-${TRINO_VERSION}
+RUN curl -L https://github.com/cloudfuse-io/lambdatization/releases/download/trino-server-${TRINO_VERSION}/trino-server-${TRINO_VERSION}.tar.gz | tar zxf - && \
+  curl -L https://repo1.maven.org/maven2/io/trino/trino-cli/${TRINO_VERSION}/trino-cli-${TRINO_VERSION}-executable.jar -o ${TRINO_HOME}/bin/trino && \
+  chmod +x ${TRINO_HOME}/bin/trino
+ENV PATH="${TRINO_HOME}/bin:${PATH}"
+COPY trino-etc ${TRINO_HOME}/etc
+
+# LAMBDA ENTRYPOINT
+
+ARG FUNCTION_DIR
+COPY --from=ric-dependency ${FUNCTION_DIR} ${FUNCTION_DIR}
+WORKDIR ${FUNCTION_DIR}
+ENTRYPOINT [ "python3", "-m", "awslambdaric" ]
+CMD [ "lambda-handler.handler" ]
diff --git a/infra/runtime/trino/build/README.md b/infra/runtime/trino/build/README.md
@@ -0,0 +1,41 @@
+# Trino lambdatization tricks
+
+## List of tricks
+
+- Trino loads many plugins by default, which implies opening many jar files in
+  parallel. To make sure this process doesn't exceed the system's maximum number
+  of file descriptors, it performs a check of the ulimit when starting. The
+  minimum required is 4096, but unfortunately we have a hard limit on AWS Lambda
+  at 1024. We had to
+  [rebuild](https://github.com/cloudfuse-io/lambdatization/actions/workflows/trino.yaml)
+  Trino with a patch that:
+    - loads less plugins
+    - removes the check on fileno
+- Trino, like Dremio, automatically detects its private IP and tries to use it
+  for internal connections. We didn't find a knob to disable this behaviour, so
+  we had to harcode it in the patch.
+- It seems you cannot query S3 without using the Hive metastore, so we had to
+  install a local version of it running on Derby which adds to the init time.
+- The container image is huge (>2GB):
+  - we are pulling in a full Hadoop distribution, in which most files won't be
+    used. We started removing some libraries from it but we could probably trim
+    a few more hundreds of MBs
+  - we could also use a remote Hive metastore (like Glue) instead of installing
+    a local one
+  - obviously, we could use a smaller base image
+
+## Updating Trino version
+
+To change the Trino version, the patch needs to be applied to that version (xxx):
+```bash
+git clone cloudfuse-io/trino
+cd trino
+git checkout 378-patch
+git checkout -b xxx-patch
+git rebase xxx
+git push
+```
+
+Then run the build in the [Trino
+workflow](https://github.com/cloudfuse-io/lambdatization/actions/workflows/trino.yaml)
+with your new Trino version number xxx
diff --git a/infra/runtime/trino/build/docker-compose.yaml b/infra/runtime/trino/build/docker-compose.yaml
@@ -0,0 +1,34 @@
+version: "3.9"
+services:
+  trino:
+    build: .
+    image: cloudfuse-io/l12n:trino
+    cap_drop:
+      - ALL
+    read_only: true
+    volumes:
+      - trino-tmp:/tmp
+    user: nobody
+    entrypoint:
+      # - bash
+      - python3
+      - lambda-handler.py
+    environment:
+      - AWS_ACCESS_KEY_ID=$LAMBDA_ACCESS_KEY_ID
+      - AWS_SECRET_ACCESS_KEY=$LAMBDA_SECRET_ACCESS_KEY
+      - AWS_SESSION_TOKEN=$LAMBDA_SESSION_TOKEN
+      - AWS_REGION=$L12N_AWS_REGION
+      - DATA_BUCKET_NAME
+    networks:
+      - tmpengine
+    ulimits:
+      nofile:
+        soft: 1024
+        hard: 1024
+
+volumes:
+  trino-tmp:
+
+
+networks:
+  tmpengine: