/
Dockerfile
70 lines (55 loc) · 3.33 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
FROM apache/flink:1.18.1-scala_2.12-java11
SHELL ["/bin/bash", "-c"]
# Install some useful tools
RUN apt-get update && \
apt-get install -y tree lnav unzip vim sudo openjdk-11-jdk
RUN echo "Purge apt artifacts" && \
apt-get purge -y --auto-remove $build_deps && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN wget https://github.com/duckdb/duckdb/releases/download/v0.10.0/duckdb_cli-linux-amd64.zip \
&& unzip duckdb_cli-linux-amd64.zip -d /usr/local/bin \
&& rm duckdb_cli-linux-amd64.zip
USER flink
WORKDIR /opt/flink
# Set up Hive config
COPY --chown=flink conf/hive-site.xml ./conf/hive-site.xml
# Pre-seed the SQL history because I'm nice like that
COPY --chown=flink flink-sql-history .flink-sql-history
# Enable SQL Client to find the job manager when running it from this image
RUN sed -i "s/jobmanager.rpc.address: localhost/jobmanager.rpc.address: flink-jobmanager/g" ./conf/flink-conf.yaml
# Install JARs
RUN echo "Add Flink S3 Plugin" && \
mkdir ./plugins/s3-fs-hadoop && \
cp ./opt/flink-s3-fs-hadoop-1.18.1.jar ./plugins/s3-fs-hadoop/
RUN echo "-> Install JARs: Flink Parquet support" && \
mkdir -p ./lib/formats && pushd $_ && \
curl https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-parquet/1.18.1/flink-sql-parquet-1.18.1.jar -O && \
popd
RUN echo "-> Install JARs: Flink's Hive connector" && \
mkdir -p ./lib/hive && pushd $_ && \
curl https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-hive-3.1.3_2.12/1.18.1/flink-sql-connector-hive-3.1.3_2.12-1.18.1.jar -O && \
popd
RUN echo "-> Install JARs: Dependencies for Iceberg" && \
mkdir -p ./lib/iceberg && pushd $_ && \
curl https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-flink-runtime-1.18/1.5.0/iceberg-flink-runtime-1.18-1.5.0.jar -O && \
popd
RUN echo "-> Install JARs: AWS / Hadoop S3" && \
mkdir -p ./lib/aws && pushd $_ && \
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar -O && \
curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.648/aws-java-sdk-bundle-1.12.648.jar -O && \
popd
RUN echo "-> Install JARs: Hadoop" && \
mkdir -p ./lib/hadoop && pushd $_ && \
curl https://repo1.maven.org/maven2/org/apache/commons/commons-configuration2/2.1.1/commons-configuration2-2.1.1.jar -O && \
curl https://repo1.maven.org/maven2/commons-logging/commons-logging/1.1.3/commons-logging-1.1.3.jar -O && \
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-auth/3.3.4/hadoop-auth-3.3.4.jar -O && \
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/3.3.4/hadoop-common-3.3.4.jar -O && \
curl https://repo1.maven.org/maven2/org/apache/hadoop/thirdparty/hadoop-shaded-guava/1.1.1/hadoop-shaded-guava-1.1.1.jar -O && \
curl https://repo1.maven.org/maven2/org/codehaus/woodstox/stax2-api/4.2.1/stax2-api-4.2.1.jar -O && \
curl https://repo1.maven.org/maven2/com/fasterxml/woodstox/woodstox-core/5.3.0/woodstox-core-5.3.0.jar -O && \
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-hdfs-client/3.3.4/hadoop-hdfs-client-3.3.4.jar -O && \
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-mapreduce-client-core/3.3.4/hadoop-mapreduce-client-core-3.3.4.jar -O && \
popd
# Set the launch command
CMD ./bin/start-cluster.sh && sleep infinity