diff --git a/corretto-emr-dbs-universal-base/Dockerfile b/corretto-emr-dbs-universal-base/Dockerfile index d27abcf..126a5af 100644 --- a/corretto-emr-dbs-universal-base/Dockerfile +++ b/corretto-emr-dbs-universal-base/Dockerfile @@ -1,7 +1,7 @@ # # Source: https://github.com/data-engineering-helpers/dpp-images/tree/main/pyspark-coretto-emr-dbs-universal-base/Dockerfile # On Docker Hub: https://hub.docker.com/repository/docker/infrahelpers/dpp/general -# Convention for the tags of the generated images: +# Convention for the tags of the generated images: # * infrahelpers/dpp:jdk{JDK_VERSION} e.g.: # * infrahelpers/dpp:jdk8 # * infrahelpers/dpp:jdk11 @@ -17,7 +17,7 @@ # (see the pyspark-py3X/ directories), with specific versions. # Note that: # * DataBricks uses Python 3.8 internally by default -# * AWS EMR uses Python 3.7.10 by default +# * AWS EMR uses Python 3.7.16 by default # # AWS Corretto / EMR # ================== diff --git a/corretto-emr-dbs-universal-pyspark/Dockerfile b/corretto-emr-dbs-universal-pyspark/Dockerfile index 9d3fefc..d892c77 100644 --- a/corretto-emr-dbs-universal-pyspark/Dockerfile +++ b/corretto-emr-dbs-universal-pyspark/Dockerfile @@ -1,11 +1,11 @@ # -# Source: https://github.com/data-engineering-helpers/dpp-images/tree/main/pyspark-py310/Dockerfile +# Source: https://github.com/data-engineering-helpers/dpp-images/tree/main/corretto-emr-dbs-universal-pyspark/Dockerfile # On Docker Hub: https://hub.docker.com/repository/docker/infrahelpers/dpp/general # Usual Docker tags: # * infrahelpers/dpp:jdk{JDK_VERSION}-python{PYTHON_MINOR_VERSION} # * infrahelpers/dpp:jdk{JDK_VERSION}-python{PYTHON_MICRO_VERSION} # -# Image containing python installation, to be accessed by EMR and Databricks (for pyspark) +# Image containing python installation, to be accessed by EMR and Databricks (for PySpark) # See https://github.com/data-engineering-helpers/dpp-images/tree/main/coretto-emr-dbs-universal-base/Dockerfile # for more details about the base image (tag: infrahelpers/dpp:jdk{JDK_VERSION}) # @@ -28,7 +28,7 @@ ENV PYTHON_MICRO_VERSION=$PYTHON_MICRO_VERSION ENV PYSPARK_PYTHON="/databricks/python3/bin/python3" # Update the OS -RUN yum -y update && yum clean all +RUN yum -y update && yum clean all && rm -rf /var/cache/yum # Install the PYTHON_MICRO_VERSION version of Python RUN curl -kLs \ @@ -38,13 +38,14 @@ RUN curl -kLs \ rm -f Python-${PYTHON_MICRO_VERSION}.tgz && \ cd Python-${PYTHON_MICRO_VERSION} && \ ./configure --prefix=/usr --enable-optimizations && \ - make && \ - make altinstall + make && make altinstall && \ + cd .. && rm -rf Python-${PYTHON_MICRO_VERSION} # Set the PYTHON_MICRO_VERSION version of Python as system Python # This is what is used by AWS EMR -RUN cp -f /usr/bin/python${PYTHON_MINOR_VERSION} /usr/bin/python3 && \ - python3 --version +RUN cd /usr/bin && \ + rm -f /usr/bin/python3 && ln -s python${PYTHON_MINOR_VERSION} python3 \ + && cd .. && python -V && python3 -V # Install a virtual environment in /databricks/python3 RUN python3 -mpip install -U pip && python3 -mpip install virtualenv && \ diff --git a/corretto-emr-dbs-universal-spark-scala/Dockerfile b/corretto-emr-dbs-universal-spark-scala/Dockerfile index 5e77f7f..d97e54f 100644 --- a/corretto-emr-dbs-universal-spark-scala/Dockerfile +++ b/corretto-emr-dbs-universal-spark-scala/Dockerfile @@ -1,10 +1,10 @@ # -# Source: https://github.com/data-engineering-helpers/dpp-images/tree/main/pyspark-py310/Dockerfile +# Source: https://github.com/data-engineering-helpers/dpp-images/tree/main/corretto-emr-dbs-universal-spark-scala/Dockerfile # On Docker Hub: https://hub.docker.com/repository/docker/infrahelpers/dpp/general # Usual Docker tags: # * infrahelpers/dpp:jdk8-sbt{SBT_VERSION} # -# Image containing python installation, to be accessed by EMR and Databricks (for spark scala) +# Image containing python installation, to be accessed by EMR and Databricks (for Spark Scala) # See https://github.com/data-engineering-helpers/dpp-images/tree/main/coretto-emr-dbs-universal-base/Dockerfile # for more details about the base image (tag: infrahelpers/dpp:jdk{JDK_VERSION}) # @@ -12,10 +12,14 @@ FROM infrahelpers/dpp:jdk8 ARG SBT_VERSION -LABEL authors="Antoine Chenon" +LABEL authors="Antoine Chenon, Denis Arnaud " # Update the OS -RUN yum -y update && yum clean all +RUN yum -y update && yum clean all && rm -rf /var/cache/yum # Install sbt -RUN rm -f /etc/yum.repos.d/bintray-rpm.repo; curl -L https://www.scala-sbt.org/sbt-rpm.repo > sbt-rpm.repo; mv sbt-rpm.repo /etc/yum.repos.d/; yum -y install sbt-${SBT_VERSION}-0 \ No newline at end of file +RUN rm -f /etc/yum.repos.d/bintray-rpm.repo && \ + curl -kLs https://www.scala-sbt.org/sbt-rpm.repo -o /etc/yum.repos.d/sbt-rpm.repo && \ + yum -y install sbt-${SBT_VERSION}-0 && \ + sbt -version +