Skip to content
This repository has been archived by the owner on Jan 8, 2024. It is now read-only.

Commit

Permalink
Add support for building NVIDIA UMD/KMD disk
Browse files Browse the repository at this point in the history
Add support to the kokoro job script to generate
a disk that contains the UMD/KMD NVIDIA modules
and corresponding support files required for
instances that use a NVIDIA gpu.

The disk is generated from a preprocessed package
that contains NVIDIA kernel headers and UMD stored
in GCS.

To create an NVIDIA disk the script requires setting
the NVIDIA_DRIVER_VERSION env var. If the var is not
set it will generate a new kernel image with the AMD
modules.

The disk is not self contained and must be mounted
with the corresponding kernel disk.

Bug: 231133651
Change-Id: I8547c908bc68fe86877139da962008a7a93c3f9a
  • Loading branch information
npcdoom committed May 17, 2022
1 parent ece8b1b commit a0f1ff9
Show file tree
Hide file tree
Showing 6 changed files with 195 additions and 16 deletions.
131 changes: 128 additions & 3 deletions kokoro/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,37 @@ function build_amdgpu_external_module() {
popd
}

function build_nvidia_external_module() {
readonly NVIDIA_DRIVER_DIR="${KBUILD_OUTPUT}/nvidia-drivers"
rm -rf "${NVIDIA_DRIVER_DIR}"
mkdir -p "${NVIDIA_DRIVER_DIR}"

tar -xf "${DOCKER_GFILE_DIR}/nvidia-drivers.tar.gz" \
-C "${NVIDIA_DRIVER_DIR}"/

local -r ext_kbuild="${KBUILD_OUTPUT}"/nvidia-kernel
rm -rf "${ext_kbuild}"
mkdir -p "${ext_kbuild}"

# Copy kernel module source files.
rsync -a ${NVIDIA_DRIVER_DIR}/kernel/ ${ext_kbuild}

pushd "${SRC_DIR}"
make -j "$(nproc)" \
NV_KERNEL_SOURCES="${SRC_DIR}" \
NV_KERNEL_OUTPUT="${KBUILD_OUTPUT}" \
NV_KERNEL_MODULES="nvidia nvidia-uvm nvidia-modeset nvidia-drm" \
M="${ext_kbuild}" "${MAKE_ARGS[@]}"
local -r mod_install_usr_dir="${MOD_INSTALL_DIR}/usr"
ls "${mod_install_usr_dir}/lib/modules/${KERNELRELEASE}"
make -j "$(nproc)" \
M="${ext_kbuild}" \
modules_install "${MAKE_ARGS[@]}" \
INSTALL_MOD_PATH="${mod_install_usr_dir}" \
INSTALL_MOD_STRIP=1
popd
}

function build_kernel_rootfs() {
# LINT.IfChange
readonly KROOTFS="${INITRAMFS_INSTALL_DIR}/krootfs.squashfs"
Expand Down Expand Up @@ -297,6 +328,69 @@ pkgdef mpm = {
EOL
}

function build_nvidia_modules_disk() {
readonly NVIDIA_DISK_NAME="nvidia-disk-${NVIDIA_DRIVER_VERSION}-${KERNELRELEASE}-${ARCH}.squashfs"
readonly NVIDIA_DISK="${KBUILD_OUTPUT}/${NVIDIA_DISK_NAME}"

local -r nvidiafs_assets="${SRC_DIR}/stadia/nvrootfs"
local -r nvidiafs_install_dir="${KBUILD_OUTPUT}/nvidiafs-install"
local -r nvidiafs_lib_dir="${nvidiafs_install_dir}/usr/lib/x86_64-linux-gnu"
rm -rf "${NVIDIA_DISK}" "${nvidiafs_install_dir}"
mkdir "${nvidiafs_install_dir}"
mkdir "${nvidiafs_install_dir}/etc"
mkdir -p "${nvidiafs_install_dir}/usr/lib/modules/${KERNELRELEASE}/extra/nvidia"
mkdir -p "${nvidiafs_lib_dir}"

# Copy manifest to /etc
install -D -m "u=rw,go=r,a-s" "${NVIDIA_DRIVER_DIR}/nvidia_icd.json" \
"${nvidiafs_install_dir}/etc/vulkan/icd.d/nvidia_icd.json"

# Copy libraries to /usr/lib/x86_64-linux-gnu
install -D "${NVIDIA_DRIVER_DIR}/lib"* "${nvidiafs_lib_dir}"

# Create symlinks for all the libraries
# - Always create a <name>.so link for each library
# - For libraries that contain the NVIDIA_DRIVER_VERSION number
# create a symlink that ends in <name>.so.1
# - For the rest of the libraries with the pattern .so.<major>.<minor>.<patch>
# create a symlink <name>.so.<major> if the `major` > 0.
pushd ${nvidiafs_lib_dir}
for lib in $(ls lib*); do
local name=$(basename -- ${lib})
local basename=${name%%.*}

ln -s ${lib} ${basename}.so

if [[ "$name" =~ .*${NVIDIA_DRIVER_VERSION}.* ]]; then
ln -s ${lib} ${basename}.so.1
else
local version=$(echo ${name} | cut -d "." -f 3)
if [[ version -gt 0 ]]; then
ln -s ${lib} ${basename}.so.${version}
fi
fi
done
popd

rsync -a \
"${nvidiafs_assets}"/ \
"${nvidiafs_install_dir}"/

rsync -a \
"${MOD_INSTALL_DIR}/usr/lib/modules/${KERNELRELEASE}/extra"/ \
"${nvidiafs_install_dir}/usr/lib/modules/${KERNELRELEASE}/extra/nvidia"/

# chown everything under the install dir to root:root, then refine as needed.
# Do not follow symlinks (-h).
chown -hR 0:0 "${nvidiafs_install_dir}"

chmod 00755 "${nvidiafs_install_dir}"

find "${nvidiafs_install_dir}" -type d -exec chmod 00755 {} \;
mksquashfs "${nvidiafs_install_dir}" "${NVIDIA_DISK}" \
-comp xz -no-exports -no-progress -no-recovery -Xbcj x86
}

function build_perf() {
pushd "${SRC_DIR}"/tools/perf
local -r perf_objs="${KBUILD_OUTPUT}"/tools/perf
Expand Down Expand Up @@ -334,7 +428,7 @@ function build_perf_tar_xz() {
"$(dirname "${PERF_TAR_XZ}")/perf-latest.tar.xz"
}

function stage_artifacts() {
function stage_kernel_artifacts() {
readonly artifacts_dir="${DOCKER_ARTIFACTS_DIR}"
readonly mpm_dir="${artifacts_dir}/mpm"

Expand All @@ -354,7 +448,16 @@ function stage_artifacts() {
cp -a "${BOOT_DISK}" "${kernel_disk_mpm_dir}/disk.raw"
}

function build() {
function stage_nvidia_artifacts() {
# See https://g3doc.corp.google.com/cloud/network/edge/g3doc/vm_disk.md#creating-an-mpm-disk-package
readonly artifacts_dir="${DOCKER_ARTIFACTS_DIR}"
readonly nvidia_disk_dir="${artifacts_dir}/disk"
mkdir -p "${nvidia_disk_dir}"
cp -a "${NVIDIA_DISK}" "${artifacts_dir}/"
cp -a "${NVIDIA_DISK}" "${nvidia_disk_dir}/disk.raw"
}

function build_kernel_disk() {
check_env
set_LOCALVERSION_from_buildstamp
download_initramfs_artifacts
Expand All @@ -371,7 +474,29 @@ function build() {
build_linux_tar_xz
build_perf_tar_xz
build_boot_disk
stage_artifacts
stage_kernel_artifacts
}

function build_nvidia_disk() {
check_env
set_LOCALVERSION_from_buildstamp
create_kbuild_output
finalize_config
build_bzimage_and_headers
build_modules
build_nvidia_external_module
build_nvidia_modules_disk
stage_nvidia_artifacts
}

function build() {
if [[ ! -z ${NVIDIA_DRIVER_VERSION} ]]; then
echo "Building NVIDIA userspace modules and kernel modules disk"
build_nvidia_disk
else
echo "Building kernel disk with amdgpu modules"
build_kernel_disk
fi
}

build
27 changes: 18 additions & 9 deletions kokoro/docker/glinux_launcher.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,24 @@ else
exit 1
fi

# Download the amdgpu firmware. Set "AMDGPU_FIRMWARE_URL" in the environment to
# override the default package.
readonly DEFAULT_AMDGPU_FIRMWARE_URL="https://storage.googleapis.com/stadia_kernel_public/amdgpu-firmware/amdgpu-firmware-19.50.tar.gz"
readonly DEFAULT_AMDGPU_FIRMWARE_SHA256="89785ad581781bbdb98902ab82bce95bf12a861b81b84731020d7ba63a1a1533"
if [[ -z "${AMDGPU_FIRMWARE_URL}" ]]; then
AMDGPU_FIRMWARE_URL="${DEFAULT_AMDGPU_FIRMWARE_URL}"
AMDGPU_FIRMWARE_SHA256="${DEFAULT_AMDGPU_FIRMWARE_SHA256}"
if [[ ! -z "${NVIDIA_DRIVER_PATH}" ]]; then
echo "Copying NVIDIA driver package from ${NVIDIA_DRIVER_PATH}."
cp "${NVIDIA_DRIVER_PATH}" "${BUILD_DIR}/gfile/nvidia-drivers.tar.gz"

readonly nvidia_filename=$(basename -- ${NVIDIA_DRIVER_PATH})
readonly NVIDIA_DRIVER_VERSION=$(echo "$nvidia_filename" | grep -Eo '[0-9]+[.]+[0-9]+')
else
# Download the amdgpu firmware. Set "AMDGPU_FIRMWARE_URL" in the environment to
# override the default package.
readonly DEFAULT_AMDGPU_FIRMWARE_URL="https://storage.googleapis.com/stadia_kernel_public/amdgpu-firmware/amdgpu-firmware-19.50.tar.gz"
readonly DEFAULT_AMDGPU_FIRMWARE_SHA256="89785ad581781bbdb98902ab82bce95bf12a861b81b84731020d7ba63a1a1533"
if [[ -z "${AMDGPU_FIRMWARE_URL}" ]]; then
AMDGPU_FIRMWARE_URL="${DEFAULT_AMDGPU_FIRMWARE_URL}"
AMDGPU_FIRMWARE_SHA256="${DEFAULT_AMDGPU_FIRMWARE_SHA256}"
fi
download_wget "${AMDGPU_FIRMWARE_URL}" \
"${BUILD_DIR}/gfile/amdgpu-firmware.tar.gz" "${AMDGPU_FIRMWARE_SHA256}"
fi
download_wget "${AMDGPU_FIRMWARE_URL}" \
"${BUILD_DIR}/gfile/amdgpu-firmware.tar.gz" "${AMDGPU_FIRMWARE_SHA256}"

${ENGINE_BIN} pull gcr.io/stadia-open-source/kernel/debian9:latest
${ENGINE_BIN} run \
Expand All @@ -70,6 +78,7 @@ ${ENGINE_BIN} run \
--env "DOCKER_GFILE_DIR=/workspace/gfile" \
--env "DOCKER_TMP_DIR=/workspace/tmp" \
--env "DOCKER_SRC_DIR=/workspace/src/kernel" \
--env "NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}" \
--net=host \
--privileged=true \
--tty \
Expand Down
18 changes: 14 additions & 4 deletions kokoro/docker/kokoro_launcher.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,24 @@ mv "${KOKORO_GFILE_DIR}" "${KOKORO_NONBUILD_ARTIFACTS}/"

rm -rf "${KOKORO_ARTIFACTS_DIR}"/*

# Move the amdgpu firmware file to its canonical name for the build script.
mv "${KOKORO_NONBUILD_ARTIFACTS}/gfile/amdgpu-firmware-2020.2.tar.gz" \
"${KOKORO_NONBUILD_ARTIFACTS}/gfile/amdgpu-firmware.tar.gz"

# Authenticate to google cloud.
gcloud auth activate-service-account --key-file \
"${KOKORO_NONBUILD_ARTIFACTS}/keystore/71274_kokoro_service_key_json"
gcloud auth configure-docker

# If the NVIDIA driver version is defined download the required files
# to build the NVIDIA driver disk.
# If not defined download the required files to build the kernel disk
# with the amdgpu modules
if [[ ! -z ${NVIDIA_DRIVER_VERSION} ]]; then
readonly nvidia_driver_url="gs://yeti_graphics_drivers/nvidia/nvidia-${NVIDIA_DRIVER_VERSION}.tar.gz"
gsutil cp "${nvidia_driver_url}" "${KOKORO_NONBUILD_ARTIFACTS}/gfile/nvidia-drivers.tar.gz"
else
# Move the amdgpu firmware file to its canonical name for the build script.
mv "${KOKORO_NONBUILD_ARTIFACTS}/gfile/amdgpu-firmware-"*.tar.gz \
"${KOKORO_NONBUILD_ARTIFACTS}/gfile/amdgpu-firmware.tar.gz"
fi

docker pull gcr.io/stadia-open-source/kernel/debian9:latest
docker run \
--volume "${TMPDIR}":/workspace/tmp \
Expand All @@ -53,6 +62,7 @@ docker run \
--env "KOKORO_BUILD_NUMBER=${KOKORO_BUILD_NUMBER}" \
--env "KOKORO_JOB_NAME=${KOKORO_JOB_NAME}" \
--env "KOKORO_JOB_TYPE=${KOKORO_JOB_TYPE}" \
--env "NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}" \
--env "RAPID_CANDIDATE_NAME=${RAPID_CANDIDATE_NAME}" \
--env "TMP=/workspace/tmp" \
--net=host \
Expand Down
15 changes: 15 additions & 0 deletions stadia/nvrootfs/usr/lib/systemd/system/modprobe-nvidiagpu.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[Unit]
Description=Load nvidiagpu
DefaultDependencies=no
Conflicts=shutdown.target
Before=sysinit.target shutdown.target
Before=systemd-modules-load.service
After=local-fs.target
ConditionCapability=CAP_SYS_MODULE
ConditionKernelCommandLine=|!nogpu

[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/usr/libexec/modprobe-nvidiagpu.sh
TimeoutSec=90s
19 changes: 19 additions & 0 deletions stadia/nvrootfs/usr/libexec/modprobe-nvidiagpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

depmod
modprobe nvidia-drm
modprobe nvidia-uvm
modprobe nvidia

0 comments on commit a0f1ff9

Please sign in to comment.