Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions accelerator/accelerator.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
# Copyright 2024 Intel Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -19,7 +20,7 @@
connection: local
roles:
- accelerator_validation
tags: amd, nvidia
tags: amd, nvidia, intel

- name: Update Repositories/Registries on nodes
ansible.builtin.import_playbook: ../utils/update_user_repo.yml
Expand All @@ -37,7 +38,7 @@
gather_facts: true
roles:
- common
tags: amd, nvidia
tags: amd, nvidia, intel

- name: Perform GPU driver and ROCm installation for AMD Accelerators
hosts: all
Expand Down Expand Up @@ -66,3 +67,11 @@
# - name: Reboot node
# ansible.builtin.reboot:
# tags: nvidia

- name: Install Habana drivers on nodes
hosts: all
gather_facts: true
any_errors_fatal: true
roles:
- intel
tags: intel
6 changes: 6 additions & 0 deletions accelerator/roles/accelerator_validation/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,14 @@
ansible.builtin.set_fact:
control_plane_os: "{{ ansible_distribution | lower }}"

- name: Saving distribution version of os
ansible.builtin.set_fact:
control_plane_os_version: "{{ ansible_distribution_version | lower }}"

- name: Include local_repo variables
ansible.builtin.include_tasks: include_local_repo_config.yml

# TODO: need a way to differentiate platforms, then run different validations
- name: Check xcat installation status
ansible.builtin.include_tasks: validate_amd.yml
# ansible.builtin.include_tasks: validate_intel_gaudi.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Copyright 2024 Intel Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---

- name: Set default intel gaudi status
ansible.builtin.set_fact:
intel_gaudi_config_status: false
intel_gaudi_input_status: false

- name: Load software_config.json
ansible.builtin.include_vars:
file: "{{ software_config_json_file }}"
name: user_config

- name: Include vars for {{ control_plane_os }}
ansible.builtin.include_vars: "{{ role_path }}/vars/{{ control_plane_os }}.yml"

- name: Get Intel Gaudi status
ansible.builtin.set_fact:
intel_gaudi_input_status: true
loop: "{{ user_config.softwares | default([]) }}"
when:
- "'intelgaudi' in item.name"
loop_control:
loop_var: item

- name: Failed, Intel Gaudi software stack not present in software_config.json
ansible.builtin.fail:
msg: "{{ intel_gaudi_input_fail_msg }}"
when: not intel_gaudi_input_status

- name: Set intel_gaudi_config_status
when: intel_gaudi_input_status
block:
- name: Fetch intelgaudi_version
ansible.builtin.set_fact:
intelgaudi_version: "{{ user_config.softwares | selectattr('name', 'equalto', 'intelgaudi') | map(attribute='version') | first }}"

- name: Check driver packages inside offline_gaudi_directory
ansible.builtin.find:
paths: "{{ offline_gaudi_directory }}"
patterns: "{{ gaudi_search_pattern }}"
register: check_driver_packages

- name: Set intel_gaudi_config_status to true
ansible.builtin.set_fact:
intel_gaudi_config_status: true
when: check_driver_packages.matched > 0
rescue:
- name: Failed, Intel Gaudi driver packages not found
ansible.builtin.fail:
msg: "{{ intel_gaudi_repo_fail_msg }}"
when: check_driver_packages.matched == 0
4 changes: 4 additions & 0 deletions accelerator/roles/accelerator_validation/vars/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ rocm_repo_fail_msg: "Failed, local_repo.yml is not executed for downloading ROCM
# Usage: include_local_repo_config.yml
local_repo_config_file: "{{ role_path }}/../../../input/local_repo_config.yml"
local_repo_config_syntax_fail_msg: "Failed. Syntax errors present in local_repo_config.yml. Fix errors and re-run playbook again."

# Usage: validate_intel_gaudi.yml
intel_gaudi_input_fail_msg: "Failed, software_config.json does not have the intelgaudi software stack."
intel_gaudi_repo_fail_msg: "Failed, local_repo.yml is not executed for downloading Intel Gaudi driver packages."
4 changes: 4 additions & 0 deletions accelerator/roles/accelerator_validation/vars/ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,7 @@

# Usage: validate_amd.yml
offline_rocm_directory: "{{ repo_store_path }}/cluster/apt"

# Usage: validate_intel_gaudi.yml
offline_gaudi_directory: "{{ repo_store_path }}/cluster/{{ control_plane_os }}/{{ control_plane_os_version }}/deb"
gaudi_search_pattern: "habanalabs*.deb"
56 changes: 56 additions & 0 deletions accelerator/roles/intel/tasks/install_ubuntu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Copyright 2024 Intel Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---

- name: Check if accelerator is present on node
ansible.builtin.include_tasks: verify_has_accelerators.yml

- name: Install drivers on Gaudi nodes
when: node_has_accelerator
block:
- name: Gather package facts
ansible.builtin.package_facts:
manager: auto

- name: Check if kernel supported
ansible.builtin.fail:
msg: "Kernel not supported"
when: ansible_kernel is version('5.4.0', '<')

- name: Base dependencies
ansible.builtin.apt:
name: "{{ intel_apt_base_packages | list }}"
state: latest
update_cache: true

- name: Update apt and install habanalabs dependencies
ansible.builtin.apt:
name: "{{ intel_habana_packages | list }}"
update_cache: true

- name: Get Secure Boot Status
ansible.builtin.shell: |
set -o pipefail
mokutil --sb-state | grep SecureBoot
register: sb_out
failed_when: (sb_out.stdout.find('enabled') != -1)
changed_when: false
args:
executable: /bin/bash

- name: Add Gaudi kernel modules
community.general.modprobe:
name: "{{ item }}"
state: present
loop: "{{ intel_gaudi_kernel_module_to_load }}"
20 changes: 20 additions & 0 deletions accelerator/roles/intel/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright 2024 Intel Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---

- name: Install for ubuntu
ansible.builtin.include_tasks: install_ubuntu.yml
when:
- compute_os == "ubuntu"
- compute_os_version == "22.04"
33 changes: 33 additions & 0 deletions accelerator/roles/intel/tasks/verify_has_accelerators.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright 2024 Intel Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---

- name: Initialise node accelerator status
ansible.builtin.set_fact:
node_has_accelerator: false

- name: Check node accelerator status
ansible.builtin.shell: |
set -o pipefail
lspci | grep -i "{{ intel_gaudi_device_pattern }}"
register: lspci_output
changed_when: false
failed_when: false
args:
executable: /bin/bash

- name: Update node accelerator status
ansible.builtin.set_fact:
node_has_accelerator: true
when: lspci_output.stdout | length > 0
53 changes: 53 additions & 0 deletions accelerator/roles/intel/vars/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Copyright 2024 Intel Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---

intel_gaudi_device_pattern: "Processing accelerators: Habana Labs Ltd."

intel_gaudi_kernel_module_to_load:
- habanalabs
- habanalabs_cn
- habanalabs_en

# TODO: move to a central config file
intel_habana_packages:
- habanalabs-dkms
- habanalabs-firmware
- habanalabs-firmware-tools
- habanalabs-graph
- habanalabs-qual
- habanalabs-rdma-core
- habanalabs-thunk
- habanatools

# TODO: move to a central config file
intel_apt_base_packages:
- cmake
- curl
- dkms
- ethtool
- gcc
- iproute2
- libbz2-dev
- libelf-dev
- libibverbs-dev
- liblzma-dev
- librdmacm-dev
- linux-headers-{{ ansible_kernel }}
- linux-modules-extra-{{ ansible_kernel }}
- lsof
- moreutils
- numactl
- unzip
- wget
67 changes: 67 additions & 0 deletions accelerator/tests/test_Gaudi.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Copyright 2024 Intel Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---

# Verify Gaudi installation

- name: OMNIA_Gaudi_TC_001
tags: TC_001, TC_003, TC_004
hosts: localhost
connection: local
vars_files:
- 'test_vars/test_Gaudi_vars.yml'
tasks:
- name: Validate Gaudi Driver Version
ansible.builtin.command:
argv:
- "ansible-playbook"
- "test_Gaudi_validation.yml"
- "-i"
- "{{ inventory }}"
- "-t"
- "TC_001,TC_002,TC_003,TC_004"

# Verify hl-qual

- name: OMNIA_Gaudi_TC_007
tags: TC_007
hosts: localhost
connection: local
vars_files:
- 'test_vars/test_Gaudi_vars.yml'
tasks:
- name: Verify hl-qual
ansible.builtin.command:
argv:
- "ansible-playbook"
- "test_Gaudi_hlqual_validation.yml"
- "-i"
- "{{ inventory }}"

# Verify HCCL

- name: OMNIA_Gaudi_TC_008
tags: TC_008
hosts: localhost
connection: local
vars_files:
- 'test_vars/test_Gaudi_vars.yml'
tasks:
- name: Verify HCCL
ansible.builtin.command:
argv:
- "ansible-playbook"
- "test_Gaudi_hccl_validation.yml"
- "-i"
- "{{ inventory }}"
Loading