In [2]:
from ls.engine.utils import get_device

In [7]:
get_device(device_id=7)

7 8
[Device] Using CUDA:7 -> NVIDIA A100-SXM4-40GB


device(type='cuda', index=7)

In [8]:
from ls.config.loader import load_config

cfg = load_config("../configs/config.yaml")

In [9]:
cfg.training.hardware

Box({'device_id': 0, 'use_dataparallel': True, 'gpus': '2, 3, 4'})

In [11]:
import os
import torch
import torch.nn as nn


def set_visible_gpus(gpus: str, verbose: bool = True):
    """
    Restrict which GPUs PyTorch can see by setting CUDA_VISIBLE_DEVICES.

    Args:
        gpus (str): Comma-separated GPU indices, e.g., "0,1,2,3".
        verbose (bool): If True, print the selection info.
    """
    os.environ["CUDA_VISIBLE_DEVICES"] = gpus
    if verbose:
        print(f"[CUDA] Visible devices set to: {gpus}")

    # Optional sanity check after setting
    torch.cuda.device_count()  # forces CUDA to reinitialize

In [12]:
def get_device(device_id: int = 0, verbose: bool = True) -> torch.device:
    """
    Returns the best available device among CUDA, MPS, and CPU.
    Automatically detects hardware availability.

    Args:
        device_id (int): Index of visible CUDA device to use.
        verbose (bool): If True, print chosen device.

    Returns:
        torch.device: torch.device("cuda"|"mps"|"cpu")
    """
    if torch.cuda.is_available():
        num_devices = torch.cuda.device_count()
        if device_id >= num_devices:
            raise ValueError(f"Requested CUDA device {device_id}, but only {num_devices} available.")
        device = torch.device(f"cuda:{device_id}")
        if verbose:
            print(f"[Device] Using CUDA:{device_id} → {torch.cuda.get_device_name(device_id)}")

    elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
        device = torch.device("mps")
        if verbose:
            print("[Device] Using Apple Metal (MPS) acceleration")

    else:
        device = torch.device("cpu")
        if verbose:
            print("[Device] Using CPU (no GPU backend found)")

    return device

In [13]:
cfg.training.hardware.device = get_device(device_id=cfg.training.hardware.device_id)
cfg.training.hardware.device

[Device] Using CUDA:0 → NVIDIA A100-SXM4-40GB


device(type='cuda', index=0)

In [None]:
def setup_model_device(model: nn.Module, device: torch.device, use_dataparallel: bool = False) -> nn.Module:
    """
    Moves the model to the selected device.
    Optionally wraps it with DataParallel for multi-GPU use.

    Args:
        model (nn.Module): Model to place on device.
        device (torch.device): Device from get_device().
        use_dataparallel (bool): If True and multiple GPUs visible, wrap model in DataParallel.

    Returns:
        nn.Module: Model ready for training on chosen device(s).
    """
    if use_dataparallel and torch.cuda.device_count() > 1:
        print(f"[Model] Using {torch.cuda.device_count()} GPUs via DataParallel")
        model = nn.DataParallel(model)
    else:
        print(f"[Model] Using single device: {device}")

    return model.to(device)