# Train with PyTorch Lightning

Description: Train multi-nodes/multi-gpu, pytorch lightning

In [None]:
from azureml.core import Workspace

ws = Workspace.from_config()
ws

In [None]:
# training script
source_dir = "."
script_name = "train.py"

# environment file
environment_file = "environment.yml"

# azure ml settings
environment_name = "pt-lightning"
experiment_name = "pt-lightning-tutorial"
compute_name = "two-gpu-cluster"

In [None]:
%%writefile environment.yml

channels:
  - conda-forge
dependencies:
  - python=3.7
  - pip
  - pip:
    - azureml-defaults
    - torch==1.10
    - torchvision==0.11.1
    - pytorch-lightning==1.5.4
    - mlflow
    - azureml-mlflow

In [None]:
from azureml.core import Environment
from azureml.core.runconfig import DockerConfiguration

env = Environment.from_conda_specification(environment_name, environment_file)


env.docker.base_image = (
    "mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.1-cudnn8-ubuntu18.04"
)


docker_config = DockerConfiguration(use_docker=True)

### Multi-node multi-GPU training



In [None]:
import os
from azureml.core import ScriptRunConfig, Experiment

from azureml.core.runconfig import  MpiConfiguration

distr_config = MpiConfiguration(node_count=2) 

src = ScriptRunConfig(
    source_directory=source_dir,
    script=script_name,
    arguments=["--max_epochs", 25,
               "--gpus", 2, 
               "--accelerator", "ddp",
               "--num_nodes", 2],
    compute_target=compute_name,
    environment=env,
    distributed_job_config=distr_config,
    docker_runtime_config=docker_config
    )

run = Experiment(ws, experiment_name).submit(src)
run

In [None]:
from azureml.widgets import RunDetails

RunDetails(run).show()

In [None]:
run.wait_for_completion(show_output=True)