In [None]:
!pip install ignite numpy

In [1]:
import torch
import ignite.distributed as idist

def train_fn(local_rank, matrix_size=100):
    device = idist.device()
    rank = idist.get_rank()
    
    # Master (rank 0) creates initial matrix
    if rank == 0:
        data = torch.randn(matrix_size, 1, device=device)
    else:
        data = torch.zeros(matrix_size, 1, device=device)
    
    # Broadcast full matrix to all nodes
    data = idist.broadcast(data, src=0)
    
    # Split and process data
    chunk_size = matrix_size // idist.get_world_size()
    start_idx = rank * chunk_size
    end_idx = (rank + 1) * chunk_size
    local_data = data[start_idx:end_idx].clone()
    local_data += torch.rand(1, device=device)  # Add random value
    
    # Gather results at master
    gathered = idist.all_gather(local_data)
    
    if rank == 0:
        result = torch.cat(gathered)[:matrix_size]  # Combine slices
        print("Final result:\n", result)

# Distributed configuration (MUST be set per node)
dist_config = {
    "nproc_per_node": 1,        # 1 process per machine
    "nnodes": 2,                 # Total machines
    "node_rank": 0,              # 0 for master, 1 for worker
    "master_addr": "192.168.1.191", # Master node IP
    "master_port": 29500          # Open port on master
}

with idist.Parallel(backend="gloo", **dist_config) as parallel:
    parallel.run(train_fn)

if __name__ == '__main__':
    freeze_support()

2025-06-25 13:42:04,339 ignite.distributed.launcher.Parallel INFO: Initialized distributed launcher with backend: 'gloo'
2025-06-25 13:42:04,340 ignite.distributed.launcher.Parallel INFO: - Parameters to spawn processes: 
	nproc_per_node: 1
	nnodes: 2
	node_rank: 0
	master_addr: 192.168.1.191
	master_port: 29500
2025-06-25 13:42:04,340 ignite.distributed.launcher.Parallel INFO: Spawn function '<function train_fn at 0x740fb16a3ce0>' in 1 processes


KeyboardInterrupt: 