In [None]:
#!pip install matplotlib 

In [1]:
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.multiprocessing import Process
import torch.nn as nn
import torch.optim as optim
import random
import time
import numpy as np
from torch.nn.parallel import DistributedDataParallel as DDP

In [2]:
device = 'cpu'
if torch.cuda.is_available():
    device = torch.device("cuda")          # a CUDA device object
    x = torch.randn(4, 4)
    y = torch.ones_like(x, device=device)  # directly create a tensor on GPU
    x = x.to(device)                       # or just use strings ``.to("cuda")``
    z = x + y
    print(z)
    print(z.to("cpu", torch.double))       

In [3]:
def init_process(rank, size,fn, backend='gloo'):
    """ Initialize the distributed environment. """
    os.environ['MASTER_ADDR'] = '127.0.0.1'
    os.environ['MASTER_PORT'] = '29500'
    dist.init_process_group(backend, rank=rank, world_size=size)
    fn(rank, size)

In [4]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.il  = nn.Linear(1,80)
        self.mi  = nn.Linear(80,80)
        self.mi2 = nn.Linear(80,40)
        self.ol  = nn.Linear(40,1)
        self.relu = nn.ReLU()
    def forward(self,x):
        hidden1 = self.il(x)
        hidden2 = self.mi(self.relu(hidden1))
        hidden3 = self.mi2(self.relu(hidden2))
        out =     self.ol(self.relu(hidden3))
        return out


In [5]:
N = 80000
M = 4
BS = 1000
epocs = 10000

In [6]:
import matplotlib.pyplot as plt

In [7]:
def show_fit(m):
    x_in2 = [0.5*i for i in range(40)]
    def yfun(i):
        return np.sqrt(i)*np.sin(4*3.14*i/20.0)
    y_vals2 = [yfun(i) for i in x_in2]
    inputs2 = torch.tensor([x_in2]).T
    targets2 = torch.tensor([y_vals2]).T
    y = m(inputs2)
    yy = y.detach().numpy()
    plt.plot(x_in2, yy, 'o-')
    plt.plot(x_in2, y_vals2)

In [8]:
def sync_initial_weights(model, rank, world_size):
    for param in model.parameters():
        if rank == 0:
            # Rank 0 is sending it's own weight
            # to all it's siblings (1 to world_size)
            for sibling in range(1, world_size):
                dist.send(param.data, dst=sibling)
        else:
            # Siblings must recieve the parameters
            dist.recv(param.data, src=0)

In [9]:
def sync_gradients(model, rank, world_size):
    for param in model.parameters():
        dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM)
        param.grad.data = param.grad.data/world_size

In [10]:
def show_model(model, rank, world_size):
    print("model for rank ", rank)
    for param in model.parameters():
        if rank == 0:
            print(param)
            print(param.data)           
        dist.all_reduce(param.data, op=dist.ReduceOp.SUM)
        param.data = param.data/world_size
        if rank == 0:
            print(' now after reduce')
            print(param)
            print(param.data)

In [11]:
def average_model(model, rank, world_size):
    for param in model.parameters():
        dist.all_reduce(param.data, op=dist.ReduceOp.SUM)
        param.data = param.data/world_size

In [12]:

x_in = [(20.0/N)*random.randint(0,N) for i in range(N)]
def yfun(x):
    return np.sqrt(x)*np.sin(3.14*x/5.0)
y_vals = [yfun(x) for x in x_in]

def mk_minibatch(i, size):  
    s = int(size)
    my_in = x_in[s*i: s*(i+1)]
    my_vals = y_vals[s*i: s*(i+1)]
    return (my_in, my_vals)

batches = [mk_minibatch(i, BS) for i in range(int(N/BS))]
k = len(batches)
print(k, M)
batch = []
s = 0
for i in range(M):
    bat = []
    for j in range(int(k/M)):
        bat.append(batches[s+j])
        #print(i, M, k, j, s, s+j)
    batch.append(bat)
    s+= int(k/M)
    
def batchtodev(rank, device):
    btch =batch[rank]
    devb = []
    for x in btch:
        xin = torch.tensor([x[0]], device=device).T
        yin = torch.tensor([x[1]], device=device).T
        devb.append((xin, yin))
    return devb

80 4


In [13]:
def run(rank, size):
    device="cpu"
    model = Net().to(device)
    print('my rank is ', rank, ' out of ', size)
    sync_initial_weights(model, rank, size)
    
    btch = batchtodev(rank, device)
    print('batch has ', len(btch), ' elements')
    print("len of btch[0][0]=", len(btch[0][0]))
    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.005)
    ta = time.time()
    for epoc in range(1,epocs):
        for b in btch:
            optimizer.zero_grad()
            outputs = model(b[0])
            loss = loss_fn(outputs, b[1])
            loss.backward()
            if epoc % 10  == 0:
                sync_gradients(model, rank, size)
            optimizer.step()
        if epoc % 200 == 0:
            average_model(model,rank, size)
            #sync_gradients(model, rank, size)

        if epoc % 1000 == 0:
            tb = time.time()
            elapse = tb-ta
            ta = time.time()
            #sync_gradients(model, rank, size)
            print('epoc %d loss %f elapse %f'%(epoc, float(loss), elapse))

    if rank == 0:
        torch.save(model.state_dict(), "/tmp/model")
    #show_model(model,rank, size)

In [14]:
processes = []
t0 = time.time()
for rank in range(M):
    p = Process(target=init_process, args=(rank, M, run))
    p.start()
    processes.append(p)
    
for p in processes:
    p.join()
print("elapse = ", time.time()-t0)

my rank is  1  out of  4
my rank is  0  out of  4
my rank is  3  out of  4
my rank is  2  out of  4
batch has  20  elements
batch has  20  elements
len of btch[0][0]= 1000
batch has  20  elements
len of btch[0][0]= 1000
len of btch[0][0]= 1000
batch has  20  elements
len of btch[0][0]= 1000
epoc 1000 loss 1.580778 elapse 83.785775
epoc 1000 loss 1.112996 elapse 83.795451
epoc 1000 loss 0.772435 elapse 83.763190
epoc 1000 loss 1.183104 elapse 83.784999
epoc 2000 loss 0.092150 elapse 83.466444
epoc 2000 loss 0.816509 elapse 83.466567
epoc 2000 loss 0.093433 elapse 83.466534
epoc 2000 loss 0.068608 elapse 83.466437
epoc 3000 loss 0.130762 elapse 86.214714
epoc 3000 loss 0.150380 elapse 86.217446
epoc 3000 loss 0.142085 elapse 86.217384
epoc 3000 loss 0.091100 elapse 86.216580
epoc 4000 loss 0.298993 elapse 93.983464
epoc 4000 loss 0.052613 elapse 93.982752
epoc 4000 loss 0.025358 elapse 93.983884
epoc 4000 loss 0.024898 elapse 93.983337
epoc 5000 loss 0.217339 elapse 99.549428
epoc 5000 l

In [None]:
m0 = Net()
m0.load_state_dict(torch.load("/tmp/model"))

In [None]:
show_fit(m0)

In [None]:
x_in = [0.025*random.randint(0,800) for i in range(800)]
def yfun(i):
    return np.sqrt(i)*np.sin(4*3.14*i/20.0)
y_vals = [yfun(i) for i in x_in]
inputs = torch.tensor([x_in]).T
targets = torch.tensor([y_vals]).T
loss_fn = nn.MSELoss()
    
mouts = m0(inputs)
err = loss_fn(mouts, targets)
print("err =", err)

In [None]:
134/39