In [1]:
import torch
import torchvision
from torchvision import models
import time
import networkx as nx
from torch import optim, nn
from importlib import reload
import numpy as np
import pickle

import GPUtil

import sys


## Copy of Inceptionv3, slightly modified for recording intermeridates
sys.path.append('/home/cshetty2/sct/pytorch')
import reformated_models.inception_modified as inception_modified

## Modified Alexnet, with a'factor' by which it can be made 'fat' 
import simple_model as sm
import dummyModels as dm


######## For profiler (some experiments. Not required) #################
from torch.profiler import profile, record_function, ProfilerActivity


## Placer libs of baechi
sys.path.append('/home/cshetty2/sct')
from placer.placer_lib import *

import matplotlib.pyplot as plt

######## For debug purposes ONLY ########
import ctypes, gc
import psutil, os

### From https://discuss.pytorch.org/t/how-pytorch-releases-variable-garbage/7277
def memReport():
    for obj in gc.get_objects():
        if torch.is_tensor(obj):
            print(type(obj), obj.size())
    
def cpuStats():
        print(sys.version)
        print(psutil.cpu_percent())
        print(psutil.virtual_memory())  # physical memory usage
        pid = os.getpid()
        py = psutil.Process(pid)
        memoryUse = py.memory_info()[0] / 2. ** 30  # memory use in GB...I think
        print('memory GB:', memoryUse)
#########################################

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
## Print memory of all available GPU's
def print_gpu_memory():
    for i in range(torch.cuda.device_count()):
        #print(torch.cuda.get_device_name(i))
        print("GPU:", i)
        print('Memory Usage:')
        print('Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,8), 'GB')
        print('Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,8), 'GB')
        #print("-----------------")
        #GPUtil.showUtilization()
        print("-----------")

In [3]:
print_gpu_memory()

GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 2
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------


## Setup

In [18]:
model_name = "ParallelModelThreeLayerSplit"
batch_size = 32
fct = 6

Nrun = 3
run_type = "training" 
repetable = 0

In [14]:
if model_name == "ParallelModel":
    inp_size_single = (1, 512*fct)
    single_run_gpu = 0
    model = dm.parallelModel(fct).to(single_run_gpu)
    opt_size = 512*fct
    
if model_name == "ParallelModelSplit":
    inp_size_single = (1, 512*fct)
    single_run_gpu = 3
    model = dm.parallelModelSplit(fct,[single_run_gpu,0], repetable)
    opt_size = 512*fct

if model_name == "ParallelModelThreeLayer":
    inp_size_single = (1, 512*fct)
    single_run_gpu = 0
    model = dm.parallelModelThreeLayer(fct).to(single_run_gpu)
    opt_size = 512*fct
    
if model_name == "ParallelModelThreeLayerSplit":
    inp_size_single = (1, 512*fct)
    single_run_gpu = 2
    model = dm.parallelModelThreeLayerSplit(fct,[single_run_gpu,1], repetable)
    opt_size = 512*fct


In [15]:
inp_size = (batch_size,) + inp_size_single

In [19]:
if run_type == "forward":
    times = []
    if 1:
    #with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
        for _ in range(Nrun):
            #torch.cuda.synchronize(0); torch.cuda.synchronize(1); torch.cuda.synchronize(2)
            if repetable == 1:
                inp   = torch.ones(inp_size)
            else:
                inp   = torch.rand(inp_size)
            start = time.time()
            inp = inp.to(single_run_gpu)
            output = model(inp)
            #torch.cuda.synchronize(0); torch.cuda.synchronize(1); torch.cuda.synchronize(2)
            end = time.time()
            times.append(1000*(end-start))
    #prof.export_chrome_trace("trace_singlegpu.json")
    gpu_time = np.mean(times[10:])
    print("Mean time taken:", gpu_time)
    print()

In [20]:
inp   = torch.ones(inp_size).to(single_run_gpu)
output = model(inp)
last_gpu = output.get_device()

if run_type == "training":

    optimizer = optim.SGD(model.parameters(), lr = 0.0001); 
    criterion = nn.MSELoss()
    dataset = torchvision.datasets.FakeData(
        size= Nrun * batch_size,
        image_size=inp_size_single,
        num_classes=opt_size,
        transform=torchvision.transforms.ToTensor())
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
    result = []


    times = []
    #if 1:
    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
        for batch_idx, (inp, oup) in enumerate(data_loader):
            torch.cuda.synchronize(0);torch.cuda.synchronize(1);torch.cuda.synchronize(2);torch.cuda.synchronize(3)
            labels = torch.randn((batch_size, opt_size)).to(last_gpu)
            start = time.time()
            inp = inp.to(single_run_gpu); 
            optimizer.zero_grad()
            output = model(inp)
            #torch.cuda.synchronize(0);torch.cuda.synchronize(1);torch.cuda.synchronize(2);torch.cuda.synchronize(3)
            ######################### loss compute ################################################
            loss = criterion(output, labels )
            ##################################################################################
            loss.backward(loss)
            optimizer.step()
            torch.cuda.synchronize(0);torch.cuda.synchronize(1);torch.cuda.synchronize(2);torch.cuda.synchronize(3)
            end = time.time()
            times.append(1000*(end-start))
    prof.export_chrome_trace("trace_split2.json")
    gpu_time = np.mean(times[10:])
    print("Mean time taken:", gpu_time)
    print()


Mean time taken: nan



In [12]:
print_gpu_memory()
del model
del inp
del output
try:
    del labels
    del optimizer
    del loss
except: pass
gc.collect()
torch.cuda.empty_cache()
print_gpu_memory()

GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 2
Memory Usage:
Allocated: 3.3061986 GB
Cached:    3.86328125 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 2
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------


## Multiple Models

In [10]:

batch_size = 32
fct = 6

Nrun = 30 
run_type = "training" 
repetable = 0

In [11]:
inp_size_single = (1, 512*fct)
inp_size = (batch_size,) + inp_size_single
opt_size = 512*fct

model1 = dm.parallelModelThreeLayerSplit(fct,[1,1], 0)
model2 = dm.parallelModelThreeLayerSplit(fct,[2,2], 0)
model3 = dm.parallelModelThreeLayerSplit(fct,[2,1], 0)


In [12]:
last_gpu1 = 1
last_gpu2 = 2
last_gpu3 = 2

if run_type == "training":

    optimizer1 = optim.SGD(model1.parameters(), lr = 0.0001); 
    optimizer2 = optim.SGD(model2.parameters(), lr = 0.0001); 
    optimizer3 = optim.SGD(model3.parameters(), lr = 0.0001); 
    
    criterion = nn.MSELoss()
    dataset = torchvision.datasets.FakeData(
        size= Nrun * batch_size,
        image_size=inp_size_single,
        num_classes=opt_size,
        transform=torchvision.transforms.ToTensor())
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
    result = []


    times = []
    if 1:
    #with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
        for batch_idx, (inp, oup) in enumerate(data_loader):
            torch.cuda.synchronize(0);torch.cuda.synchronize(1);torch.cuda.synchronize(2);torch.cuda.synchronize(3)
            
            labels1 = torch.randn((batch_size, opt_size)).to(last_gpu1)
            labels2 = torch.randn((batch_size, opt_size)).to(last_gpu2)
            labels3 = torch.randn((batch_size, opt_size)).to(last_gpu3)
            
            start = time.time()
            optimizer1.zero_grad();optimizer2.zero_grad();optimizer3.zero_grad()
            
            output1 = model1(inp)
            output2 = model2(inp)
            output3 = model3(inp)
            #torch.cuda.synchronize(0);torch.cuda.synchronize(1);torch.cuda.synchronize(2);torch.cuda.synchronize(3)
            ######################### loss compute ################################################
            loss1 = criterion(output1, labels1 )
            loss2 = criterion(output2, labels2 )
            loss3 = criterion(output3, labels3 )
            ##################################################################################
            loss1.backward(loss1)
            loss2.backward(loss2)
            loss3.backward(loss3)
            
            optimizer1.step()
            optimizer2.step()
            optimizer3.step()
            torch.cuda.synchronize(0);torch.cuda.synchronize(1);torch.cuda.synchronize(2);torch.cuda.synchronize(3)
            end = time.time()
            times.append(1000*(end-start))
    #prof.export_chrome_trace("trace_singlegpu.json")
    gpu_time = np.mean(times[10:])
    print("Mean time taken:", gpu_time)
    print()


Mean time taken: 86.74815893173218



In [13]:
print_gpu_memory()
del model1, model2, model3
del inp
del output1, output2, output3
try:
    del labels1, labels2, labels3
    del optimizer1, optimizer2, optimizer3
    del loss1, loss2, loss3
except: pass
gc.collect()
torch.cuda.empty_cache()
print_gpu_memory()


GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 4.14967394 GB
Cached:    4.703125 GB
-----------
GPU: 2
Memory Usage:
Allocated: 5.76782322 GB
Cached:    6.32421875 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 2
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------


# With Threads

In [14]:
import threading

In [15]:
def run_train(model_split, fct, batch_size, Nrun, done_flag ): 
    inp_size_single = (1, 512*fct)
    model = dm.parallelModelThreeLayerSplit(fct,model_split, 0)
    
    inp_size = (batch_size,) + inp_size_single
    inp   = torch.ones(inp_size)
    output = model(inp)
    last_gpu = output.get_device()
    opt_size = tuple(output.size())[1]

    optimizer = optim.SGD(model.parameters(), lr = 0.0001); 
    criterion = nn.MSELoss()
    dataset = torchvision.datasets.FakeData(
        size= Nrun * batch_size,
        image_size=inp_size_single,
        num_classes=opt_size,
        transform=torchvision.transforms.ToTensor())
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
    result = []


    times = []
    if 1:
    #with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
        for batch_idx, (inp, oup) in enumerate(data_loader):
            torch.cuda.synchronize(0);torch.cuda.synchronize(1);torch.cuda.synchronize(2);torch.cuda.synchronize(3)
            labels = torch.randn((batch_size, opt_size)).to(last_gpu)
            start = time.time()
            optimizer.zero_grad()
            output = model(inp)
            #torch.cuda.synchronize(0);torch.cuda.synchronize(1);torch.cuda.synchronize(2);torch.cuda.synchronize(3)
            ######################### loss compute ################################################
            loss = criterion(output, labels )
            ##################################################################################
            loss.backward(loss)
            optimizer.step()
            torch.cuda.synchronize(0);torch.cuda.synchronize(1);torch.cuda.synchronize(2);torch.cuda.synchronize(3)
            end = time.time()
            times.append(1000*(end-start))
    #prof.export_chrome_trace("trace_singlegpu.json")
    gpu_time = np.mean(times[10:])
    print("Mean time taken:", gpu_time)
    print()

    del model
    del inp
    del output
    try:
        del labels
        del optimizer
        del loss
    except: pass
    gc.collect()
    torch.cuda.empty_cache()
    print_gpu_memory()
    done_flag[0] = 1

    return 0


In [22]:
batch_size = 32
fct = 6
Nrun = 50 

done_flag1 = [0]
done_flag2 = [0]
done_flag3 = [0]

In [28]:
run1 = threading.Thread(target=run_train, args=([1,1], fct, batch_size, Nrun,done_flag1,))
run2 = threading.Thread(target=run_train, args=([2,2], fct, batch_size, Nrun,done_flag2,))
run3 = threading.Thread(target=run_train, args=([1,2], fct, batch_size, Nrun,done_flag3,))

In [29]:
run1.start(); time.sleep(2)

In [30]:
run2.start(); time.sleep(2)

In [31]:
run3.start()

In [None]:
for _ in range(50):
    print(done_flag1, done_flag2, done_flag3)
    time.sleep(0.5)

[1] [1] [1]
[1] [1] [1]
[1] [1] [1]
[1] [1] [1]
[1] [1] [1]
[1] [1] [1]
[1] [1] [1]
[1] [1] [1]
[1] [1] [1]
[1] [1] [1]
[1] [1] [1]
[1] [1] [1]
Mean time taken: 88.66408467292786

GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 2.46199083 GB
Cached:    2.64453125 GB
-----------
GPU: 2
Memory Usage:
Allocated: 4.14967394 GB
Cached:    4.3671875 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
[1] [1] [1]
Mean time taken: 67.50410199165344

GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 2.46199083 GB
Cached:    2.64453125 GB
-----------
GPU: 2
Memory Usage:
Allocated: 0.84384155 GB
Cached:    1.05859375 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
[1] [1] [1]
Mean time taken: 55.809569358825684

GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.0 GB
Cached: 