In [3]:
import torch
import torchvision
from torchvision import models
import time
import networkx as nx
from torch import optim, nn
from importlib import reload
import numpy as np
import pickle

import GPUtil

import sys


## Copy of Inceptionv3, slightly modified for recording intermeridates
sys.path.append('/home/cshetty2/sct/pytorch')
import reformated_models.inception_modified as inception_modified

## Modified Alexnet, with a'factor' by which it can be made 'fat' 
import dummyModels as dm


######## For profiler (some experiments. Not required) #################
from torch.profiler import profile, record_function, ProfilerActivity


## Placer libs of baechi
sys.path.append('/home/cshetty2/sct')
from placer.placer_lib import *

import matplotlib.pyplot as plt

######## For debug purposes ONLY ########
import ctypes, gc
import psutil, os

### From https://discuss.pytorch.org/t/how-pytorch-releases-variable-garbage/7277
def memReport():
    for obj in gc.get_objects():
        if torch.is_tensor(obj):
            print(type(obj), obj.size())
    
def cpuStats():
        print(sys.version)
        print(psutil.cpu_percent())
        print(psutil.virtual_memory())  # physical memory usage
        pid = os.getpid()
        py = psutil.Process(pid)
        memoryUse = py.memory_info()[0] / 2. ** 30  # memory use in GB...I think
        print('memory GB:', memoryUse)
#########################################

def b2mb(x):
    return round(x/1024**2,8)

def b2gb(x):
    return round(x/1024**3,8)

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [4]:
from baechi_units_bigbrain import *

In [5]:
## Print memory of all available GPU's
def print_gpu_memory():
    for i in range(torch.cuda.device_count()):
        #print(torch.cuda.get_device_name(i))
        print("GPU:", i)
        print('Memory Usage:')
        print('Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,8), 'GB')
        print('Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,8), 'GB')
        #print("-----------------")
        #GPUtil.showUtilization()
        print("-----------")

In [6]:
##################################################################################
def b2gb(x): return round(x/2**30,8)
class TorchTracemalloc():

    def __enter__(self):
        self.begin = torch.cuda.memory_allocated()
        torch.cuda.reset_max_memory_allocated() # reset the peak gauge to zero
        return self

    def __exit__(self, *exc):
        self.end  = torch.cuda.memory_allocated()
        self.peak = torch.cuda.max_memory_allocated()
        self.used   = b2gb(self.end-self.begin)
        self.peaked = b2gb(self.peak-self.begin)
        print(f"delta used/peak {self.used}/{self.peaked}")

###################################################################################

In [7]:
print_gpu_memory()

GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 2
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------


## Setup

In [6]:
model_name = "ParallelModelThreeLayerSplit"
batch_size = 32
fct = 6

Nrun = 3
run_type = "training" 
repetable = 0

In [7]:
if model_name == "ParallelModel":
    inp_size_single = (1, 512*fct)
    single_run_gpu = 0
    model = dm.parallelModel(fct).to(single_run_gpu)
    opt_size = 512*fct
    
if model_name == "ParallelModelSplit":
    inp_size_single = (1, 512*fct)
    single_run_gpu = 3
    model = dm.parallelModelSplit(fct,[single_run_gpu,0], repetable)
    opt_size = 512*fct

if model_name == "ParallelModelThreeLayer":
    inp_size_single = (1, 512*fct)
    single_run_gpu = 0
    model = dm.parallelModelThreeLayer(fct).to(single_run_gpu)
    opt_size = 512*fct
    
if model_name == "ParallelModelThreeLayerSplit":
    inp_size_single = (1, 512*fct)
    single_run_gpu = 2
    model = dm.parallelModelThreeLayerSplit(fct,[single_run_gpu,1], repetable)
    opt_size = 512*fct


In [8]:
inp_size = (batch_size,) + inp_size_single

In [9]:
if run_type == "forward":
    times = []
    if 1:
    #with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
        for _ in range(Nrun):
            #torch.cuda.synchronize(0); torch.cuda.synchronize(1); torch.cuda.synchronize(2)
            if repetable == 1:
                inp   = torch.ones(inp_size)
            else:
                inp   = torch.rand(inp_size)
            start = time.time()
            inp = inp.to(single_run_gpu)
            output = model(inp)
            #torch.cuda.synchronize(0); torch.cuda.synchronize(1); torch.cuda.synchronize(2)
            end = time.time()
            times.append(1000*(end-start))
    #prof.export_chrome_trace("trace_singlegpu.json")
    gpu_time = np.mean(times[10:])
    print("Mean time taken:", gpu_time)
    print()

In [10]:
inp   = torch.ones(inp_size).to(single_run_gpu)
output = model(inp)
last_gpu = output.get_device()

if run_type == "training":

    optimizer = optim.SGD(model.parameters(), lr = 0.0001); 
    criterion = nn.MSELoss()
    dataset = torchvision.datasets.FakeData(
        size= Nrun * batch_size,
        image_size=inp_size_single,
        num_classes=opt_size,
        transform=torchvision.transforms.ToTensor())
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
    result = []


    times = []
    #if 1:
    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
        for batch_idx, (inp, oup) in enumerate(data_loader):
            torch.cuda.synchronize(0);torch.cuda.synchronize(1);torch.cuda.synchronize(2);torch.cuda.synchronize(3)
            labels = torch.randn((batch_size, opt_size)).to(last_gpu)
            start = time.time()
            inp = inp.to(single_run_gpu); 
            optimizer.zero_grad()
            output = model(inp)
            #torch.cuda.synchronize(0);torch.cuda.synchronize(1);torch.cuda.synchronize(2);torch.cuda.synchronize(3)
            ######################### loss compute ################################################
            loss = criterion(output, labels )
            ##################################################################################
            loss.backward(loss)
            optimizer.step()
            torch.cuda.synchronize(0);torch.cuda.synchronize(1);torch.cuda.synchronize(2);torch.cuda.synchronize(3)
            end = time.time()
            times.append(1000*(end-start))
    prof.export_chrome_trace("trace_split2.json")
    gpu_time = np.mean(times[10:])
    print("Mean time taken:", gpu_time)
    print()


Mean time taken: nan



  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [11]:
print_gpu_memory()
del model
del inp
del output
try:
    del labels
    del optimizer
    del loss
except: pass
gc.collect()
torch.cuda.empty_cache()
print_gpu_memory()

GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.84384155 GB
Cached:    1.2890625 GB
-----------
GPU: 2
Memory Usage:
Allocated: 2.46235704 GB
Cached:    3.01757812 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 2
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------


## Multiple Models

In [12]:

batch_size = 32
fct = 6

Nrun = 30 
run_type = "training" 
repetable = 0

In [13]:
inp_size_single = (1, 512*fct)
inp_size = (batch_size,) + inp_size_single
opt_size = 512*fct

model1 = dm.parallelModelThreeLayerSplit(fct,[1,1], 0)
model2 = dm.parallelModelThreeLayerSplit(fct,[2,2], 0)
model3 = dm.parallelModelThreeLayerSplit(fct,[2,1], 0)


In [14]:
last_gpu1 = 1
last_gpu2 = 2
last_gpu3 = 2

if run_type == "training":

    optimizer1 = optim.SGD(model1.parameters(), lr = 0.0001); 
    optimizer2 = optim.SGD(model2.parameters(), lr = 0.0001); 
    optimizer3 = optim.SGD(model3.parameters(), lr = 0.0001); 
    
    criterion = nn.MSELoss()
    dataset = torchvision.datasets.FakeData(
        size= Nrun * batch_size,
        image_size=inp_size_single,
        num_classes=opt_size,
        transform=torchvision.transforms.ToTensor())
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
    result = []


    times = []
    if 1:
    #with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
        for batch_idx, (inp, oup) in enumerate(data_loader):
            torch.cuda.synchronize(0);torch.cuda.synchronize(1);torch.cuda.synchronize(2);torch.cuda.synchronize(3)
            
            labels1 = torch.randn((batch_size, opt_size)).to(last_gpu1)
            labels2 = torch.randn((batch_size, opt_size)).to(last_gpu2)
            labels3 = torch.randn((batch_size, opt_size)).to(last_gpu3)
            
            start = time.time()
            optimizer1.zero_grad();optimizer2.zero_grad();optimizer3.zero_grad()
            
            output1 = model1(inp)
            output2 = model2(inp)
            output3 = model3(inp)
            #torch.cuda.synchronize(0);torch.cuda.synchronize(1);torch.cuda.synchronize(2);torch.cuda.synchronize(3)
            ######################### loss compute ################################################
            loss1 = criterion(output1, labels1 )
            loss2 = criterion(output2, labels2 )
            loss3 = criterion(output3, labels3 )
            ##################################################################################
            loss1.backward(loss1)
            loss2.backward(loss2)
            loss3.backward(loss3)
            
            optimizer1.step()
            optimizer2.step()
            optimizer3.step()
            torch.cuda.synchronize(0);torch.cuda.synchronize(1);torch.cuda.synchronize(2);torch.cuda.synchronize(3)
            end = time.time()
            times.append(1000*(end-start))
    #prof.export_chrome_trace("trace_singlegpu.json")
    gpu_time = np.mean(times[10:])
    print("Mean time taken:", gpu_time)
    print()


Mean time taken: 86.8112564086914



In [15]:
print_gpu_memory()
del model1, model2, model3
del inp
del output1, output2, output3
try:
    del labels1, labels2, labels3
    del optimizer1, optimizer2, optimizer3
    del loss1, loss2, loss3
except: pass
gc.collect()
torch.cuda.empty_cache()
print_gpu_memory()


GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 4.14967394 GB
Cached:    4.703125 GB
-----------
GPU: 2
Memory Usage:
Allocated: 5.76782322 GB
Cached:    6.32421875 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 2
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------


# With Threads

In [16]:
import threading

In [17]:
def run_train(model_split, fct, batch_size, Nrun, done_flag ): 
    inp_size_single = (1, 512*fct)
    model = dm.parallelModelThreeLayerSplit(fct,model_split, 0)
    
    inp_size = (batch_size,) + inp_size_single
    inp   = torch.ones(inp_size)
    output = model(inp)
    last_gpu = output.get_device()
    opt_size = tuple(output.size())[1]

    optimizer = optim.SGD(model.parameters(), lr = 0.0001); 
    criterion = nn.MSELoss()
    dataset = torchvision.datasets.FakeData(
        size= Nrun * batch_size,
        image_size=inp_size_single,
        num_classes=opt_size,
        transform=torchvision.transforms.ToTensor())
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
    result = []


    times = []
    if 1:
    #with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
        for batch_idx, (inp, oup) in enumerate(data_loader):
            torch.cuda.synchronize(0);torch.cuda.synchronize(1);torch.cuda.synchronize(2);torch.cuda.synchronize(3)
            labels = torch.randn((batch_size, opt_size)).to(last_gpu)
            start = time.time()
            optimizer.zero_grad()
            output = model(inp)
            #torch.cuda.synchronize(0);torch.cuda.synchronize(1);torch.cuda.synchronize(2);torch.cuda.synchronize(3)
            ######################### loss compute ################################################
            loss = criterion(output, labels )
            ##################################################################################
            loss.backward(loss)
            optimizer.step()
            torch.cuda.synchronize(0);torch.cuda.synchronize(1);torch.cuda.synchronize(2);torch.cuda.synchronize(3)
            end = time.time()
            times.append(1000*(end-start))
    #prof.export_chrome_trace("trace_singlegpu.json")
    gpu_time = np.mean(times[10:])
    print("Mean time taken:", gpu_time)
    print()

    del model
    del inp
    del output
    try:
        del labels
        del optimizer
        del loss
    except: pass
    gc.collect()
    torch.cuda.empty_cache()
    print_gpu_memory()
    done_flag[0] = 1

    return 0


In [18]:
batch_size = 32
fct = 6
Nrun = 50 

done_flag1 = [0]
done_flag2 = [0]
done_flag3 = [0]

In [19]:
run1 = threading.Thread(target=run_train, args=([1,1], fct, batch_size, Nrun,done_flag1,))
run2 = threading.Thread(target=run_train, args=([2,2], fct, batch_size, Nrun,done_flag2,))
run3 = threading.Thread(target=run_train, args=([1,2], fct, batch_size, Nrun,done_flag3,))

In [20]:
run1.start(); time.sleep(2)

In [21]:
run2.start(); time.sleep(2)

In [22]:
run3.start()

In [23]:
for _ in range(50):
    print(done_flag1, done_flag2, done_flag3)
    time.sleep(0.5)

[0] [0] [0]
[0] [0] [0]
[0] [0] [0]
[0] [0] [0]
[0] [0] [0]
[0] [0] [0]
[0] [0] [0]
[0] [0] [0]
[0] [0] [0]
[0] [0] [0]
[0] [0] [0]
Mean time taken: 73.59336614608765

GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 2.46528673 GB
Cached:    2.59375 GB
-----------
GPU: 2
Memory Usage:
Allocated: 4.15443468 GB
Cached:    4.296875 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
[1] [0] [0]
Mean time taken: 72.16576337814331

GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 2.68098497 GB
Cached:    2.8046875 GB
-----------
GPU: 2
Memory Usage:
Allocated: 0.98812866 GB
Cached:    1.13085938 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
Mean time taken: 65.5152976512909

GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 2
Me

KeyboardInterrupt: 

## Memory measurement

In [None]:
batch_size = 64
fct = 6

Nrun = 3
run_type = "training" 
repetable = 0


inp_size_single = (1, 512*fct)
single_run_gpu = 0
model = dm.parallelModelThreeLayer(fct).to(single_run_gpu)
opt_size = 512*fct
inp_size = (batch_size,) + inp_size_single
out_size = (batch_size,opt_size) 

inp   = torch.rand(inp_size)
    

In [None]:
with TorchTracemalloc() as tt:
    with torch.no_grad():
    #if 1:
        inp = inp.to(single_run_gpu)
        out =  model(inp)
print(tt.used)
print(tt.peaked)

In [None]:
del model
del inp
del out

In [None]:
from baechi_units_bigbrain import *

batch_size = 128
fct = 1

Nrun = 3
run_type = "training" 
repetable = 0


inp_size_single = (1, 512*fct)
single_run_gpu = 0
model = dm.parallelModelThreeLayer(fct)
opt_size = 512*fct
inp_size = (batch_size,) + inp_size_single
out_size = (batch_size,opt_size) 


    

In [None]:
tester = Profiling(model,batch_size, single_run_gpu, 40, input_size = inp_size_single)
final_output = tester.run()

In [None]:
net_res = 0
for node_id in tester.sub_module_nodes:
    node = tester.sub_module_nodes[node_id]
    print(node.input_memory)
    print(node.persistent_memory)
    print(node.temporary_memory)
    
    #curr_res_usage =  node.persistent_memory + node.temporary_memory \
    #                + node.output_memory 
    
    curr_res_usage =  node.persistent_memory
    net_res += curr_res_usage
    print("layer:", node.module)
    print("resource:", curr_res_usage)
    print('-'*20)
print(net_res)

In [None]:
batch_size = 128
fct = 1
single_run_gpu = 0
model = dm.parallelModelThreeLayer(fct)


In [None]:
estimate_model_size(model, 'MB')

In [None]:
junk   = torch.rand((1,1)).to(single_run_gpu)

In [None]:
model = dm.parallelModelThreeLayer(1).to(0)

In [None]:
1147-1085

In [None]:
model2 = dm.parallelModelThreeLayer(fct).to(single_run_gpu)

In [None]:
1189-1147

In [None]:
from baechi_units_bigbrain import *

batch_size = 32
fct = 6

Nrun = 3
run_type = "training" 
repetable = 0


inp_size_single = (1, 512*fct)
single_run_gpu = 0

opt_size = 512*fct
inp_size = (batch_size,) + inp_size_single
out_size = (batch_size,opt_size) 


In [None]:
if 1:
#with TorchTracemalloc() as tt:
    model = dm.parallelModelThreeLayer(fct).to(single_run_gpu)
    inp   = torch.rand(inp_size).to(single_run_gpu)
    labels = torch.randn(out_size).to(single_run_gpu)

    inp.requires_grad = True
    #optimizer = optim.SGD(model.parameters(), lr = 0.0001); optimizer.zero_grad()
    #criterion = nn.MSELoss()

    output = model(inp)
    #loss = criterion(output, labels)
    #loss.backward(loss)

In [None]:
163-85


In [6]:
import baechiTest_dummyModels as dm
factor = 1
inp_size_single = (1, int(512*factor))
model = dm.parallelThreeLayer(factor, 1)
opt_size = 512*factor

In [None]:
forward_original_methods={}
def recur_function(module):
    sub_modules = module.__dict__['_modules']
    for name, sub_module in sub_modules.items():
        # sub modules of sub_module, if there are more than 1, we need further recursion
        sub_sub_modules = sub_module.__dict__['_modules']
        if len(sub_sub_modules) > 0:
            recur_function(sub_module)
            continue

        def _calculate_time_and_memory(function, *input):
            with TorchTracemalloc() as tt:
                torch.cuda.synchronize(0)
                start_time = time.time()
                result = function(*input)
                torch.cuda.synchronize(0)
                stop_time = time.time()
            return (stop_time - start_time) * 1000, tt.used, tt.peaked , result

        def forward_wrapper(cur_module, *input):
            """
            use this wrapper to replace the original forward function in submodules
            :param cur_module: the input submodule
            """

            ## collect relevant information of cur module
            function = forward_original_methods[cur_module]
            forward_time, used_mem, peak_mem, result = _calculate_time_and_memory(function, *input)

            ## Input size in bytes
            input_size = 0
            for inp in input:
                input_size = input_size + estimate_tensor_size(inp, 'B')

            ## Model size in bytes
            persistent_memory = estimate_model_size(cur_module,'B', False)

            output_memory = estimate_tensor_size(result, 'B')
            
            temporary_memory = peak_mem - used_mem
            
            print("Module:", cur_module)
            print("Input memory:", b2mb(input_size))
            print("Persistent memory:", b2mb(persistent_memory) )
            print("Temporary memory:", b2mb(temporary_memory) )
            print("*"*20)

            return result
        if sub_module in forward_original_methods:
                # only record the original forward functions once
                continue

        forward_original_methods[sub_module] = sub_module.forward
        sub_module.forward = forward_wrapper.__get__(sub_module, sub_module.__class__) 

In [None]:
recur_function(model)

In [None]:
batch_size = 2
inp_size = (batch_size,) +  inp_size_single
inp = torch.randn(inp_size)* (0.000001); 

output = model(inp)

In [None]:
model5 = dm.parallelThreeLayer(1, 1)

In [None]:
model5 = model5.to(3)

In [None]:
1111-1085

In [None]:
1135-1111

In [None]:
1159-1135

In [6]:
import baechiTest_dummyModels as dm
factor = 1
inp_size_single = (1, int(512*factor))
model = dm.parallelThreeLayer(factor, 1)
opt_size = 512*factor
batch_size = 32

print_gpu_memory()

GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 2
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------


In [7]:
return_graph, tester = build_graph(model, batch_size,args.prof_gpu_id, args.prof_rounds, inp_size = inp_size_single)

  return F.mse_loss(input, target, reduction=self.reduction)


In [8]:
print_gpu_memory()

GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 2
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.04692078 GB
Cached:    0.0703125 GB
-----------


In [9]:
placed_op_graph = m_sct(return_graph, DEVICE_GRAPH_MULTIPLE)
copy_p(return_graph, tester)

2021-12-15 09:24:32,096 - m_sct_v1:157 - INFO - Start LP solver.


Problem
  Name                   :                 
  Objective sense        : min             
  Type                   : LO (linear optimization problem)
  Constraints            : 78              
  Cones                  : 0               
  Scalar variables       : 24              
  Matrix variables       : 0               
  Integer variables      : 0               

Optimizer started.
Presolve started.
Linear dependency checker started.
Linear dependency checker terminated.
Eliminator started.
Freed constraints in eliminator : 5
Eliminator terminated.
Eliminator - tries                  : 1                 time                   : 0.00            
Lin. dep.  - tries                  : 1                 time                   : 0.00            
Lin. dep.  - number                 : 0               
Presolve terminated. Time: 0.01    
Problem
  Name                   :                 
  Objective sense        : min             
  Type                   : LO (linear optimization 

2021-12-15 09:24:32,171 - m_sct_v1:162 - INFO - LP solver finished. Relaxed makespan soultion: 0.601608
2021-12-15 09:24:32,173 - m_sct_v1:140 - INFO - Favorite child round threshold: 0.5
2021-12-15 09:24:32,175 - m_sct:143 - INFO - # favorite child: 7
2021-12-15 09:24:32,178 - m_sct:144 - INFO - # favorite child changes: 0
2021-12-15 09:24:32,184 - m_sct:172 - INFO - SCT estimated runtime: 0.000001


In [10]:
print_gpu_memory()

GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 2
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.04692078 GB
Cached:    0.0703125 GB
-----------


In [11]:
del model
del return_graph
del placed_op_graph
del tester
gc.collect()              ## To clean any circular references
torch.cuda.empty_cache()

In [12]:
print_gpu_memory()

GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 2
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------


In [13]:
memReport()



In [10]:
import baechiTest_dummyModels as dm
factor = 6
inp_size_single = (1, int(512*factor))
model = dm.tallParallelModel(factor)
opt_size = 512*factor
batch_size = 32

In [11]:
return_graph, tester = build_graph(model, batch_size,args.prof_gpu_id, args.prof_rounds, inp_size = inp_size_single)