# Temporary

In [1]:
import torch
import torchvision
from torchvision import models
import time
import networkx as nx
from torch import optim, nn
from importlib import reload
import numpy as np
import pickle
import torch.nn.functional as F

import GPUtil

import sys


## Copy of Inceptionv3, slightly modified for recording intermeridates
sys.path.append('/home/cshetty2/sct/pytorch')
import reformated_models.pytorch_modified_inception as pytorch_modified_inception

## Modified Alexnet, with a'factor' by which it can be made 'fat' 
import simple_model as sm


## Placer libs of baechi
sys.path.append('/home/cshetty2/sct')
from placer.placer_lib import *

import matplotlib.pyplot as plt

######## For profiler #################
from torch.profiler import profile, record_function, ProfilerActivity

%matplotlib inline
%load_ext autoreload
%autoreload 2


import ctypes, gc
import psutil, os

## Defined in this round about way (instead of just directly assigning) to keep it compatibble with summarize.py
class Args:
     def __init__(self,itype, prof_rounds, prof_gpu_id, batch_size, gpu_num, sch):
         self.type = itype
         self.prof_rounds = prof_rounds
         self.prof_gpu_id = prof_gpu_id
         self.batch_size = batch_size
         self.gpu_num = gpu_num
         self.sch = sch
            
itype       = 'all'  # help: forward/all -> Conside forward path only or both
prof_rounds = 4      # help: 'rounds for profiler'
prof_gpu_id = 0      # help: 'which gpu to place the profiler'
batch_size  = '32'   # help: 'batch_size'
gpu_num     = 4      # help: 'number of gpu to use'
sch         = 'sct'  # help: 'sct/etf/topo'

args = Args(itype, prof_rounds, prof_gpu_id, batch_size, gpu_num, sch)



"""
    Function: placer_lib.create_device_graph
    -> Creates a graph with devices as nodes and unit weight edges between them
    -> Each node: graph.add_node(device_id,
                                 id=device_id,
                                 name=device_info["name"],
                                 size=0,
                                 memory_limit=device_info["memory_size"])
"""
DEVICE_GRAPH_SINGLE = create_device_graph({0: {'name': '/job:localhost/replica:0/task:0/device:XLA_GPU:0', 'memory_size': 17179869184, 'type': ''}})
DEVICE_GRAPH_MULTIPLE = create_device_graph({0: {'name': '/job:localhost/replica:0/task:0/device:XLA_GPU:0', 'memory_size': 8000000000, 'type': ''}, 
                                             1: {'name': '/job:localhost/replica:0/task:0/device:XLA_GPU:1', 'memory_size': 8000000000, 'type': ''}, 
                                             2: {'name': '/job:localhost/replica:0/task:0/device:XLA_GPU:2', 'memory_size': 8000000000, 'type': ''}, 
                                             3: {'name': '/job:localhost/replica:0/task:0/device:XLA_GPU:3', 'memory_size': 8000000000, 'type': ''}})


"""
    we are going to use streams to allow parallel processing
"""
COMPUTE0 = torch.cuda.Stream(device=0)
COMPUTE1 = torch.cuda.Stream(device=1)
COMPUTE2 = torch.cuda.Stream(device=2)
COMPUTE3 = torch.cuda.Stream(device=3)
COMPUTE_STREAM = {0:COMPUTE0,1:COMPUTE1,2:COMPUTE2,3:COMPUTE3}


## A global variable can be directly called form inside a function
## But to change it, use the 'global' keyword
## Source: https://stackoverflow.com/questions/10588317/python-function-global-variables
def del_all():
    ## Clear the GPU
    try:
        global model
        del model
    except:
        print("No model")
    try:
        global inp
        del inp
    except:
        print("No inp")
    try:
        global labels
        del labels
    except:
        print("No Labels")
    try:
        global output
        del output
    except:
        print("No Output")
    try:
        global loss
        del loss
    except:
        print("No Loss")
    try:
        global optimizer
        del optimizer
    except:
        print("No optimizer")
    print("Emptying cache")
    torch.cuda.empty_cache()
    print_mem(args.prof_gpu_id)
    
# Get the leaf operations in a model. model.modules() gives not just the leaves, bbut higher levels as well
# Ref: https://stackoverflow.com/questions/54846905/pytorch-get-all-layers-of-model
# More explanation: https://discuss.pytorch.org/t/module-children-vs-module-modules/4551/4
def get_children(model: torch.nn.Module):
    # get children form model!
    children = list(model.children())
    flatt_children = {}
    if children == []:
        # if model has no children; model is last child! :O
        return {id(model): model}
    else:
       # look for children from children... to the last child!
       for child in children:
            try:
                flatt_children.update(get_children(child))
            except TypeError:
                flatt_children.update(get_children(child))
    return flatt_children


## Print memory of all available GPU's
def print_gpu_memory():
    for i in range(torch.cuda.device_count()):
        #print(torch.cuda.get_device_name(i))
        print("GPU:", i)
        print('Memory Usage:')
        print('Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,8), 'GB')
        print('Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,8), 'GB')
        #print("-----------------")
        #GPUtil.showUtilization()
        print("-----------")

# print memory of given GPU. ex: gpu_no = 0
def print_mem(gpu_id, cached=2):
    mem_allocated = round(torch.cuda.memory_allocated(gpu_id)/1024**3,8)
    mem_cached    = round(torch.cuda.memory_reserved(gpu_id)/1024**3,8)
    if cached>0:
        print('Allocated:', mem_allocated , 'GB')
    if cached>1:
        print('Cached:   ', mem_cached    , 'GB')
    return mem_allocated, mem_cached

#### Estimate size of the model (in GB or MB)

def estimate_model_size(model, unit='MB'): 
    persistent_memory = 0
    for name, param in model.named_parameters():
        persistent_memory += param.element_size() * param.nelement()
    if unit == 'GB':
        gb_mem = round(persistent_memory/1024**3,8)
        print("Estimated Model Memory:",gb_mem, "GB")
        return gb_mem
    elif unit == 'B':
        gb_mem = persistent_memory
        print("Estimated Model Memory:",gb_mem, "Bytes")
        return gb_mem
    else:
        mb_mem = round(persistent_memory/1024**2,8)
        print("Estimated Model Memory:", mb_mem, "MB")
        return mb_mem
    
def estimate_input_size(inp, unit='MB'):
    input_size = 0
    if isinstance(inp, torch.Tensor): 
        input_size += float(torch.prod(torch.tensor(inp.size())))
    if isinstance(inp, list): 
        for sub_inp in inp:
            if isinstance(sub_inp, torch.Tensor): input_size += float(torch.prod(torch.tensor(sub_inp.size())))

    input_size = input_size*torch.rand((1,1)).element_size() # multiply by 4
    if unit == 'GB':
        gb_mem = round(input_size/1024**3,8)
        print("Estimated Input Memory:",gb_mem, "GB")
        return gb_mem
    if unit == 'B':
        gb_mem = input_size
        print("Estimated Input Memory:",gb_mem, "B")
        return gb_mem
    else:
        mb_mem = round(input_size/1024**2,8)
        print("Estimated Input Memory:", mb_mem, "MB")
        return mb_mem

    
def b2gb(x): return round(x/2**30,8)
class TorchTracemalloc():

    def __enter__(self):
        self.begin = torch.cuda.memory_allocated()
        torch.cuda.reset_max_memory_allocated() # reset the peak gauge to zero
        return self

    def __exit__(self, *exc):
        self.end  = torch.cuda.memory_allocated()
        self.peak = torch.cuda.max_memory_allocated()
        self.used   = (self.end-self.begin)
        self.peaked = (self.peak-self.begin)
        print(f"delta used/peak {self.used}/{self.peaked}")

print_gpu_memory()


GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 2
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------


GPU: 0
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 2
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------


In [2]:
class TorchTracemalloc():
    def __enter__(self):
        self.begin = torch.cuda.memory_allocated()
        torch.cuda.reset_max_memory_allocated() # reset the peak gauge to zero
        return self

    def __exit__(self, *exc):
        self.end  = torch.cuda.memory_allocated()
        self.peak = torch.cuda.max_memory_allocated()
        self.used   = (self.end-self.begin)
        self.peaked = (self.peak-self.begin)

def _calculate_time_and_memory(function, *input):
    """
    - Helper function in forward wrapper
    - Calculates forward runtime, peak memory used and static memory used
    - Verified: Memory measurement context doesn't add overhead to
      time measurement
    """
    with TorchTracemalloc() as tt:
        torch.cuda.synchronize('cuda:0')
        start_time = time.time()
        result = function(*input)
        torch.cuda.synchronize('cuda:0')
        stop_time = time.time()
    return (stop_time - start_time) * 1000, tt.used, tt.peaked , result

In [3]:
inp_size = (32, 3, 299, 299)
inp = torch.rand(inp_size).to('cuda:0')
convLayer1 = nn.Conv2d(3, 192, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2)).to('cuda:0')
_calculate_time_and_memory(convLayer1.forward, inp)



(7.112741470336914,
 134578176,
 134611456,
 tensor([[[[ 4.4548e-01,  7.1091e-02,  4.4385e-01,  ..., -5.1342e-02,
             3.1152e-01, -8.9722e-02],
           [ 2.8740e-01,  1.3975e-01,  3.8090e-01,  ...,  2.8432e-01,
             3.5253e-01, -2.1155e-01],
           [ 3.5314e-01,  1.8756e-01,  9.2431e-02,  ..., -6.6950e-02,
             1.8295e-01,  1.6360e-02],
           ...,
           [ 3.4881e-02,  1.9179e-01,  2.3429e-01,  ...,  9.1239e-02,
            -3.9884e-02, -1.5244e-03],
           [ 1.3473e-01,  1.1092e-01,  3.0242e-01,  ...,  9.3249e-02,
             2.0884e-01,  1.5638e-01],
           [ 3.2495e-01,  1.0392e-01,  3.6976e-02,  ...,  2.6723e-01,
            -1.8213e-01,  1.2905e-01]],
 
          [[-2.1707e-01,  3.9914e-01,  2.2452e-01,  ...,  2.1434e-01,
             2.2052e-01,  1.4559e-01],
           [ 2.8395e-02, -2.0778e-01,  3.9858e-01,  ...,  1.0712e-01,
            -5.1491e-02, -4.2790e-01],
           [-5.5490e-01,  1.7486e-01, -8.6600e-02,  ..., -1.6715e

In [None]:
del 

In [None]:
inp = torch.rand((1,1)).to(0)

In [None]:

inp = torch.rand((1,1)).to(0)
inp.dtype


In [None]:
inp = torch.rand((1,129)).to(0)
torch.cuda.memory_allocated('cuda:0')

In [None]:
try:
    del fc1
    del fc2
    del fc3
    del inp
    del out
    del out1
    
    del adn
    del y
except:
    pass

In [None]:
inp_size = (32, 3, 299, 299)
inp = torch.rand(inp_size).to('cuda:0')
convLayer1 = nn.Conv2d(3, 64*8, 20).to('cuda:0')
with TorchTracemalloc() as tt:
    out = convLayer1(inp)
print(tt.peak-tt.used)   

In [None]:
N   = 50
with TorchTracemalloc() as tt:
    fc1 = nn.Linear(N, 1000).to('cuda:0')
    fc2 = nn.Linear(1000, 1000).to('cuda:0')
    fc3 = nn.Linear(1000, 1000).to('cuda:0')
print(tt.peak-tt.used)   

In [None]:
inp_size = (1, N)
with TorchTracemalloc() as tt:
    inp = torch.rand(inp_size).to('cuda:0')
print(tt.peak-tt.used)   

In [None]:
with TorchTracemalloc() as tt:
    #with torch.no_grad():
    if 1:
        out = fc3(fc2(fc1(inp)))
print(tt.peak-tt.used)   

In [None]:
with TorchTracemalloc() as tt:
    target = torch.rand((1,1000)).to('cuda:0')
print(tt.peak-tt.used)   

In [None]:
with TorchTracemalloc() as tt:
    loss_fn = nn.MSELoss()  # LogSoftmax + ClassNLL Loss
    err = loss_fn(out, target)
print(tt.peak-tt.used)   

In [None]:
with TorchTracemalloc() as tt:
    err.backward()
print(tt.peak-tt.used)   

In [None]:
with TorchTracemalloc() as tt:
    fc1.zero_grad(set_to_none=True)
print(tt.peak-tt.used)   

In [None]:
N = 50
fc1 = nn.Linear(N, 1000).to('cuda:0')

In [None]:
torch.cuda.memory_allocated('cuda:0')

In [None]:
def _calculate(function, *input):

    torch.cuda.synchronize('cuda:0')
    with TorchTracemalloc() as tt:
    #if 1:
        start_time = time.time()
        result = function(*input)
    torch.cuda.synchronize('cuda:0')
    stop_time = time.time()
    return (stop_time - start_time) * 1000, result

In [None]:
def fwd_wrapper(module, *input):
    old_fwd = module.forward
    t, res = _calculate(old_fwd, *input)
    print("Time: ", t)
    return res

In [None]:
def dummy(module, *input):
    print("I am the dummy forward")

In [None]:
fc1.forward = dummy

In [None]:
inp = torch.rand((1,N)).to('cuda:0')

In [None]:
fc1.forward = dummy.__get__(fc1, fc1.__class__)

In [None]:
fc1(inp)

In [None]:
fc1.forward = fwd_wrapper.__get__(fc1, fc1.__class__)

In [None]:
res = fc1(inp)

In [None]:
def _calculate(function, *input):
    torch.cuda.synchronize('cuda:0')
    with TorchTracemalloc() as tt:
    #if 1:
        start_time = time.time()
        result = function(*input)
        torch.cuda.synchronize('cuda:0')
        stop_time = time.time()
    return (stop_time - start_time) * 1000, result

def fwd_wrapper(module, *input):
    t, res = _calculate(old_fwd, *input)
    print("Time: ", t)
    return res

In [None]:
conv1 = nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2).to('cuda:0')
old_fwd = conv1.forward

In [None]:
inp = torch.rand((1, 3, 299, 299)).to('cuda:0')

In [None]:
conv1.forward = fwd_wrapper.__get__(conv1, conv1.__class__)

In [None]:
for _ in range(20):
    res = conv1(inp)

In [None]:
### Autograd testing

In [None]:
a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)

In [None]:
a1 = 3*a**3
a1.retain_grad()

In [None]:
b1 = b**2
b1.retain_grad()

In [None]:
Q = 2*a1 - 5*b1

In [None]:
print(Q.grad_fn)

In [None]:
external_grad = torch.tensor([1., 1.])
Q.backward(gradient=external_grad)

In [None]:
old_bwd = Q.grad_fn

In [None]:
def modified_bwd(ctx, grad_output):
    print("I am inside bwd!")
    out = old_bwd(ctx, grad_output)
    return out

In [None]:
## Demo: using hooks vs modifying the forward function

############### Using forward Wrapper #################

conv2 = nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2).to('cuda:0')
inp = torch.rand((1, 3, 299, 299)).to('cuda:0')
 
start_time = 0
stop_time  = 0
times = [start_time, stop_time]

## save original forward function
original_forward = conv2.forward

## define a wrapper around the original forward function
def modified_forward(module, *input):
    #print("I am about to start the forward run!")
    torch.cuda.synchronize('cuda:0')
    times[0] = time.time()
    out = original_forward(module,*input)
    torch.cuda.synchronize('cuda:0')
    times[1] = time.time()
    #print("I am done with the forward")
    return out

## set wrapper as the new forward 
conv2.forward = modified_forward

## forward run
metrics = []
for _ in range(500):
    out2 = conv2(inp)
    t = (times[1] - times[0])*1000.0
    #print(t)
    metrics.append(t)
print("*****************************************")
wrapper_mean = np.mean(metrics)
print(wrapper_mean)
print(np.std(metrics))

In [None]:

################### Using forward hooks #########################

## Setup
conv1 = nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2).to('cuda:0')

start_time = 0
stop_time  = 0
times = [start_time, stop_time]

## pre-forward hook
def print_pre(module, input):
    #print("I am about to start the forward run!")
    torch.cuda.synchronize('cuda:0')
    times[0] = time.time()

# # post-forward hook  
def print_post(module ,input, output):
    #print("I am done with the forward")
    torch.cuda.synchronize('cuda:0')
    times[1] = time.time()

## Register the hooks
conv1.register_forward_pre_hook(print_pre)
conv1.register_forward_hook(print_post)


metrics = []
for _ in range(500):
    out1 = conv1(inp)
    t = (times[1] - times[0])*1000.0
    #print(t)
    metrics.append(t)
print("*****************************************")
hook_mean = np.mean(metrics)
print(hook_mean)
print(np.std(metrics))

In [None]:
100*(wrapper_mean - hook_mean)/hook_mean

In [None]:
print_gpu_memory()

In [None]:

m0, _  = print_mem(0,0)
conv2 = nn.Conv2d(3, 640, kernel_size=11, stride=4, padding=2).to('cuda:0')
m1, _  = print_mem(0,0)

print("Actual model mem: ", m1-m0)
estimate_model_size(conv2, 'GB')

inp = torch.rand((1, 3, 2990, 2990)).to('cuda:0')
m2, _  = print_mem(0,0)

print("Actual input mem: ", m2-m1)
estimate_input_size(inp, 'GB')

with TorchTracemalloc() as tt:
    output = conv2(inp)
print(tt.used, tt.peaked)
out_size = estimate_input_size(output, 'B')
assert (tt.used==512*np.ceil(out_size/512)) # since memory is allotted in blocks of 512B
print("Peak and used difference:", tt.peaked - tt.used)

In [None]:
del output
del conv2
del inp
print_gpu_memory()

In [None]:

m0, _  = print_mem(0,0)
N = 50000
fc1 = nn.Linear(N, 1000).to('cuda:0')
m1, _  = print_mem(0,0)

print("Actual model mem: ", m1-m0)
estimate_model_size(fc1, 'GB\n')

inp = torch.rand((1,N)).to('cuda:0')
m2, _  = print_mem(0,0)

print("Actual input mem: ", m2-m1)
estimate_input_size(inp, 'GB')

with TorchTracemalloc() as tt:
    output = fc1(inp)
print(tt.used, tt.peaked)
out_size = estimate_input_size(output, 'B')
assert (tt.used==512*np.ceil(out_size/512)) # since memory is allotted in blocks of 512B
print("Peak and used difference:", tt.peaked - tt.used)

In [None]:
del output
del fc1
del inp
print_gpu_memory()

In [None]:
print('*'*20)

In [None]:
a= set()

In [None]:
a.add('r')

In [None]:
b=set()

In [None]:
b.add('0')
b.add('9')

In [None]:
a

In [None]:
b

In [None]:
a.union(b)

In [None]:
class LinearModel(nn.Module):

    def __init__(self, num_classes: int = 1000, factor: int = 1) -> None:
        super(LinearModel, self).__init__()
        self.factor = factor
        self.linear1N = 4096
        self.linear2N = 4096
        self.linear3N = 4096


        self.fc1 = nn.Linear(10000,  self.linear1N)
        self.rl1 = nn.ReLU()
        self.fc2 = nn.Linear(self.linear1N, self.linear2N)
        self.rl2 = nn.ReLU()
        self.fc3 = nn.Linear(self.linear2N, self.linear3N)
        self.rl3 = nn.ReLU()
        self.fc4 = nn.Linear(self.linear3N, num_classes)

    def forward(self, x):

        x = self.fc1(x)
        x = self.rl1(x)

        x = self.fc2(x)
        x = self.rl2(x)

        x = self.fc3(x)
        x = self.rl3(x)

        x = self.fc4(x)

        return x

In [None]:
'''
make_dot is modified to add nodes for only the autograd corresponding to layers
'''

def make_dot(var, cur_model):
    """
    this function build a DiGraph for the model, by tracing the grad function of each layer's output
    :return: the DiGraph
    """
    dot = nx.DiGraph()
    seen = set()
    output_nodes = (var.grad_fn,) if not isinstance(var, tuple) else tuple(v.grad_fn for v in var)

    def add_nodes(var):
        if var not in seen:
            cur_id = None
            if var.metadata != {}:
                if ('module' in var.metadata):
                    # this submodule has a forward function, so it's information is previously recorded in Profiling
                    cur_id = id(var.metadata['module'])
                    # retrieve the node representing this submodule
                    cur_node = cur_model.sub_module_nodes[id(var.metadata['module'])]
                    dot.add_node(id(var.metadata['module']), 
                                 model = str(cur_node.module), 
                                 name = str(cur_node.name), 
                                 weight=cur_node.weight_forward,
                                 reverse_weight=cur_node.weight_backward,
                                 id=id(var.metadata['module']), 
                                 topo_order=id(var.metadata['module']), 
                                 temporary_memory=cur_node.temporary_memory, 
                                 persistent_memory=cur_node.persistent_memory,
                                 output_memory=[cur_node.output_memory], 
                                 output_tensors=cur_node.output_memory, 
                                 colocation_group="")
                    
                    if hasattr(var, 'next_functions'):
                        for u in var.next_functions:
                            if u[0] is not None and torch.is_tensor(u[0]) is False and hasattr(u[0], 'variable') is False:
                                if u[0].metadata != {}:
                                    if ('module' in u[0].metadata):
                                        next_id = id(u[0].metadata['module'])
                                        cur_model.sub_module_nodes[next_id].children.add(cur_id)
                                        cur_model.sub_module_nodes[cur_id].parent.add(next_id)
                                    elif ('parent' in u[0].metadata):
                                        u[0].metadata['parent'].add(cur_id)
                                    else:
                                        print("Error:", u[0], " has metadata that is neither module nor parent!")
                                        return 0
                                else:
                                    u[0].metadata['parent'] = set()
                                    u[0].metadata['parent'].add(cur_id)
                                    
                                add_nodes(u[0])
                                
                elif ('parent' in var.metadata):
                    cur_id_list = []
                    for parent in var.metadata['parent']:
                        cur_id_list.append(parent)
                    if hasattr(var, 'next_functions'):
                        for u in var.next_functions:
                            if u[0] is not None and torch.is_tensor(u[0]) is False and hasattr(u[0], 'variable') is False:
                                if u[0].metadata != {}:
                                    if ('module' in u[0].metadata):
                                        next_id = id(u[0].metadata['module'])
                                        for cur_id in cur_id_list:
                                            cur_model.sub_module_nodes[next_id].children.add(cur_id)
                                            cur_model.sub_module_nodes[cur_id].parent.add(next_id)
                                    elif ('parent' in u[0].metadata):
                                        for cur_id in cur_id_list:
                                            u[0].metadata['parent'].add(cur_id)
                                    else:
                                        print("Error:", u[0], " has metadata that is neither module nor parent!")
                                        return 0
                                else:
                                    u[0].metadata['parent'] = set()
                                    for cur_id in cur_id_list:
                                        u[0].metadata['parent'].add(cur_id)
                                add_nodes(u[0])
                
            else:
                ## All functions will have either 'module' or 'parent' metadata
                print("Error:", var, " does not have any metadata!")
                return 0

            seen.add(var)

    if isinstance(var, tuple):
        # handle multiple outputs
        for v in var:
            add_nodes(v.grad_fn)
    else:
        add_nodes(var.grad_fn)
    
    return dot

In [2]:
def test_func(*inputs):
    input_list = list(inputs)
    print(input_list)

In [6]:
test_func(1,[2])

[1, [2]]


In [16]:
c1 = nn.Conv2d(3, 2, kernel_size=2)
c2 = nn.Conv2d(2, 2, kernel_size=2)

In [27]:
inp = torch.rand((1, 3, 100, 100))

In [28]:
out1 = c1(inp)
out2 = c2(out1)

In [29]:
out1.shape

torch.Size([1, 2, 99, 99])

In [30]:
out2.shape

torch.Size([1, 2, 98, 98])

In [31]:
out3 = torch.flatten(out2,1)

In [38]:
out3.shape

torch.Size([1, 2, 3, 3])

In [36]:
ap = nn.AdaptiveAvgPool2d((3,3))

In [37]:
out3 = ap(out2)

In [None]:
reset_forward_functions(model)
del original_forwards
del gpu_assignment

In [None]:
with torch.no_grad():
    output = model(inp)

In [21]:
class _concatenateLayer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, *x):
        return torch.cat(x, 1)

In [120]:
class TwoLayerLinearModel(nn.Module):

    def __init__(self, factor: int = 1) -> None:
        super(TwoLayerLinearModel, self).__init__()
        self.factor = factor
        self.linear1N = 512*self.factor
        self.linear2N = 2048*self.factor
        self.linear3N = 1024*self.factor
        self.linear4N = 2*self.linear3N
        self.linear5N = 512*self.factor


        self.fc1 = nn.Linear(self.linear1N, self.linear2N).to('cuda:0')
        self.fc2a = nn.Linear(self.linear2N, self.linear3N).to('cuda:1')
        self.fc2b = nn.Linear(self.linear2N, self.linear3N).to('cuda:0')
        self.concatenate = _concatenateLayer().to('cuda:0')
        self.fc3 = nn.Linear(self.linear4N, self.linear5N).to('cuda:0')
        self.fc4 = nn.Linear(self.linear5N, self.linear5N).to('cuda:0')
        

    def forward(self, x):
        x = x.to('cuda:0')
        x = self.fc1(x)
        

        x1 = x.to('cuda:1')
        xb = self.fc2b(x)
        xa = self.fc2a(x1)
        
        xa = xa.to('cuda:0')
        
        y = self.concatenate(xa,xb)
        y = self.fc3(y)
        y = self.fc4(y)
        return y


In [121]:
class TwoLayerLinearModel_stream(nn.Module):

    def __init__(self, factor: int = 1) -> None:
        super(TwoLayerLinearModel_stream, self).__init__()
        self.factor = factor
        self.linear1N = 512*self.factor
        self.linear2N = 2048*self.factor
        self.linear3N = 1024*self.factor
        self.linear4N = 2*self.linear3N
        self.linear5N = 512*self.factor


        self.fc1 = nn.Linear(self.linear1N, self.linear2N).to('cuda:0')
        self.fc2a = nn.Linear(self.linear2N, self.linear3N).to('cuda:1')
        self.fc2b = nn.Linear(self.linear2N, self.linear3N).to('cuda:0')
        self.concatenate = _concatenateLayer().to('cuda:0')
        self.fc3 = nn.Linear(self.linear4N, self.linear5N).to('cuda:0')
        self.fc4 = nn.Linear(self.linear5N, self.linear5N).to('cuda:0')
        

    def forward(self, x):
        x = x.to('cuda:0')
        x = self.fc1(x)
        xb = self.fc2b(x)

        with torch.cuda.stream(COMPUTE_STREAM[1]):
        #if 1:
            with torch.cuda.stream(COMPUTE_STREAM[0]):
            #if 1:
                x = x.to('cuda:1')

        xa = self.fc2a(x)
        
        ## Order of defining streams (0 first and then 1) matters a little
        #with torch.cuda.stream(COMPUTE_STREAM[0]):
        if 1:
            #with torch.cuda.stream(COMPUTE_STREAM[1]):
            if 1:
                xa = xa.to('cuda:0')
        
        y = self.concatenate(xa,xb)
        y = self.fc3(y)
        y = self.fc4(y)
        return y


In [122]:
print_gpu_memory()

GPU: 0
Memory Usage:
Allocated: 0.43326044 GB
Cached:    0.89257812 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.20395136 GB
Cached:    0.43359375 GB
-----------
GPU: 2
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------


In [123]:
factor = 5
model = TwoLayerLinearModel_stream(factor)

In [124]:
print_gpu_memory()

GPU: 0
Memory Usage:
Allocated: 0.84935236 GB
Cached:    0.89257812 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.39928293 GB
Cached:    0.43359375 GB
-----------
GPU: 2
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------


In [125]:
inp_size = (128, 512*factor)
inp   = torch.rand(inp_size)

In [126]:
times = []
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
    for _ in range(20):
        torch.cuda.synchronize(0); torch.cuda.synchronize(1); torch.cuda.synchronize(2)
        start = time.time()
        output = model(inp)
        torch.cuda.synchronize(0); torch.cuda.synchronize(1); torch.cuda.synchronize(2)
        end = time.time()
        times.append(1000*(end-start))
prof.export_chrome_trace("trace.json")
    
print("Mean time taken:", np.mean(times[10:]))

Mean time taken: 7.3741912841796875


In [127]:
print_gpu_memory()

GPU: 0
Memory Usage:
Allocated: 0.43326044 GB
Cached:    0.89257812 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.20395136 GB
Cached:    0.43359375 GB
-----------
GPU: 2
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------


In [128]:
del model
del inp
del output
print_gpu_memory()

GPU: 0
Memory Usage:
Allocated: 0.00374079 GB
Cached:    0.89257812 GB
-----------
GPU: 1
Memory Usage:
Allocated: 0.00373697 GB
Cached:    0.43359375 GB
-----------
GPU: 2
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------
GPU: 3
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
-----------


In [39]:
a = torch.zeros((1000, 1000)).to(0)
b = torch.ones((1000, 1000)).to(0)

start = time.time()
        
#with torch.cuda.stream(COMPUTE_STREAM[0]):
if 1:
    c = 0
    for i in range(10):
        c = c + (torch.sum(torch.count_nonzero(a)))
        time.sleep(0.005)

for i in range(50):
        a = a+b
        time.sleep(0.005)
print(c)
print(time.time() - start)


tensor(0, device='cuda:0')
0.31549930572509766


In [36]:
del a
del b
del c

NameError: name 'a' is not defined

In [129]:
fc0 = nn.Linear(1000, 1000).to(0)
fc1 = nn.Linear(1000, 1000).to(1)

def junk1(x):
    x = x.to(0)
    x = fc0(x)
    x = x.to(1)
    x = fc1(x)
    x = x.to(0)
    return x
    
def junk2(x):
    x = x.to(0)
    x = fc0(x)
    with torch.cuda.stream(COMPUTE_STREAM[1]):
            with torch.cuda.stream(COMPUTE_STREAM[0]):
                x = x.to(1)
    x = fc1(x)
    x = x.to(0)
    return x
    
       

In [131]:
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
    for _ in range(50):
        torch.cuda.synchronize(0); torch.cuda.synchronize(1); torch.cuda.synchronize(2)
        inp  = torch.ones((1,1000)).to(0)
        out1 =  junk1(inp)
        out2 =  junk2(inp)
        #print(out1==out2)
        torch.cuda.synchronize(0); torch.cuda.synchronize(1); torch.cuda.synchronize(2)
prof.export_chrome_trace("trace.json")
print(out1[0][0])

tensor(0.0983, device='cuda:0', grad_fn=<SelectBackward>)


In [117]:
del a
del fc
del a1
del out
del fc1

In [21]:
import random
import time
nums = tuple(random.random() for _ in range(20))

In [22]:
start = time.time()
for i in range(2000):
    inp_list = list(nums)
    out = tuple(inp_list)
print((time.time()-start)*1000)

2.909421920776367


In [10]:
inp  = torch.ones((1000,1000)).to(0)

In [11]:
start = time.time()
for i in range(2000):
    inp2 = inp.to(0)
print((time.time()-start)*1000/2000)

0.0005660057067871094


In [2]:
a = torch.rand((1,2))
b = torch.rand((1,2))
a+b

tensor([[1.8403, 0.5308]])

In [58]:
l = torch.rand((100000,10000)).to(0)

In [59]:
b = l.to(1)


with torch.cuda.stream(COMPUTE_STREAM[1]):
    c = torch.sum(b)


In [60]:
torch.cuda.synchronize(0); torch.cuda.synchronize(1); torch.cuda.synchronize(2); torch.cuda.synchronize(3)

d = torch.sum(b)
print(c-d)

tensor(-1632., device='cuda:1')


In [61]:
del l
del b, c
del d



In [327]:
#l = torch.rand((1000,1000), dtype = torch.double) # No problem with this
l = torch.rand((1000,1000), dtype = torch.double).to(0)
actual_sum = torch.sum(l, dtype = torch.double)

b = l.to(1)
    
default_sum = torch.sum(b, dtype = torch.double)

# No sum diff if this is here
# torch.cuda.synchronize(0); torch.cuda.synchronize(1); torch.cuda.synchronize(2); torch.cuda.synchronize(3)

with torch.cuda.stream(COMPUTE_STREAM[1]):
    stream_sum = torch.sum(b, dtype = torch.double)

print(stream_sum)
print(default_sum)
print(actual_sum)
print(stream_sum-default_sum)

del l
del b, stream_sum
del default_sum
del actual_sum
gc.collect()
#torch.cuda.empty_cache() #IF this is there the  the sum diff is always negative
torch.cuda.synchronize(0); torch.cuda.synchronize(1); torch.cuda.synchronize(2); torch.cuda.synchronize(3)


tensor(500100.9181, device='cuda:1', dtype=torch.float64)
tensor(500100.9181, device='cuda:1', dtype=torch.float64)
tensor(500100.9181, device='cuda:0', dtype=torch.float64)
tensor(0., device='cuda:1', dtype=torch.float64)


In [338]:
#l = torch.rand((1000,1000), dtype = torch.double) # No problem with this
l = torch.rand((1000,1000), dtype = torch.double).to(0)
actual_sum = torch.sum(l, dtype = torch.double)

with torch.cuda.stream(COMPUTE_STREAM[0]):
    with torch.cuda.stream(COMPUTE_STREAM[1]):
        b = l.to(1)
    
with torch.cuda.stream(COMPUTE_STREAM[1]):
    stream_sum = torch.sum(b, dtype = torch.double)
    
default_sum = torch.sum(b, dtype = torch.double)

print(stream_sum)
print(default_sum)
print(actual_sum)
print(stream_sum-default_sum)

## Here stream sum is equal to actual sum!!!

del l
del b, stream_sum
del default_sum
del actual_sum
gc.collect()
#torch.cuda.empty_cache() #IF this is there the  the sum diff is always negative
torch.cuda.synchronize(0); torch.cuda.synchronize(1); torch.cuda.synchronize(2); torch.cuda.synchronize(3)


tensor(500228.1464, device='cuda:1', dtype=torch.float64)
tensor(499980.1120, device='cuda:1', dtype=torch.float64)
tensor(500228.1464, device='cuda:0', dtype=torch.float64)
tensor(248.0344, device='cuda:1', dtype=torch.float64)


In [428]:
n = 0

In [489]:
l = (n*torch.ones((10000,10000), dtype = torch.double)).to(0)
actual_sum = torch.sum(l, dtype = torch.double)

with torch.cuda.stream(COMPUTE_STREAM[0]):   # No probblem if there's only one of the two streams
    with torch.cuda.stream(COMPUTE_STREAM[1]):
        b = l.to(1)

default_sum = torch.sum(b, dtype = torch.double)

print(default_sum)
print(actual_sum)
print(default_sum-actual_sum.to(1))

del l
del b,
del default_sum
del actual_sum
gc.collect()
#torch.cuda.empty_cache() #If this is there the  the sum diff is always negative
torch.cuda.synchronize(0); torch.cuda.synchronize(1); torch.cuda.synchronize(2); torch.cuda.synchronize(3)
n=1-n

tensor(65622272., device='cuda:1', dtype=torch.float64)
tensor(0., device='cuda:0', dtype=torch.float64)
tensor(65622272., device='cuda:1', dtype=torch.float64)


In [1]:
from baechi_units import *

In [85]:
class SubModuleNode:
    """
    This class represents a submodel (ex. conv2d layer) in the given model (ex. inception_v3). 
    It is represented as a node in the return graph
    """
    def __init__(self):
        # store the entire submodel
        self.module = None
        # submodel name
        self.name = None

        # nodes that must finish processing before this node (direct dependencies)
        self.parent = set()
        # nodes that depends on this node
        self.children = set()

        # forward function's estimated runtime
        self.weight_forward = 0
        # backward function's estimated runtime
        self.weight_backward = 0
        # id represented by the model's location (python's id function)
        self.id_hash = None
        # sudo id used, for one model, this sudo id starts from 0 and add 1 for each new node
        # -- self.id = None
        # storage used by submodel's parameters (weight, bias)
        self.persistent_memory = 0
        # submodel's input's size
        self.input_memory = 0
        # submodel's output's size
        self.output_memory = 0
        # temporary memory used in forward run
        self.temporary_memory = 0
        
        # gpu assigned to the submodule
        self.p = None
        
########################################################################

class Profiling:
    """
    This class produce the profile, this class referenced "https://github.com/msr-fiddle/pipedream"
    """
    def __init__(self, model, gpu=0, rounds=20, input_size=(50, 10)):
        """
        model: ex. inception_v3 model, alexnet model, etc
        gpu: choose in between {0,1,2,3}
        rounds: number of rounds to run the profiling
        """
        self.gpu = gpu
        self.model = model.to(self.gpu)
        self.input_size = input_size

        self.rounds = rounds
        # first few rounds are inaccurate, so I choose to discard the results from the first 1/4 rounds
        self.ignore_rounds = int(self.rounds/4)
        # counting variable, runs from 0 - self.rounds
        self.cur_round = 0

        # used to calculate backward runtime for each submodule
        self.back_record = []
        # all submodules record of the form {id of the layer(submodule) : SubModuleNode created out of tha layer}
        self.sub_module_nodes = {}
        # use id_hash to record the order of submodules's execution
        self.submodule_order = []

        # internal use only, record the original forward functions for submodules
        self.forward_original_methods = {}
        # internal use only, switch back to the original forward functions after profiling
        self.detach_record = set()
        # Collect handles to all hooks added, so as to remove them in detach()
        self.hook_handles = []


    def recur_function(self, module):
        """
        modify self.model: adding forward timing, backward timing, input output sizes, etc
        :param module: the model to recursively add forward/backward wrappers to
        """
        this_profiler = self
        sub_modules = module.__dict__['_modules']
        for name, sub_module in sub_modules.items():
            # sub modules of sub_module, if there are more than 1, we need further recursion
            sub_sub_modules = sub_module.__dict__['_modules']
            if len(sub_sub_modules) > 0:
                self.recur_function(sub_module)
                continue
            
            def _calculate_time_and_memory(function, *input):
                """
                - Helper function in forward wrapper
                - Calculates forward runtime, peak memory used and static memory used
                - Verified: Memory measurement context doesn't add overhead to
                  time measurement
                """
                with TorchTracemalloc(self.gpu) as tt:
                    torch.cuda.synchronize(self.gpu)
                    start_time = time.time()
                    result = function(*input)
                    torch.cuda.synchronize(self.gpu)
                    stop_time = time.time()
                return (stop_time - start_time) * 1000, tt.used, tt.peaked , result

            def forward_wrapper(cur_module, *input):
                """
                use this wrapper to replace the original forward function in submodules
                :param cur_module: the input submodule
                """
                # original forward function
                
                function = this_profiler.forward_original_methods[cur_module]
                if this_profiler.cur_round < this_profiler.ignore_rounds:
                    if this_profiler.cur_round == 0:
                        # record submodule execution order only in the first round
                        print('-->', "Module name: ",cur_module)
                        this_profiler.submodule_order.append(id(cur_module))
                    # do not record first few rounds
                    result = function(*input)
                    return result
                
                ## collect relevant information of cur module
                forward_time, used_mem, peak_mem, result = _calculate_time_and_memory(function, *input)
                
                ## Input size in bytes
                input_size = 0
                for inp in input:
                    input_size = input_size + estimate_tensor_size(inp, 'B')
                
                ## Model size in bytes
                persistent_memory = estimate_model_size(cur_module,'B', False)

                output_memory = estimate_tensor_size(result, 'B')
                
                '''
                if not(used_mem==512*np.ceil(output_memory/512)):
                    print('*'*50)
                    print("In sumodule ", cur_module , ':' )
                    print("Output memory is: ", output_memory)
                    print("But used memory is: ", used_mem)
                    print("They dont match upto a factor of 512 (since mem bolcks are alotted in 512 byte locks) as expected")
                    print('*'*50)
                '''
                    
                temporary_memory = peak_mem - used_mem

                # record a SubModuleNode for each model layer
                if id(cur_module) not in this_profiler.sub_module_nodes:
                    cur_node = SubModuleNode()
                    cur_node.id_hash = id(cur_module)
                    cur_node.module = cur_module
                    cur_node.name = cur_module.__class__.__name__
                    
                    #***********?????????????????????????????????????????***************************
                    ########## REMOVE THIS ######################
                    cur_node.persistent_memory = persistent_memory
                    cur_node.temporary_memory = temporary_memory
                    cur_node.output_memory = output_memory
                    cur_node.input_memory = input_size
                    #############################################
                    #***********?????????????????????????????????????????***************************
                    
                    ### And Uncomment this
                    #cur_node.persistent_memory = persistent_memory
                    #cur_node.temporary_memory = temporary_memory
                    #cur_node.output_memory = output_memory
                    #cur_node.input_memory = input_size
                    
                    print("Module name: ", cur_node.name)
                    print("Persistent Mem:", cur_node.persistent_memory)
                    print("Temporary Mem:", cur_node.temporary_memory )
                    print("Output Mem:", cur_node.output_memory)
                    
                else:
                    cur_node = this_profiler.sub_module_nodes[id(cur_module)]
                # we want weight_forward as the average forward runtime of the relevent rounds
                cur_node.weight_forward += forward_time / (this_profiler.rounds - this_profiler.ignore_rounds)
                this_profiler.sub_module_nodes[id(cur_module)] = cur_node

                return result

            def hook(cur_module, inputs, output):
                # this is for retriving the module inside make dot function
                if isinstance(output, tuple):
                    for i in range(len(output)):
                        print(output[i]
                        
                        #otp.grad_fn.metadata['module'] = cur_module
                else:
                    output.grad_fn.metadata['module'] = cur_module
                print("*"*50)

            def backward_post_hook(cur_module, input, output):
                """
                add backward hook to record backward runtime
                :param cur_module: the input submodule
                """
                if this_profiler.cur_round < this_profiler.ignore_rounds:
                    # do not record first few rounds
                    return
                torch.cuda.synchronize(0)
                cur_time = time.time() * 1000
                this_profiler.back_record.append((id(cur_module), cur_time))

            if sub_module in self.forward_original_methods:
                # only record the original forward functions once
                continue

            self.forward_original_methods[sub_module] = sub_module.forward
            sub_module.forward = forward_wrapper.__get__(sub_module, sub_module.__class__)
            fhook_handle = sub_module.register_forward_hook(hook)
            bhook_handle =  sub_module.register_backward_hook(backward_post_hook)
            this_profiler.hook_handles.append(fhook_handle)
            this_profiler.hook_handles.append(bhook_handle)
            
            
    def detach(self, module):
        """
        use this helper function to detach all forward wrappers
        """
        this_profiler = self
        sub_modules = module.__dict__['_modules']
        for name, sub_module in sub_modules.items():
            sub_sub_modules = sub_module.__dict__['_modules']
            if len(sub_sub_modules) > 0:
                self.detach(sub_module)
                continue
            if sub_module in self.detach_record:
                continue

            self.detach_record.add(sub_module)
            sub_module.forward = self.forward_original_methods[sub_module]
        ## Remove all the hooks that were added
        for handle in this_profiler.hook_handles:
            handle.remove()

    def run(self):
        """
        :return: the model's output of the final round
        """
        self.sub_module_nodes = {}
        self.recur_function(self.model)

        dataset = torch.randint(self.input_size[0], (self.rounds * int(args.batch_size), 2), dtype=torch.long).to(self.gpu)

        for batch_idx in range(self.rounds):
            inp = dataset[ int(args.batch_size)*(self.rounds-1):int(args.batch_size)*(self.rounds) ]
            self.cur_round = batch_idx
            
            hidden = model.init_hidden(int(args.batch_size))

            torch.cuda.synchronize(self.gpu)
            output = self.model(inp, hidden)
            torch.cuda.synchronize(self.gpu)

        self.detach(self.model)
        return output

########################################################################


In [86]:
import torch.nn as nn
import torch.nn.init as init

class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.embedding = nn.Embedding(vocab_size, hidden_size)
        init.normal_(self.embedding.weight, 0.0, 0.2)

        self.lstm = nn.LSTM(
            hidden_size,
            int(hidden_size/2),  # Bi-directional processing will ouput vectors of double size, therefore I reduced output dimensionality
            num_layers=n_layers,
            batch_first=True,  # First dimension of input tensor will be treated as a batch dimension
            bidirectional=True
        )

    # word_inputs: (batch_size, seq_length), h: (h_or_c, layer_n_direction, batch, seq_length)
    def forward(self, word_inputs, hidden_in):         
        # embedded (batch_size, seq_length, hidden_size)
        gpuid = word_inputs.get_device()

        hidden = (hidden_in[0].to(gpuid), hidden_in[1].to(gpuid))
        
        embedded = self.embedding(word_inputs)
        # output (batch_size, seq_length, hidden_size*directions)
        # hidden (h: (num_layers*directions, batch_size, hidden_size),
        #         c: (num_layers*directions, batch_size, hidden_size))
        output, hidden = self.lstm(embedded, hidden)
        return output, hidden

    def init_hidden(self, batches):
        #hidden = torch.zeros(2, self.n_layers*2, batches, int(self.hidden_size/2))
        h_s = torch.zeros(self.n_layers*2, batches, int(self.hidden_size/2))
        c_s = torch.zeros(self.n_layers*2, batches, int(self.hidden_size/2))
        hidden = (h_s, c_s)
        return hidden

In [87]:
vocab_size = 50
hidden_dim = 10
n_layers = 2

model = EncoderRNN(vocab_size, hidden_dim, n_layers).to(0)

In [88]:
inp_size = (vocab_size, hidden_dim)
tester = Profiling(model, args.prof_gpu_id, args.prof_rounds, input_size = inp_size)

In [89]:
tester.run()

--> Module name:  Embedding(50, 10)
**************************************************
--> Module name:  LSTM(10, 5, num_layers=2, batch_first=True, bidirectional=True)
torch.Size([32, 2, 10])


AttributeError: 'tuple' object has no attribute 'size'

In [38]:
vocab_size = 50
hidden_dim = 10
n_layers = 2

model = EncoderRNN(vocab_size, hidden_dim, n_layers)


EncoderRNN(
  (embedding): Embedding(50, 10)
  (lstm): LSTM(10, 5, num_layers=2, batch_first=True, bidirectional=True)
)


In [49]:
word_input = torch.LongTensor([[1, 2, 30, 4, 5]])

In [50]:
module_hidden = model.init_hidden(1)

In [51]:
module_outputs, module_hidden = moduleRNN(word_input, module_hidden)

In [52]:
module_outputs.size()

torch.Size([1, 5, 10])

In [62]:
inp = torch.randint(vocab_size, (20, 7), dtype=torch.long)
otp = torch.randint(vocab_size, (20, 10), dtype=torch.long)

In [None]:
tester = Profiling(model, gpu, rounds, input_size = inp_size)

In [2]:
hidden_size = 5
n_layers = 2

ls = nn.LSTM(
            hidden_size,
            int(hidden_size/2),  # Bi-directional processing will ouput vectors of double size, therefore I reduced output dimensionality
            num_layers=n_layers,
            batch_first=True,  # First dimension of input tensor will be treated as a batch dimension
            bidirectional=True
        )


In [7]:
a = torch.randn(3, 1)
b = torch.randn(4, 1)
print(b)
print(a[:,None,:])
print(b[None,:,:])

a[:,None,:]-b[None,:,:]

tensor([[ 0.2240],
        [-0.4726],
        [ 1.0271],
        [-1.0378]])
tensor([[[ 0.9209]],

        [[-1.2188]],

        [[ 0.0925]]])
tensor([[[ 0.2240],
         [-0.4726],
         [ 1.0271],
         [-1.0378]]])


tensor([[[ 0.6969],
         [ 1.3935],
         [-0.1062],
         [ 1.9587]],

        [[-1.4428],
         [-0.7462],
         [-2.2459],
         [-0.1810]],

        [[-0.1315],
         [ 0.5651],
         [-0.9346],
         [ 1.1303]]])