In [1]:
# MTL
import torch
import torch.nn as nn
import numpy

In [2]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [3]:
def print_grad_tree(fn, prefix = ''):
    print(prefix, fn)
    for child in fn.next_functions:
        print_grad_tree(child[0], prefix + '    ')

In [4]:
class MTL(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.shared = nn.Embedding(4, 2, _weight=torch.tensor(
            [[0.2, 0.1], 
             [0.3, 0.4], 
             [0.5, 0.6], 
             [0.7, 0.8]], requires_grad=True))
        self.t1 = nn.Linear(2, 1, bias=False)
        self.t2 = nn.Linear(2, 1, bias=False)   
        
        self.t1.weight.data = torch.tensor([0.5, 0.5], dtype=torch.float32, requires_grad=True)
        self.t2.weight.data = torch.tensor([0.2, 0.2], dtype=torch.float32, requires_grad=True)

    def forward(self, x):
        
        enc = self.shared(x)
        y1 = self.t1(enc)
        y2 = self.t2(enc)
        
        return enc, y1, y2
    
    def print(self):
        print('shared')
        print(self.shared.weight)
        print('t1')
        print(self.t1.weight)
        print('t2')
        print(self.t2.weight)

In [5]:
m = MTL()
m.print()

shared
Parameter containing:
tensor([[0.2000, 0.1000],
        [0.3000, 0.4000],
        [0.5000, 0.6000],
        [0.7000, 0.8000]], requires_grad=True)
t1
Parameter containing:
tensor([0.5000, 0.5000], requires_grad=True)
t2
Parameter containing:
tensor([0.2000, 0.2000], requires_grad=True)


In [6]:
x = torch.randint(0, 4, (1,))
# y1gold
enc, y1, y2 = m(x)

# Grad Trees

In [7]:
# See grads of y1
print(color.BOLD + 'y1 tree' + color.END)
print_grad_tree(y1.grad_fn, color.BOLD + 'y1' + color.END)
print(color.BOLD + 'y1 tree' + color.END)
print(color.BOLD + 'y2 tree' + color.END)
print_grad_tree(y2.grad_fn, color.BOLD + 'y2' + color.END)
print(color.BOLD + 'y2 tree' + color.END)

print(color.BOLD + 'shared tree' + color.END)
print_grad_tree(enc.grad_fn, color.BOLD + 'shared' + color.END)
print(color.BOLD + 'shared tree' + color.END)

[1my1 tree[0m
[1my1[0m <MvBackward0 object at 0x7fbf7be352b0>
[1my1[0m     <EmbeddingBackward0 object at 0x7fbf7be35640>
[1my1[0m         <AccumulateGrad object at 0x7fbf7be35880>
[1my1[0m     <TBackward0 object at 0x7fbf7be356a0>
[1my1[0m         <AccumulateGrad object at 0x7fbf7be35bb0>
[1my1 tree[0m
[1my2 tree[0m
[1my2[0m <MvBackward0 object at 0x7fbf7be35bb0>
[1my2[0m     <EmbeddingBackward0 object at 0x7fbf7be35fa0>
[1my2[0m         <AccumulateGrad object at 0x7fbf7be32190>
[1my2[0m     <TBackward0 object at 0x7fbf7be35fd0>
[1my2[0m         <AccumulateGrad object at 0x7fbf7be32490>
[1my2 tree[0m
[1mshared tree[0m
[1mshared[0m <EmbeddingBackward0 object at 0x7fbf7be32490>
[1mshared[0m     <AccumulateGrad object at 0x7fbf7be32880>
[1mshared tree[0m


# Check if model is deterministic

In [8]:
#### Round 1
m1 = MTL()
m1.print()
x = torch.tensor([3])
enc, y1, y2 = m1(x)
print('\n\n\nOUTPUTS:')
print(enc, y1, y2)

shared
Parameter containing:
tensor([[0.2000, 0.1000],
        [0.3000, 0.4000],
        [0.5000, 0.6000],
        [0.7000, 0.8000]], requires_grad=True)
t1
Parameter containing:
tensor([0.5000, 0.5000], requires_grad=True)
t2
Parameter containing:
tensor([0.2000, 0.2000], requires_grad=True)



OUTPUTS:
tensor([[0.7000, 0.8000]], grad_fn=<EmbeddingBackward0>) tensor([0.7500], grad_fn=<MvBackward0>) tensor([0.3000], grad_fn=<MvBackward0>)


In [9]:
#### Round 2
m1 = MTL()
m1.print()
x = torch.tensor([3])
enc, y1, y2 = m1(x)
print('\n\n\nOUTPUTS:')
print(enc, y1, y2)

shared
Parameter containing:
tensor([[0.2000, 0.1000],
        [0.3000, 0.4000],
        [0.5000, 0.6000],
        [0.7000, 0.8000]], requires_grad=True)
t1
Parameter containing:
tensor([0.5000, 0.5000], requires_grad=True)
t2
Parameter containing:
tensor([0.2000, 0.2000], requires_grad=True)



OUTPUTS:
tensor([[0.7000, 0.8000]], grad_fn=<EmbeddingBackward0>) tensor([0.7500], grad_fn=<MvBackward0>) tensor([0.3000], grad_fn=<MvBackward0>)


#### As you can see, there is no randomness in the output if we give the same input

# Back Prop Effect

In [10]:
m1 = MTL()
m2 = MTL()

x1 = torch.tensor([3])
x2 = torch.tensor([3])

y1gold_1 = torch.tensor([10.0,])
y2gold_1 = torch.tensor([5.0,])

y1gold_2 = torch.tensor([10.0,])
y2gold_2 = torch.tensor([5.0,])

mse = nn.MSELoss()


In [11]:
enc_1, y1_1, y2_1 = m1(x1)
enc_2, y1_2, y2_2 = m2(x2)


y1_1, y2_1, y1_2, y2_2

(tensor([0.7500], grad_fn=<MvBackward0>),
 tensor([0.3000], grad_fn=<MvBackward0>),
 tensor([0.7500], grad_fn=<MvBackward0>),
 tensor([0.3000], grad_fn=<MvBackward0>))

In [13]:
# Now let's see what happens to the parameters of the Linear layer t2 when we just use L2 to calculate the gradients
l1_1 = mse(y1_1, y1gold_1)
l2_1 = mse(y2_1, y2gold_1)
print(l1_1, l2_1)

tensor(85.5625, grad_fn=<MseLossBackward0>) tensor(22.0900, grad_fn=<MseLossBackward0>)


In [15]:
l1_2 = mse(y1_2, y1gold_2)
l2_2 = mse(y2_2, y2gold_2)
print(l1_2, l2_2)

tensor(85.5625, grad_fn=<MseLossBackward0>) tensor(22.0900, grad_fn=<MseLossBackward0>)


In [16]:

l2_1.backward()
m1.t1.weight.grad, m1.t2.weight.grad, m1.shared.weight.grad

(None,
 tensor([-6.5800, -7.5200]),
 tensor([[ 0.0000,  0.0000],
         [ 0.0000,  0.0000],
         [ 0.0000,  0.0000],
         [-1.8800, -1.8800]]))

In [17]:


l = l1_2 + l2_2
# print(l1, l2, l)
l.backward()
m2.t1.weight.grad, m2.t2.weight.grad, m2.shared.weight.grad

(tensor([-12.9500, -14.8000]),
 tensor([-6.5800, -7.5200]),
 tensor([[  0.0000,   0.0000],
         [  0.0000,   0.0000],
         [  0.0000,   0.0000],
         [-11.1300, -11.1300]]))

In [None]:
y1.grad_fn.next_functions[1][0].next_functions