In [1]:
from IPython.core.interactiveshell import InteractiveShell

# pretty print all cell's output and not just the last one
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# External Lib imports
import collections
import html
import os
import pickle
import re
from functools import partial
from pathlib import Path
from typing import Callable

import pandas as pd
import sklearn
from tqdm import tqdm

os.environ['QT_QPA_PLATFORM'] = 'offscreen'

# FastAI Imports
from fastai import text, core, lm_rnn

# Torch imports
import torch.nn as nn
import torch.tensor as T
import torch.nn.functional as F

# Mytorch imports
from mytorch import loops as mtlp
from mytorch.utils.goodies import *
from mytorch import lriters as mtlr

import utils
from options import Options as params

# Macros

In [3]:
device = torch.device('cpu')
np.random.seed(42)
torch.manual_seed(42)

DEBUG = True
TRIM=False

# Path fields
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

WIKI_DATA_PATH = Path('raw/wikitext/wikitext-103/')
WIKI_DATA_PATH.mkdir(exist_ok=True)
IMDB_DATA_PATH = Path('raw/imdb/aclImdb/')
IMDB_DATA_PATH.mkdir(exist_ok=True)
PATH = Path('resources/proc/imdb')
DATA_PROC_PATH = PATH / 'data'
DATA_LM_PATH = PATH / 'datalm'

LM_PATH = Path('resources/models')
LM_PATH.mkdir(exist_ok=True)
PRE_PATH = LM_PATH / 'wt103'
PRE_LM_PATH = PRE_PATH / 'fwd_wt103.h5'
CLASSES = ['neg', 'pos', 'unsup']
WIKI_CLASSES = ['wiki.train.tokens', 'wiki.valid.tokens', 'wiki.test.tokens']

<torch._C.Generator at 0x7f3c051befb0>

# Network

Three part network,
 - a feature extractor
 - a label predictor
 - a domain classifier
 
 Ref:
 bs = 10
 sl = 5
 hiddim = 20

In [4]:
bs = 10
sl = 5
hdim = 20

np_x = np.random.randint(0, 1000, (sl, bs))
np_y = np.random.randint(0, 3, (bs))
np_d = np.random.randint(0, 2, (bs))

In [5]:
class FeatExtractor(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.emb = nn.Embedding(1000, 20)
        self.rnn = nn.LSTM(20, 20)
        
    def init_hidden(self, bs):
        return (torch.zeros((1, bs, 20)),
                torch.zeros((1, bs, 20)))
        
    def forward(self, x, h):
        x_emb = self.emb(x)
        print(x_emb.shape, h[0].shape, h[1].shape)
        x, h = self.rnn(x_emb, h)
        return x, h
    
    @property
    def layers(self):
        return torch.nn.ModuleList([
            self.emb, self.rnn
        ])    
class LabelPredictor(nn.Module):
    
    def __init__(self, out):
        super().__init__()
        self.clf = nn.Linear(sl*hdim, out)
        
    @property
    def layers(self):
        return torch.nn.ModuleList([self.clf])
        
    def forward(self, x):
        
        return x, self.clf(x)   
class DomainClassifier(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.clf = nn.Linear(sl*hdim, out)    
class GradReverse(Function):
    """
        Torch function used to invert the sign of gradients (to be used for argmax instead of argmin)
        Usage:
            x = GradReverse.apply(x) where x is a tensor with grads.
    """
    @staticmethod
    def forward(ctx, x):
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        return grad_output.neg()  
class ZeNetwork(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.f = FeatExtractor()
        self.y = LabelPredictor(3)
        self.d = LabelPredictor(2)
        
    @property
    def layers(self):
        return self.f.layers.extend(self.y.layers.extend(self.d.layers))
    
    def domain_(self, x):
        h = self.f.init_hidden(x.shape[1])
        x, h = self.f(x, h)
        
        x = x.transpose(1, 0).reshape(h[0].shape[1], -1)
        
        x, d = self.d(x)
        return x, d
        
    def forward(self, x):
        h = self.f.init_hidden(x.shape[1])
        x, h = self.f(x, h)
        
        x = x.transpose(1, 0).reshape(h[0].shape[1], -1)
        
        x, y = self.y(x)
#         print(y.shape)
        
        return x, y
    
    def domain(self, x):
        x = GradReverse.apply(x)
        x, y = self.d(x)
        return x, y

In [6]:
# flow = 'dann'
# assert flow  in ['main', 'dann', 'both']

model = ZeNetwork()
model.layers
lfn = nn.CrossEntropyLoss()
_x = torch.tensor(np_x)
_y = torch.tensor(np_y)
_d = torch.tensor(np_d)

ModuleList(
  (0): Embedding(1000, 20)
  (1): LSTM(20, 20)
  (2): Linear(in_features=100, out_features=3, bias=True)
  (3): Linear(in_features=100, out_features=2, bias=True)
)

In [8]:
_y, _d

(tensor([1, 2, 2, 0, 2, 0, 2, 2, 0, 0]),
 tensor([0, 1, 1, 0, 1, 1, 1, 1, 0, 1]))

In [9]:
model.zero_grad()
x, y = model(_x)
l_main = lfn(y, _y)
l_main.backward()
grads_main_f = [param.grad.clone() for param in model.f.rnn.parameters()]
print(l_main)

torch.Size([5, 10, 20]) torch.Size([1, 10, 20]) torch.Size([1, 10, 20])
tensor(1.1052, grad_fn=<NllLossBackward>)


In [12]:
model.zero_grad()
x, d = model.domain_(_x)
l_dann = lfn(d, _d)
l_dann.backward()
grads_dann_f = [param.grad.clone() for param in model.f.rnn.parameters()]
print(l_dann)

torch.Size([5, 10, 20]) torch.Size([1, 10, 20]) torch.Size([1, 10, 20])
tensor(0.6961, grad_fn=<NllLossBackward>)


In [13]:
model.zero_grad()
_x = torch.tensor(np_x)
_y = torch.tensor(np_y)
_d = torch.tensor(np_d)

x, y = model(_x)
l_main = lfn(y, _y)
# l_main.backward(retain_graph=True)

x, d = model.domain(x)
l_dann = lfn(d, _d)
l = l_main - ( 0.7 * l_dann)
l.backward()

torch.Size([5, 10, 20]) torch.Size([1, 10, 20]) torch.Size([1, 10, 20])


In [16]:
grads_main_f[0] + (0.7*grads_dann_f[0])

tensor([[-0.0024,  0.0023, -0.0031,  ..., -0.0015, -0.0019, -0.0030],
        [ 0.0003, -0.0003,  0.0010,  ...,  0.0008, -0.0002, -0.0016],
        [ 0.0030,  0.0003, -0.0005,  ..., -0.0026,  0.0018,  0.0007],
        ...,
        [-0.0010,  0.0022, -0.0008,  ...,  0.0027, -0.0015, -0.0003],
        [ 0.0002,  0.0017,  0.0012,  ...,  0.0007,  0.0012,  0.0007],
        [ 0.0015,  0.0037,  0.0017,  ..., -0.0001, -0.0002,  0.0006]])

In [14]:
[param.grad for param in model.f.rnn.parameters()][0]

tensor([[-0.0024,  0.0023, -0.0031,  ..., -0.0015, -0.0019, -0.0030],
        [ 0.0003, -0.0003,  0.0010,  ...,  0.0008, -0.0002, -0.0016],
        [ 0.0030,  0.0003, -0.0005,  ..., -0.0026,  0.0018,  0.0007],
        ...,
        [-0.0010,  0.0022, -0.0008,  ...,  0.0027, -0.0015, -0.0003],
        [ 0.0002,  0.0017,  0.0012,  ...,  0.0007,  0.0012,  0.0007],
        [ 0.0015,  0.0037,  0.0017,  ..., -0.0001, -0.0002,  0.0006]])

# Conclusion!

What I find is, that the best way to do the DANN updates, i.e.

$ \theta_f \leftarrow \theta_f - \mu \left ( \frac{d\mathcal{L}_y}{d\theta_f} - \lambda \frac{d\mathcal{L}_d}{d\theta_f} \right ) $

$\theta_y \leftarrow \theta_y - \mu\frac{d\mathcal{L}_y}{d\theta_y}$

$\theta_d \leftarrow \theta_d - \mu\lambda\frac{d\mathcal{L}_d}{d\theta_d}$

is to simply compute loss 1, without backward compute loss 2,
add losses with lamda scaled loss 2
and do a simple backward

**CODE**:

```
x, y = model(_x)
l_main = lfn(y, _y)

x, d = model.domain(x)
l_dann = lfn(d, _d)
l = l_main - ( 0.7 * l_dann)
l.backward()
```