In [1]:
#hide
#skip
! [ -e /content ] && pip install -Uqq fastai  # upgrade fastai on colab

In [2]:
#export 
from fastai.data.all import *
from fastai.text.models.core import *
from fastai.text.models.awdlstm import *
from xcube.layers import *

In [3]:
#hide
from nbdev.showdoc import *

In [4]:
#default_exp text.models.core

In [5]:
#hide
%load_ext autoreload
%autoreload 2

# Core text modules

> Contain the modules common between different architectures and the generic functions to get models

In [6]:
#export
_model_meta = {AWD_LSTM: {'hid_name':'emb_sz', 'url':URLs.WT103_FWD, 'url_bwd':URLs.WT103_BWD,
                          'config_lm':awd_lstm_lm_config, 'split_lm': awd_lstm_lm_split,
                          'config_clas':awd_lstm_clas_config, 'split_clas': awd_lstm_clas_split},}

## Basic Models

In [6]:
#export
class SequentialRNN(nn.Sequential):
    "A sequential pytorch module that passes the reset call to its children."
    def reset(self):
        for c in self.children(): getattr(c, 'reset', noop)()

## Classification Models

In [7]:
#export
def _pad_tensor(t, bs):
    if t.size(0) < bs: return torch.cat([t, t.new_zeros(bs-t.size(0), *t.shape[1:])])
    return t

In [8]:
#export
class SentenceEncoder(Module):
    "Create an encoder over `module` that can process a full sentence."
    def __init__(self, bptt, module, pad_idx=1, max_len=None): store_attr('bptt,module,pad_idx,max_len')
    def reset(self): getattr(self.module, 'reset', noop)()

    def forward(self, input):
        bs,sl = input.size()
        self.reset()
        mask = input == self.pad_idx
        outs,masks = [],[]
        for i in range(0, sl, self.bptt):
            #Note: this expects that sequence really begins on a round multiple of bptt
            real_bs = (input[:,i] != self.pad_idx).long().sum()
            o = self.module(input[:real_bs,i: min(i+self.bptt, sl)])
            if self.max_len is None or sl-i <= self.max_len:
                outs.append(o)
                masks.append(mask[:,i: min(i+self.bptt, sl)])
        outs = torch.cat([_pad_tensor(o, bs) for o in outs], dim=1)
        mask = torch.cat(masks, dim=1)
        return outs,mask

Examples:

In [9]:
config = awd_lstm_clas_config.copy()
del config['output_p']
config

{'emb_sz': 400,
 'n_hid': 1152,
 'n_layers': 3,
 'pad_token': 1,
 'bidir': False,
 'hidden_p': 0.3,
 'input_p': 0.4,
 'embed_p': 0.05,
 'weight_p': 0.5}

In [12]:
encoder = SentenceEncoder(72, AWD_LSTM(vocab_sz=60008, **config), pad_idx=1, max_len=72*20).cuda()
encoder

SentenceEncoder(
  (module): AWD_LSTM(
    (encoder): Embedding(60008, 400, padding_idx=1)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(60008, 400, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1152, batch_first=True)
      )
      (1): WeightDropout(
        (module): LSTM(1152, 1152, batch_first=True)
      )
      (2): WeightDropout(
        (module): LSTM(1152, 400, batch_first=True)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
      (2): RNNDropout()
    )
  )
)

> Warning: This module expects the inputs padded with most of the padding first, with the sequence beginning at a round multiple of bptt (and the rest of the padding at the end). Use `pad_input_chunk` to get your data in a suitable format.

In [9]:
#export
def masked_concat_pool(output, mask, bptt):
    "Pool `MultiBatchEncoder` outputs into one vector [last_hidden, max_pool, avg_pool]"
    lens = output.shape[1] - mask.long().sum(dim=1)
    last_lens = mask[:,-bptt:].long().sum(dim=1)
    avg_pool = output.masked_fill(mask[:, :, None], 0).sum(dim=1)
    avg_pool.div_(lens.type(avg_pool.dtype)[:,None])
    max_pool = output.masked_fill(mask[:,:,None], -float('inf')).max(dim=1)[0]
    x = torch.cat([output[torch.arange(0, output.size(0)),-last_lens-1], max_pool, avg_pool], 1) #Concat pooling.
    return x

In [14]:
x = to_device(torch.randint(low=0, high=60008, size=(128, 18398)))

In [15]:
out, mask = encoder(x)

In [16]:
out.shape, mask.shape

(torch.Size([128, 1406, 400]), torch.Size([128, 1406]))

In [10]:
#export
class PoolingLinearClassifier(Module):
    "Create a linear classifier with pooling"
    def __init__(self, dims, ps, bptt, y_range=None):
        if len(ps) != len(dims)-1: raise ValueError("Number of layers and dropout values do not match.")
        acts = [nn.ReLU(inplace=True)] * (len(dims) - 2) + [None]
        layers = [LinBnDrop(i, o, p=p, act=a) for i,o,p,a in zip(dims[:-1], dims[1:], ps, acts)]
        if y_range is not None: layers.append(SigmoidRange(*y_range))
        self.layers = nn.Sequential(*layers)
        self.bptt = bptt

    def forward(self, input):
        out,mask = input
        x = masked_concat_pool(out, mask, self.bptt)
        x = self.layers(x)
        return x, out, out

In [18]:
x = masked_concat_pool(out, mask, bptt=72)
x.shape

torch.Size([128, 1200])

The output of `masked_concat_pool` is fed into the decoder. So Let's now check out the decoder which compresses the incoming features (in this case 1200) to 50 linear features and then outputs the number of classes (in this example 6594).

In [19]:
layers = [1200, 50, 6594]
ps = [0.04, 0.1]
decoder = PoolingLinearClassifier(layers, ps, bptt=72).cuda()
decoder

PoolingLinearClassifier(
  (layers): Sequential(
    (0): LinBnDrop(
      (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Dropout(p=0.04, inplace=False)
      (2): Linear(in_features=1200, out_features=50, bias=False)
      (3): ReLU(inplace=True)
    )
    (1): LinBnDrop(
      (0): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Dropout(p=0.1, inplace=False)
      (2): Linear(in_features=50, out_features=6594, bias=False)
    )
  )
)

In [20]:
preds, *_ = decoder((out, mask))

In [21]:
preds.shape

torch.Size([128, 6594])

---

Breaking down the `PoolingLinearClassifier.__init__`:

Note that in the `__init__` while creating `PoolingLinearClassifier` `dims` is `layers`

In [None]:
dims = layers
print(f"{dims = }")

dims = [1200, 50, 6594]


In [None]:
print(f"{ps = }")

ps = [0.04000000000000001, 0.1]


Also note that `bptt` is `seq_len`

In [None]:
bptt = seq_len
print(f"{bptt = }")

bptt = 72


In [None]:
y_range = None

In [None]:
if len(ps) != len(dims) - 1: raise ValueError("Number of layers and dopout values do not match.")

In [None]:
acts = [nn.ReLU(inplace=True)] * (len(dims) - 2) + [None]
acts

[ReLU(inplace=True), None]

In [None]:
for i, o, p, a in zip(dims[:-1], dims[1:], ps, acts):
    print(f"{i = }, {o = }, {p = }, {a = }")

i = 1200, o = 50, p = 0.04000000000000001, a = ReLU(inplace=True)
i = 50, o = 6594, p = 0.1, a = None


In [None]:
layers = [LinBnDrop(i, o, p=p, act=a) for i, o, p, a in zip(dims[:-1], dims[1:], ps, acts)]
layers

[LinBnDrop(
   (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (1): Dropout(p=0.04000000000000001, inplace=False)
   (2): Linear(in_features=1200, out_features=50, bias=False)
   (3): ReLU(inplace=True)
 ),
 LinBnDrop(
   (0): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (1): Dropout(p=0.1, inplace=False)
   (2): Linear(in_features=50, out_features=6594, bias=False)
 )]

---

In [11]:
#export
class OurPoolingLinearClassifier(Module):
    def __init__(self, dims, ps, bptt, y_range=None):
        self.layer = LinBnDrop(dims[0], dims[1], p=ps, act=None)
        self.bptt = bptt

    def forward(self, input):
        out, mask = input
        x = masked_concat_pool(out, mask, self.bptt)
        x = self.layer(x)
        return x, out, out

Note that `OurPoolingLinearClassifier` is exactly same as fastai's `PoolingLinearClassifier` except that we do not do the feature compression from 1200 to 50 linear features. 

In [23]:
decoder = OurPoolingLinearClassifier(dims=[1200, 6594], ps=0.04, bptt=72)

In [24]:
decoder

OurPoolingLinearClassifier(
  (layer): LinBnDrop(
    (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Dropout(p=0.04, inplace=False)
    (2): Linear(in_features=1200, out_features=6594, bias=False)
  )
)

Note: Also try `OurPoolingLinearClassifier` w/o dropouts and batch normalization (Verify this, but as far as what I found it does not work well as compared to /w batch normalization)

In [12]:
#export
class LabelAttentionClassifier(Module):
    def __init__(self, dims, ps, bptt, y_range=None):
        self.lbs = dims[-1] 
        self.fts = dims[0]//3
        # self.layers = LinBnDrop(self.lbs, self.fts, p=ps, act=None) # wrong_deb
        
        # ps = 0.1 # deb
        self.layers = LinBnDrop(self.lbs, ln=False, p=ps, act=None) # deb
        self.bptt = bptt
        self.emb_label = Embedding(self.lbs, self.fts) # deb
        self.final_lin = nn.Linear(self.fts, self.lbs) 

    def forward(self, input):
        out, _ = input
        # x = masked_concat_pool(out, mask, self.bptt)
        
        bs = out.shape[0]
        # ctx = out.new_zeros((bs, self.lbs, self.fts))
        # for out_split in torch.split(out, 1, dim=1):
        attn_wgts = out @ self.emb_label.weight.transpose(0, 1) # deb
        attn_wgts = F.softmax(attn_wgts, 1) # deb
        # attn_wgts = torch.nn.functional.log_softmax(attn_wgts, 1) # deb
        # attn_wgts = torch.log(attn_wgts)/(attn_wgts.sum(dim=1, keepdim=True) + 1e-12)
        # attn_wgts[torch.isnan(attn_wgts)] = tensor(0.)
        # attn_wgts = torch.nn.functional.normalize(torch.log(attn_wgts), dim=1)
        ctx = attn_wgts.transpose(1,2) @ out # deb

        x = self.layers(ctx)
        # x = self.final_lin.weight.mul(x).sum(dim=2).add(self.final_lin.bias) #missed_deb
        x = (self.final_lin.weight * x).sum(dim=2)
        
        # x = x.view(x.shape[0], x.shape[1])
        return x, out, out

Example

In [70]:
decoder = LabelAttentionClassifier([1200, 6594], ps=0.04, bptt=72).cuda()
decoder

LabelAttentionClassifier(
  (layers): LinBnDrop(
    (0): BatchNorm1d(6594, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Dropout(p=0.1, inplace=False)
  )
  (emb_label): Embedding(6594, 400)
  (final_lin): Linear(in_features=400, out_features=6594, bias=True)
)

In [29]:
preds, *_ = decoder((out, None))
preds.shape

torch.Size([128, 6594])

---

Breaking down `LabelAttentionClassifier`:

In [28]:
decoder.emb_label.weight.shape

torch.Size([6594, 400])

In [29]:
out.shape, out.device

(torch.Size([128, 1406, 400]), device(type='cuda', index=0))

In [32]:
attn_wgts = out @ decoder.emb_label.weight.transpose(0,1)
attn_wgts.shape, attn_wgts.device

(torch.Size([128, 1406, 6594]), device(type='cuda', index=0))

In [34]:
attn_wgts = F.softmax(attn_wgts, 1)

In [33]:
# attn_wgts = None
# import gc
# gc.collect()
# torch.cuda.empty_cache()

In [None]:
#out[:, :, None].shape

In [35]:
attn_wgts.transpose(1,2).shape

torch.Size([128, 6594, 1406])

In [36]:
ctx = attn_wgts.transpose(1,2) @ out
ctx.shape

torch.Size([128, 6594, 400])

In [74]:
a = torch.arange(10).reshape(5,2)

In [76]:
a, a.shape

(tensor([[0, 1],
         [2, 3],
         [4, 5],
         [6, 7],
         [8, 9]]),
 torch.Size([5, 2]))

In [82]:
for a_split in torch.split(a, 2): print(a_split, a_split.shape, end='\n****\n')

tensor([[0, 1],
        [2, 3]]) torch.Size([2, 2])
****
tensor([[4, 5],
        [6, 7]]) torch.Size([2, 2])
****
tensor([[8, 9]]) torch.Size([1, 2])
****


---

In [6]:
#export
def get_text_classifier(arch, vocab_sz, n_class, seq_len=72, config=None, drop_mult=1., lin_ftrs=None,
                       ps=None, pad_idx=1, max_len=72*20, y_range=None):
    "Create a text classifier from `arch` and its `config`, maybe `pretrained`"
    meta = _model_meta[arch]
    config = ifnone(config, meta['config_clas']).copy()
    for k in config.keys():
        if k.endswith('_p'): config[k] *= drop_mult
    if lin_ftrs is None: lin_ftrs = [50]
    if ps is None: ps = [0.1]*len(lin_ftrs) # not required if not using OurPoolingLinearClasifier
#     layers = [config[meta['hid_name']] * 3] + lin_ftrs + [n_class]  # required if using fastai's PoolingLinearClassifier
    layers = [config[meta['hid_name']] * 3] + [n_class]
#     ps = [config.pop('output_p')] + ps
    ps = config.pop('output_p')
    init = config.pop('init') if 'init' in config else None
    encoder = SentenceEncoder(seq_len, arch(vocab_sz, **config), pad_idx=pad_idx, max_len=max_len)
    decoder = OurPoolingLinearClassifier(layers, ps, bptt=seq_len, y_range=y_range)
    # decoder = LabelAttentionClassifier(layers, ps, bptt=seq_len, y_range=y_range)
    model = SequentialRNN(encoder, decoder)
    return model if init is None else model.apply(init)

In [10]:
from nbdev.export import notebook2script; notebook2script()

Converted 00_core.ipynb.
Converted 01_layers.ipynb.
Converted 02_text.models.core.ipynb.
Converted 03_text.learner.ipynb.
Converted 04_metrics.ipynb.
Converted index.ipynb.
