In [58]:
import torch
import torch.nn as nn

import IPython

from transformers import AutoTokenizer
from transformers.configuration_utils import PretrainedConfig
from transformers.models.distilbert.modeling_distilbert import DistilBertPreTrainedModel, DistilBertModel

In [4]:
class IMBDModel(DistilBertPreTrainedModel):

    def __init__(self, config : PretrainedConfig):
        super(IMBDModel, self).__init__(config)

        self.distilbert = DistilBertModel(config)

        # # freeze whole model
        # for params in self.distilbert.parameters():
        #     params.requires_grad = False

        # layers = self.distilbert.transformer.layer
        # print("Total Layers:", len(layers))

        # # Enable trainable few layers.
        # for layer_num in [5]:
        #     for params in layers[layer_num].parameters():
        #         params.requires_grad = True

        self.fc = nn.Linear(config.dim, 1)

        self.post_init()

    def forward(self, x):

        output = self.distilbert(**x)
        
        pooled_output = output.last_hidden_state[:, 0]

        x = self.fc(pooled_output)

        return x, output.attentions

In [5]:
infer_path = "model3/0/fold0_epoch01_loss0.1403_val_loss0.1994_roc_auc0.9779"

pretrained_model = IMBDModel.from_pretrained(infer_path, output_attentions=True)
pretrained_model.eval()

pretrained_tokenizer = AutoTokenizer.from_pretrained(infer_path)

In [82]:
def get_attentions(attentions):
    # REF: https://github.com/hsm207/bert_attn_viz
    
    # last layer attentions
    layer_layer_att = attentions[-1] # [batch, heads, seq_len, seq_len]
    cls_att = layer_layer_att[:,:,0,:] # attentions of [CLS] token
    cls_att_mean = cls_att.mean(dim=1) # mean over heads
    
    cls_att_mean = cls_att_mean[0]
    
    # min-max scaled because we are using for opicity (0 - 1)
    cls_att_mean = (cls_att_mean - cls_att_mean.min()) / (cls_att_mean.max() - cls_att_mean.min())
    
    return cls_att_mean

In [83]:
def prediction(text):
    
    tokens = pretrained_tokenizer(text, truncation=True, max_length=512)
    tokens = {k:torch.tensor([v]) for k, v in tokens.items()}
    word_tokens= pretrained_tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
    
    scores, attentions = pretrained_model(tokens)
    scores = torch.sigmoid(scores)
    
    return scores, word_tokens, attentions

In [84]:
text = """
When it comes to ranking the Marvel superhero(live action) films, 
Iron Man is for me up there with the better ones. It looks fabulous,
the whole film is very slickly made with top-notch special 
effects(which look like time and effort was really put into it),
awesome-looking futuristic gadgets and Iron Man's suit will be a
guaranteed delight to anybody who is familiar or loves anything to do with the superhero.
"""

In [85]:
scores, word_tokens, attentions = prediction(text)
scores

tensor([[0.9821]], grad_fn=<SigmoidBackward0>)

In [86]:
len(attentions)

6

In [87]:
att_op = get_attentions(attentions)

In [88]:
att_op.shape

torch.Size([91])

In [89]:
def wrap_text(word, score):
    return f"<span style='background-color:rgba(0, 0, 255, {score:.2f});padding:2px;'>{word}</span>"

In [90]:
html = "".join([wrap_text(w,s) for w,s in zip(word_tokens, att_op)])
display(IPython.display.HTML(html))

In [91]:
att_op.max(), att_op.min()

(tensor(1., grad_fn=<MaxBackward1>), tensor(0., grad_fn=<MinBackward1>))