### 11/02, https://arxiv.org/pdf/1910.12574.pdf, の(d)を実装するnotebook

In [251]:
import torch
import torch.nn as nn
from transformers import AutoModel, T5Tokenizer
from torchinfo import summary

model_name = "rinna/japanese-roberta-base"
#model_name = "rinna/japanese-roberta-large"
input_size = (32, 128)
dtypes = [torch.int, torch.long]

tokenizer = T5Tokenizer.from_pretrained(model_name)
token = tokenizer.encode_plus("例えば君がいるだけで心が強くなれるよ。そうはいっても人生楽ありゃ雲あるそ")
input_ids = torch.Tensor(token["input_ids"]).to(torch.long).unsqueeze(0)
attention_mask = torch.Tensor(token["attention_mask"]).to(torch.long).unsqueeze(0)

OSError: rinna/japanese-roberta-large is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.

In [None]:
from tokenize import Single
from transformers import AutoConfig
from torch.nn import functional as F


class BertClassificationMaxPoolingHeader(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super(BertClassificationMaxPoolingHeader, self).__init__()
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        # max pooling --
        self.fc = nn.Linear(self.hidden_size, self.num_classes)

    def forward(self, base_output):
        out = base_output["hidden_states"][-1].max(axis=1)[0]
        out = self.fc(out)
        return out


class BertClassificationConvolutionHeader(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super(BertClassificationConvolutionHeader, self).__init__()
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        # conv1d --
        self.cnn1 = nn.Conv1d(self.hidden_size, 256, kernel_size=2, padding=1)
        self.cnn2 = nn.Conv1d(256, self.num_classes, kernel_size=2, padding=1)

    def forward(self, base_output):
        last_hidden_state = base_output["hidden_states"][-1].permute(0, 2, 1)
        cnn_embeddings = F.relu(self.cnn1(last_hidden_state))
        cnn_embeddings = self.cnn2(cnn_embeddings)
        outputs = cnn_embeddings.max(axis=2)[0]
        return outputs


class BertClassificationLSTMHeader(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super(BertClassificationLSTMHeader, self).__init__()
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        # lstm --
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size, batch_first=True)
        self.fc = nn.Linear(self.hidden_size, self.num_classes)

    def forward(self, base_output):
        last_hidden_state = base_output["hidden_states"][-1]
        out = self.lstm(last_hidden_state, None)[0]
        out = out[:, -1, :]  # lstmの時間方向の最終層を抜く, [batch_size, hidden_size] --
        outputs = self.fc(out)
        return outputs


class BertClassificationConcatenateHeader(nn.Module):
    def __init__(self, hidden_size, num_classes, use_layer_num=4):
        super(BertClassificationConcatenateHeader, self).__init__()
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        self.use_layer_num = use_layer_num

        # concatenate --
        self.fc = nn.Linear(self.hidden_size*self.use_layer_num, self.num_classes)
        print(self.fc)

    def forward(self, base_output):
        out = torch.cat([base_output["hidden_states"][-1 * i][:, 0, :] for i in range(1, 4 + 1)], dim=1)
        outputs = self.fc(out)
        return outputs

class BertClassificationModel(nn.Module):
    def __init__(self, model_name, mode=None):
        super(BertClassificationModel, self).__init__()
        self.cfg = AutoConfig.from_pretrained(
            model_name, output_attentions=True, output_hidden_states=True
        )
        self.l1 = AutoModel.from_pretrained(model_name)
        
        if mode=="max_pooling":
            self.l2 = BertClassificationMaxPoolingHeader(self.cfg.hidden_size, 2)
        elif mode=="conv":
            self.l2 = BertClassificationConvolutionHeader(self.cfg.hidden_size, 2)
        elif mode=="lstm":
            self.l2 = BertClassificationLSTMHeader(self.cfg.hidden_size, 2)
        elif mode=="concatenate":
            self.l2 = BertClassificationConcatenateHeader(self.cfg.hidden_size, 2, use_layer_num=4)
        else:
            self.l2 = nn.Identity()


    def forward(self, input_ids, attention_mask):
        hidden_states = self.l1(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
            )
        out = self.l2(hidden_states)
        return out, hidden_states


In [None]:
m3 = BertClassificationModel(model_name)

Some weights of the model checkpoint at rinna/japanese-roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at rinna/japanese-roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to b

In [None]:
out, hidden_states = m3(input_ids=input_ids, attention_mask=attention_mask)

In [None]:
hidden_states["hidden_states"][1].shape

torch.Size([1, 22, 768])

In [None]:
hidden_states.hidden_states[0].shape

torch.Size([1, 22, 768])

In [None]:
len(hidden_states["hidden_states"])

13

In [None]:
class BertClassificationMozafariHeader(nn.Module):
    """
    Mozafari et al., 2019の(d)を実装したModule
    AutoModelのoutput["hidden_states"]を受け取る想定
    実装はここを参考(https://github.com/ZeroxTM/BERT-CNN-Fine-Tuning-For-Hate-Speech-Detection-in-Online-Social-Media/blob/main/Model.py)
    
    BatchNorm2dの場所は下記を参考
    https://qiita.com/cfiken/items/b477c7878828ebdb0387
    
    """
    def __init__(self, hidden_size, hidden_layer_num, max_length, num_classes):
        super(BertClassificationMozafariHeader, self).__init__()
        self.hidden_size = hidden_size
        self.hidden_layer_num = hidden_layer_num
        self.max_length = max_length
        self.num_classes = num_classes

        self.conv = nn.Conv2d(
            in_channels=self.hidden_layer_num, out_channels=self.hidden_layer_num, kernel_size=(3, self.hidden_size), padding=1
            ) # [batch, hidden_layer, max_length, hidden_size] -> [batch, hidden_layer, max_length, 1]
        self.batchnorm = nn.BatchNorm2d(num_features=self.hidden_layer_num)
        self.pool = nn.MaxPool2d(kernel_size=3, stride=1)
        self.relu = nn.ReLU()
        self.flatten = nn.Flatten() # [batch, a, b, c...] -> [batch, a*b*c...]
        self.fc = nn.Linear(self.hidden_layer_num*(self.max_length-2), self.num_classes)


    def forward(self, base_output):
        x1 = torch.transpose(
            torch.cat(tuple([t.unsqueeze(0) for t in base_output["hidden_states"]][1:]), 0),
            0, 
            1)  # -> [batch, hidden_layer_num, max_length, hidden_size] --
        x2 = self.relu(self.batchnorm(self.conv(x1)))
        x3 = self.pool(x2)  # "...the maximum value for each transformer encoder..."
        x4 = self.fc(self.flatten(x3))

        return x1, x2, x3, x4

In [None]:
base_output = hidden_states
mozafari = BertClassificationMozafariHeader(768, 12, 22, 2)

In [246]:
x1, x2, x3, x4 = mozafari(base_output)

In [247]:
x1.shape

torch.Size([1, 12, 22, 768])

In [248]:
x2.shape

torch.Size([1, 12, 22, 3])

In [249]:
x3.shape

torch.Size([1, 12, 20, 1])

In [250]:
hidden_layers = 12
max_length = 22
hidden_size = 768

hidden_layers*(max_length-2)

240

In [239]:
x4.shape

torch.Size([1, 2])

## notebookここまで

In [165]:
t.shape

torch.Size([1, 13, 16, 768])

In [175]:
torch.eq(t[:, -1, :, :], base_output["last_hidden_state"]).all()

tensor(True)

In [161]:
AutoConfig.from_pretrained(model_name)

RobertaConfig {
  "_name_or_path": "rinna/japanese-roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 1,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 3,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

In [159]:
from torchinfo import summary
summary(AutoModel.from_pretrained(model_name), input_size=input_size, dtypes=dtypes, depth=4)

Some weights of the model checkpoint at rinna/japanese-roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at rinna/japanese-roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to b

Layer (type:depth-idx)                                  Output Shape              Param #
RobertaModel                                            [32, 768]                 --
├─RobertaEmbeddings: 1-1                                [32, 128, 768]            --
│    └─Embedding: 2-1                                   [32, 128, 768]            24,576,000
│    └─Embedding: 2-2                                   [32, 128, 768]            1,536
│    └─Embedding: 2-3                                   [32, 128, 768]            394,752
│    └─LayerNorm: 2-4                                   [32, 128, 768]            1,536
│    └─Dropout: 2-5                                     [32, 128, 768]            --
├─RobertaEncoder: 1-2                                   [32, 128, 768]            --
│    └─ModuleList: 2-6                                  --                        --
│    │    └─RobertaLayer: 3-1                           [32, 128, 768]            2,361,600
│    │    │    └─RobertaAttention:

In [158]:
m3.state_dict().keys()

odict_keys(['l1.embeddings.position_ids', 'l1.embeddings.word_embeddings.weight', 'l1.embeddings.position_embeddings.weight', 'l1.embeddings.token_type_embeddings.weight', 'l1.embeddings.LayerNorm.weight', 'l1.embeddings.LayerNorm.bias', 'l1.encoder.layer.0.attention.self.query.weight', 'l1.encoder.layer.0.attention.self.query.bias', 'l1.encoder.layer.0.attention.self.key.weight', 'l1.encoder.layer.0.attention.self.key.bias', 'l1.encoder.layer.0.attention.self.value.weight', 'l1.encoder.layer.0.attention.self.value.bias', 'l1.encoder.layer.0.attention.output.dense.weight', 'l1.encoder.layer.0.attention.output.dense.bias', 'l1.encoder.layer.0.attention.output.LayerNorm.weight', 'l1.encoder.layer.0.attention.output.LayerNorm.bias', 'l1.encoder.layer.0.intermediate.dense.weight', 'l1.encoder.layer.0.intermediate.dense.bias', 'l1.encoder.layer.0.output.dense.weight', 'l1.encoder.layer.0.output.dense.bias', 'l1.encoder.layer.0.output.LayerNorm.weight', 'l1.encoder.layer.0.output.LayerNorm.b

* MaxPool

In [127]:
t = nn.MaxPool1d(kernel_size=2, stride=4)

tt = torch.Tensor([0, 5, 0, 0, 0, 4]).unsqueeze(0).unsqueeze(0)
t(tt)

tensor([[[5., 4.]]])