### モデルの実装のところをもっときれいに書きたいためのnotebook

In [6]:
import torch
import torch.nn as nn
from transformers import AutoModel, T5Tokenizer
from torchinfo import summary

model_name = "rinna/japanese-roberta-base"
input_size = (32, 128)
dtypes = [torch.int, torch.long]

tokenizer = T5Tokenizer.from_pretrained(model_name)
token = tokenizer.encode_plus("例えば君がいるだけで心が強くなれるよ")
input_ids = torch.Tensor(token["input_ids"]).to(torch.long).unsqueeze(0)
attention_mask = torch.Tensor(token["attention_mask"]).to(torch.long).unsqueeze(0)

In [7]:
from re import A
from tokenize import Single
from transformers import AutoConfig
from torch.nn import functional as F


class BertClassificationMaxPoolingHeader(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super(BertClassificationMaxPoolingHeader, self).__init__()
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        # max pooling --
        self.fc = nn.Linear(self.hidden_size, self.num_classes)

    def forward(self, base_output):
        out = base_output["hidden_states"][-1].max(axis=1)[0]
        out = self.fc(out)
        return out


class BertClassificationConvolutionHeader(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super(BertClassificationConvolutionHeader, self).__init__()
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        # conv1d --
        self.cnn1 = nn.Conv1d(self.hidden_size, 256, kernel_size=2, padding=1)
        self.cnn2 = nn.Conv1d(256, self.num_classes, kernel_size=2, padding=1)

    def forward(self, base_output):
        last_hidden_state = base_output["hidden_states"][-1].permute(0, 2, 1)
        cnn_embeddings = F.relu(self.cnn1(last_hidden_state))
        cnn_embeddings = self.cnn2(cnn_embeddings)
        outputs = cnn_embeddings.max(axis=2)[0]
        return outputs


class BertClassificationLSTMHeader(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super(BertClassificationLSTMHeader, self).__init__()
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        # lstm --
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size, batch_first=True)
        self.fc = nn.Linear(self.hidden_size, self.num_classes)

    def forward(self, base_output):
        last_hidden_state = base_output["hidden_states"][-1]
        out = self.lstm(last_hidden_state, None)[0]
        out = out[:, -1, :]  # lstmの時間方向の最終層を抜く, [batch_size, hidden_size] --
        outputs = self.fc(out)
        return outputs


class BertClassificationConcatenateHeader(nn.Module):
    def __init__(self, hidden_size, num_classes, use_layer_num=4):
        super(BertClassificationConcatenateHeader, self).__init__()
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        self.use_layer_num = use_layer_num

        # concatenate --
        self.fc = nn.Linear(self.hidden_size*self.use_layer_num, self.num_classes)
        print(self.fc)

    def forward(self, base_output):
        out = torch.cat([base_output["hidden_states"][-1 * i][:, 0, :] for i in range(1, 4 + 1)], dim=1)
        outputs = self.fc(out)
        return outputs

class BertClassificationModel(nn.Module):
    def __init__(self, model_name, mode="max_pooling"):
        super(BertClassificationModel, self).__init__()
        self.cfg = AutoConfig.from_pretrained(
            model_name, output_attentions=True, output_hidden_states=True
        )
        self.l1 = AutoModel.from_pretrained(model_name)
        
        if mode=="max_pooling":
            self.l2 = BertClassificationMaxPoolingHeader(self.cfg.hidden_size, 2)
        elif mode=="conv":
            self.l2 = BertClassificationConvolutionHeader(self.cfg.hidden_size, 2)
        elif mode=="lstm":
            self.l2 = BertClassificationLSTMHeader(self.cfg.hidden_size, 2)
        elif mode=="concatenate":
            self.l2 = BertClassificationConcatenateHeader(self.cfg.hidden_size, 2, use_layer_num=4)
        else:
            assert False


    def forward(self, input_ids, attention_mask):
        out = self.l1(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
            )
        out = self.l2(out)
        return out


### 一つのモデルの中で配置

In [9]:
m3 = BertClassificationModel(model_name, mode="concatenate")

Some weights of the model checkpoint at rinna/japanese-roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at rinna/japanese-roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to b

Linear(in_features=3072, out_features=2, bias=True)


In [10]:
out = m3(input_ids=input_ids, attention_mask=attention_mask)