# Warming Up - BERT 에 Custom Classifier 추가 하기




---

### 참조: 
Adding Custom Layers on Top of a Hugging Face Model
- https://towardsdatascience.com/adding-custom-layers-on-top-of-a-hugging-face-model-f1ccdfc257bd
- code
    - https://jovian.ai/rajbsangani/emotion-tuned-sarcasm

# 0. 환경 셋업

In [1]:
from transformers import AutoModel

from datasets import load_dataset,Dataset,DatasetDict
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
import pandas as pd


# 1. 데이터 준비

In [2]:

data=load_dataset("json",data_files="download_data/news-headlines-dataset-for-sarcasm-detection.zip")
data=data.rename_column("is_sarcastic","label")

data=data.remove_columns(['article_link'])

data.set_format('pandas')
data=data['train'][:]

data.drop_duplicates(subset=['headline'],inplace=True)
data=data.reset_index()[['headline','label']]



Using custom data configuration default-5fa7fc59288bdff3
Reusing dataset json (/home/ec2-user/.cache/huggingface/datasets/json/default-5fa7fc59288bdff3/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)


  0%|          | 0/1 [00:00<?, ?it/s]

## 1.1 데이터 확인
- is_sarcastic (풍자) 에 따라 레이블이 1 과 0 임

In [3]:
data

Unnamed: 0,headline,label
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0
...,...,...
28498,tyson holds contest to let fans submit new ide...,1
28499,increasingly cocky bernie sanders announces he...,1
28500,cash-strapped zuckerberg forced to sell 11 mil...,1
28501,grocery store bar actually has great little ha...,1


In [4]:
data=Dataset.from_pandas(data)

# 80% train, 20% test + validation
train_testvalid = data.train_test_split(test_size=0.2,seed=15)

# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5,seed=15)



In [5]:
train_testvalid, test_valid 

(DatasetDict({
     train: Dataset({
         features: ['headline', 'label'],
         num_rows: 22802
     })
     test: Dataset({
         features: ['headline', 'label'],
         num_rows: 5701
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['headline', 'label'],
         num_rows: 2850
     })
     test: Dataset({
         features: ['headline', 'label'],
         num_rows: 2851
     })
 }))

## 1.2 최종 사전 형태의 데이터 세트 (훈련, 검증, 테스트 셋) 생성

In [6]:
# gather everyone if you want to have a single DatasetDict
data = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

data

DatasetDict({
    train: Dataset({
        features: ['headline', 'label'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['headline', 'label'],
        num_rows: 2851
    })
    valid: Dataset({
        features: ['headline', 'label'],
        num_rows: 2850
    })
})

In [7]:
data["train"].data[0:5]

pyarrow.Table
headline: string
label: int64
----
headline: [["former versace store clerk sues over secret 'black code' for minority shoppers","the 'roseanne' revival catches up to our thorny political mood, for better and worse","mom starting to fear son's web series closest thing she will have to grandchild","boehner just wants wife to listen, not come up with alternative debt-reduction ideas","j.k. rowling wishes snape happy birthday in the most magical way"]]
label: [[0,0,1,1,0]]

## 1.3. 토큰나이저 로딩 및 BERT 인코딩으로 변환

In [8]:
model_id = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.model_max_len=512

In [9]:
def tokenize(batch):
  return tokenizer(batch["headline"], truncation=True,max_length=512)

tokenized_dataset = data.map(tokenize, batched=True)
tokenized_dataset

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['headline', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['headline', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2851
    })
    valid: Dataset({
        features: ['headline', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2850
    })
})

## 1.4. 데이터 로더 생성

In [10]:
tokenized_dataset.set_format("torch",columns=["input_ids", "attention_mask", "label"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=PreTrainedTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [11]:
from torch.utils.data import DataLoader

train_batch_size = 32
eval_batch_size = 4

train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=train_batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset["valid"], batch_size=eval_batch_size, collate_fn=data_collator
)

In [12]:
next(iter(eval_dataloader))

{'input_ids': tensor([[  101,   170,  1440,  1120,  9304,  9753,  1162, 18254,   112,  1297,
          1105,  1578,  1112,  1103,  2851,  3587,  1851,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,   176, 25690,  9037,  1106,  2194,  2919,  6486,  5914,  1114,
          1714,   188,  1204,  1181,  2774, 21195,  1254,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  4267,  1116,  4695,  1231, 15189,  1116,  1900, 17462,  1279,
          2560,  1179,  1112,  1226,  1104,  1419,   118,  2043,  4684,  2019,
          9712, 26348,   185,  1174,  4184, 20473,  1465,   102],
        [  101,   192,  7745,  5834,  1940,  1116, 13830, 10517,  1299,  4482,
          1158,  1146,  1113, 20188,  1104,  6831,  1111,  1981, 15841,  3842,
           102,     0,     0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0

# 2.모델 정의 및 생성

## 2.1. Pre-Trained Model 로딩

In [13]:
plm = AutoModel.from_pretrained(model_id)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 2.2. BERT 아키텍쳐 확인

In [14]:
def show_module(model):
    for name, child in model.named_children():
        print("name :", name)
        #print("child: \n", child)


In [15]:
show_module(plm)

name : embeddings
name : encoder
name : pooler


In [16]:
print(plm)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

## 2.3. Custom Classifier 추가 하여 Custom Model 생성 하기

In [17]:
class CustomBERTModel(nn.Module):
  def __init__(self,model ,num_labels): 
    super(CustomBERTModel,self).__init__() 
    self.num_labels = num_labels 

    self.model = model
    self.dropout = nn.Dropout(0.1) 
    self.classifier = nn.Linear(768,num_labels) # load and initialize weights

  def forward(self, input_ids=None, attention_mask=None,labels=None):
    #Extract outputs from the body
    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

    #Add custom layers
    sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state

    logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # calculate losses
    
    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
    
    return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)



In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
custom_model=CustomBERTModel(model = plm ,num_labels=2).to(device)

## 2.4. Custome Model 아키텍쳐 확인

In [19]:
show_module(custom_model)

name : model
name : dropout
name : classifier


In [20]:
print(custom_model)

CustomBERTModel(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

# 3. 훈련 준비

In [21]:
from transformers import AdamW,get_scheduler

optimizer = AdamW(custom_model.parameters(), lr=5e-5)

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

713




In [22]:
from datasets import load_metric
metric = load_metric("f1")

# 4. 훈련 루프 실행

훈련 루프에 입력이 될 Batch 확인 함. 'eval_dataloader' 를 'train_dataloader' 로 바꾸어서 보시면 됩니다.
레코드가 많이 출력이 되어서 eval_dataloader 로 확인 함.

In [23]:
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    print(batch)
    break

{'input_ids': tensor([[  101,   170,  1440,  1120,  9304,  9753,  1162, 18254,   112,  1297,
          1105,  1578,  1112,  1103,  2851,  3587,  1851,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,   176, 25690,  9037,  1106,  2194,  2919,  6486,  5914,  1114,
          1714,   188,  1204,  1181,  2774, 21195,  1254,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  4267,  1116,  4695,  1231, 15189,  1116,  1900, 17462,  1279,
          2560,  1179,  1112,  1226,  1104,  1419,   118,  2043,  4684,  2019,
          9712, 26348,   185,  1174,  4184, 20473,  1465,   102],
        [  101,   192,  7745,  5834,  1940,  1116, 13830, 10517,  1299,  4482,
          1158,  1146,  1113, 20188,  1104,  6831,  1111,  1981, 15841,  3842,
           102,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

## 4.1. 훈련 및 평가
- f1 : 0.9247

In [24]:
from tqdm.auto import tqdm

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epochs * len(eval_dataloader)))


def train_loop(num_epochs, model, train_dataloader, progress_bar_train, \
               eval_dataloader, progress_bar_eval, metric):
    for epoch in range(num_epochs):
      model.train()
      for batch in train_dataloader:
          batch = {k: v.to(device) for k, v in batch.items()}
          outputs = model(**batch)
          loss = outputs.loss
          loss.backward()

          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar_train.update(1)

      model.eval()
      for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = custom_model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
        progress_bar_eval.update(1)

      print(metric.compute())

train_loop(num_epochs, custom_model, train_dataloader, progress_bar_train, \
               eval_dataloader, progress_bar_eval, metric)      

  0%|          | 0/713 [00:00<?, ?it/s]

  0%|          | 0/713 [00:00<?, ?it/s]

{'f1': 0.9184423218221895}


# 5. 테스트 데이터 로 모델 평가 
- f: 0.914

In [25]:
test_dataloader = DataLoader(
    tokenized_dataset["test"], batch_size=32, collate_fn=data_collator
)

def evaL_model(model, test_dataloader, metric):
    model.eval()


    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    print(metric.compute())
    
evaL_model(custom_model, test_dataloader, metric)    

{'f1': 0.9095550692924872}


# E. 커널 리스타팅

In [26]:
import IPython

IPython.Application.instance().kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}