# Warming Up - PLM + 그룹 Custom Classifiers 만들기

이 노트북은 크게 2개의 모델을 생성하여 파이프라인으로 연결를 결과로 만듧니다.
- 첫 번재 모델
    - BERT Pretrained Model (PLM)
- 두 번째 모델
    - 4 개의 Classifiers 로 구성된 모델 
        - Classifiers_01, Classifiers02, Classifiers_03, Classifiers_04
- 추론
    - PLM --> Classifiers 로 이루어 지며, 최종 4개의 Classifier 의 모델 결과가 제공 됨.




---

### 참조: 
- 딥러닝으로 리뷰에서 제품 속성 정보 추출하기
    * http://blog.hwahae.co.kr/all/tech/tech-tech/5967/
- A Visual Guide to Using BERT for the First Time
    - http://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/
- PyTorch 101, Part 3: Going Deep with PyTorch
    - https://blog.paperspace.com/pytorch-101-advanced/
- Pytorch freeze part of the layers
    - https://jimmy-shen.medium.com/pytorch-freeze-part-of-the-layers-4554105e03a6
- BERT Fine-Tuning Tutorial with PyTorch
    - https://mccormickml.com/2019/07/22/BERT-fine-tuning/
- How many layers of my BERT model should I freeze?
    - https://raphaelb.org/posts/freezing-bert/
- Add dense layer on top of Huggingface BERT model
    - https://pyquestions.com/add-dense-layer-on-top-of-huggingface-bert-model
    

# 0. 환경 셋업

In [1]:
from transformers import AutoModel

from datasets import load_dataset,Dataset,DatasetDict
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
import pandas as pd
import numpy as np


# 1. 데이터 준비

In [2]:

data=load_dataset("json",data_files="download_data/news-headlines-dataset-for-sarcasm-detection.zip")
data=data.rename_column("is_sarcastic","label")

data=data.remove_columns(['article_link'])

data.set_format('pandas')
data=data['train'][:]

data.drop_duplicates(subset=['headline'],inplace=True)
data=data.reset_index()[['headline','label']]



Using custom data configuration default-5fa7fc59288bdff3
Reusing dataset json (/home/ec2-user/.cache/huggingface/datasets/json/default-5fa7fc59288bdff3/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)


  0%|          | 0/1 [00:00<?, ?it/s]

## 1.1 데이터 확인
- is_sarcastic (풍자) 에 따라 레이블이 1 과 0 임

In [3]:
data

Unnamed: 0,headline,label
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0
...,...,...
28498,tyson holds contest to let fans submit new ide...,1
28499,increasingly cocky bernie sanders announces he...,1
28500,cash-strapped zuckerberg forced to sell 11 mil...,1
28501,grocery store bar actually has great little ha...,1


In [4]:
data=Dataset.from_pandas(data)

# 80% train, 20% test + validation
train_testvalid = data.train_test_split(test_size=0.2,seed=15)

# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5,seed=15)



In [5]:
train_testvalid, test_valid 

(DatasetDict({
     train: Dataset({
         features: ['headline', 'label'],
         num_rows: 22802
     })
     test: Dataset({
         features: ['headline', 'label'],
         num_rows: 5701
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['headline', 'label'],
         num_rows: 2850
     })
     test: Dataset({
         features: ['headline', 'label'],
         num_rows: 2851
     })
 }))

## 1.2 최종 사전 형태의 데이터 세트 (훈련, 검증, 테스트 셋) 생성

In [6]:
# gather everyone if you want to have a single DatasetDict
data = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

data

DatasetDict({
    train: Dataset({
        features: ['headline', 'label'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['headline', 'label'],
        num_rows: 2851
    })
    valid: Dataset({
        features: ['headline', 'label'],
        num_rows: 2850
    })
})

In [7]:
data["train"].data[0:5]

pyarrow.Table
headline: string
label: int64
----
headline: [["former versace store clerk sues over secret 'black code' for minority shoppers","the 'roseanne' revival catches up to our thorny political mood, for better and worse","mom starting to fear son's web series closest thing she will have to grandchild","boehner just wants wife to listen, not come up with alternative debt-reduction ideas","j.k. rowling wishes snape happy birthday in the most magical way"]]
label: [[0,0,1,1,0]]

## 1.3. 토큰나이저 로딩 및 BERT 인코딩으로 변환

In [8]:
model_id = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.model_max_len=512

In [9]:
def tokenize(batch):
  return tokenizer(batch["headline"], truncation=True,max_length=512)

tokenized_dataset = data.map(tokenize, batched=True)
#sample_dataset = data.map(tokenize, batched=True)
tokenized_dataset

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['headline', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['headline', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2851
    })
    valid: Dataset({
        features: ['headline', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2850
    })
})

## 1.4. 데이터 로더 생성

In [10]:
tokenized_dataset.set_format("torch",columns=["input_ids", "attention_mask", "label"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=PreTrainedTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [11]:
from torch.utils.data import DataLoader

train_batch_size = 50
eval_batch_size = 16

train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=train_batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset["valid"], batch_size=eval_batch_size, collate_fn=data_collator
)

In [12]:
next(iter(eval_dataloader))

{'input_ids': tensor([[  101,   170,  1440,  1120,  9304,  9753,  1162, 18254,   112,  1297,
          1105,  1578,  1112,  1103,  2851,  3587,  1851,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,   176, 25690,  9037,  1106,  2194,  2919,  6486,  5914,  1114,
          1714,   188,  1204,  1181,  2774, 21195,  1254,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  4267,  1116,  4695,  1231, 15189,  1116,  1900, 17462,  1279,
          2560,  1179,  1112,  1226,  1104,  1419,   118,  2043,  4684,  2019,
          9712, 26348,   185,  1174,  4184, 20473,  1465,   102],
        [  101,   192,  7745,  5834,  1940,  1116, 13830, 10517,  1299,  4482,
          1158,  1146,  1113, 20188,  1104,  6831,  1111,  1981, 15841,  3842,
           102,     0,     0,     0,     0,     0,     0,     0],
        [  101,  5871, 24138,  1180, 12999,  1154,   112,  6866,   112,  2963,
         16

# 2.모델 정의 및 생성

## 2.1. Pre-Trained Model 로딩

In [13]:
plm = AutoModel.from_pretrained(model_id)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 2.2. BERT 아키텍쳐 확인

In [14]:
def show_module(model):
    for name, child in model.named_children():
        print("name :", name)
        #print("child: \n", child)


In [15]:
show_module(plm)

name : embeddings
name : encoder
name : pooler


In [17]:
print(plm)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

## 2.3. Custom Classifier 추가 하여 Custom Model 생성 하기
- PLM + Classifier 로 구성됨.
- PLM 레이어는 훈련을 안하기 위해 파라이터 Freezing 을 함.

In [18]:
class CustomBERTModel(nn.Module):
    '''
    plm 파라미터는 freezing 하여 훈련을 하지 않음.
    '''
    def __init__(self,model ,num_labels): 
        super(CustomBERTModel,self).__init__() 
        self.num_labels = num_labels 

        self.plm = model
        # self.dropout = nn.Dropout(0.1) 
        # self.classifier = nn.Linear(768,num_labels) # load and initialize weights
        self.classifier = nn.Sequential(
            nn.Dropout(0.1),            
            nn.Linear(768,256),        
            nn.Dropout(0.1),                        
            nn.Linear(256,num_labels) 
        )
                
        self.freeze_plm()        

    def forward(self, input_ids=None, attention_mask=None,labels=None):
        #Extract outputs from the body
        outputs = self.plm(input_ids=input_ids, attention_mask=attention_mask)
        #print("outputs shape: ", outputs[0].shape)
        
        #sequence_output = self.dropout(outputs[0]) # outputs[0]=last hidden state
        # print("sequence_output shape: ", sequence_output.shape)
        
        cls_vector = outputs[0][:,0,:].view(-1,768) # outputs[0] 은 last_hidden_state, outputs[1] 은 pooled_output_state
        # print("cls_vector shape: ", cls_vector.shape)        
        
        logits = self.classifier(cls_vector) 

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

    def freeze_plm(self):
        """
        Freezes the parameters of BERT so when BertWithCustomNNClassifier is trained
        only the wieghts of the custom classifier are modified.
        """
        for param in self.plm.parameters():
            param.requires_grad=False
        #     print(param.requires_grad)
        
    
    def unfreeze_plm(self):
        """
        Unfreezes the parameters of BERT so when BertWithCustomNNClassifier is trained
        both the wieghts of the custom classifier and of the underlying BERT are modified.
        """
        for param in self.plm.parameters():
            param.requires_grad=True



In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
custom_model=CustomBERTModel(model = plm ,num_labels=2).to(device)

최상위 모듈(레이어)  확인

In [20]:
show_module(custom_model)

name : plm
name : classifier


파라미터 Freezing 여부 확인. plm == freezing , classifier == trainable

In [21]:
def show_trainable_layer(model):
    # requires_grad == true 만 출력
    for name, param in model.named_parameters():
        if param.requires_grad: print(name) 

show_trainable_layer(custom_model)

classifier.1.weight
classifier.1.bias
classifier.3.weight
classifier.3.bias


## 2.4. Custome Model 아키텍쳐 확인

In [22]:
print(custom_model)

CustomBERTModel(
  (plm): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

# 3. 훈련 준비

## 3.1. 모델 평가 지표 정의

In [23]:
from datasets import load_metric
# metric = load_metric("f1")
metric = load_metric("accuracy")

## 3.2. 옵티마이저, 스케줄러, 훈련 루프 정의

In [24]:
from transformers import AdamW,get_scheduler

def create_optimizer_scheduler(num_epochs, model, train_dataloader):
    # plm freezing optimizer
    optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5)
    # optimizer withoud frezzing
    # optimizer = AdamW(model.parameters(), lr=5e-5)

    
    def get_lr_scheduler(num_epochs, train_dataloader, optimizer):
        num_training_steps = num_epochs * len(train_dataloader)
        lr_scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps,
        )

        print(num_training_steps)

        return lr_scheduler, num_training_steps
    
    lr_scheduler, num_training_steps = get_lr_scheduler(num_epochs, train_dataloader, optimizer)
    
    return optimizer, lr_scheduler, num_training_steps


In [25]:
def train_loop(num_epochs, model, train_dataloader, progress_bar_train, \
               eval_dataloader, progress_bar_eval, metric, optimizer, lr_scheduler):
    for epoch in range(num_epochs):
        model.train()
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar_train.update(1)
                
        # 모델 평가
        model.eval()
        for batch in eval_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = custom_model(**batch)

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            metric.add_batch(predictions=predictions, references=batch["labels"])
            progress_bar_eval.update(1)
            

        print(metric.compute())
    
    return model



# 3.3. 훈련 루프 실행 및 평가

훈련 루프에 입력이 될 Batch 확인 함. 'eval_dataloader' 를 'train_dataloader' 로 바꾸어서 보시면 됩니다.
레코드가 많이 출력이 되어서 eval_dataloader 로 확인 함.

In [26]:
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    print(batch)
    break

{'input_ids': tensor([[  101,   170,  1440,  1120,  9304,  9753,  1162, 18254,   112,  1297,
          1105,  1578,  1112,  1103,  2851,  3587,  1851,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,   176, 25690,  9037,  1106,  2194,  2919,  6486,  5914,  1114,
          1714,   188,  1204,  1181,  2774, 21195,  1254,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  4267,  1116,  4695,  1231, 15189,  1116,  1900, 17462,  1279,
          2560,  1179,  1112,  1226,  1104,  1419,   118,  2043,  4684,  2019,
          9712, 26348,   185,  1174,  4184, 20473,  1465,   102],
        [  101,   192,  7745,  5834,  1940,  1116, 13830, 10517,  1299,  4482,
          1158,  1146,  1113, 20188,  1104,  6831,  1111,  1981, 15841,  3842,
           102,     0,     0,     0,     0,     0,     0,     0],
        [  101,  5871, 24138,  1180, 12999,  1154,   112,  6866,   112,  2963,
         16

In [27]:
from tqdm.auto import tqdm

num_epochs = 2
optimizer, lr_scheduler, num_training_steps = create_optimizer_scheduler(num_epochs, custom_model, train_dataloader)

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epochs * len(eval_dataloader)))

custom_model_01 = train_loop(num_epochs, custom_model, train_dataloader, progress_bar_train, \
               eval_dataloader, progress_bar_eval, metric, optimizer, lr_scheduler)      

914




  0%|          | 0/914 [00:00<?, ?it/s]

  0%|          | 0/358 [00:00<?, ?it/s]

{'accuracy': 0.7410526315789474}
{'accuracy': 0.7519298245614036}


# 5. 테스트 데이터 로 모델 평가 

In [28]:
test_batch_size = 32
test_dataloader = DataLoader(
    tokenized_dataset["test"], batch_size= test_batch_size, collate_fn=data_collator
)

def evaL_model(model, test_dataloader, metric):
    model.eval()


    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    print(metric.compute())
    
evaL_model(custom_model_01, test_dataloader, metric)    

{'accuracy': 0.7649947386881796}


# 6. 모델 분리 후 추론

In [29]:
show_module(custom_model_01)

name : plm
name : classifier


## 6.1. 추론 PLM Model 생성 
- custom_model (PLM + Classifier) 에서 PLM 만을 분리 함.

In [30]:
class PLModel(nn.Module):
    def __init__(self, base_model, num_labels): 
        super(PLModel,self).__init__() 

        self.plm = base_model.plm
        

    def forward(self, input_ids=None, attention_mask=None,labels=None):
        #Extract outputs from the body

        #Add custom layers
        outputs = self.plm(input_ids=input_ids, attention_mask=attention_mask)
        
        
        cls_vector = outputs[0][:,0,:].view(-1,768)
        # print("cls_vector shape: ", cls_vector.shape)                

        return cls_vector

PL_Model=PLModel(base_model = custom_model_01 ,num_labels=2).to(device)
print("PLM: ")
show_module(PL_Model)



PLM: 
name : plm


## 6.2. 추천 이진 분류기 모델 
- custom_model (PLM + Classifier) 에서 Classifier 만을 분리 함.

In [31]:
class ClassifierModel(nn.Module):
    def __init__(self, base_model, num_labels): 
        super(ClassifierModel,self).__init__() 

        self.num_labels = num_labels 
        #self.dropout = nn.Dropout(0.1)     
        self.classifier = base_model.classifier    
        

    def forward(self, cls_vector=None, labels=None):

        #Add custom layers
        #cls_vector = self.dropout(cls_vector) #outputs[0]=last hidden state
        logits = self.classifier(cls_vector)
        
        # logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # calculate losses


        return logits


classifier_01 = ClassifierModel(base_model = custom_model_01 ,num_labels=2).to(device)
print("\nClassifier: \n")
show_module(classifier_01)


Classifier: 

name : classifier


## 6.3. PLM 모델 추론
- BERT Encoding 을 입력하여 PLM 모델을 통해서 (Batch_Size, 25, 768) 벡터를 제공

In [32]:
def inference_plm(model, sample_dataloader):
    model.eval()

    output_list = []
    for i, batch in enumerate(sample_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            output_list.append(outputs)
        
        # if i == 10:
        #     break
            
    return output_list

    
plm_vector = inference_plm(PL_Model, test_dataloader)
print("batch size: " , len(plm_vector))
print("one batch shape: " , plm_vector[0].shape)

batch size:  90
one batch shape:  torch.Size([32, 768])


## 6.4. 이진 분류기 추론
PLM 모델을 통해서 (Batch_Size, 25, 768) 벡터를 입력으로 하여 Classifier 로 추론

In [33]:
from datasets import load_metric

In [34]:
def inference_classifier(model, plm_vector, test_loader):
    model.eval()

    output_list = []
    for batch, reference in zip(plm_vector, test_loader):
        # print(reference['labels'])
        # batch = batch[0].to(device)

        #print("batch shape: ", batch.shape)
        # batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(batch)
            # print("outputs: ", outputs.shape)
            predictions = torch.argmax(outputs, dim=-1)
            #print(predictions)
            metric.add_batch(predictions=predictions, references=reference["labels"])
            
            output_list.append(outputs)

        
    print(metric.compute())        
                
    return output_list

output_list = inference_classifier(classifier_01, plm_vector, test_dataloader)

{'accuracy': 0.7649947386881796}


# 7. 두번째 모델 

## 7.1. plm plus custom classifier 로 두번째 모델 생성

In [35]:
custom_model =CustomBERTModel(model = plm ,num_labels=2).to(device)

In [36]:
show_trainable_layer(custom_model)

classifier.1.weight
classifier.1.bias
classifier.3.weight
classifier.3.bias


## 7.2. 모델 훈련

In [37]:
num_epochs = 2

optimizer, lr_scheduler, num_training_steps = create_optimizer_scheduler(num_epochs, custom_model, train_dataloader)

914




In [38]:
progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epochs * len(eval_dataloader)))

custom_model_02 = train_loop(num_epochs, custom_model, train_dataloader, progress_bar_train, \
               eval_dataloader, progress_bar_eval, metric, optimizer, lr_scheduler)      

  0%|          | 0/914 [00:00<?, ?it/s]

  0%|          | 0/358 [00:00<?, ?it/s]

{'accuracy': 0.7424561403508771}
{'accuracy': 0.755438596491228}


## 7.3. 테스트 데이터로 평가

In [39]:
evaL_model(custom_model_02, test_dataloader, metric)    

{'accuracy': 0.7646439845668187}


## 7.4. 두번째 모델에서 Classifier 추출

In [40]:
classifier_02 = ClassifierModel(base_model = custom_model_02 ,num_labels=2).to(device)
show_module(classifier_02)

name : classifier


## 7.6. 이진 분류기로 추론

In [41]:
output_list = inference_classifier(classifier_02, plm_vector, test_dataloader)

{'accuracy': 0.7646439845668187}


# 8. 첫번째, 두번째의 모델을 한개의 모델로 통합
- Classifier_01 , Classifier_02 를 한개의 모델 안으로 포함 시킴 (Combine_Classifier_02)

In [42]:
class CombineClassifier(nn.Module):
    def __init__(self, base_classifier, add_classifier): 
        super(CombineClassifier,self).__init__() 

        # self.dropout = nn.Dropout(0.1)     
        
        self.base_classifier = base_classifier
        self.add_classifier = add_classifier

    def forward(self, cls_vector=None, labels=None):

        #Add custom layers
        #print("cls_vector shape: ", cls_vector.shape)
        x = cls_vector
        base_logits = self.base_classifier(x) 
        add_logits = self.add_classifier(x)


        return base_logits, add_logits


Combine_Classifier_02 = CombineClassifier(base_classifier = classifier_01 ,add_classifier = classifier_02).to(device)
show_module(Combine_Classifier_02)

name : base_classifier
name : add_classifier


In [43]:
print(Combine_Classifier_02)

CombineClassifier(
  (base_classifier): ClassifierModel(
    (classifier): Sequential(
      (0): Dropout(p=0.1, inplace=False)
      (1): Linear(in_features=768, out_features=256, bias=True)
      (2): Dropout(p=0.1, inplace=False)
      (3): Linear(in_features=256, out_features=2, bias=True)
    )
  )
  (add_classifier): ClassifierModel(
    (classifier): Sequential(
      (0): Dropout(p=0.1, inplace=False)
      (1): Linear(in_features=768, out_features=256, bias=True)
      (2): Dropout(p=0.1, inplace=False)
      (3): Linear(in_features=256, out_features=2, bias=True)
    )
  )
)


복수개의 Classifier 가 있는 Combine_Classifier 모델의 추론을 및 평가를 하는 함수 정의

In [44]:
def inference_classifier2(model, plm_vector, test_dataloader, test_batch_size, verbose=False):
    def get_depth(l):
        if isinstance(l, list):
            return 1 + max(get_depth(item) for item in l)
        elif isinstance(l, tuple):
            return 1 + max(get_depth(item) for item in l)

        else:
            return 0

    def unflatten_tuple(t, depth):
        e_list = []
        while True:
            if depth ==0:
                e_list.append(x)
                break
            x, y = t
            e_list.append(y)

            t = x
            #print("x: ", x)

            depth -= 1

        e_list.reverse()

        return e_list

    def get_num_model():
        model.eval()

        test_batch_num = len(test_dataloader)
        total_correct, correct = 0 , 0

        output_list = []
        for batch, reference in zip(plm_vector, test_dataloader):
            # print(reference['labels'])
            batch = batch[0].to(device)
            # batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                probs = model(batch)
                # print("outputs: ", probs)   
                depth = get_depth(probs)    
                probs_list = unflatten_tuple(probs, depth)            
            break
        return len(probs_list), depth
    
    def eval_model():
        #############################
        # 정확도 계산 위한 변수 정의
        #############################        
        test_batch_num = len(test_dataloader) # 총 배치 숫자
        num_models , depth = get_num_model() # 총 모델안의 분류기 수
        print("# of Moddels: ", num_models)
        
        total_correct = np.zeros((num_models,1)) # 통계를 내기 위해 각 모델마다 할당
        
        correct = 0 
        output_list = []

        model.eval()

        for batch, reference in zip(plm_vector, test_dataloader):
            # print(reference['labels'])
            # batch = batch[0].to(device)
            # batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                probs = model(batch)
                # print("outputs: ", probs)   
                #depth = get_depth(probs)    
                probs_list = unflatten_tuple(probs, depth)            
                # print("probs_list: ", probs_list)

                ground_truth = reference["labels"].to(device)
                
                if verbose:
                    print("Ground_Truth: \n", ground_truth, "\n")            
                # print("outputs: ", outputs.shape)   

                # 각 모델 마다 correct 수를 구함.
                for i, pred in enumerate(probs_list):

                    correct += (pred.argmax(1) == ground_truth).type(torch.float).sum().item()
                    total_correct[i] +=correct                
                    correct /= test_batch_size

                    correct = 0

                    if verbose:
                        print(f"From model_0{i+1} - Predicted Label:")                
                        print(pred.argmax(1))

                        print(f"From model_0{i+1} Accuracy: {(100*correct):>0.2f}% \n")


        # 전체 배치에 대한 모델 마다 정확도 구함.
        num_total_payload = test_batch_num * test_batch_size                
        for i in range(num_models):
            total_correct[i] /= num_total_payload    
            #print(" total_correct[i] : ",         total_correct[i])
            print(f"From model_0{i+1} Accuracy: {(100*total_correct[i][0]):>0.2f}% \n")    
    
    
    eval_model()
    

In [45]:
output_list = inference_classifier2(Combine_Classifier_02, plm_vector, test_dataloader, test_batch_size)

# of Moddels:  2
From model_01 Accuracy: 75.73% 

From model_02 Accuracy: 75.69% 



# 9. 세번째 모델 생성 및 통합 추론

## 9.1. 세번째 모델 생성

In [46]:
num_epochs = 2
# 모델 훈련
custom_model =CustomBERTModel(model = plm ,num_labels=2).to(device)
optimizer, lr_scheduler, num_training_steps = create_optimizer_scheduler(num_epochs, custom_model, train_dataloader)
progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epochs * len(eval_dataloader)))


def create_bert_model(num_epochs, custom_model):
    print("eval dataset accuracy in training loop")
    custom_model = train_loop(num_epochs, custom_model, train_dataloader, progress_bar_train, \
                   eval_dataloader, progress_bar_eval, metric, optimizer, lr_scheduler)      

    # 모델 평가
    print("inference accuracy for plm plus classifer")    
    evaL_model(custom_model, test_dataloader, metric)    

    # Classifier 추출
    classifier = ClassifierModel(base_model = custom_model ,num_labels=2).to(device)

    # Classifier 모델 구조 확인
    print("classifier architecture: ")        
    show_module(classifier)

    # plm vector 추출
    plm_vector = inference_plm(PL_Model, test_dataloader)
    # print("batch size: " , len(plm_vector))
    # print("one batch shape: " , plm_vector[0][0].shape)

    # Classifier 로 추론
    print("inference accuracy for classifer")
    output_list = inference_classifier(classifier, plm_vector, test_dataloader)
    
    return classifier

    
classifier_03 = create_bert_model(num_epochs, custom_model)


914




  0%|          | 0/914 [00:00<?, ?it/s]

  0%|          | 0/358 [00:00<?, ?it/s]

eval dataset accuracy in training loop
{'accuracy': 0.7421052631578947}
{'accuracy': 0.7568421052631579}
inference accuracy for plm plus classifer
{'accuracy': 0.7653454928095406}
classifier architecture: 
name : classifier
inference accuracy for classifer
{'accuracy': 0.7653454928095406}


## 9.2. 세 번째 모델(classifier_03) 을 기존의 모델 (classifier_01, classifier_02) 로 병합

In [47]:
Combine_Classifier_03 = CombineClassifier(base_classifier=Combine_Classifier_02 ,
                                          add_classifier=classifier_03).to(device)
show_module(Combine_Classifier_03)

name : base_classifier
name : add_classifier


In [48]:
print(Combine_Classifier_03)

CombineClassifier(
  (base_classifier): CombineClassifier(
    (base_classifier): ClassifierModel(
      (classifier): Sequential(
        (0): Dropout(p=0.1, inplace=False)
        (1): Linear(in_features=768, out_features=256, bias=True)
        (2): Dropout(p=0.1, inplace=False)
        (3): Linear(in_features=256, out_features=2, bias=True)
      )
    )
    (add_classifier): ClassifierModel(
      (classifier): Sequential(
        (0): Dropout(p=0.1, inplace=False)
        (1): Linear(in_features=768, out_features=256, bias=True)
        (2): Dropout(p=0.1, inplace=False)
        (3): Linear(in_features=256, out_features=2, bias=True)
      )
    )
  )
  (add_classifier): ClassifierModel(
    (classifier): Sequential(
      (0): Dropout(p=0.1, inplace=False)
      (1): Linear(in_features=768, out_features=256, bias=True)
      (2): Dropout(p=0.1, inplace=False)
      (3): Linear(in_features=256, out_features=2, bias=True)
    )
  )
)


## 9.3. 통합 모델 (classifier_01, classifier_02, classifier_03) 을 추론

In [49]:
output_list = inference_classifier2(Combine_Classifier_03, plm_vector, test_dataloader, test_batch_size, verbose=False)

# of Moddels:  3
From model_01 Accuracy: 75.73% 

From model_02 Accuracy: 75.69% 

From model_03 Accuracy: 75.76% 



# 10. 4번째 모델 생성 및 통합 모델 (classifier_01, classifier_02, classifier_03, classifier_04) 을 추론

In [50]:
print("(1) Create Bert MOdel (plm + classifier)")

num_epochs = 4
custom_model =CustomBERTModel(model = plm ,num_labels=2).to(device)
optimizer, lr_scheduler, num_training_steps = create_optimizer_scheduler(num_epochs, custom_model, train_dataloader)
progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epochs * len(eval_dataloader)))
classifier_04 = create_bert_model(num_epochs, custom_model)

print("\n(2) Create a group of four classifiers")
Combine_Classifier_04 = CombineClassifier(base_classifier=Combine_Classifier_03 ,
                                          add_classifier=classifier_04).to(device)
print("\n(3) Look at the architecture")
show_module(Combine_Classifier_04)
print(Combine_Classifier_04)
print("\n(4) Inference the group of 4 classifier")
output_list = inference_classifier2(Combine_Classifier_04, plm_vector, test_dataloader, test_batch_size, verbose=False)

(1) Create Bert MOdel (plm + classifier)
1828




  0%|          | 0/1828 [00:00<?, ?it/s]

  0%|          | 0/716 [00:00<?, ?it/s]

eval dataset accuracy in training loop
{'accuracy': 0.7410526315789474}
{'accuracy': 0.7740350877192983}
{'accuracy': 0.771578947368421}
{'accuracy': 0.775438596491228}
inference accuracy for plm plus classifer
{'accuracy': 0.7930550683970536}
classifier architecture: 
name : classifier
inference accuracy for classifer
{'accuracy': 0.7930550683970536}

(2) Create a group of four classifiers

(3) Look at the architecture
name : base_classifier
name : add_classifier
CombineClassifier(
  (base_classifier): CombineClassifier(
    (base_classifier): CombineClassifier(
      (base_classifier): ClassifierModel(
        (classifier): Sequential(
          (0): Dropout(p=0.1, inplace=False)
          (1): Linear(in_features=768, out_features=256, bias=True)
          (2): Dropout(p=0.1, inplace=False)
          (3): Linear(in_features=256, out_features=2, bias=True)
        )
      )
      (add_classifier): ClassifierModel(
        (classifier): Sequential(
          (0): Dropout(p=0.1, inplace

# E. 커널 리스타팅

In [None]:
# import IPython

# IPython.Application.instance().kernel.do_shutdown(True)