# KoElectr Model 로 네이버 리뷰 감성 분석




---

### 참조: 
Adding Custom Layers on Top of a Hugging Face Model
- https://towardsdatascience.com/adding-custom-layers-on-top-of-a-hugging-face-model-f1ccdfc257bd
- code
    - https://jovian.ai/rajbsangani/emotion-tuned-sarcasm

# 1. 환경 셋업

## 1.1 변수 로딩 및 라이브러리 로딩

In [1]:
%store -r local_train_output_path
%store -r local_test_output_path

In [2]:
%load_ext autoreload
%autoreload 2

# src 폴더 경로 설정
import sys
sys.path.append('./src')
import config
from  data_util import read_nsmc_split

In [3]:
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# logger.setLevel(logging.WARNING)
logger.addHandler(logging.StreamHandler(sys.stdout))

In [4]:
from transformers import AutoModel

from datasets import load_dataset,Dataset,DatasetDict
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
import pandas as pd


## 1.2. Pre-trained model_id, tokenizer_id 지정
- [KoElectra Git](https://github.com/monologg/KoELECTRA)
- KoElectra Model
    - Small:
        - "monologg/koelectra-small-v3-discriminator
    - Base: 
        - monologg/koelectra-base-v3-discriminator
        


In [5]:
# from datasets import load_dataset
from transformers import (
    ElectraModel, 
    ElectraTokenizer, 
)

tokenizer_id = 'monologg/koelectra-small-v3-discriminator'
model_id = "monologg/koelectra-small-v3-discriminator"



# 2. 데이터 준비

## 2.1 데이터 로딩

In [6]:
train_texts, train_labels = read_nsmc_split(local_train_output_path)
test_texts, test_labels = read_nsmc_split(local_test_output_path)

In [7]:
logger.info(f"len: {len(train_texts)} \nSample: {train_texts[0:5]}")
logger.info(f"len: {len(train_labels)} \nSample: {train_labels[0:5]}")

len: 149552 
Sample: ['흠   포스터보고 초딩영화줄    오버연기조차 가볍지 않구나', '너무재밓었다그래서보는것을추천한다', '교도소 이야기구먼   솔직히 재미는 없다  평점 조정', '사이몬페그의 익살스런 연기가 돋보였던 영화 스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다', '막 걸음마 뗀 세부터 초등학교 학년생인 살용영화 ㅋㅋㅋ   별반개도 아까움']
len: 149552 
Sample: [1, 0, 0, 1, 0]


## 2.2. 훈련 데이타를 분리하여 검증 데이터 세트 생성

In [8]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

# 3. Electra Model 입력 인코딩 변환 및 torch custome Dataset 생성 

## 3.1. 토큰나이저 로딩 

In [9]:
tokenizer = ElectraTokenizer.from_pretrained(tokenizer_id)

## 3.2. Electra Model 입력 인코딩 생성

In [10]:
%%time 

tokenizer = ElectraTokenizer.from_pretrained(tokenizer_id)

train_encodings = tokenizer(train_texts, return_token_type_ids = False, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, return_token_type_ids = False, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, return_token_type_ids = False, truncation=True, padding=True)

CPU times: user 42.7 s, sys: 260 ms, total: 43 s
Wall time: 43 s


## 3.3. torch custome dataset 생성

In [11]:
from data_util import NSMCDataset

train_dataset = NSMCDataset(train_encodings, train_labels)
val_dataset = NSMCDataset(val_encodings, val_labels)
test_dataset = NSMCDataset(test_encodings, test_labels)

In [12]:
logger.info(f"len(train_dataset) : {len(train_dataset)}")
logger.info(f"len(val_dataset) : {len(val_dataset)}")
logger.info(f"len(test_dataset) : {len(test_dataset)}")


len(train_dataset) : 119641
len(val_dataset) : 29911
len(test_dataset) : 49832


## 3.4. 데이터 로더 생성

### Sampler 생성

In [13]:
from torch.utils.data import DataLoader, SubsetRandomSampler


from train_util import create_random_sampler
    
subset_train_sampler = create_random_sampler(train_dataset, frac=0.01, is_shuffle=True, logger=logger)
train_sampler = create_random_sampler(train_dataset, frac=1, is_shuffle=True, logger=logger)

subset_eval_sampler = create_random_sampler(val_dataset, frac=0.001, is_shuffle=False, logger=logger)
eval_sampler = create_random_sampler(val_dataset, frac=1, is_shuffle=False, logger=logger)
test_sampler = create_random_sampler(test_dataset, frac=1, is_shuffle=False, logger=logger)



dataset size with frac: 0.01 ==> 1196
dataset size with frac: 1 ==> 119641
dataset size with frac: 0.001 ==> 29
dataset size with frac: 1 ==> 29911
dataset size with frac: 1 ==> 49832


### 데이터 로더 생성

In [14]:
train_batch_size = 32
eval_batch_size = 4
test_batch_size = 32


train_sample_loader = DataLoader(dataset=train_dataset, 
                          shuffle=False, 
                          batch_size=train_batch_size, 
                          sampler=subset_train_sampler)    

train_dataloader = DataLoader(dataset=train_dataset, 
                          shuffle=False, 
                          batch_size=train_batch_size, 
                          sampler=train_sampler)    

eval_sample_loader = DataLoader(dataset=val_dataset, 
                          shuffle=False, 
                          batch_size=eval_batch_size, 
                          sampler=subset_eval_sampler)    

eval_dataloader = DataLoader(dataset=val_dataset, 
                          shuffle=False, 
                          batch_size=eval_batch_size, 
                          sampler=eval_sampler)    

test_dataloader = DataLoader(dataset=test_dataset, 
                          shuffle=False, 
                          batch_size=test_batch_size, 
                          sampler=test_sampler)    



# 4.모델 정의 및 생성

## 4.1. Pre-Trained Model 로딩

In [15]:
plm = AutoModel.from_pretrained(model_id)

Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 4.2. Electra Model 아키텍쳐 확인

In [16]:
def show_module(model):
    for name, child in model.named_children():
        print("name :", name)
        #print("child: \n", child)


In [17]:
show_module(plm)

name : embeddings
name : embeddings_project
name : encoder


In [20]:
# print(plm)

## 4.3. Custom Classifier 추가 하여 Custom Model 생성 하기

In [26]:
class CustomBERTModel(nn.Module):
    def __init__(self,model ,num_labels): 
        super(CustomBERTModel,self).__init__() 
        self.num_labels = num_labels 

        self.model = model
        self.dropout = nn.Dropout(0.1) 
        self.classifier = nn.Linear(256,num_labels) # load and initialize weights    
        # self.classifier = nn.Linear(768,num_labels) # load and initialize weights

    def forward(self, input_ids=None, attention_mask=None,labels=None):
        #Extract outputs from the body
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

        #Add custom layers
        sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state

        # logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # calculate losses
        logits = self.classifier(sequence_output[:,0,:].view(-1,256)) # calculate losses    

        loss = None
        if labels is not None:
          loss_fct = nn.CrossEntropyLoss()
          # print("logits.view(-1, self.num_labels)\n", logits.view(-1, self.num_labels))
          # print("labels.view(-1): \n", labels.view(-1))  
          loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
          # print("loss: \n", loss)

        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)



In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
custom_model=CustomBERTModel(model = plm ,num_labels=2).to(device) # plm + custom classifier

## 4.4. Custome Model 아키텍쳐 확인

In [28]:
show_module(custom_model)

name : model
name : dropout
name : classifier


In [29]:
# print(custom_model)

# 5. 모델 훈련 

## 5.1. 훈련 준비

In [30]:
from transformers import AdamW,get_scheduler

optimizer = AdamW(custom_model.parameters(), lr=5e-5)

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

3739


In [31]:
from datasets import load_metric
metric = load_metric("accuracy")

## 5.2. 훈련 루프 실행

훈련 루프에 입력이 될 Batch 확인 함. 'eval_dataloader' 를 'train_dataloader' 로 바꾸어서 보시면 됩니다.
레코드가 많이 출력이 되어서 eval_dataloader 로 확인 함.

In [32]:
# for batch in eval_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     print(batch)
#     break

## 5.3. 훈련 및 평가

In [33]:
from tqdm.auto import tqdm

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epochs * len(eval_dataloader)))


def train_loop(num_epochs, model, train_dataloader, progress_bar_train, \
               eval_dataloader, progress_bar_eval, metric):
    for epoch in range(num_epochs):
      model.train()
      for batch in train_dataloader:
          batch = {k: v.to(device) for k, v in batch.items()}
          outputs = model(**batch)
          loss = outputs.loss
          loss.backward()

          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar_train.update(1)
            
      #     break
      # break

      model.eval()
      for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = custom_model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
        progress_bar_eval.update(1)
        
        break

      print(metric.compute())

train_loop(num_epochs, custom_model, train_dataloader, progress_bar_train, \
               eval_dataloader, progress_bar_eval, metric)      

  0%|          | 0/3739 [00:00<?, ?it/s]

  0%|          | 0/7478 [00:00<?, ?it/s]

logits.view(-1, self.num_labels)
 tensor([[ 0.0798, -0.0116],
        [ 0.3035,  0.1060],
        [ 0.0436,  0.0612],
        [ 0.0670,  0.0433],
        [ 0.2970, -0.1680],
        [ 0.4261,  0.1429],
        [ 0.3005, -0.1175],
        [ 0.1866, -0.1190],
        [ 0.2968, -0.0354],
        [ 0.0011, -0.0970],
        [ 0.2615, -0.0395],
        [ 0.0821, -0.0491],
        [ 0.1728, -0.0321],
        [ 0.0669, -0.0867],
        [ 0.1367,  0.0228],
        [ 0.2040, -0.0672],
        [ 0.2185,  0.0542],
        [ 0.2080,  0.0366],
        [ 0.1689, -0.3635],
        [ 0.1712, -0.1791],
        [ 0.3028, -0.0627],
        [ 0.2598, -0.0741],
        [ 0.2553, -0.0443],
        [ 0.1954,  0.0042],
        [ 0.2033, -0.1746],
        [ 0.2029, -0.1608],
        [ 0.2394, -0.1464],
        [ 0.2593, -0.2523],
        [ 0.2253,  0.0395],
        [ 0.1534, -0.2716],
        [ 0.2323, -0.0643],
        [ 0.0819, -0.1106]], device='cuda:0', grad_fn=<ViewBackward>)
labels.view(-1): 
 tensor([1

# 6. 테스트 데이터 로 모델 평가 

In [29]:

def evaL_model(model, test_dataloader, metric):
    model.eval()


    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    print(metric.compute())
    


In [30]:
evaL_model(custom_model, test_dataloader, metric)    

{'accuracy': 0.8686185583560764}


# E. 커널 리스타팅

In [31]:
import IPython

IPython.Application.instance().kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}