# 네이버 영화 멀티 레이블링 (BCELoss)

참고
- transformers_multi-label_classification
    - https://www.kaggle.com/code/eggwhites2705/transformers-multi-label-classification

# 0. 설정 파일 로딩

In [1]:
KR_NAVER_TOXIC = True

if KR_NAVER_TOXIC:
    from src import config_kr_naver_toxic as config

# 1. 환경 설정

In [2]:
%load_ext autoreload
%autoreload 2


In [3]:
import torch
import torch.nn as nn
import pandas as pd

import numpy as np
import os

from sklearn import metrics
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel

# 2. 데이터 준비

In [4]:
# %store -r naver_toxic_train_file_path
# %store -r naver_toxic_test_file_path

In [5]:
train_file_path = os.path.join(config.train_data_dir,"train.csv")
test_file_path = os.path.join(config.test_data_dir,"test.csv")
print("train_file_path : ", train_file_path)
print("test_file_path : ", test_file_path)

train_file_path :  data/naver_toxic_review/train.csv
test_file_path :  data/naver_toxic_review/test.csv


## 2.1. 데이터 로딩 및 레이블 리스트 작성

In [6]:
def read_csv_data(naver_toxic_train_file_path, labels):
    df = pd.read_csv(naver_toxic_train_file_path)    
    df['list'] = df[labels].values.tolist()

    return df

# labels = ['toxic','obscene','threat','insult','identity_hate']
labels = config.labels
train_df = read_csv_data(train_file_path, labels)
test_df = read_csv_data(test_file_path, labels)
train_df.head()


Unnamed: 0,index,id,comment_text,toxic,obscene,threat,insult,identity_hate,list
0,0,9976970.0,아 더빙.. 진짜 짜증나네요 목소리,1,0,0,0,0,"[1, 0, 0, 0, 0]"
1,1,9045019.0,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,1,0,0,0,0,"[1, 0, 0, 0, 0]"
2,2,5403919.0,막 걸음마 뗀 3세부터 초등학교 1학년생인 8살용영화.ㅋㅋㅋ...별반개도 아까움.,1,0,0,0,0,"[1, 0, 0, 0, 0]"
3,3,7797314.0,원작의 긴장감을 제대로 살려내지못했다.,1,0,0,0,0,"[1, 0, 0, 0, 0]"
4,4,9443947.0,별 반개도 아깝다 욕나온다 이응경 길용우 연기생활이몇년인지..정말 발로해도 그것보단...,1,0,0,0,1,"[1, 0, 0, 0, 1]"


## 2.2. 피쳐 와 레이블 분리

In [7]:
def get_feature_label(df):
    texts = df.comment_text.tolist()
    labels = df.list.tolist()
    
    return texts, labels
    
train_texts, train_labels = get_feature_label(train_df)    
test_texts, test_labels = get_feature_label(test_df)    

In [8]:
train_texts[0:3] , train_labels[0:3]

(['아 더빙.. 진짜 짜증나네요 목소리',
  '교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정',
  '막 걸음마 뗀 3세부터 초등학교 1학년생인 8살용영화.ㅋㅋㅋ...별반개도 아까움.'],
 [[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0, 0]])

# 3. 토큰나이저 준비 및 Electra 모델 인코딩 준비

## 3.1. Electra 모델 입려으로 변경

In [9]:
%%time 

# tokenizer = ElectraTokenizer.from_pretrained(tokenizer_id)
tokenizer_id = config.tokenizer_id
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)

train_encodings = tokenizer(train_texts, return_token_type_ids = False, truncation=True, padding=True)
# val_encodings = tokenizer(val_texts, return_token_type_ids = False, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, return_token_type_ids = False, truncation=True, padding=True)

CPU times: user 3.07 s, sys: 1.5 s, total: 4.57 s
Wall time: 909 ms


## 3.2. 사용자 데이터 세트 생성

In [10]:
class NaverToxicDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)



In [11]:
train_dataset = NaverToxicDataset(train_encodings, train_labels)
test_dataset = NaverToxicDataset(test_encodings, test_labels)

In [12]:
next(iter(train_dataset))
next(iter(test_dataset))

{'input_ids': tensor([    2,  6394,  4110,  2780,  4034,  9121,  9747,  2252,  4070,  6437,
          6880,  3249, 16868,  6272,    18,    18,    18,  3755,  4034,  7265,
          4172,  3755,  4325,  3771,  4820,  6394,     3,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

## 3.3. 데이터 로더 생성

In [13]:
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 16

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(train_dataset, **train_params)
testing_loader = DataLoader(test_dataset, **test_params)

In [14]:
next(iter(test_dataset))

{'input_ids': tensor([    2,  6394,  4110,  2780,  4034,  9121,  9747,  2252,  4070,  6437,
          6880,  3249, 16868,  6272,    18,    18,    18,  3755,  4034,  7265,
          4172,  3755,  4325,  3771,  4820,  6394,     3,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

# 4. 모델 생성

## 4.1. Pre-Trained Model 로딩

In [15]:
# plm = ElectraModel.from_pretrained(model_id)
model_id = config.model_id
plm = AutoModel.from_pretrained(model_id)

Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 4.2. Custom Classifier 추가 하여 Custom Model 생성 하기

In [16]:
class CustomTransformerModel(nn.Module):
    def __init__(self,model ,num_labels, num_cls_vector): 
        super(CustomTransformerModel,self).__init__() 
        self.num_labels = num_labels 

        self.model = model
        self.dropout = nn.Dropout(0.1) 
        self.classifier = nn.Linear(256,num_labels) # load and initialize weights    
        # self.classifier = nn.Linear(768,num_labels) # load and initialize weights

    def forward(self, input_ids=None, attention_mask=None,labels=None):
        #Extract outputs from the body
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

        #Add custom layers
        sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state

        # logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # calculate losses
        logits = self.classifier(sequence_output[:,0,:].view(-1,256)) # calculate losses    

        return logits



In [17]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_cls_vector = config.num_cls_vector
num_labels = len(labels)


model = CustomTransformerModel(model = plm ,
                           num_labels=num_labels,
                           num_cls_vector = num_cls_vector
                          ).to(device) # plm + custom classifier

## 4.3. optimizer, loss() 함수 정의
- BCEWithLogitsLoss 로 loss() 정의
    - [BCEWithLogitsLoss](https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html)

In [18]:
def loss_fn(outputs, targets):
#     return torch.nn.BCEWithLogitsLoss()(outputs, targets)
    loss = torch.nn.BCEWithLogitsLoss()
    return loss(outputs, targets)

LEARNING_RATE = 1e-05
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)




# 5. 훈련 실행

In [19]:
EPOCHS = 1

def train(epoch, log_interval):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['labels'].to(device, dtype = torch.float)

        outputs = model(ids, mask)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _% log_interval ==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        


In [20]:
%%time 
EPOCHS = 1
for epoch in range(EPOCHS):
    train(epoch, log_interval=100)        

Epoch: 0, Loss:  0.7059875726699829
Epoch: 0, Loss:  0.21625137329101562
Epoch: 0, Loss:  0.18075047433376312
CPU times: user 21.6 s, sys: 2.58 s, total: 24.2 s
Wall time: 24.2 s


# 6. 테스트 데이터로 검증

In [21]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for i , data in enumerate(testing_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            targets = data['labels'].to(device, dtype = torch.float)
            outputs = model(ids, mask)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

#             print("outputs: \n", torch.sigmoid(outputs))                        
            
#             if i == 1:
#                 break
            
    return fin_outputs, fin_targets

In [22]:
EPOCHS = 1
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    # print("targets: \n", targets)
    # print("outputs: \n", outputs)            

    
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(metrics.classification_report(targets, outputs, output_dict=False, target_names = labels))
    
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

               precision    recall  f1-score   support

        toxic       0.99      1.00      1.00      1988
      obscene       0.00      0.00      0.00        16
       threat       0.00      0.00      0.00         3
       insult       0.00      0.00      0.00        27
identity_hate       0.00      0.00      0.00        42

    micro avg       0.99      0.96      0.98      2076
    macro avg       0.20      0.20      0.20      2076
 weighted avg       0.95      0.96      0.95      2076
  samples avg       0.99      0.97      0.98      2076

Accuracy Score = 0.9574787393696849
F1 Score (Micro) = 0.9757055214723926
F1 Score (Macro) = 0.19944820667168295


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
