# 멀티 레이블링 (BCELoss)

참고
- transformers_multi-label_classification
    - https://www.kaggle.com/code/eggwhites2705/transformers-multi-label-classification

# 0. 설정 파일 로딩

In [1]:
KR_NAVER_TOXIC = False
EN_KAGGLE_TOXIC = False
KR_KAGGLE_TOXIC = True

if KR_NAVER_TOXIC:
    from src import config_kr_naver_toxic as config
elif EN_KAGGLE_TOXIC:
    from src import config_en_kaggle_toxic as config
elif KR_KAGGLE_TOXIC:    
    from src import config_kr_kaggle_toxic as config

# 1. 환경 설정

In [2]:
%load_ext autoreload
%autoreload 2


In [3]:
import torch
import torch.nn as nn
import pandas as pd

import numpy as np
import os

from sklearn import metrics
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel

# 2. 데이터 준비

In [4]:
train_file_path = os.path.join(config.train_data_dir,"train.csv")
test_file_path = os.path.join(config.test_data_dir,"test.csv")
print("train_file_path : ", train_file_path)
print("test_file_path : ", test_file_path)

train_file_path :  data/translated_kaggle_toxic_review/train.csv
test_file_path :  data/translated_kaggle_toxic_review/test.csv


## 2.1. 데이터 로딩 및 레이블 리스트 작성

In [5]:
def read_csv_data(naver_toxic_train_file_path, labels):
    df = pd.read_csv(naver_toxic_train_file_path)    
    df['list'] = df[labels].values.tolist()

    return df

# labels = ['toxic','obscene','threat','insult','identity_hate']
labels = config.labels
train_df = read_csv_data(train_file_path, labels)
test_df = read_csv_data(test_file_path, labels)
train_df.head()


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_en,list
0,0000997932d777bf,설명\n내 사용자 이름 하드코어 메탈리카 팬으로 편집한 내용이 되돌아간 이유는 무엇...,0,0,0,0,0,0,Explanation\nWhy the edits made under my usern...,"[0, 0, 0, 0, 0, 0]"
1,000103f0d9cfb60f,다우!그는 제가 붙어있는 것처럼 보이는 이 배경색과 일치합니다.감사합니다.(토크) ...,0,0,0,0,0,0,D'aww! He matches this background colour I'm s...,"[0, 0, 0, 0, 0, 0]"
2,000113f07ec002fd,"이봐, 난 정말 전쟁을 편집하려고 하는 게 아니야.이 사람은 제 토론 문서 대신 지...",0,0,0,0,0,0,"Hey man, I'm really not trying to edit war. It...","[0, 0, 0, 0, 0, 0]"
3,0001b41b1c6bb37e,“\n더 보기\n개선에 대한 실질적인 제안을 할 수 없습니다. 섹션 통계가 나중에 ...,0,0,0,0,0,0,"""\nMore\nI can't make any real suggestions on ...","[0, 0, 0, 0, 0, 0]"
4,0001d958c54c6e35,"선생님, 당신은 제 영웅이에요.어떤 페이지에 있는지 기억할 수 있을까요?",0,0,0,0,0,0,"You, sir, are my hero. Any chance you remember...","[0, 0, 0, 0, 0, 0]"


## 2.2. 피쳐 와 레이블 분리

In [6]:
def get_feature_label(df):
    texts = df.comment_text.tolist()
    labels = df.list.tolist()
    
    return texts, labels
    
train_texts, train_labels = get_feature_label(train_df)    
test_texts, test_labels = get_feature_label(test_df)    

In [7]:
train_texts[0:3] , train_labels[0:3]

(['설명\n내 사용자 이름 하드코어 메탈리카 팬으로 편집한 내용이 되돌아간 이유는 무엇인가요?그들은 기물 파손이 아니라 뉴욕 인형 FAC에서 투표한 후 일부 GA를 폐쇄했습니다.그리고 제가 지금 은퇴했으니 토론 문서에서 템플릿을 제거하지 마십시오.89.205.38.27',
  '다우!그는 제가 붙어있는 것처럼 보이는 이 배경색과 일치합니다.감사합니다.(토크) 2016년 1월 11일 21:51 (UTC)',
  '이봐, 난 정말 전쟁을 편집하려고 하는 게 아니야.이 사람은 제 토론 문서 대신 지속적으로 관련 정보를 제거하고 편집을 통해 저와 대화하고 있다는 것입니다.그는 실제 정보보다 서식에 더 신경을 쓰는 것 같습니다.'],
 [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]])

# 3. 토큰나이저 준비 및 트랜스포머 모델 인코딩 준비

## 3.1. 트랜스포모 모델 입력으로 변경

In [8]:
# tokenizer = ElectraTokenizer.from_pretrained(tokenizer_id)
tokenizer_id = config.tokenizer_id
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, model_max_length = 64)



In [9]:
%%time 


train_encodings = tokenizer(train_texts, return_token_type_ids = False, truncation=True, padding=True)
# val_encodings = tokenizer(val_texts, return_token_type_ids = False, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, return_token_type_ids = False, truncation=True, padding=True)

CPU times: user 1min, sys: 5.1 s, total: 1min 5s
Wall time: 5.37 s


## 3.2. 사용자 데이터 세트 생성

In [10]:
class CustomerDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)



In [11]:
train_dataset = CustomerDataset(train_encodings, train_labels)
test_dataset = CustomerDataset(test_encodings, test_labels)

In [12]:
next(iter(train_dataset))
next(iter(test_dataset))

{'input_ids': tensor([    2,   146,  9571, 11655,  4279,  8553,    16, 10029,  4086,  9571,
         13227,  5012,  4576,  6390,    35,  2043,  4238, 13662,   147,     3,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor([0, 0, 0, 0, 0, 0])}

## 3.3. 데이터 로더 생성

In [13]:
#TRAIN_BATCH_SIZE = 64 # Naver Toxic Data
TRAIN_BATCH_SIZE = 256 # Kaggle Toxic
VALID_BATCH_SIZE = 16

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(train_dataset, **train_params)
testing_loader = DataLoader(test_dataset, **test_params)

In [14]:
next(iter(test_dataset))

{'input_ids': tensor([    2,   146,  9571, 11655,  4279,  8553,    16, 10029,  4086,  9571,
         13227,  5012,  4576,  6390,    35,  2043,  4238, 13662,   147,     3,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor([0, 0, 0, 0, 0, 0])}

# 4. 모델 생성

## 4.1. Pre-Trained Model 로딩

In [16]:
# plm = ElectraModel.from_pretrained(model_id)
model_id = config.model_id
print("model_id: ", model_id)
plm = AutoModel.from_pretrained(model_id)

model_id:  monologg/koelectra-small-v3-discriminator


Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 4.2. Custom Classifier 추가 하여 Custom Model 생성 하기

In [17]:
class CustomTransformerModel(nn.Module):
    def __init__(self,model ,num_labels, num_cls_vector): 
        super(CustomTransformerModel,self).__init__() 
        self.num_labels = num_labels 

        self.model = model
        self.dropout = nn.Dropout(0.1) 
        self.classifier = nn.Linear(num_cls_vector,num_labels) # load and initialize weights    
        # self.classifier = nn.Linear(768,num_labels) # load and initialize weights

    def forward(self, input_ids=None, attention_mask=None,labels=None):
        #Extract outputs from the body
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

        #Add custom layers
        sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state

        # logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # calculate losses
        logits = self.classifier(sequence_output[:,0,:].view(-1,num_cls_vector)) # calculate losses    

        return logits



In [18]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_cls_vector = config.num_cls_vector
num_labels = len(labels)


model = CustomTransformerModel(model = plm ,
                           num_labels=num_labels,
                           num_cls_vector = num_cls_vector
                          ).to(device) # plm + custom classifier

## 4.3. optimizer, loss() 함수 정의
- BCEWithLogitsLoss 로 loss() 정의
    - [BCEWithLogitsLoss](https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html)

In [19]:
def loss_fn(outputs, targets):
#     return torch.nn.BCEWithLogitsLoss()(outputs, targets)
    loss = torch.nn.BCEWithLogitsLoss()
    return loss(outputs, targets)

LEARNING_RATE = 1e-05
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)




# 5. 훈련 실행

In [20]:
EPOCHS = 1

def train(epoch, log_interval):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['labels'].to(device, dtype = torch.float)

        outputs = model(ids, mask)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _% log_interval ==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        


In [21]:
%%time 
EPOCHS = 1
print("total steps: ", len(training_loader) * EPOCHS)
for epoch in range(EPOCHS):
    train(epoch, log_interval=100)        

total steps:  561
Epoch: 0, Loss:  0.7036914825439453
Epoch: 0, Loss:  0.26596951484680176
Epoch: 0, Loss:  0.20626245439052582
Epoch: 0, Loss:  0.1964389532804489
Epoch: 0, Loss:  0.183050736784935
Epoch: 0, Loss:  0.15583528578281403
CPU times: user 1min 8s, sys: 15.9 s, total: 1min 24s
Wall time: 1min 23s


# 6. 테스트 데이터로 검증

In [22]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for i , data in enumerate(testing_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            targets = data['labels'].to(device, dtype = torch.float)
            outputs = model(ids, mask)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

#             print("outputs: \n", torch.sigmoid(outputs))                        
            
#             if i == 1:
#                 break
            
    return fin_outputs, fin_targets

In [23]:
EPOCHS = 1
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    # print("targets: \n", targets)
    # print("outputs: \n", outputs)            

    
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(metrics.classification_report(targets, outputs, output_dict=False, target_names = labels))
    
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

               precision    recall  f1-score   support

        toxic       0.83      0.55      0.66      1543
 severe_toxic       0.00      0.00      0.00       150
      obscene       0.77      0.63      0.69       864
       threat       0.00      0.00      0.00        50
       insult       0.69      0.61      0.65       817
identity_hate       0.00      0.00      0.00       144

    micro avg       0.77      0.53      0.63      3568
    macro avg       0.38      0.30      0.33      3568
 weighted avg       0.70      0.53      0.60      3568
  samples avg       0.05      0.05      0.04      3568

Accuracy Score = 0.9094441311023376
F1 Score (Micro) = 0.6297279362972794
F1 Score (Macro) = 0.3343413427479069


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 7. 데이터 셋트 결과 비교

## 7.1, Korean Naver Toxic Data

```
                 precision    recall  f1-score   support

        toxic       0.99      1.00      1.00      1988
      obscene       0.00      0.00      0.00        16
       threat       0.00      0.00      0.00         3
       insult       0.00      0.00      0.00        27
identity_hate       0.00      0.00      0.00        42

    micro avg       0.99      0.96      0.98      2076
    macro avg       0.20      0.20      0.20      2076
 weighted avg       0.95      0.96      0.95      2076
  samples avg       0.99      0.97      0.98      2076

Accuracy Score = 0.9574787393696849
F1 Score (Micro) = 0.9757055214723926
F1 Score (Macro) = 0.19944820667168295
```

## 7.2. English Kaggle Toxic Data

```
  precision    recall  f1-score   support

        toxic       0.85      0.77      0.81      1543
 severe_toxic       0.71      0.08      0.14       150
      obscene       0.85      0.77      0.81       864
       threat       0.00      0.00      0.00        50
       insult       0.81      0.62      0.70       817
identity_hate       0.00      0.00      0.00       144

    micro avg       0.84      0.66      0.74      3568
    macro avg       0.54      0.37      0.41      3568
 weighted avg       0.79      0.66      0.71      3568
  samples avg       0.07      0.06      0.06      3568

Accuracy Score = 0.9211004574794761
F1 Score (Micro) = 0.7432241892527025
F1 Score (Macro) = 0.4108082686274194
```

## 7.3. Translated (Korean) Kaggle Toxic Data

```
                precision    recall  f1-score   support

        toxic       0.83      0.55      0.66      1543
 severe_toxic       0.00      0.00      0.00       150
      obscene       0.77      0.63      0.69       864
       threat       0.00      0.00      0.00        50
       insult       0.69      0.61      0.65       817
identity_hate       0.00      0.00      0.00       144

    micro avg       0.77      0.53      0.63      3568
    macro avg       0.38      0.30      0.33      3568
 weighted avg       0.70      0.53      0.60      3568
  samples avg       0.05      0.05      0.04      3568
```  