# 모델 선정
1. 영화 리뷰 데이터셋의 특성을 고려하여 KoBERT, KoELECTA 중 구어체, 메신저, 웹 데이터가 포함된 데이터셋을 훈련시킨 KoELECTRA 모델을 선정
2. KoBERT, KoELECTRA 모델을 비교 실험한 결과 KoELECTRA 모델의 성능이 우수하며 훈련 시간 또한 단축됨

# 훈련 시킬 데이터셋 구축
1. EDA에서 발견한 데이터 불균형을 고려하여, 각 클래스마다 동일한 개수의 데이터를 사용하여 데이터셋 구축
2. 각 클래스 별 사용 데이터의 개수를 늘리면서 실험 진행
3. 중복 데이터는 제외한 후 훈련
4. EDA에서 정수 인코딩 된 문장의 길이가 32 이하인 문장이 전체의 약 86% 이며, 훈련 시간을 고려하여 sequence 길이는 32로 패딩

# 실험 및 결과
1. 각각의 클래스마다 동일한 개수의 데이터를 사용하여 submission 제출 결과 0.37의 점수 도출
2. 클래스 별 사용 데이터의 개수를 최대 300만개 까지 늘린 후 submission 제출 결과 2 epoch에서 0.604의 점수 도출 -> TestSet 또한 데이터 불균형이 있을 가능성을 확인
3. 데이터 중복 처리를 한 결과가 그렇지 않은 결과보다 성능이 향상됨
4. sequence 길이를 64로 패딩할 때와 32로 패딩할 때를 비교한 결과 성능은 비슷하지만, 32로 패딩할 때의 훈련 시간이 단축됨 -> 모델이 입력 문장 전체를 보고 판단할 필요는 없음을 확인
5. scheduler의 적용 여부를 실험한 결과, 적용하지 않은 모델의 성능이 우수

# 추후 개선 가능성
1. 전체 데이터셋을 사용하여 훈련하지 않았으며, 데이터의 수가 늘어날수록 성능이 향상되는 양상을 보임
2. 다양한 scheduler를 통해 실험한다면 더 나은 결과를 보일 가능성이 있음
3. weight decay 적용을 통한 과적합 해결 기대

In [1]:
import mxnet
import numpy as np
import pandas as pd
import torch
import transformers
from torch import nn
from torch.utils.data import Dataset
from tqdm import tqdm, tqdm_notebook
from transformers import AdamW
from transformers import ElectraModel, ElectraTokenizer
from transformers.optimization import get_cosine_schedule_with_warmup

print("transformers.__version__", transformers.__version__)
print("mxnet.__version__", mxnet.__version__)


# ============================================================================
# Seed 고정
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

# ============================================================================
# Load data
train_x = []
train_y = []
valid_x = []
valid_y = []
with open("./inputs/train_data", 'rt', encoding='UTF-8') as f:
    for e, i in enumerate(f):        
        if 8000000 < e < 9000000:
            valid_x.append(i[:-1])
        else:
            train_x.append(i[:-1])
with open("./inputs/train_label", 'rt', encoding='UTF-8') as f:
    for e, i in enumerate(f):        
        if 8000000 < e < 9000000:
            valid_y.append(int(i[:-1]) - 1)
        else:
            train_y.append(int(i[:-1]) - 1)

print("len(set(train_y))", len(set(train_y)), set(train_y))
print("len(set(valid_y))", len(set(valid_y)), set(valid_y))

trainSet = []
validSet = []

for i, j in zip(train_x, train_y):
    trainSet.append((i, j))
for i, j in zip(valid_x, valid_y):
    validSet.append((i, j))

# ============================================================================
# 데이터셋 중복 제거
    
trainSet = list(set(trainSet))  
validSet = list(set(validSet))

print("len(trainSet)", len(trainSet))
print("len(validSet)", len(validSet))

# ============================================================================
# 클래스 별 사용할 데이터 개수 최대값 지정 후 추출 -> 데이터셋 생성
print("preprocess")

dic = {}
for i in range(10):
    dic[i] = []
for i in trainSet:
    if len(dic[i[1]]) < 3000000:
        dic[i[1]].append(i)

trainSet = []
for i in dic:
    for j in dic[i]:
        trainSet.append(j)
        
print(len(trainSet))
print(len(validSet))


# ============================================================================
# GPU 사용 시
device = torch.device("cuda:0")
print(device)

# ============================================================================
# Load model, tokenizer
electra_model = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator")
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
print("load model")

# ============================================================================
# Setting parameters
max_len = 32
batch_size = 820
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
log_interval = 3000
learning_rate = 5e-5


# ============================================================================
# 모델 입력을 위한 패딩 함수 
def padding(inputs, pad_token, pad_length=0, pad=True):
    pad_token = pad_token[0]

    if pad_length > len(inputs):
        if pad is True:
            length = (pad_length - len(inputs))
            pad_seq = [pad_token] * length
            inputs.extend(pad_seq)

    else:
        if pad is True:
            inputs = inputs[:pad_length]
    return np.array(inputs)


# ============================================================================
# Torch Dataset
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, electra_tokenizer, max_len, pad):
        self.sentences = [electra_tokenizer.convert_tokens_to_ids(electra_tokenizer.tokenize(i[sent_idx])) for i in dataset]
        self.length = [np.int32(len(i)) for i in self.sentences]
        for e in range(len(self.sentences)):
            self.sentences[e] = padding(self.sentences[e], tokenizer.convert_tokens_to_ids(['PAD']), max_len, pad)
        self.segment = np.zeros((len(self.sentences), max_len))
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return self.sentences[i], self.length[i], self.segment[i], self.labels[i]

    def __len__(self):
        return (len(self.labels))


data_train = BERTDataset(trainSet, 0, 1, tokenizer, max_len, True)
data_test = BERTDataset(validSet, 0, 1, tokenizer, max_len, True)

train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=0, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=0)
print("create dataset")


# ============================================================================
# KoELECTRA Sentence Classification Model
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size=768,
                 num_classes=10,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        output = self.bert(input_ids=token_ids, token_type_ids=segment_ids.long(),
                           attention_mask=attention_mask.float().to(token_ids.device))
        first_token_tensor = output[0]

        if self.dr_rate:
            out = self.dropout(first_token_tensor[:, 0])
        return self.classifier(out)


model = BERTClassifier(electra_model,  dr_rate=0.5).to(device)

# ============================================================================
# Prepare optimizer and schedule
# weight decay, scheduler 사용 x
# learning rate = 5e-5 고정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

# scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# ============================================================================
# Accuracy 계산 함수
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc


# ============================================================================
# Loop
print("loop start")
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    test_loss = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        # scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e + 1, batch_id + 1, loss.data.cpu().numpy(),
                                                                     train_acc / (batch_id + 1)))
    print("epoch {} train acc {}".format(e + 1, train_acc / (batch_id + 1)))
    PATH = f"./output/2/koelectrabert_epoch{e+1}.pt"
    torch.save(model, PATH)

    model.eval()
    with torch.no_grad():
        for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            valid_length = valid_length
            label = label.long().to(device)
            out = model(token_ids, valid_length, segment_ids)
            loss = loss_fn(out, label)

            test_loss += loss.item()
            test_acc += calc_accuracy(out, label)
        print("epoch {} valid loss {} acc {}\n".format(e + 1, test_loss / (batch_id + 1), test_acc / (batch_id + 1)))


transformers.__version__ 4.6.1
mxnet.__version__ 1.7.0
len(set(train_y)) 10 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
len(set(valid_y)) 10 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
len(trainSet) 7579756
len(validSet) 955010
preprocess
6809008
955010
cuda:0


Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


load model
create dataset
loop start


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/8304 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 2.34513783454895 train acc 0.1
epoch 1 batch id 3001 loss 1.2640464305877686 train acc 0.5491945774172596
epoch 1 batch id 6001 loss 1.1447519063949585 train acc 0.5577541954389714
epoch 1 train acc 0.5611302977508213


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1165 [00:00<?, ?it/s]

epoch 1 valid loss 1.07378880286933 acc 0.6214429725720871



  0%|          | 0/8304 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 1.188226580619812 train acc 0.6048780487804878
epoch 2 batch id 3001 loss 1.1998754739761353 train acc 0.5759425719881766
epoch 2 batch id 6001 loss 1.163714051246643 train acc 0.576409622786448
epoch 2 train acc 0.5764831755581094


  0%|          | 0/1165 [00:00<?, ?it/s]

epoch 2 valid loss 1.071791572683359 acc 0.6216310197922628



  0%|          | 0/8304 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 1.146508812904358 train acc 0.5975609756097561
epoch 3 batch id 3001 loss 1.17511785030365 train acc 0.582225030680806
epoch 3 batch id 6001 loss 1.1705378293991089 train acc 0.5824147194979672
epoch 3 train acc 0.582276657094512


  0%|          | 0/1165 [00:00<?, ?it/s]

epoch 3 valid loss 1.0645165302211124 acc 0.6233195933708473



  0%|          | 0/8304 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 1.116074800491333 train acc 0.5817073170731707
epoch 4 batch id 3001 loss 1.0890370607376099 train acc 0.5874846595849914
epoch 4 batch id 6001 loss 1.1176083087921143 train acc 0.587340321328555
epoch 4 train acc 0.5871290871230153


  0%|          | 0/1165 [00:00<?, ?it/s]

epoch 4 valid loss 1.0689687622463242 acc 0.623153153509024



  0%|          | 0/8304 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 1.109312891960144 train acc 0.6085365853658536
epoch 5 batch id 3001 loss 1.0845779180526733 train acc 0.5929665721182106
epoch 5 batch id 6001 loss 1.124261498451233 train acc 0.5921029421925553
epoch 5 train acc 0.5916660556499522


  0%|          | 0/1165 [00:00<?, ?it/s]

epoch 5 valid loss 1.0787487955052453 acc 0.6219139300308717



  0%|          | 0/8304 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 1.0802961587905884 train acc 0.5963414634146341
epoch 6 batch id 3001 loss 1.0648103952407837 train acc 0.5974459732934285
epoch 6 batch id 6001 loss 1.0998690128326416 train acc 0.5965786190106455
epoch 6 train acc 0.596200813797095


  0%|          | 0/1165 [00:00<?, ?it/s]

epoch 6 valid loss 1.0888213877514197 acc 0.6196568696191476



  0%|          | 0/8304 [00:00<?, ?it/s]

epoch 7 batch id 1 loss 1.0378354787826538 train acc 0.6060975609756097
epoch 7 batch id 3001 loss 1.0458824634552002 train acc 0.6018347542688768
epoch 7 batch id 6001 loss 1.0948565006256104 train acc 0.6010475896293637
epoch 7 train acc 0.6005545780717311


  0%|          | 0/1165 [00:00<?, ?it/s]

epoch 7 valid loss 1.1035275179940744 acc 0.6171889103294659



  0%|          | 0/8304 [00:00<?, ?it/s]

epoch 8 batch id 1 loss 1.091403841972351 train acc 0.5926829268292683
epoch 8 batch id 3001 loss 1.1037347316741943 train acc 0.6060874017603651
epoch 8 batch id 6001 loss 1.0114260911941528 train acc 0.6055185924297081
epoch 8 train acc 0.6051211423191284


  0%|          | 0/1165 [00:00<?, ?it/s]

epoch 8 valid loss 1.1188026595524965 acc 0.6120692304501846



  0%|          | 0/8304 [00:00<?, ?it/s]

epoch 9 batch id 1 loss 1.0141222476959229 train acc 0.6097560975609756
epoch 9 batch id 3001 loss 1.0666812658309937 train acc 0.6125458180606278
epoch 9 batch id 6001 loss 0.9962368607521057 train acc 0.6107435346141441
epoch 9 train acc 0.6098035692723996


  0%|          | 0/1165 [00:00<?, ?it/s]

epoch 9 valid loss 1.1275187568603156 acc 0.612319887657542



  0%|          | 0/8304 [00:00<?, ?it/s]

epoch 10 batch id 1 loss 1.0461211204528809 train acc 0.598780487804878
epoch 10 batch id 3001 loss 1.0373928546905518 train acc 0.6176530587365047
epoch 10 batch id 6001 loss 0.9912645220756531 train acc 0.6160324905198683
epoch 10 train acc 0.6149914142502967


  0%|          | 0/1165 [00:00<?, ?it/s]

epoch 10 valid loss 1.1417690326727512 acc 0.6118533938760765

