# Fine-tuning model

In [None]:
import torch
torch.cuda.is_available()

Direct Access를 위한 data preprocess functions 정의하기

In [None]:
import re

def del_bracket(s):
  pattern = r'\([^)]*\)'  # ()
  s = re.sub(pattern=pattern, repl='', string=s)

  pattern = r'\[[^)]*\]'  # []
  s = re.sub(pattern=pattern, repl='', string=s)

  pattern = r'\<[^)]*\>'  # <>
  s = re.sub(pattern=pattern, repl='', string=s)

  pattern = r'\{[^)]*\}'  # {}
  s = re.sub(pattern=pattern, repl='', string=s)

  return s

def del_special_num(s):
  pattern = r'[^a-zA-Z가-힣]'
  s = re.sub(pattern=pattern, repl=' ', string=s)

  return s

def del_unit(s):
  units = ['mm', 'cm', 'km', 'ml', 'kg', 'g']
  for unit in units:
    s = s.lower() # 대문자를 소문자로 변환
    s = s.replace(unit, '')
  return s

def del_whitespace(s):
  return " ".join(s.split())

import io 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
  
def del_stopwords(s):
  stopwords = open("data/stopwords.txt", 'r', encoding="utf-8").read().split()
  #print(stopwords)
  s_o=s.split()
  s_f=[]
  for w in s_o:
    if w.strip() not in stopwords:
      s_f.append(w.strip())
  return " ".join(s_f)

In [None]:
#modelname = "beomi/KcELECTRA-base"
#modelname = "klue/bert-base"
#modelname = "beomi/kcbert-base"
#modelname = "monologg/kobert"
#modelname = "kakaobrain/kogpt"
modelname = "kykim/bert-kor-base"

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import AdamW, get_cosine_schedule_with_warmup, BertTokenizer, BertModel

#GPU 사용
device = torch.device("cuda:0")

#BERT 모델 불러오기
bertmodel = BertModel.from_pretrained(modelname)

## Datamaking process

In [None]:
import pandas as pd
#filter_values = ['기쁨', '슬픔', '분노', '불안']
#filter_values = ['기쁨', '불안', '슬픔', '분노', '상처']

train=pd.read_excel('data/Training.xlsx')
train=pd.DataFrame({'s1': train['감정_대분류'], 's2': train['감정_소분류'], 't1': train['사람문장1']})
#train = train[train['s1'].isin(filter_values)]
train.head()

val=pd.read_excel('data/Validation.xlsx')
val=pd.DataFrame({'s1': val['감정_대분류'], 's2': val['감정_소분류'], 't1': val['사람문장1']})
#val = val[val['s1'].isin(filter_values)]

In [None]:
data_1_path = "data/single.xlsx"
data_2_path = "data/continuous.xlsx"
# filter_values = ['공포', '분노', '슬픔', '행복', '혐오']
filter_values = ['공포', '놀람', '분노', '슬픔', '행복', '혐오', '중립']

data_1 = pd.read_excel(data_1_path)
data_1 = pd.DataFrame({'t1': data_1['Sentence'], 's1': data_1['Emotion']})
data_1 = data_1[data_1['s1'].isin(filter_values)] # filter out neutral/other typos

data_2 = pd.read_excel(data_2_path, usecols="B:C", skiprows=2, header=None, names=["t1", "s1"])
data_2 = data_2[data_2['s1'].isin(filter_values)] # filter out neutral/other typos

# print(data_2[:4])

In [None]:
#기쁨 불안 당황 슬픔 분노 상처 
s12label={'기쁨':1, '불안': 2, '당황': 3, '슬픔': 4, '분노': 5, '상처': 6}
# s12label={'기쁨':0, '불안': 1, '슬픔': 1, '분노': 1, '상처':1}
train_l=[]

for t, s in zip(train['t1'], train['s1']):
  t=del_bracket(t)
  t=del_special_num(t)
  t=del_whitespace(t)
  t=del_stopwords(t)
  train_l.append([t, s12label[s.strip()]])

for t, s in zip(val['t1'], val['s1']):
  t=del_bracket(t)
  t=del_special_num(t)
  t=del_whitespace(t)
  t=del_stopwords(t)
  train_l.append([t, s12label[s]])

# for additional data: 공포 놀람 분노 슬픔 행복 혐오 중립
s12label = {'중립': 0, '행복': 1, '공포' : 2, '슬픔': 4, '분노': 5, '놀람': 7, '혐오': 8}
# s12label = {'행복': 0, '공포': 1, '분노': 1, '슬픔': 1, '혐오': 1}
for t, s in zip(data_1['t1'], data_1['s1']):
  t=del_bracket(t)
  t=del_special_num(t)
  t=del_whitespace(t)
  t=del_stopwords(t)
  train_l.append([t, s12label[s]])

for t, s in zip(data_2['t1'], data_2['s1']):
  t=del_bracket(t)
  t=del_special_num(t)
  t=del_whitespace(t)
  t=del_stopwords(t)
  train_l.append([t, s12label[s]])

print(train_l[:10])

In [None]:
data_augment_flag = [1, 5, 5, 9, 5, 4, 9, 9, 9]
data_augment_limit = [150000, 150000, 150000, 150000, 150000, 150000, 150000, 150000, 150000]
# Separate the data by class
data_flag_0 = [sample for sample in train_l if sample[1] == 0]
data_flag_1 = [sample for sample in train_l if sample[1] == 1]
data_flag_2 = [sample for sample in train_l if sample[1] == 2]
data_flag_3 = [sample for sample in train_l if sample[1] == 3]
data_flag_4 = [sample for sample in train_l if sample[1] == 4]
data_flag_5 = [sample for sample in train_l if sample[1] == 5]
data_flag_6 = [sample for sample in train_l if sample[1] == 6]
data_flag_7 = [sample for sample in train_l if sample[1] == 7]
data_flag_8 = [sample for sample in train_l if sample[1] == 8]
data_flag = [data_flag_0, data_flag_1, data_flag_2, data_flag_3, data_flag_4, data_flag_5, data_flag_6, data_flag_7, data_flag_8]
print(len(data_flag_0),len(data_flag_1)/len(data_flag_0),len(data_flag_2)/len(data_flag_0),len(data_flag_3)/len(data_flag_0),len(data_flag_4)/len(data_flag_0),len(data_flag_5)/len(data_flag_0),len(data_flag_6)/len(data_flag_0),len(data_flag_7)/len(data_flag_0),len(data_flag_8)/len(data_flag_0))
augmented_data = []

In [None]:
# For unbalanced-class
import random
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer

# Initialize and train WordExtractor
word_extractor = WordExtractor(min_frequency=100,
                               min_cohesion_forward=0.05, 
                               min_right_branching_entropy=0.0)
corpus = [sample[0] for sample in train_l]
word_extractor.train(corpus)
words = word_extractor.extract()

# Extract nouns from the words
nouns = list(words.keys())

# Now we'll define our text augmentation function using these nouns
def augment_korean_text(text, nouns, num_aug_samples=1):
    augmented_samples = [text]  # Start with the original text
    tokenizer = LTokenizer()

    for _ in range(num_aug_samples):
        # Replace some nouns with other random nouns
        words = tokenizer.tokenize(text)
        augmented_words = [
            word if word not in nouns else random.choice(nouns)
            for word in words
        ]
        augmented_samples.append(" ".join(augmented_words))

    return augmented_samples

In [None]:
i = 0
for datasets in data_flag:
    count_sample = 0
    for sample in datasets:
        text = sample[0]
        label = sample[1]
        augmented_texts = augment_korean_text(text, nouns, num_aug_samples=data_augment_flag[i])
        for augmented_text in augmented_texts:
            if count_sample < data_augment_limit[i]:
                augmented_data.append([augmented_text, label])
                count_sample += 1
    i += 1

# Shuffle the data
random.shuffle(augmented_data)

# Convert the data to a DataFrame and save it to a new Excel file
df_augmented = pd.DataFrame(augmented_data, columns=["document", "label"])
df_augmented.to_excel("data/train_l_augmentedplus_9sent.xlsx", index=False)

## Training Process

In [None]:
# Reload the data from the Excel file and convert it back to a list
df_augmented = pd.read_excel("data/train_l_augmentedplus_9sent.xlsx")
train_l = df_augmented.values.tolist()

In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, tokenizer, max_len):
        self.sentences = [tokenizer.encode_plus(i[sent_idx], add_special_tokens=True, max_length=max_len, padding='max_length', truncation=True) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, idx):
        token_ids = torch.tensor(self.sentences[idx]['input_ids'], dtype=torch.long)
        valid_length = torch.tensor(len(self.sentences[idx]['input_ids']), dtype=torch.long)
        segment_ids = torch.tensor(self.sentences[idx]['token_type_ids'], dtype=torch.long)
        attention_mask = torch.tensor(self.sentences[idx]['attention_mask'], dtype=torch.long)
        label = torch.tensor(self.labels[idx])
        return token_ids, valid_length, segment_ids, attention_mask, label

    def __len__(self):
        return (len(self.labels))

In [None]:
# Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 50
max_grad_norm = 1
log_interval = 10
learning_rate =  1e-5

In [None]:
from transformers import AutoTokenizer, BertTokenizerFast
from sklearn.model_selection import train_test_split

# Preparing the dataset
dataset_train, dataset_test = train_test_split(train_l, test_size=0.2, random_state=0)
print(len(dataset_train), len(dataset_test))
len_data = len(dataset_train) + len(dataset_test)

# Tokenization
tokenizer = BertTokenizerFast.from_pretrained(modelname)

# Ensure all data is a string
dataset_train = [[str(i[0]), i[1]] for i in dataset_train]
dataset_test = [[str(i[0]), i[1]] for i in dataset_test]

# Creating instances of the BERTDataset class for train and test sets
data_train = BERTDataset(dataset_train, 0, 1, tokenizer, max_len)
data_test = BERTDataset(dataset_test, 0, 1, tokenizer, max_len)


In [None]:
data_train[0]

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=0)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=0)

In [None]:
# Set num_of_classes
num_classes = 9

class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=num_classes,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids, attention_mask):
        outputs = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        pooler = outputs[1]
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
from collections import Counter
labels = []
for _, (_, _, _, _, label) in enumerate(tqdm(test_dataloader)):
    labels.extend(label.numpy())
print(Counter(labels))

In [None]:
#BERT 모델 불러오기
model = BERTClassifier(bertmodel,  dr_rate=0.7).to(device)

#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

#정확도 측정을 위한 함수 정의
def calc_accuracy(out, label):
    predicted = out.argmax(1)
    correct = (predicted == label).sum().item()
    total = label.size(0)
    accuracy = correct / total
    # print(f'Predicted: {predicted}, Label: {label}, Correct: {correct}, Total: {total}, Accuracy: {accuracy}')
    return accuracy

    
train_dataloader

In [None]:
# For time-based tensorboard store
num_augmentations="043532558"
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d")
log_dir = f'./logs/{timestamp}-model{modelname}-data{len_data}/a{num_augmentations}-class{num_classes}-batch{batch_size}-lr{learning_rate}-epoch{num_epochs}-maxlen{max_len}'

In [None]:
from torch.utils.tensorboard import SummaryWriter

In [None]:
from tqdm.notebook import tqdm

best_acc = 0
max_patience = 5
patience = 0
PATH = log_dir
writer = SummaryWriter(PATH)

# print("Starting training loop...")
for e in range(num_epochs):
    # print("Starting epoch ", e+1)
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    # for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
    for batch_id, (token_ids, valid_length, segment_ids, attention_mask, label) in enumerate(tqdm(train_dataloader)):
        # print("Starting batch ", batch_id+1)    
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        attention_mask = attention_mask.to(device)
        out = model(token_ids, valid_length, segment_ids, attention_mask)
        # print(f'Output: {out.argmax(1)}, Label: {label}')

        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            writer.add_scalar('loss/train_loss', loss.data.cpu().numpy(), e*len(train_dataloader)+batch_id+1)
            writer.add_scalar('acc/train_acc', train_acc / (batch_id+1), e*len(train_dataloader)+batch_id+1)
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, attention_mask, label) in enumerate(tqdm(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        attention_mask = attention_mask.to(device)
        out = model(token_ids, valid_length, segment_ids, attention_mask)
        # print(f'Output: {out.argmax(1)}, Label: {label}')
        
        test_acc += calc_accuracy(out, label)
        loss = loss_fn(out, label)
    writer.add_scalar('acc/test_acc', test_acc / (batch_id+1), e+1)        
    writer.add_scalar('loss/test_loss', loss.data.cpu().numpy(), e+1)        
    print("epoch {} loss {} test acc {}".format(e+1, loss.data.cpu().numpy(), test_acc / (batch_id+1)))
    if test_acc/(batch_id+1)>best_acc:
      best_acc=test_acc/(batch_id+1)
      patience=0
      torch.save(model, PATH + '{}_{}_model.pt'.format(e+1, test_acc/(batch_id+1)))
    else:
      patience+=1
    if patience>max_patience:
      break
    test_acc = 0  # Reset test accuracy for the next epoch