In [2]:
import transformers
from transformers import ElectraModel, ElectraTokenizer
from transformers.optimization import get_cosine_schedule_with_warmup
import torch
import pandas as pd
import torch.nn as nn
import random
import os
import optuna
import tqdm
import time

# 전처리

In [3]:
electra_version = 'base-v3' # can be base / small / base-v2 / small-v2 / base-v3 / small-v3 바꾸면 맨 위에 tokenizer도 바꿔줘야함
tokenizer = ElectraTokenizer.from_pretrained(f"monologg/koelectra-{electra_version}-discriminator")

In [4]:
# 데이터 받아서 문장 앞 뒤에 class 토큰 seperate 토큰 넣어주기
df = pd.read_csv('kodoli_v1.1.csv', encoding='CP949')
data_size = len(df)
data_list = []
for i in range(data_size):
  data_list.append(list(df.iloc[i, 1:5]))
for i in range(data_size):
  data_list[i][0] = '[CLS] ' + data_list[i][0] + ' [SEP]'

In [5]:
# 문장 토큰 개수 분포
len_list = [0 for i in range(250)]
for data in data_list:
  lenOfSen = len(tokenizer.tokenize(data[0]))
  len_list[lenOfSen] += 1
sum = 0
for i in range(1, 250):
  sum += len_list[i]
  print(f'N of Seq with len{i} : {len_list[i]} / cumulative rate: {sum / data_size *100}%')

N of Seq with len1 : 0 / cumulative rate: 0.0%
N of Seq with len2 : 0 / cumulative rate: 0.0%
N of Seq with len3 : 81 / cumulative rate: 0.21218074656188607%
N of Seq with len4 : 134 / cumulative rate: 0.5631958087753766%
N of Seq with len5 : 527 / cumulative rate: 1.9436804191224626%
N of Seq with len6 : 969 / cumulative rate: 4.4819908316961365%
N of Seq with len7 : 1227 / cumulative rate: 7.696136214800261%
N of Seq with len8 : 1634 / cumulative rate: 11.976424361493123%
N of Seq with len9 : 1931 / cumulative rate: 17.034708578912902%
N of Seq with len10 : 2080 / cumulative rate: 22.48330058939096%
N of Seq with len11 : 2150 / cumulative rate: 28.115258677144727%
N of Seq with len12 : 2119 / cumulative rate: 33.66601178781926%
N of Seq with len13 : 2033 / cumulative rate: 38.99148657498363%
N of Seq with len14 : 2007 / cumulative rate: 44.24885396201702%
N of Seq with len15 : 1873 / cumulative rate: 49.155206286836936%
N of Seq with len16 : 1708 / cumulative rate: 53.62933857236412%

In [6]:
# data의 약 95%를 수용하는 길이인 64로 설정해서 64보다 많은 토큰이 있는 data는 버리기
max_length = 64
# label들을 숫자로 바꿔주기
abuse_dict = {'NON': 0, 'ABS' : 1}
sentiment_dict = {'NEG': 0, 'NEU': 1, 'POS': 2}
offensive_dict = {'NOT': 0, 'LIKELY': 1, 'OFFEN': 2}
# 모든 문장을 토큰화하고 숫자로 바꾼다. max length 보다 작은 문장은 pad 토큰을 넣어준다.
sentence_ids_list = []
attn_masks_list = []
labels_list = []

for i in range(38175):
  temp = []
  tokened = tokenizer.tokenize(data_list[i][0])
  idx = tokenizer.convert_tokens_to_ids(tokened)
  idx_len = len(idx)
  attention_mask = [1 for i in range(idx_len)]
  if idx_len <= max_length:
    num_pad = max_length - idx_len
    for j in range(num_pad):
      idx.append(0)
      attention_mask.append(0)
  else: # max length 보다 긴 건 데이터셋에 안 넣기
    continue
  sentence_ids_list.append(idx)
  attn_masks_list.append(attention_mask)
  temp.append(abuse_dict[data_list[i][1]])
  temp.append(sentiment_dict[data_list[i][2]])
  temp.append(offensive_dict[data_list[i][3]])
  labels_list.append(temp)

In [7]:
combined_labels_list = []
for label in labels_list:
    # combined = label[0] * 9 + label[1] * 3 + label[2]
    combined = label[0] * 3 + label[2]
    combined_labels_list.append(combined)
    

In [8]:
# train / test 데이터셋 나누기

test_data_percent = 10

total_data_size = len(sentence_ids_list) #36,342
test_data_size = int(total_data_size * (test_data_percent / 100)) #3,634 10% test data
train_data_size = total_data_size - test_data_size #32,708 90% train data

train_sentence_ids_list = torch.tensor(sentence_ids_list[:train_data_size])
test_sentence_ids_list = torch.tensor(sentence_ids_list[train_data_size:])
train_attn_masks_list = torch.tensor(attn_masks_list[:train_data_size])
test_attn_masks_list = torch.tensor(attn_masks_list[train_data_size:])
train_labels_list = torch.tensor(combined_labels_list[:train_data_size])
test_labels_list = torch.tensor(labels_list[train_data_size:])

In [9]:
batch_size = 64

train_data = torch.utils.data.TensorDataset(train_sentence_ids_list, train_attn_masks_list, train_labels_list)
train_sampler = torch.utils.data.RandomSampler(train_data)
train_dataloader = torch.utils.data.DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = torch.utils.data.TensorDataset(test_sentence_ids_list, test_attn_masks_list, test_labels_list)
test_sampler = torch.utils.data.RandomSampler(test_data)
test_dataloader = torch.utils.data.DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# 모델

In [10]:
class Classifier(nn.Module):
  def __init__(self, c_in, c_out):
    super().__init__()
    self.dropout = nn.Dropout(p = 0.5)
    self.fc1 = nn.Linear(c_in, c_out)
    self.norm = nn.LayerNorm(c_out)
    self.activation = nn.Tanh()
    self.init_weights()

  def forward(self, x):
    x = self.dropout(x)
    return self.activation(self.norm(self.fc1(x)))

  def init_weights(self):
    torch.nn.init.xavier_normal_(self.fc1.weight.data)
    torch.nn.init.zeros_(self.fc1.bias.data)

In [11]:
class NLPteam2_KoElectra(nn.Module):
    def __init__(self, electra_version = 'base-v3', num_hiddens = 256, freezing = '11'):
        super().__init__()
        self.electra = ElectraModel.from_pretrained(f"monologg/koelectra-{electra_version}-discriminator")
        electra_out_dim = 768 if 'base' in electra_version else 256
        
        self.layer1 = Classifier(electra_out_dim, 9)
        #self.layer2 = Classifier(num_hiddens, 18)

        for param in list(self.electra.named_parameters()):
            if freezing in param[0]:
                break
            param[1].requires_grad = False

    def forward(self, x, attention_mask):
        out = self.electra(x, attention_mask = attention_mask)
        cls_representation = out[0][:, 0, :]
        #cls_representation = torch.mean(out[0], dim=1)    
        
        #return self.layer2(self.layer1(cls_representation))
        return self.layer1(cls_representation)


# Optuna

In [12]:
# def objective(trial):
#     torch.manual_seed(42)
#     # 하이퍼파라미터 설정 - 제안받고 싶은 하이퍼파라미터 suggest 써서 다 선언하시면 됩니다.
#     lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)  # 학습률을 log scale로 제안
#     batch_size = trial.suggest_categorical('batch_size', (32, 64, 128))  # 배치 크기 제안
#     max_epochs = trial.suggest_int('max_epochs', 10, 30)  # 에포크 수 제안
#     num_hiddens = trial.suggest_categorical('num_hiddens', (128, 256, 512, 768))
#     #electra_version  = trial.suggest_categorical('electra_version', ('base-v3', 'base-v2', 'base-v1'))
#     electra_version = 'base-v3'
#     freezing = trial.suggest_categorical('freezing', ('11', '10', '9', 'layer1.fc1.weight'))
#     #solver = trial.suggest_categorical('solver', (torch.optim.AdamW, torch.optim.Adam, torch.optim.Adadelta))

#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#     model = NLPteam2_KoElectra(electra_version=electra_version, num_hiddens=num_hiddens, freezing=freezing).to(device)
#     optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
#     train_dataloader = torch.utils.data.DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
#     test_dataloader = torch.utils.data.DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
#     criterion = nn.CrossEntropyLoss()

#     model.train()
#     for epoch in tqdm.notebook.tqdm(range(max_epochs)):
#         for batch in train_dataloader:
#             input = batch[0].to(device)
#             attn_mask = batch[1].to(device)
#             out_true  = batch[2].to(device)

#             out = model(input, attention_mask = attn_mask)
#             loss = criterion(out, out_true)

#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()

#     model.eval()
#     abu_acc, senti_acc, off_acc, total_count = 0, 0, 0, 0
#     for data in tqdm.notebook.tqdm(test_dataloader):
#         input = data[0].to(device)
#         attn_mask = data[1].to(device)
#         abu_true = data[2][:, 0].to(device)
#         senti_true = data[2][:, 1].to(device)
#         off_true = data[2][:, 2].to(device)

#         out = model(input, attention_mask = attn_mask)

#         predict_index = out.argmax(1)
#         abu_pred = predict_index // 9
#         senti_pred = (predict_index % 9) // 3
#         off_pred = (predict_index % 9) % 3

#         abu_acc += (abu_true == abu_pred).sum().item()
#         senti_acc += (senti_true == senti_pred).sum().item()
#         off_acc += (off_true == off_pred).sum().item()

#         total_count += 3 * senti_pred.size(0)

#     return (abu_acc + senti_acc + off_acc) / total_count

In [13]:
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=20)

# print("Best trial:")
# trial = study.best_trial
# print(f"  Value: {trial.value}")
# print("  Params: ")
# for key, value in trial.params.items():
#     print(f"    {key}: {value}")

# 학습

In [14]:
#torch.manual_seed(42)
max_epochs = 20
max_step_per_epoch = None
lr = 1e-3
num_hiddens = 512 # classifier hidden dim
electra_version = 'base-v3' # can be base / small / base-v2 / small-v2 / base-v3 / small-v3
saved_model_name = None # 불러올 모델 저장한 파일명
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = NLPteam2_KoElectra(electra_version=electra_version, num_hiddens=num_hiddens, freezing='10')

warmup_ratio = 0.1
t_total = len(train_dataloader) * max_epochs
warmup_step = int(t_total * warmup_ratio)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = warmup_step, num_training_steps = t_total)


if saved_model_name:
    model.load_state_dict(torch.load(os.getcwd() + f'/{saved_model_name}.pth'))    
model.to(device)

criterion = nn.CrossEntropyLoss()

In [15]:
model_param_name = []
for param in list(model.named_parameters()):
    if param[1].requires_grad:
        model_param_name.append(param[0])
model_param_name

['electra.encoder.layer.10.attention.self.query.weight',
 'electra.encoder.layer.10.attention.self.query.bias',
 'electra.encoder.layer.10.attention.self.key.weight',
 'electra.encoder.layer.10.attention.self.key.bias',
 'electra.encoder.layer.10.attention.self.value.weight',
 'electra.encoder.layer.10.attention.self.value.bias',
 'electra.encoder.layer.10.attention.output.dense.weight',
 'electra.encoder.layer.10.attention.output.dense.bias',
 'electra.encoder.layer.10.attention.output.LayerNorm.weight',
 'electra.encoder.layer.10.attention.output.LayerNorm.bias',
 'electra.encoder.layer.10.intermediate.dense.weight',
 'electra.encoder.layer.10.intermediate.dense.bias',
 'electra.encoder.layer.10.output.dense.weight',
 'electra.encoder.layer.10.output.dense.bias',
 'electra.encoder.layer.10.output.LayerNorm.weight',
 'electra.encoder.layer.10.output.LayerNorm.bias',
 'electra.encoder.layer.11.attention.self.query.weight',
 'electra.encoder.layer.11.attention.self.query.bias',
 'electr

In [16]:
trial = 75

In [17]:
# Pre-train Evaluation
abu_acc, senti_acc, off_acc, total_count = 0, 0, 0, 0

print(f'before {trial}th trial\n')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # gpu가속을 위함
model = model.to(device)
model.eval()

for data in test_dataloader:
  input = data[0].to(device)
  attn_mask = data[1].to(device)
  senti_true = data[2][:, 1]
  off_true = data[2][:, 2]

  out = model(input, attention_mask = attn_mask)

  out = out.detach().cpu()
  senti_pred = out.argmax(1) // 3
  off_pred = out.argmax(1) % 3

  senti_acc += (senti_true == senti_pred).sum().item()
  off_acc += (off_true == off_pred).sum().item()

  total_count += off_pred.size(0)

print(f"초기화된 모델의 SA 정확도: {format(senti_acc/total_count * 100, '.2f')} %")
print(f"초기화된 모델의 OLD 정확도: {format(off_acc/total_count * 100, '.2f')} %")

before 75th trial

초기화된 모델의 SA 정확도: 33.35 %
초기화된 모델의 OLD 정확도: 12.91 %


In [18]:
#max_epochs = 1
eval_per_epoch = 2
#max_step_per_epoch = 12800
# training
model.train()
abu_acc, senti_acc, off_acc, total_count = 0, 0, 0, 0
for epoch in range(max_epochs):
  num_steps = 0
  
  for batch in train_dataloader:
    input = batch[0].to(device)
    attn_mask = batch[1].to(device)
    out_true  = batch[2].to(device)

    out = model(input, attention_mask = attn_mask)

    loss = criterion(out, out_true)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    scheduler.step()

    num_steps += batch_size
    if (num_steps) % (batch_size * 100) == 0:
      print(f'Epoch {epoch+1}/{max_epochs}, Step [{num_steps}/{train_data_size}], Loss: {loss.item():.4f}')

    senti_true = out_true // 3
    off_true = out_true % 3

    senti_pred = out.argmax(1) // 3
    off_pred = out.argmax(1) % 3

    
    senti_acc += (senti_pred == senti_true).sum().item()
    off_acc += (off_pred == off_true).sum().item()
    total_count += off_pred.size(0)

    if max_step_per_epoch and num_steps > max_step_per_epoch:
      break
    
  if epoch % eval_per_epoch == eval_per_epoch-1:
    print(f"모델의 SA 정확도: {format(senti_acc/total_count * 100, '.2f')} %")
    print(f"모델의 OLD 정확도: {format(off_acc/total_count * 100, '.2f')} %")
    abu_acc, senti_acc, off_acc, total_count = 0, 0, 0, 0
    

Epoch 1/20, Step [6400/32708], Loss: 1.5308
Epoch 1/20, Step [12800/32708], Loss: 1.5635
Epoch 1/20, Step [19200/32708], Loss: 1.3479
Epoch 1/20, Step [25600/32708], Loss: 1.3640
Epoch 1/20, Step [32000/32708], Loss: 1.3176
Epoch 2/20, Step [6400/32708], Loss: 1.2654
Epoch 2/20, Step [12800/32708], Loss: 1.3163
Epoch 2/20, Step [19200/32708], Loss: 1.2961
Epoch 2/20, Step [25600/32708], Loss: 1.3471
Epoch 2/20, Step [32000/32708], Loss: 1.2657
모델의 SA 정확도: 83.79 %
모델의 OLD 정확도: 74.31 %
Epoch 3/20, Step [6400/32708], Loss: 1.1821
Epoch 3/20, Step [12800/32708], Loss: 1.1432
Epoch 3/20, Step [19200/32708], Loss: 1.1253
Epoch 3/20, Step [25600/32708], Loss: 1.2083
Epoch 3/20, Step [32000/32708], Loss: 1.1321
Epoch 4/20, Step [6400/32708], Loss: 1.1521
Epoch 4/20, Step [12800/32708], Loss: 1.1719
Epoch 4/20, Step [19200/32708], Loss: 1.2141
Epoch 4/20, Step [25600/32708], Loss: 1.1430
Epoch 4/20, Step [32000/32708], Loss: 1.2118
모델의 SA 정확도: 87.43 %
모델의 OLD 정확도: 77.34 %
Epoch 5/20, Step [6400

In [22]:
# Post-train Evaluation
abu_acc, senti_acc, off_acc, total_count = 0, 0, 0, 0

print(f'after {trial}th trial\n')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # gpu가속을 위함
model = model.to(device)
model.eval()

for data in test_dataloader:
  input = data[0].to(device)
  attn_mask = data[1].to(device)
  senti_true = data[2][:, 1]
  off_true = data[2][:, 2]

  out = model(input, attention_mask = attn_mask)

  out = out.detach().cpu()
  predict_index = out.argmax(1)
  senti_pred = predict_index // 3
  off_pred = predict_index % 3

  senti_acc += (senti_true == senti_pred).sum().item()
  off_acc += (off_true == off_pred).sum().item()
  total_count += off_pred.size(0)

print(f"학습된 모델의 SA 정확도: {format(senti_acc/total_count * 100, '.2f')} %")
print(f"학습된 모델의 OLD 정확도: {format(off_acc/total_count * 100, '.2f')} %")

after 75th trial

학습된 모델의 SA 정확도: 25.62 %
학습된 모델의 OLD 정확도: 79.61 %


In [20]:
# Post-train thorough Evaluation
TPFP_not, TP_not, TPFN_not = 0, 0, 0
TPFP_likely, TP_likely, TPFN_likely = 0, 0, 0
TPFP_off, TP_off, TPFN_off = 0, 0, 0
total_count = 0

print(f'after {trial}th trial / offensive detection thorough evaluation\n')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # gpu가속을 위함
model = model.to(device)
model.eval()

for data in test_dataloader:
  input = data[0].to(device)
  attn_mask = data[1].to(device)
  off_true = data[2][:, 2]

  out = model(input, attention_mask = attn_mask)

  out = out.detach().cpu()
  predict_index = out.argmax(1)

  off_pred = predict_index % 3#<- 이부분을 바꿔야 할듯
    
  TPFP_not += (off_pred == torch.tensor(0)).sum().item()
  TPFP_likely += (off_pred == torch.tensor(1)).sum().item()
  TPFP_off += (off_pred == torch.tensor(2)).sum().item()

  TP_not += (off_pred * 10 == off_true).sum().item()
  TP_likely += (((off_pred - torch.tensor(1))*10 + torch.tensor(1)) == off_true).sum().item() 
  TP_off += (((off_pred - torch.tensor(2)) *10 + torch.tensor(2)) == off_true).sum().item()

  TPFN_not += (off_true == torch.tensor(0)).sum().item()
  TPFN_likely += (off_true == torch.tensor(1)).sum().item() 
  TPFN_off += (off_true == torch.tensor(2)).sum().item()

  total_count += out.size(0)

if TP_likely == 0:
  TP_likely = 1
if TPFP_likely == 0:
  TPFP_likely = 1
if TPFP_off == 0:
  TPFP_off = 1
if TP_off ==0:
  TP_off = 1

print(TPFP_not, TP_not, TPFN_not,  '/', TPFP_likely, TP_likely, TPFN_likely, '/', TPFP_off, TP_off, TPFN_off, '/', total_count)
print(f"학습된 모델의 Not 정확도: Precision {format(TP_not/TPFP_not * 100, '.2f')} % / Recall {format(TP_not/TPFN_not * 100, '.2f')} % / F1 {format((2 * TP_not/TPFP_not * 100 * TP_not/TPFN_not * 100) / (TP_not/TPFP_not * 100 + TP_not/TPFN_not * 100), '.2f')}")
print(f"학습된 모델의 Likely 정확도: Precision {format((TP_likely/TPFP_likely) * 100, '.2f')} % / Recall {format((TP_likely/TPFN_likely) * 100, '.2f')} % / F1 {format((2 * (TP_likely/TPFP_likely) * 100 * (TP_likely/TPFN_likely) * 100) / ((TP_likely/TPFP_likely) * 100 + (TP_likely/TPFN_likely) * 100), '.2f')}")
print(f"학습된 모델의 Offensive 정확도: Precision {format(TP_off/TPFP_off * 100, '.2f')} % / Recall {format(TP_off/TPFN_off * 100, '.2f')} % / F1 {format((2 * TP_off/TPFP_off * 100 * TP_off/TPFN_off * 100) / (TP_off/TPFP_off * 100 + TP_off/TPFN_off * 100), '.2f')}")

after 75th trial / offensive detection thorough evaluation

2646 2262 2420 / 182 66 437 / 806 565 777 / 3634
학습된 모델의 Not 정확도: Precision 85.49 % / Recall 93.47 % / F1 89.30
학습된 모델의 Likely 정확도: Precision 36.26 % / Recall 15.10 % / F1 21.32
학습된 모델의 Offensive 정확도: Precision 70.10 % / Recall 72.72 % / F1 71.38


In [21]:
# # 결과 잘 나온 모델 저장하기
# savefile_name = f'{trial}th_trial'
# Path = os.getcwd() # <- 로컬
# #Path = '/content/gdrive/MyDrive/Colab Notebooks'# <- 코랩
# torch.save(model.state_dict(), Path + f'/{savefile_name}.pth')