# 패키지 가져오기

In [1]:
import pandas as pd
import numpy as np
import random
import os

import datetime
from pytz import timezone

import gc

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

from tqdm.notebook import tqdm

In [2]:
def set_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seeds(777)

# 디렉토리 설정

In [3]:
os.chdir('/content/drive/MyDrive/Competitions/emotion')
os.getcwd()

'/content/drive/MyDrive/Competitions/emotion'

# 데이터 전처리

## 가져오기

In [4]:
train_org = pd.read_csv('dataset/train.csv')
submit_org = pd.read_csv('dataset/test.csv')
print(len(train_org))

9989


In [5]:
train_org.loc[train_org['Dialogue_ID'] == 0]

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,neutral
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,neutral
2,TRAIN_0002,That I did. That I did.,Chandler,0,neutral
3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,neutral
4,TRAIN_0004,My duties? All right.,Chandler,0,surprise
5,TRAIN_0005,"Now you’ll be heading a whole division, so you...",The Interviewer,0,neutral
6,TRAIN_0006,I see.,Chandler,0,neutral
7,TRAIN_0007,But there’ll be perhaps 30 people under you so...,The Interviewer,0,neutral
8,TRAIN_0008,Good to know.,Chandler,0,neutral
9,TRAIN_0009,We can go into detail,The Interviewer,0,neutral


## 데이터 살피기

In [6]:
# 감정 개수 : 7
train_org.Target.value_counts()

neutral     4710
joy         1743
surprise    1205
anger       1109
sadness      683
disgust      271
fear         268
Name: Target, dtype: int64

In [7]:
# 대화 별 발화 수: 1~24
train_org.Dialogue_ID.value_counts()

523    24
54     24
450    24
649    24
530    24
       ..
987     1
790     1
581     1
490     1
742     1
Name: Dialogue_ID, Length: 1038, dtype: int64

In [8]:
# 단어수 분포 : 1 ~ 69
train_org.Utterance.apply(lambda x: len(x.split())).describe()

count    9989.000000
mean        7.948644
std         6.229899
min         1.000000
25%         3.000000
50%         6.000000
75%        11.000000
max        69.000000
Name: Utterance, dtype: float64

## Target Label Encoding

In [9]:
le = LabelEncoder()
train_org['EncodedTarget'] = le.fit_transform(train_org['Target'])
train_org.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target,EncodedTarget
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,neutral,4
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,neutral,4
2,TRAIN_0002,That I did. That I did.,Chandler,0,neutral,4
3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,neutral,4
4,TRAIN_0004,My duties? All right.,Chandler,0,surprise,6


In [10]:
train_org.EncodedTarget.value_counts()

4    4710
3    1743
6    1205
0    1109
5     683
1     271
2     268
Name: EncodedTarget, dtype: int64

In [11]:
le.inverse_transform([0, 1, 2, 3, 4, 5, 6])

array(['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness',
       'surprise'], dtype=object)

## Train-Test Split

In [12]:
train_df, valid_df = train_test_split(train_org, test_size=0.2, random_state=777, stratify=train_org.EncodedTarget)
valid_df, test_df = train_test_split(valid_df, test_size=0.5, random_state=777, stratify=valid_df.EncodedTarget)
print(len(train_df), len(valid_df), len(test_df))

7991 999 999


# Tokenizer / Model 가져오기

In [13]:
MODEL_NAME = 'tae898/emoberta-base'
# MODEL_NAME = 'SamLowe/roberta-base-go_emotions'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
pretrained = AutoModel.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  return self.fget.__get__(instance, owner)()
Some weights of RobertaModel were not initialized from the model checkpoint at tae898/emoberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
tokenizer('Hello')

{'input_ids': [0, 43998, 15722, 23133, 45209, 5782, 15722, 48589, 711, 43998, 11936, 18537, 43998, 15113, 10674, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# Dataset / Dataloader 생성

In [15]:
class TrainDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=100):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df['Utterance'].iloc[idx]
        label = self.df['EncodedTarget'].iloc[idx]

        tokenized = self.tokenizer(text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors='pt')
        # tokenized = self.tokenizer(text, max_length=self.max_len, truncation=True, return_tensors='pt')
        return tokenized['input_ids'][0], tokenized['attention_mask'][0], label

In [16]:
BATCH_SIZE = 128
MAX_LEN = 128

train_dataset = TrainDataset(train_df, tokenizer, max_len=MAX_LEN)
valid_dataset = TrainDataset(valid_df, tokenizer, max_len=MAX_LEN)
test_dataset = TrainDataset(test_df, tokenizer, max_len=MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# for a, b, c in train_loader:
#     print(a.shape, b.shape, c.shape)
#     break

# Model 선언

In [17]:
class SwiGLU(nn.Module):
    def forward(self, x):
        x, gate = x.chunk(2, dim=-1) # 마지막 dimension에 대해 절반으로 나눔
        return F.silu(gate) * x # SiLU는 Swish의 beta=1

class EmotionClassifier(nn.Module):
    def __init__(self, pretrained):
        super(EmotionClassifier, self).__init__()
        self.pretrained = pretrained
        self.classifier = nn.Sequential(
            nn.Linear(768, 512),
            SwiGLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 7)
        )

    def forward(self, input_ids, attention_mask):
        output = self.pretrained(input_ids=input_ids, attention_mask=attention_mask)
        output = output.last_hidden_state[:, 0].detach()
        return self.classifier(output)

In [18]:
# model = EmotionClassifier(pretrained)
# # model = model.to(device)
# for a, b, c in train_loader:
#     tmp = model(input_ids=a, attention_mask=b)
#     print(tmp)
#     print(tmp.argmax(1))
#     break
# # tmp = model(input_ids=train_dataset[0][0], attention_mask=train_dataset[0][1])

# 학습

## 헬퍼 함수

In [19]:
def train(model, train_loader, valid_loader, optimizer, loss_fn, scheduler, device='cuda:0'):
    print('[[[[ Train ]]]]')

    model.train()

    preds = []
    answers = []
    losses = []

    for input_ids, attention_mask, labels in tqdm(train_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.view(-1).to(device)

        #### optimizer 초기화
        optimizer.zero_grad()

        #### 모델 연산 결과
        y_hat = model(input_ids=input_ids, attention_mask=attention_mask)

        #### score 산정을 위한 정답/예측값 리스트 취합
        predicted = y_hat.argmax(1).tolist()
        preds.extend(predicted) # 예측값
        answers.extend(labels.int().tolist()) # 정답

        #### loss 계산
        loss = loss_fn(y_hat, labels)
        loss.backward()
        losses.append(loss.item()) # loss값 저장

        #### 최적화
        optimizer.step()

        #### 디버깅용 프린트
        # print('y_hat : ', y_hat.round(decimals=4).tolist()) # 모델 연산 결과
        # print('preds : ', predicted) # 예측 결과
        # print('labels: ', labels.tolist()) # 정답

    #### 성능 기록
    acc = accuracy_score(answers, preds)
    f1 = f1_score(answers, preds, average='macro')
    avg_loss = np.mean(losses)

    print(f'accuracy: {acc*100:.6f}, f1-score: {f1*100:.6f}, loss:{avg_loss:.6f}')
    print()

    eval_loss, eval_acc, eval_f1 = evaluate(model, valid_loader, loss_fn, device)

    #### 스케줄러로 LR 조정
    scheduler.step(eval_loss)

    #### loss, accuracy, f1-score 반환
    return avg_loss, acc, f1

In [20]:
def evaluate(model, valid_loader, loss_fn, device='cuda:0', test=False):
    if test:
        print('[[[[ Test ]]]]')
    else:
        print('[[[[ Evaluation ]]]]')

    model.eval()

    preds = []
    answers = []
    losses = []

    for input_ids, attention_mask, labels in tqdm(valid_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.view(-1).to(device)

        #### 모델 연산 결과
        y_hat = model(input_ids=input_ids, attention_mask=attention_mask)

        #### score 산정을 위한 정답/예측값 리스트 취합
        predicted = y_hat.argmax(1).tolist()
        preds.extend(predicted) # 예측값
        answers.extend(labels.int().tolist()) # 정답

        #### loss 계산
        loss = loss_fn(y_hat, labels)
        # loss.backward()
        losses.append(loss.item()) # loss값 저장

        #### 디버깅용 프린트
        # print('y_hat : ', y_hat.round(decimals=4).tolist()) # 모델 연산 결과
        # print('preds : ', predicted) # 예측 결과
        # print('labels: ', labels.tolist()) # 정답

    #### 성능 기록
    acc = accuracy_score(answers, preds)
    f1 = f1_score(answers, preds, average='macro')
    avg_loss = np.mean(losses)

    print(f'accuracy: {acc*100:.6f}, f1-score: {f1*100:.6f}, loss:{avg_loss:.6f}')
    print()

    #### loss, accuracy, f1-score 반환
    return avg_loss, acc, f1

## 학습 시작 / 모델 저장

In [22]:
gc.collect()
torch.cuda.empty_cache()

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
N_EPOCHS = 50
best_f1 = 0.0
best_epoch = 0

start_time = datetime.datetime.now(timezone('Asia/Seoul'))
start_time = start_time.strftime('%y%m%d-%H%M%S')

model = EmotionClassifier(pretrained).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.01, weight_decay=0.9)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
loss_fn = nn.CrossEntropyLoss()

model_dir = f'checkpoints/{MODEL_NAME.replace("/", "-")}_{start_time}'
if not os.path.exists(model_dir):
    os.mkdir(model_dir)

for epoch in range(N_EPOCHS):
    print(f'### EPOCH {epoch+1} ###')

    loss, acc, f1 = train(model, train_loader, valid_loader, optimizer, loss_fn, scheduler, device)

    test_loss, test_acc, test_f1 = evaluate(model, test_loader, loss_fn, device, test=True)

    if test_f1 > best_f1:
        best_f1 = test_f1
        best_epoch = epoch+1
        model_save_name = f'best_models/{MODEL_NAME.replace("/", "-")}_{start_time}.pt'
        torch.save(model.state_dict(), model_save_name)
        print(f'best model saved! {model_save_name}')

    if (epoch+1) % 5 == 0:
        ckpt_name = f'{model_dir}/{MODEL_NAME.replace("/", "-")}_{start_time}_{epoch+1}epoch_{test_f1*100:.4f}.ckpt'
        torch.save(model.state_dict(), ckpt_name)
        print(f'checkpoint saved! {ckpt_name}')

    print('=' * 50)
    print()

print(f'Best Epoch : {best_epoch}   /   Best F1 : {best_f1*100:.4f}')

### EPOCH 1 ###
[[[[ Train ]]]]


  0%|          | 0/63 [00:00<?, ?it/s]

accuracy: 68.064072, f1-score: 49.639015, loss:1.607147

[[[[ Evaluation ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 78.278278, f1-score: 59.479996, loss:0.754710

[[[[ Test ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 79.779780, f1-score: 68.457320, loss:0.739634

best model saved! best_models/tae898-emoberta-base_240308-121746.pt

### EPOCH 2 ###
[[[[ Train ]]]]


  0%|          | 0/63 [00:00<?, ?it/s]

accuracy: 73.357527, f1-score: 56.291264, loss:0.908535

[[[[ Evaluation ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 76.176176, f1-score: 57.939561, loss:0.797993

[[[[ Test ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 76.676677, f1-score: 59.393515, loss:0.806880


### EPOCH 3 ###
[[[[ Train ]]]]


  0%|          | 0/63 [00:00<?, ?it/s]

accuracy: 73.720435, f1-score: 57.734568, loss:0.853638

[[[[ Evaluation ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 77.177177, f1-score: 59.322587, loss:0.736113

[[[[ Test ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 77.577578, f1-score: 63.343967, loss:0.769530


### EPOCH 4 ###
[[[[ Train ]]]]


  0%|          | 0/63 [00:00<?, ?it/s]

accuracy: 74.083344, f1-score: 57.797261, loss:0.845272

[[[[ Evaluation ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 62.362362, f1-score: 52.957439, loss:1.099313

[[[[ Test ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 61.661662, f1-score: 54.703594, loss:1.177007


### EPOCH 5 ###
[[[[ Train ]]]]


  0%|          | 0/63 [00:00<?, ?it/s]

accuracy: 72.794394, f1-score: 55.827828, loss:0.894039

[[[[ Evaluation ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 75.575576, f1-score: 60.591842, loss:0.803343

[[[[ Test ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 75.275275, f1-score: 58.460649, loss:0.837836

checkpoint saved! checkpoints/tae898-emoberta-base_240308-121746/tae898-emoberta-base_240308-121746_5epoch_58.4606.ckpt

### EPOCH 6 ###
[[[[ Train ]]]]


  0%|          | 0/63 [00:00<?, ?it/s]

accuracy: 74.133400, f1-score: 56.852577, loss:0.849276

[[[[ Evaluation ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 78.178178, f1-score: 63.553188, loss:0.742230

Epoch 00006: reducing learning rate of group 0 to 5.0000e-03.
[[[[ Test ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 78.478478, f1-score: 62.988756, loss:0.743353


### EPOCH 7 ###
[[[[ Train ]]]]


  0%|          | 0/63 [00:00<?, ?it/s]

accuracy: 75.197097, f1-score: 60.204429, loss:0.802696

[[[[ Evaluation ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 79.379379, f1-score: 64.959194, loss:0.690575

[[[[ Test ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 80.380380, f1-score: 65.496007, loss:0.670792


### EPOCH 8 ###
[[[[ Train ]]]]


  0%|          | 0/63 [00:00<?, ?it/s]

accuracy: 75.997998, f1-score: 60.676636, loss:0.776340

[[[[ Evaluation ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 78.178178, f1-score: 63.614501, loss:0.734047

[[[[ Test ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 79.179179, f1-score: 67.473922, loss:0.725187


### EPOCH 9 ###
[[[[ Train ]]]]


  0%|          | 0/63 [00:00<?, ?it/s]

accuracy: 75.122012, f1-score: 60.125061, loss:0.774268

[[[[ Evaluation ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 79.079079, f1-score: 64.564806, loss:0.667593

[[[[ Test ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 80.180180, f1-score: 67.300591, loss:0.675115


### EPOCH 10 ###
[[[[ Train ]]]]


  0%|          | 0/63 [00:00<?, ?it/s]

accuracy: 75.972970, f1-score: 61.512353, loss:0.760567

[[[[ Evaluation ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 78.978979, f1-score: 63.495995, loss:0.693109

[[[[ Test ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 81.381381, f1-score: 70.531757, loss:0.694477

best model saved! best_models/tae898-emoberta-base_240308-121746.pt
checkpoint saved! checkpoints/tae898-emoberta-base_240308-121746/tae898-emoberta-base_240308-121746_10epoch_70.5318.ckpt

### EPOCH 11 ###
[[[[ Train ]]]]


  0%|          | 0/63 [00:00<?, ?it/s]

accuracy: 75.610061, f1-score: 60.923889, loss:0.766263

[[[[ Evaluation ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 76.876877, f1-score: 61.350973, loss:0.719809

[[[[ Test ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 78.678679, f1-score: 66.056427, loss:0.722517


### EPOCH 12 ###
[[[[ Train ]]]]


  0%|          | 0/63 [00:00<?, ?it/s]

accuracy: 75.284695, f1-score: 61.133386, loss:0.770828

[[[[ Evaluation ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 77.377377, f1-score: 59.681393, loss:0.758296

Epoch 00012: reducing learning rate of group 0 to 2.5000e-03.
[[[[ Test ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 78.078078, f1-score: 62.526531, loss:0.771640


### EPOCH 13 ###
[[[[ Train ]]]]


  0%|          | 0/63 [00:00<?, ?it/s]

accuracy: 76.298336, f1-score: 62.674309, loss:0.744469

[[[[ Evaluation ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 79.479479, f1-score: 65.798208, loss:0.669776

[[[[ Test ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 80.180180, f1-score: 68.003866, loss:0.670596


### EPOCH 14 ###
[[[[ Train ]]]]


  0%|          | 0/63 [00:00<?, ?it/s]

accuracy: 76.723814, f1-score: 63.209281, loss:0.729134

[[[[ Evaluation ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

accuracy: 79.479479, f1-score: 65.148803, loss:0.660128

[[[[ Test ]]]]


  0%|          | 0/8 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
## 이어서 학습할 시 사용

# gc.collect()
# torch.cuda.empty_cache()

# device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# N_EPOCHS = 20

# # model = EmotionClassifier(pretrained).to(device)

# optimizer = torch.optim.AdamW(model.parameters(), lr=1.25e-3, weight_decay=0.9)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
# # loss_fn = nn.BCEWithLogitsLoss()
# loss_fn = nn.CrossEntropyLoss()

# for epoch in range(N_EPOCHS, 2*N_EPOCHS):
#     print(f'### EPOCH {epoch+1} ###')

#     loss, acc, f1 = train(model, train_loader, valid_loader, optimizer, loss_fn, scheduler, device)

#     evaluate(model, test_loader, loss_fn, device, test=True)

#     print('=' * 50)
#     print()

# torch.save(model.state_dict(), f'models/{MODEL_NAME.replace("/", "-")}_40epoch_240306.ckpt')

# Submission

In [23]:
class SubmitDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=100):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df['Utterance'].iloc[idx]
        # label = self.df['EncodedTarget'].iloc[idx]

        tokenized = self.tokenizer(text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors='pt')
        return tokenized['input_ids'][0], tokenized['attention_mask'][0]

In [24]:
submit_dataset = SubmitDataset(submit_org, tokenizer)
submit_loader = DataLoader(submit_dataset, batch_size=64, shuffle=False)

In [25]:
inference_model = EmotionClassifier(pretrained).to(device)
inference_model.load_state_dict(torch.load(f'best_models/tae898-emoberta-base_240308-121746.pt'))

<All keys matched successfully>

In [26]:
inference_model.eval()

answer = []

for input_ids, attention_mask in tqdm(submit_loader):
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    with torch.no_grad():
        outputs = inference_model(input_ids, attention_mask)

    answer.extend(outputs.argmax(1).tolist())
submission = le.inverse_transform(answer)
submission

  0%|          | 0/41 [00:00<?, ?it/s]

array(['neutral', 'neutral', 'neutral', ..., 'neutral', 'surprise',
       'neutral'], dtype=object)

In [27]:
submit = pd.read_csv('dataset/sample_submission.csv')
submit['Target'] = submission
submit.to_csv(f'submission/{MODEL_NAME.replace("/", "-")}_{start_time}.csv', index=False)