# Baseline Bert 감정분류 모델 필사
competition : 월간 데이콘 발화자의 감정인식 AI 경진대회  
notebook link : https://dacon.io/competitions/official/236027/codeshare/6989?page=1&dtype=recent

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import random
import os

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertModel
from torch.optim import Adam

import matplotlib as mpl
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cpu')

# 하이퍼 파라미터 튜닝

In [3]:
cfg = {
    'epochs' : 3,
    'learning_rate' : 1e-5,
    'batch_size' : 8,
    'seed' : 42
}

# 시드 고정

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(cfg['seed']) # Seed 고정

# 데이터 불러오기

In [5]:
train = pd.read_csv("./Desktop/ff/code/data/speaker_sementic/train.csv")

In [6]:
train.head(3)

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,neutral
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,neutral
2,TRAIN_0002,That I did. That I did.,Chandler,0,neutral


In [7]:
len(train)

9989

# 라벨인코딩

In [8]:
le = LabelEncoder()
le = le.fit(train["Target"])
train["Target"] = le.transform(train["Target"])

# 학습/ 검증셋 분리

In [9]:
# 대화ID가 총 1038이므로 이에 해당하는 항목은 validation 데이터로 대체
valid = train[train["Dialogue_ID"].isin([i for i in range(1016,1039)])].reset_index(drop = True)
train = train[~train["Dialogue_ID"].isin([i for i in range(1016,1039)])].reset_index(drop = True)

train_len = len(train)
val_len = len(valid)

train_len, val_len

(9725, 264)

# tokenizer 정의

In [10]:
tokenizers = BertTokenizer.from_pretrained('bert-base-cased')

# custom dataset 생성

In [11]:
class CustomDataset(Dataset):
    def __init__(self, data, mode = "train"):
        self.dataset = data
        self.tokenizer = tokenizers
        self.mode = mode
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        text = self.dataset['Utterance'][idx]
        
        # truncation = True는 tokenizer의 토큰을 기준으로 자르는 것
        # return_tensors 는 tf, pt, np가 있음. 파이토치 형태로 반환
        inputs = self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
        input_ids = inputs['input_ids'][0]
        
        # 몇번째 문장인지 구분짓는 id 
        # [0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1] 의 모양과 비슷하게 형성됨
        token_type_ids = inputs['token_type_ids'][0]
        
        # padding을 구분지음 -> 패딩에 집중하지 못하게 패딩부분을 0으로 나타냄
        # [1,1,1,0,0,0,0,0] 일 경우,
        # 세번째 토큰까지는 패딩 외에 다른 토큰이 있고, 네번쨰부터는 패딩이라는 의미
        attention_mask = inputs["attention_mask"][0]
        
        if self.mode == "train":
            y = self.dataset['Target'][idx]
            return input_ids, token_type_ids, attention_mask, y
        else:
            return input_ids, token_type_ids, attention_mask

In [12]:
train = CustomDataset(train, mode = "train")
valid = CustomDataset(valid, mode = "train")

train_dataloader = torch.utils.data.DataLoader(train, batch_size= cfg['batch_size'], shuffle=True)
val_dataloader = torch.utils.data.DataLoader(valid, batch_size= cfg['batch_size'], shuffle=False)

# 모델 정의

In [13]:
class BaseModel(nn.Module):
    def __init__(self, dropout = 0.5, num_classes = len(le.classes_)):
        super().__init__()
        # or super(BaseModel, self).__init__()
        
        self.bert = BertModel.from_pretrained("bert-base-cased")
        self.dropout = nn.Dropout(dropout)
        
        # bertbase는 학습할 때 d_model을 768로 정의함.
        # 따라서 모든 단어는 768 차원의 임베딩 벡터가 되고,
        # bert와 연결되는 층에서는 768개의 input_feature을 받을 수 있어야함
        self.linear = nn.Linear(768, num_classes)
        self.relu = nn.ReLU()
        
    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

# 학습

In [14]:
def train(model, optimizer, train_loader, test_loader, device):

    model.to(device)

    criterion = nn.CrossEntropyLoss().to(device)

    best_score = 0
    best_model = "None"
    for epoch_num in range(cfg["epochs"]):
        # 위에서 정의한 train함수가 아니라 train 모드로 전환하겠다는 토치 함수
        model.train()
        train_loss = []
        for input_ids, token_type_ids, attention_mask, train_label in tqdm(train_loader):
            optimizer.zero_grad()

            train_label = train_label.to(device)
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)

            output = model(input_id, mask)     
    
            batch_loss = criterion(output, train_label.long()) 
            train_loss.append(batch_loss.item())
            
            batch_loss.backward()
            optimizer.step()

        val_loss, val_score = validation(model, criterion, test_loader, device)
        print(f'Epoch [{epoch_num}], Train Loss : [{np.mean(train_loss) :.5f}] Val Loss : [{np.mean(val_loss) :.5f}] Val F1 Score : [{val_score:.5f}]')

        if best_score < val_score:
            best_model = model
            best_score = val_score
        
    return best_model                         

In [15]:
def competition_metric(true, pred):
    return f1_score(true, pred, average = "macro")

def validation(model, criterion, test_loader, device):
    model.eval()
    
    val_loss = []
    model_preds = []
    true_labels = []
    with torch.no_grad():
        for input_ids, token_type_ids, attention_mask, valid_label in tqdm(test_loader):
            valid_label = valid_label.to(device)
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)
            
            output = model(input_id, mask)
            
            batch_loss = criterion(output, valid_label.long())
            val_loss.append(batch_loss.item())
            
            # model에 input_id를 넣었을 때 출력값은 loss, logits임
            # 원하는 값은 loss가 아닌 예측값이므로 argmax(1) 을 사용
            model_preds += output.argmax(1).detach().cpu().numpy().tolist()
            true_labels += valid_label.detach().cpu().numpy().tolist()
            
        val_f1 = competition_metric(true_labels, model_preds)
    return val_loss, val_f1

In [16]:
model = BaseModel()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = cfg["learning_rate"])

infer_model = train(model, optimizer, train_dataloader, val_dataloader, device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1216 [00:00<?, ?it/s]

KeyboardInterrupt: 

# 추론

In [17]:
test = pd.read_csv("./Desktop/ff/code/data/speaker_sementic/test.csv")

In [18]:
test = CustomDataset(test, mode = "test")
test_dataloader = torch.utils.data.DataLoader(test, batch_size= cfg['batch_size'], shuffle=False)

In [None]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    
    test_predict  = []
    for input_ids, token_type_ids, attetion_mask in tqdm(test_loader):
        input_id = input_ids.to(device)
        mask = attention_mask.to(device)
        y_pred = model(input_id, mask)
        test_predict += y_pred.argmax(1).detach().cpu().numpy().tolist()
    return test_predict

In [None]:
preds = inference(infer_model, test_dataloader, device)

In [None]:
preds = le.inverse_transform(preds)

# 제출

In [None]:
submit = pd.read_csv("./Desktop/ff/code/data/speaker_sementic/sample_submission.csv")
submit["Target"] = preds
display(submit.head())
submit_to_csv("./submit.csv", index = False)