## To Do 📚
- 관련 논문 찾아보기 (https://github.com/tae898/erc)
- 관련 대회 찾아보고, 어떻게 풀었는지 보기
- 데이터 분석하기 (train, valid 부분 되짚기)

## Done ⭐
- 코드 간결화
- EarlyStop 적용
- AdamW 변경
- AMP 적용

## Google Drive Mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Install Libraries

In [None]:
!pip install transformers wandb

## Import

In [2]:
import os
import torch
import random
import shutil
import warnings
warnings.filterwarnings(action='ignore')

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import torch.nn.functional as F

from tqdm.notebook import tqdm
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

from torch import nn
from torch.optim import AdamW
from torch.cuda.amp import autocast
from torch.cuda.amp import GradScaler
from torch.utils.data import DataLoader, Dataset

from transformers import BertTokenizer, BertModel, get_scheduler

## GPU Setting

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
!nvidia-smi

Wed Nov  9 09:40:55 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P8    14W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Hyperparameter Setting

In [4]:
CFG = {
    # seed
    'SEED': 2022,

    # model
    'CHECKPOINT': 'bert-base-cased',
    'LEARNING_RATE': 1e-5,

    # scheduler
    'SCHEDULER_NAME': 'linear',
    'NUM_WARMUP_STEPS': 0,

    # training
    'EPOCHS': 100,
    'BATCH_SIZE': 16,
    'EARLY_STOP': 3,
    'USE_AMP': True,
    'MAX_NORM': 5,
}

## Seed Setting

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [6]:
seed_everything(CFG['SEED'])

## Load Dataset

In [7]:
data_path = '/content/drive/MyDrive/dacon_sentiment_analysis/dataset'

In [8]:
data_csv = pd.read_csv(os.path.join(data_path, 'train.csv'))

## Label encoding

In [9]:
le = LabelEncoder()
le = le.fit(data_csv['Target'])
data_csv['Target'] = le.transform(data_csv['Target'])

## Train/Validation split

In [15]:
train_csv = data_csv[~data_csv['Dialogue_ID'].isin([i for i in range(1016,1039)])].reset_index(drop=True)
valid_csv = data_csv[data_csv['Dialogue_ID'].isin([i for i in range(1016,1039)])].reset_index(drop=True)

print(f'train_dataset: {len(train_csv)}')
print(f'valid_dataset: {len(valid_csv)}')

train_dataset: 9725
valid_dataset: 264


In [16]:
train_csv.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,4
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,4
2,TRAIN_0002,That I did. That I did.,Chandler,0,4
3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,4
4,TRAIN_0004,My duties? All right.,Chandler,0,6


## Tokenizer

In [10]:
tokenizer = BertTokenizer.from_pretrained(CFG['CHECKPOINT'])

## CustomDataset

In [11]:
class CustomDataset(Dataset):
  
    def __init__(self, data, tokenizer, mode='train'):
        self.dataset = data
        self.tokenizer = tokenizer
        self.mode = mode

    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        text = self.dataset['Utterance'][idx]
        inputs = self.tokenizer(text, padding='max_length', max_length=512,
                                truncation=True, return_tensors='pt')

        input_ids = inputs['input_ids'][0]
        token_type_ids = inputs['token_type_ids'][0]
        attention_mask = inputs['attention_mask'][0]
    
        if self.mode in ['train', 'valid']:
            y = self.dataset['Target'][idx]
            return input_ids, token_type_ids, attention_mask, y

        else:
            return input_ids, token_type_ids, attention_mask

In [19]:
train_dataset = CustomDataset(train_csv, tokenizer, mode='train')
valid_dataset = CustomDataset(valid_csv, tokenizer, mode='valid')

In [20]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False)

## Model

In [12]:
class BaseModel(nn.Module):

    def __init__(self, dropout=0.5, num_classes=len(le.classes_)):
        super(BaseModel, self).__init__()

        self.bert = BertModel.from_pretrained(CFG['CHECKPOINT'])
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, num_classes)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids,
                                     attention_mask=attention_mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

## Train

In [22]:
def train(model, optimizer, scheduler, train_loader, valid_loader, device):
    model.to(device)

    criterion = nn.CrossEntropyLoss().to(device)
    scaler = GradScaler(enabled=CFG['USE_AMP'])

    best_score = 0
    best_model_path = os.path.join(data_path, 'best_model')

    patience = 0

    for epoch_num in range(CFG["EPOCHS"]):
        model.train()
        train_loss = []

        for input_ids, token_type_ids, attention_mask, train_label in tqdm(train_loader):
            optimizer.zero_grad()

            train_label = train_label.to(device)
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)

            with torch.cuda.amp.autocast(enabled=CFG['USE_AMP']):
              output = model(input_id, mask)
              batch_loss = criterion(output, train_label.long())

            scaler.scale(batch_loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=CFG['MAX_NORM'])
            scaler.step(optimizer)
            scaler.update()

            scheduler.step()

            train_loss.append(batch_loss.item())

        valid_loss, valid_score = validation(model, criterion, valid_loader, device)
        print(f'Epoch [{epoch_num}], Train Loss : [{np.mean(train_loss) :.5f}] Valid Loss : [{np.mean(valid_loss) :.5f}] Valid F1 Score : [{valid_score:.5f}]')
        wandb.log({'train_loss': np.mean(train_loss), 'valid_loss': np.mean(valid_loss), 'valid_f1_score': valid_score})

        if best_score < valid_score:
            print(f'best F1 Score: {best_score} → {valid_score}')

            best_score = valid_score
            patience = 0

            # save best model
            if os.path.exists(best_model_path):
              shutil.rmtree(best_model_path) # delete everything in the directory

            model.save_pretrained(best_model_path)

        else:
          patience += 1

          if patience == CFG['EARLY_STOP']:
            break
        
    return best_model                         

In [13]:
def competition_metric(true, pred):
    return f1_score(true, pred, average='macro')

def validation(model, criterion, test_loader, device):
    model.eval()

    val_loss = list()
    model_preds = list()
    true_labels = list()

    with torch.no_grad():
        for input_ids, token_type_ids, attention_mask, valid_label in tqdm(test_loader):
            valid_label = valid_label.to(device)
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)

            output = model(input_id, mask)
    
            batch_loss = criterion(output, valid_label.long()) 
            val_loss.append(batch_loss.item())      
            
            model_preds += output.argmax(1).detach().cpu().numpy().tolist()
            true_labels += valid_label.detach().cpu().numpy().tolist()
            
        val_f1 = competition_metric(true_labels, model_preds)

    return val_loss, val_f1    

## Load Model

In [15]:
model = BaseModel()
model.eval()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BaseModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

## Opitimizer & Scheduler

In [27]:
optimizer = torch.optim.AdamW(params = model.parameters(), lr = CFG["LEARNING_RATE"])
num_training_steps = CFG['EPOCHS'] * len(train_dataloader)
lr_scheduler = get_scheduler(name=CFG['SCHEDULER_NAME'], optimizer=optimizer,
                              num_warmup_steps=CFG['NUM_WARMUP_STEPS'],
                              num_training_steps=num_training_steps)

## WandB Setting

In [28]:
import wandb

wandb.init(project='dacon_sentiment_analysis')

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Run 🔥

In [None]:
infer_model = train(model, optimizer, lr_scheduler, train_dataloader, valid_dataloader, device)
wandb.finish()

## Inference

In [18]:
best_model_path = '/content/drive/MyDrive/dacon_sentiment_analysis/best_model'

In [49]:


if not os.path.exists(best_model_path):
  os.mkdir(best_model_path)

torch.save(infer_model.state_dict(), os.path.join(best_model_path, 'baseline.pt'))

In [19]:
infer_model = BaseModel()
infer_model.load_state_dict(torch.load(os.path.join(best_model_path, 'baseline.pt')))
infer_model.eval()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BaseModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

In [21]:
test = pd.read_csv(os.path.join(data_path, 'test.csv'))
test = CustomDataset(test, tokenizer, mode ='test')
test_dataloader = torch.utils.data.DataLoader(test, batch_size=CFG['BATCH_SIZE'], shuffle=False)

In [22]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    
    test_predict = []
    for input_ids, token_type_ids, attention_mask in tqdm(test_loader):
        input_id = input_ids.to(device)
        mask = attention_mask.to(device)
        y_pred = model(input_id, mask)
        test_predict += y_pred.argmax(1).detach().cpu().numpy().tolist()
        
    print('Done.')
    return test_predict

In [23]:
torch.cuda.empty_cache()

In [24]:
!nvidia-smi

Wed Nov  9 09:42:37 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P0    29W /  70W |    816MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [25]:
preds = inference(infer_model, test_dataloader, device)
preds = le.inverse_transform(preds)

  0%|          | 0/327 [00:00<?, ?it/s]

Done.


## Submit

In [26]:
submit = pd.read_csv(os.path.join(data_path, 'sample_submission.csv'))
submit.head()

Unnamed: 0,ID,Target
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,0


In [27]:
submit['Target'] = preds
submit.head()

Unnamed: 0,ID,Target
0,TEST_0000,neutral
1,TEST_0001,neutral
2,TEST_0002,neutral
3,TEST_0003,joy
4,TEST_0004,anger


In [28]:
submit.to_csv('./submit.csv', index=False)